From fff2bad766ef6e20a8d308f2d22cb126df98d61f Mon Sep 17 00:00:00 2001 From: Johan Engelen Date: Sat, 2 Dec 2023 09:20:09 +0100 Subject: [PATCH] Add ldc-profgen tool (#4536) Also fixes cross build action: asserts yes/no must be same as host package (never asserts for android cross build) --- .github/actions/3-build-cross/action.yml | 16 +- .github/actions/5-install/action.yml | 2 +- CHANGELOG.md | 3 +- CMakeLists.txt | 2 +- tests/CMakeLists.txt | 1 + tests/lit.site.cfg.in | 2 + tools/CMakeLists.txt | 26 + tools/README.md | 2 + .../ldc-profgen-14.0/CMakeLists.txt | 23 + .../ldc-profgen-14.0/CSPreInliner.cpp | 285 ++++ .../ldc-profgen-14.0/CSPreInliner.h | 95 ++ .../ldc-profgen-14.0/CallContext.h | 59 + .../ldc-profgen-14.0/ErrorHandling.h | 56 + .../ldc-profgen-14.0/PerfReader.cpp | 1222 ++++++++++++++++ .../ldc-profgen/ldc-profgen-14.0/PerfReader.h | 728 ++++++++++ .../ldc-profgen-14.0/ProfileGenerator.cpp | 979 +++++++++++++ .../ldc-profgen-14.0/ProfileGenerator.h | 312 ++++ .../ldc-profgen-14.0/ProfiledBinary.cpp | 790 ++++++++++ .../ldc-profgen-14.0/ProfiledBinary.h | 541 +++++++ .../ldc-profgen-14.0/llvm-profgen.cpp | 164 +++ .../ldc-profgen-15.0/CMakeLists.txt | 23 + .../ldc-profgen-15.0/CSPreInliner.cpp | 303 ++++ .../ldc-profgen-15.0/CSPreInliner.h | 90 ++ .../ldc-profgen-15.0/CallContext.h | 59 + .../ldc-profgen-15.0/ErrorHandling.h | 56 + .../ldc-profgen-15.0/PerfReader.cpp | 1196 ++++++++++++++++ .../ldc-profgen/ldc-profgen-15.0/PerfReader.h | 742 ++++++++++ .../ldc-profgen-15.0/ProfileGenerator.cpp | 1243 ++++++++++++++++ .../ldc-profgen-15.0/ProfileGenerator.h | 383 +++++ .../ldc-profgen-15.0/ProfiledBinary.cpp | 889 ++++++++++++ .../ldc-profgen-15.0/ProfiledBinary.h | 577 ++++++++ .../ldc-profgen-15.0/llvm-profgen.cpp | 192 +++ .../ldc-profgen-16.0/CMakeLists.txt | 25 + .../ldc-profgen-16.0/CSPreInliner.cpp | 303 ++++ .../ldc-profgen-16.0/CSPreInliner.h | 90 ++ .../ldc-profgen-16.0/CallContext.h | 59 + .../ldc-profgen-16.0/ErrorHandling.h | 56 + .../ldc-profgen-16.0/MissingFrameInferrer.cpp | 318 ++++ .../ldc-profgen-16.0/MissingFrameInferrer.h | 116 ++ .../ldc-profgen-16.0/PerfReader.cpp | 1208 ++++++++++++++++ .../ldc-profgen/ldc-profgen-16.0/PerfReader.h | 742 ++++++++++ .../ldc-profgen-16.0/ProfileGenerator.cpp | 1270 ++++++++++++++++ .../ldc-profgen-16.0/ProfileGenerator.h | 390 +++++ .../ldc-profgen-16.0/ProfiledBinary.cpp | 978 +++++++++++++ .../ldc-profgen-16.0/ProfiledBinary.h | 585 ++++++++ .../ldc-profgen-16.0/llvm-profgen.cpp | 190 +++ .../ldc-profgen-17.0/CMakeLists.txt | 25 + .../ldc-profgen-17.0/CSPreInliner.cpp | 323 +++++ .../ldc-profgen-17.0/CSPreInliner.h | 97 ++ .../ldc-profgen-17.0/CallContext.h | 59 + .../ldc-profgen-17.0/ErrorHandling.h | 56 + .../ldc-profgen-17.0/MissingFrameInferrer.cpp | 316 ++++ .../ldc-profgen-17.0/MissingFrameInferrer.h | 116 ++ .../ldc-profgen-17.0/PerfReader.cpp | 1206 ++++++++++++++++ .../ldc-profgen/ldc-profgen-17.0/PerfReader.h | 742 ++++++++++ .../ldc-profgen-17.0/ProfileGenerator.cpp | 1274 +++++++++++++++++ .../ldc-profgen-17.0/ProfileGenerator.h | 390 +++++ .../ldc-profgen-17.0/ProfiledBinary.cpp | 977 +++++++++++++ .../ldc-profgen-17.0/ProfiledBinary.h | 589 ++++++++ .../ldc-profgen-17.0/llvm-profgen.cpp | 193 +++ 60 files changed, 23749 insertions(+), 5 deletions(-) create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/CMakeLists.txt create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/CallContext.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/ErrorHandling.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/PerfReader.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/PerfReader.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.h create mode 100644 tools/ldc-profgen/ldc-profgen-14.0/llvm-profgen.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/CMakeLists.txt create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/CallContext.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/ErrorHandling.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/PerfReader.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/PerfReader.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.h create mode 100644 tools/ldc-profgen/ldc-profgen-15.0/llvm-profgen.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/CMakeLists.txt create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/CallContext.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/ErrorHandling.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/PerfReader.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/PerfReader.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.h create mode 100644 tools/ldc-profgen/ldc-profgen-16.0/llvm-profgen.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/CMakeLists.txt create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/CallContext.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/ErrorHandling.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/PerfReader.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/PerfReader.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.cpp create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.h create mode 100644 tools/ldc-profgen/ldc-profgen-17.0/llvm-profgen.cpp diff --git a/.github/actions/3-build-cross/action.yml b/.github/actions/3-build-cross/action.yml index fb1473281da..51acd708d69 100644 --- a/.github/actions/3-build-cross/action.yml +++ b/.github/actions/3-build-cross/action.yml @@ -33,8 +33,20 @@ runs: else tag=CI fi + + if [[ '${{ inputs.os }}' == android ]]; then + assertsSuffix="" + else + # Use assertions for untagged builds. Must be the same as for the host LLVM package, because + # llvm-config of host package will be used for the cross build configuration. + assertsSuffix="-withAsserts" + if [[ '${{ github.ref }}' = refs/tags/* ]]; then + assertsSuffix="" + fi + fi + curl -fL --retry 3 --max-time 300 -o llvm-cross.tar.xz \ - https://github.com/ldc-developers/llvm-project/releases/download/$tag/llvm-$version-${{ inputs.os }}-${{ inputs.arch }}.tar.xz + https://github.com/ldc-developers/llvm-project/releases/download/$tag/llvm-$version-${{ inputs.os }}-${{ inputs.arch }}$assertsSuffix.tar.xz mkdir llvm-cross tar -xf llvm-cross.tar.xz --strip 1 -C llvm-cross rm llvm-cross.tar.xz @@ -145,4 +157,4 @@ runs: ${{ inputs.cmake_flags }} ${{ inputs.with_pgo == 'true' && '-DDFLAGS_LDC=-fprofile-use=../pgo-ldc/merged.profdata' || '' }} ${{ env.CROSS_CMAKE_FLAGS }} - build_targets: ldc2 ldmd2 ldc-build-runtime ldc-build-plugin ldc-profdata ldc-prune-cache timetrace2txt + build_targets: ldc2 ldmd2 ldc-build-runtime ldc-build-plugin ldc-profdata ldc-profgen ldc-prune-cache timetrace2txt diff --git a/.github/actions/5-install/action.yml b/.github/actions/5-install/action.yml index f9388388db1..5f600fdca9a 100644 --- a/.github/actions/5-install/action.yml +++ b/.github/actions/5-install/action.yml @@ -21,7 +21,7 @@ runs: cd .. else mkdir -p install/bin - cp build-cross/bin/{ldc2,ldmd2,ldc-build-runtime,ldc-profdata,ldc-prune-cache,timetrace2txt} install/bin/ + cp build-cross/bin/{ldc2,ldmd2,ldc-build-runtime,ldc-profdata,ldc-profgen,ldc-prune-cache,timetrace2txt} install/bin/ cp build-cross/bin/ldc-build-plugin install/bin/ || true cp -R build-cross-libs/lib install/ cp build-cross/lib/{libldc_rt.*,libLTO-ldc.dylib,LLVMgold-ldc.so} install/lib/ || true diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e8b7007822..406b415215c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ #### Big news - Frontend, druntime and Phobos are at version [2.106.0](https://dlang.org/changelog/2.106.0.html). (#4522) - New command-line options `-fno-{exceptions,moduleinfo,rtti}` to selectively enable some `-betterC` effects. (#4522) -- New command-line option `-fprofile-sample-use` for using sample-based profile data for optimization. Functionality and usage is identical to Clang's option with same name. (#4531). +- New command-line option `-fprofile-sample-use` for using sample-based profile data for optimization. Functionality and usage is identical to Clang's option with same name. (#4531) +- New `ldc-profgen` tool for sample-based PGO, a copy of LLVM's [llvm-profgen](https://llvm.org/docs/CommandGuide/llvm-profgen.html). (#4536) #### Platform support diff --git a/CMakeLists.txt b/CMakeLists.txt index d42012cad6b..a942156dc28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,7 +40,7 @@ find_package(LLVM 11.0 REQUIRED instcombine ipo instrumentation irreader libdriver linker lto mc mcdisassembler mcparser objcarcopts object option profiledata scalaropts selectiondag support tablegen target transformutils vectorize - windowsdriver windowsmanifest ${EXTRA_LLVM_MODULES}) + windowsdriver windowsmanifest symbolize ${EXTRA_LLVM_MODULES}) math(EXPR LDC_LLVM_VER ${LLVM_VERSION_MAJOR}*100+${LLVM_VERSION_MINOR}) message(STATUS "Using LLVM Version ${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}") # Remove LLVMTableGen library from list of libraries diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e6ce8521428..fc988f59560 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,6 @@ set( LDC2_BIN ${PROJECT_BINARY_DIR}/bin/${LDC_EXE} ) set( LDCPROFDATA_BIN ${PROJECT_BINARY_DIR}/bin/ldc-profdata ) +set( LDCPROFGEN_BIN ${PROJECT_BINARY_DIR}/bin/ldc-profgen ) set( LDCPRUNECACHE_BIN ${PROJECT_BINARY_DIR}/bin/${LDCPRUNECACHE_EXE} ) set( LDCBUILDPLUGIN_BIN ${PROJECT_BINARY_DIR}/bin/${LDC_BUILD_PLUGIN_EXE} ) set( TIMETRACE2TXT_BIN ${PROJECT_BINARY_DIR}/bin/${TIMETRACE2TXT_EXE} ) diff --git a/tests/lit.site.cfg.in b/tests/lit.site.cfg.in index 90b81aa4600..9b0b9c84d81 100644 --- a/tests/lit.site.cfg.in +++ b/tests/lit.site.cfg.in @@ -16,6 +16,7 @@ OFF = False ## Auto-initialized variables by cmake: config.ldc2_bin = "@LDC2_BIN@" config.ldcprofdata_bin = "@LDCPROFDATA_BIN@" +config.ldcprofgen_bin = "@LDCPROFGEN_BIN@" config.ldcprunecache_bin = "@LDCPRUNECACHE_BIN@" config.ldcbuildplugin_bin = "@LDCBUILDPLUGIN_BIN@" config.timetrace2txt_bin = "@TIMETRACE2TXT_BIN@" @@ -156,6 +157,7 @@ config.environment['PATH'] = path config.substitutions.append( ('%ldc', config.ldc2_bin) ) config.substitutions.append( ('%gnu_make', config.gnu_make_bin) ) config.substitutions.append( ('%profdata', config.ldcprofdata_bin) ) +config.substitutions.append( ('%profgen', config.ldcprofgen_bin) ) config.substitutions.append( ('%prunecache', config.ldcprunecache_bin) ) config.substitutions.append( ('%buildplugin', config.ldcbuildplugin_bin + " --ldcSrcDir=" + config.ldc2_source_dir ) ) config.substitutions.append( ('%timetrace2txt', config.timetrace2txt_bin) ) diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index eb510f3c940..facb939472d 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -40,6 +40,32 @@ else() message(WARNING "ldc-profdata source (${LDCPROFDATA_SRC}) not found") endif() +############################################################################# +# Build ldc-profgen utility that generates a profile data file from given perf script +# data files for sample-based profile guided optimization (-fprofile-sample-use). +# https://llvm.org/docs/CommandGuide/llvm-profgen.html +# The source in ldc-profgen/ldc-profgen-xx.x is an unmodified copy of llvm's llvm-profgen source dir. +if(LDC_LLVM_VER GREATER_EQUAL 1400) + macro(add_llvm_tool llvm_name) + string(REPLACE "llvm-" "ldc-" ldc_name ${llvm_name}) + message(STATUS "Configuring ${ldc_name} build target") + add_executable(${ldc_name} ${ARGN}) + set_target_properties( + ${ldc_name} PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin + COMPILE_FLAGS "${LLVM_CXXFLAGS} ${LDC_CXXFLAGS}" + LINK_FLAGS "${SANITIZE_LDFLAGS}" + ) + target_link_libraries(${ldc_name} ${LLVM_LIBRARIES} ${CMAKE_DL_LIBS} ${LLVM_LDFLAGS}) + install(TARGETS ${ldc_name} DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + endmacro() + if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ldc-profgen/ldc-profgen-${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}) + add_subdirectory(ldc-profgen/ldc-profgen-${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}) + else() + message(WARNING "ldc-profgen source not found (${CMAKE_CURRENT_SOURCE_DIR}/ldc-profgen/ldc-profgen-${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR})") + endif() +endif() + ############################################################################# # Build timetrace2txt set(TIMETRACE2TXT_EXE timetrace2txt) diff --git a/tools/README.md b/tools/README.md index 53d3c480e11..c4cd887af88 100644 --- a/tools/README.md +++ b/tools/README.md @@ -7,4 +7,6 @@ The `/tools` directory contains user tools that accompany LDC and that should be `ldc-profdata` converts raw profiling data to a profile data format that can be used by LDC. The source is copied from LLVM (`llvm-profdata`), and is versioned for each LLVM version that we support because the version has to match exactly with LDC's LLVM version. +`ldc-profgen` converts perf sample profiling data to a profile data format that can be used by LDC. The source is copied from LLVM (`llvm-profgen`), and is versioned for each LLVM version that we support because the version has to match exactly with LDC's LLVM version. + `timetrace2txt` converts the .timetrace output of `--ftime-trace` (which is in [Chromium's trace event JSON format](https://www.chromium.org/developers/how-tos/trace-event-profiling-tool/)) to a text file that is easier for humans to read. diff --git a/tools/ldc-profgen/ldc-profgen-14.0/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-14.0/CMakeLists.txt new file mode 100644 index 00000000000..b3e05a94856 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/CMakeLists.txt @@ -0,0 +1,23 @@ + +set(LLVM_LINK_COMPONENTS + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + DebugInfoDWARF + Core + MC + IPO + MCDisassembler + Object + ProfileData + Support + Symbolize + ) + +add_llvm_tool(llvm-profgen + llvm-profgen.cpp + PerfReader.cpp + CSPreInliner.cpp + ProfiledBinary.cpp + ProfileGenerator.cpp + ) diff --git a/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.cpp new file mode 100644 index 00000000000..1e642639902 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.cpp @@ -0,0 +1,285 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include +#include + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +extern cl::opt SampleHotCallSiteThreshold; +extern cl::opt SampleColdCallSiteThreshold; +extern cl::opt ProfileInlineGrowthLimit; +extern cl::opt ProfileInlineLimitMin; +extern cl::opt ProfileInlineLimitMax; +extern cl::opt SortProfiledSCC; + +cl::opt EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); + +static cl::opt SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +CSPreInliner::CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, + uint64_t HotThreshold, uint64_t ColdThreshold) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Profiles, nullptr), ProfileMap(Profiles), Binary(Binary), + HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; +} + +std::vector CSPreInliner::buildTopDownOrder() { + std::vector Order; + ProfiledCallGraph ProfiledCG(ContextTracker); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextFor(CallerSamples->getContext()); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(*CalleeSamples); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) { + if (UseContextCost) { + return Binary.getFuncSizeForContext(FSamples.getContext()); + } + + return FSamples.getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return Candidate.CalleeSamples->getContext().hasAttribute( + ContextWasInlined); + + // Adjust threshold based on call site hotness, only do this for callsite + // prioritized inliner because otherwise cost-benefit check is done earlier. + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + if (Candidate.CallsiteCount > HotCountThreshold) + SampleThreshold = SampleHotCallSiteThreshold; + + // TODO: for small cold functions, we may inlined them and we need to keep + // context profile accordingly. + if (Candidate.CallsiteCount < ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const StringRef Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = getFuncSize(*FSamples); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << Candidate.CalleeSamples->getContext().toString() + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " << Candidate.CalleeSamples->getContext().toString() + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleProfileMap &Profiles, bool IsInput) { + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Profiles.size() << " total):\n"; + for (auto &It : Profiles) { + const FunctionSamples &Samples = It.second; + dbgs() << " [" << Samples.getContext().toString() << "] " + << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() + << "\n"; + } + }; +#endif + + LLVM_DEBUG(printProfileNames(ProfileMap, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (StringRef FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + std::vector ProfilesToBeRemoved; + for (auto &It : ProfileMap) { + SampleContext &Context = It.second.getContext(); + if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { + assert(Context.hasState(MergedContext) && + "Not inlined context profile should be merged already"); + ProfilesToBeRemoved.push_back(It.first); + } + } + + for (auto &ContextName : ProfilesToBeRemoved) { + ProfileMap.erase(ContextName); + } + + // Make sure ProfileMap's key is consistent with FunctionSamples' name. + SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); + + LLVM_DEBUG(printProfileNames(ProfileMap, false)); +} diff --git a/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.h new file mode 100644 index 00000000000..9f63f7ef7be --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/CSPreInliner.h @@ -0,0 +1,95 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleProfileMap &Profiles, ProfiledBinary &Binary, + uint64_t HotThreshold, uint64_t ColdThreshold); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector buildTopDownOrder(); + void processFunction(StringRef Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const FunctionSamples &FSamples); + bool UseContextCost; + SampleContextTracker ContextTracker; + SampleProfileMap &ProfileMap; + ProfiledBinary &Binary; + + // Count thresholds to answer isHotCount and isColdCount queries. + // Mirrors the threshold in ProfileSummaryInfo. + uint64_t HotCountThreshold; + uint64_t ColdCountThreshold; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/CallContext.h b/tools/ldc-profgen/ldc-profgen-14.0/CallContext.h new file mode 100644 index 00000000000..5e552130d03 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/CallContext.h @@ -0,0 +1,59 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include +#include +#include + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.FuncName.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-14.0/ErrorHandling.h new file mode 100644 index 00000000000..b797add8a89 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template +T unwrapOrError(Expected EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.cpp new file mode 100644 index 00000000000..98b4c7cdf16 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.cpp @@ -0,0 +1,1222 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt SkipSymbolization("skip-symbolization", cl::init(false), + cl::ZeroOrMore, + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt ShowMmapEvents("show-mmap-events", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print binary load events.")); + +static cl::opt + UseOffset("use-offset", cl::init(true), cl::ZeroOrMore, + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", cl::init(false), cl::ZeroOrMore, + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt + IgnoreStackSamples("ignore-stack-samples", cl::init(false), cl::ZeroOrMore, + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt ShowDetailedWarning("show-detailed-warning", cl::init(false), + cl::ZeroOrMore, + cl::desc("Show detailed warning message.")); + +extern cl::opt PerfTraceFilename; +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + // An artificial return should push an external frame and an artificial call + // will match it and pop the external frame so that the context before and + // after the external call will be the same. + if (State.getCurrentLBR().IsArtificial) { + NumExtCallBranch++; + // A return is matched and pop the external frame. + if (State.getParentFrame()->isExternalFrame()) { + State.popFrame(); + } else { + // An artificial return is missing, it happens that the sample is just hit + // in the middle of the external code. In this case, the leading branch is + // a call to external, we just keep unwinding use a context-less stack. + if (State.getParentFrame() != State.getDummyRootPtr()) + NumMissingExternalFrame++; + State.clearCallStack(); + State.pushFrame(Source); + State.InstPtr.update(Source); + return; + } + } + + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (State.getCurrentLBR().IsArtificial) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + // Push an external frame for the case of returning to external + // address(callback), later if an aitificial call is matched and it will be + // popped up. This is to 1)avoid context being interrupted by callback, + // context before or after the callback should be the same. 2) the call stack + // of function called by callback should be truncated which is done during + // recording the context on trie. For example: + // main (call)--> foo (call)--> callback (call)--> bar (return)--> callback + // (return)--> foo (return)--> main + // Context for bar should not include main and foo. + // For the code of foo, the context of before and after callback should both + // be [foo, main]. + if (LBR.IsArtificial) + State.pushFrame(ExternalAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr FrameStack::getContextKey() { + std::shared_ptr KeyStr = + std::make_shared(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + if (KeyStr->Context.empty()) + return nullptr; + return KeyStr; +} + +std::shared_ptr ProbeStack::getContextKey() { + std::shared_ptr ProbeBasedKey = + std::make_shared(); + for (auto CallProbe : Stack) { + ProbeBasedKey->Probes.emplace_back(CallProbe); + } + CSProfileGenerator::compressRecursionContext( + ProbeBasedKey->Probes); + CSProfileGenerator::trimContext( + ProbeBasedKey->Probes); + return ProbeBasedKey; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &Item : Cur->RangeSamples) { + uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item)); + } + + for (auto &Item : Cur->BranchSamples) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item)); + } +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + ProbeStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.IsArtificial || Branch.Target == ExternalAddr) + return; + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return instruction + unwindReturn(State); + } else { + // Unwind branches + // For regular intra function branches, we only need to record branch with + // context. For an artificial branch cross function boundaries, we got an + // issue with returning to external code. Take the two LBR enties for + // example: [foo:8(RETURN), ext:1] [ext:3(CALL), bar:1] After perf reader, + // we only get[foo:8(RETURN), bar:1], unwinder will be confused like foo + // return to bar. Here we detect and treat this case as BRANCH instead of + // RETURN which only update the source address. + unwindBranch(State); + } + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput) { + std::unique_ptr PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset(new HybridPerfReader(Binary, PerfInput.InputFile)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + std::string PerfTraceFile = PerfData.str() + ".script.tmp"; + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + Optional Redirects[] = {llvm::None, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(PerfTraceFile)}; // Stderr + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, llvm::None, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::string PIDs; + std::unordered_set PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMap2Event(TraceIt.getCurrentLine()) && + extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + + // Run perf script again to retrieve events for PIDs collected above + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "ip,brstack", "--pid", + PIDs, "-i", PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, llvm::None, Redirects); + + return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent}; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath); + if (Binary->getName() != BinaryName) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = std::lower_bound(Offsets.begin(), Offsets.end(), Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast(K)) { + SampleContextFrameVector ContextStack; + for (const auto *Probe : CtxKey->Probes) { + Binary->getInlineContextForProbe(Probe, ContextStack, true); + } + // Probe context key at this point does not have leaf probe, so do not + // include the leaf inline location. + return SampleContext::getContextString(ContextStack, true); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + uint64_t LeadingAddr; + if (!Records.empty() && !Records[0].contains('/')) { + if (Records[0].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index = 1; + } + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + uint64_t PrevTrDst = 0; + + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + bool IsExternal = !SrcIsInternal && !DstIsInternal; + bool IsIncoming = !SrcIsInternal && DstIsInternal; + bool IsOutgoing = SrcIsInternal && !DstIsInternal; + bool IsArtificial = false; + + // Ignore branches outside the current binary. + if (IsExternal) { + if (!PrevTrDst && !LBRStack.empty()) { + WithColor::warning() + << "Invalid transfer to external code in LBR record at line " + << TraceIt.getLineNumber() << ": " << TraceIt.getCurrentLine() + << "\n"; + } + // Do not ignore the entire samples, the remaining LBR can still be + // unwound using a context-less stack. + continue; + } + + if (IsOutgoing) { + if (!PrevTrDst) { + // This is a leading outgoing LBR, we should keep processing the LBRs. + if (LBRStack.empty()) { + NumLeadingOutgoingLBR++; + // Record this LBR since current source and next LBR' target is still + // a valid range. + LBRStack.emplace_back(LBREntry(Src, ExternalAddr, false)); + continue; + } + // This is middle unpaired outgoing jump which is likely due to + // interrupt or incomplete LBR trace. Ignore current and subsequent + // entries since they are likely in different contexts. + break; + } + + // For transition to external code, group the Source with the next + // availabe transition target. + Dst = PrevTrDst; + PrevTrDst = 0; + IsArtificial = true; + } else { + if (PrevTrDst) { + // If we have seen an incoming transition from external code to internal + // code, but not a following outgoing transition, the incoming + // transition is likely due to interrupt which is usually unpaired. + // Ignore current and subsequent entries since they are likely in + // different contexts. + break; + } + + if (IsIncoming) { + // For transition from external code (such as dynamic libraries) to + // the current binary, keep track of the branch target which will be + // grouped with the Source of the last transition from the current + // binary. + PrevTrDst = Dst; + continue; + } + } + + // TODO: filter out buggy duplicate branches on Skylake + + LBRStack.emplace_back(LBREntry(Src, Dst, IsArtificial)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr Sample = std::make_shared(); + + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + Start = Binary->offsetToVirtualAddr(Start); + End = Binary->offsetToVirtualAddr(End); + } + + if (UseOffset && UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCSFlat) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + uint64_t BaseAddr = 0; + if (UseOffset && UseLoadableSegmentAsBase) + BaseAddr = Binary->getFirstLoadableAddress(); + + Source = Binary->virtualAddrToOffset(Source + BaseAddr); + Target = Binary->virtualAddrToOffset(Target + BaseAddr); + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr Key = + std::make_shared(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.startswith("[")) { + ProfileIsCSFlat = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + assert(LBR.Source != ExternalAddr && + "Branch' source should not be an external address, it should be " + "converted to aritificial branch."); + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t TargetOffset = LBR.Target == static_cast(ExternalAddr) + ? static_cast(ExternalAddr) + : Binary->virtualAddrToOffset(LBR.Target); + + if (!LBR.IsArtificial && TargetOffset != ExternalAddr) { + Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartOffset = TargetOffset; + if (EndOffeset != 0) + Counter.recordRangeCount(StartOffset, EndOffeset, Repeat); + EndOffeset = SourceOffset; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr Key = + std::make_shared(); + SampleCounters.emplace(Hashable(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const Pattern = + "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + Regex RegMmap2(Pattern); + SmallVector Fields; + bool R = RegMmap2.match(Line, &Fields); + if (!R) { + std::string ErrorMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + exitWithError(ErrorMsg); + } + Fields[PID].getAsInteger(10, MMap.PID); + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath); + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMap2Event(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMap2Event(TraceIt.getCurrentLine())) + parseMMap2Event(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + if (Records[1].startswith("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMap2Event(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 does not appear at the beginning of the line + // for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP2"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCSFlat = !IgnoreStackSamples; + if (ProfileIsCSFlat) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map, uint64_t, + pair_hash> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t StartOffset = Binary->virtualAddrToOffset(LBR.Target); + if (EndOffeset != 0) + Ranges[{StartOffset, EndOffeset}] += Count; + EndOffeset = SourceOffset; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = + [&](uint64_t StartOffset, uint64_t EndOffset, StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() + << "[" + << format("%8" PRIx64, Binary->offsetToVirtualAddr(StartOffset)) + << "," + << format("%8" PRIx64, Binary->offsetToVirtualAddr(EndOffset)) + << "]: " << Msg << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + + for (auto &I : Ranges) { + uint64_t StartOffset = I.first.first; + uint64_t EndOffset = I.first.second; + + if (!Binary->offsetIsCode(StartOffset) || + !Binary->offsetIsTransfer(EndOffset)) { + InstNotBoundary++; + WarnInvalidRange(StartOffset, EndOffset, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRangeForOffset(StartOffset); + if (!FRange) { + UnmatchedRange++; + WarnInvalidRange(StartOffset, EndOffset, DanglingRangeMsg); + continue; + } + + if (EndOffset >= FRange->EndOffset) { + RangeCrossFunc++; + WarnInvalidRange(StartOffset, EndOffset, RangeCrossFuncMsg); + } + } + + uint64_t TotalRangeNum = Ranges.size(); + emitWarningSummary(InstNotBoundary, TotalRangeNum, + "of profiled ranges are not on instruction boundary."); + emitWarningSummary(UnmatchedRange, TotalRangeNum, + "of profiled ranges do not belong to any functions."); + emitWarningSummary(RangeCrossFunc, TotalRangeNum, + "of profiled ranges do cross function boundaries."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.h b/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.h new file mode 100644 index 00000000000..9d84ad34bb3 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/PerfReader.h @@ -0,0 +1,728 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + // An artificial branch stands for a series of consecutive branches starting + // from the current binary with a transition through external code and + // eventually landing back in the current binary. + bool IsArtificial = false; + LBREntry(uint64_t S, uint64_t T, bool I) + : Source(S), Target(T), IsArtificial(I) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + if (IsArtificial) + dbgs() << " Artificial"; + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparision if hash conflicts probabilities is low) +template class Hashable { +public: + std::shared_ptr Data; + Hashable(const std::shared_ptr &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable &LHS, const Hashable &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector &OtherCallStack = Other->CallStack; + const SmallVector &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + void print() const { + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map, uint64_t, + Hashable::Hash, Hashable::Equal>; + +using SampleVector = SmallVector, 16>; + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr >= LBRLeaf + 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_ProbeBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Probe based context key as the intermediate key of context +// String based context key will introduce redundant string handling +// since the callee context is inferred from the context string which +// need to be splitted by '@' to get the last location frame, so we +// can just use probe instead and generate the string in the end. +struct ProbeBasedCtxKey : public ContextKey { + SmallVector Probes; + + ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {} + static bool classof(const ContextKey *K) { + return K->getKind() == CK_ProbeBased; + } + + bool isEqual(const ContextKey *K) const override { + const ProbeBasedCtxKey *O = dyn_cast(K); + assert(O != nullptr && "Probe based key shouldn't be null in isEqual"); + return std::equal(Probes.begin(), Probes.end(), O->Probes.begin(), + O->Probes.end()); + } + + void genHashCode() override { + for (const auto *P : Probes) { + HashCode = hash_combine(HashCode, P); + } + if (HashCode == 0) { + // Avoid zero value of HashCode when it's an empty list + HashCode = 1; + } + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map, SampleCounter, + Hashable::Hash, Hashable::Equal>; + +struct FrameStack { + SmallVector Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct ProbeStack { + SmallVector Stack; + ProfiledBinary *Binary; + ProbeStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(Cur->Address); + // We may not find a probe for a merged or external callsite. + // Callsite merging may cause the loss of original probe IDs. + // Cutting off the context from here since the inliner will + // not know how to consume a context with unknown callsites. + if (!CallProbe) + return false; + Stack.push_back(CallProbe); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + // Use pseudo probe based context key to get the sample counter + // A context stands for a call path from 'main' to an uninlined + // callee with all inline frames recovered on that path. The probes + // belonging to that call path is the probes either originated from + // the callee or from any functions inlined into the callee. Since + // pseudo probes are organized in a tri-tree style after decoded, + // the tree path from the tri-tree root (which is the uninlined + // callee) to the probe node forms an inline context. + // Here we use a list of probe(pointer) as the context key to speed up + // aggregation and the final context string will be generate in + // ProfileGenerator + std::shared_ptr getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + +private: + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + return Binary->addressIsCall(State.getCurrentLBRSource()); + } + + bool isReturnState(UnwindState &State) const { + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (!Binary->addressIsReturn(State.getCurrentLBRSource())) + return false; + + // In a callback case, a return from internal code, say A, to external + // runtime can happen. The external runtime can then call back to + // another internal routine, say B. Making an artificial branch that + // looks like a return from A to B can confuse the unwinder to treat + // the instruction before B as the call instruction. Here we detect this + // case if the return target is not the next inst of call inst, then we just + // do not treat it as a return. + uint64_t CallAddr = + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()); + return (CallAddr != 0); + } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr create(ProfiledBinary *Binary, + PerfInputFile &PerfInput); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCSFlat() { return ProfileIsCSFlat; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCSFlat = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace) + : PerfReaderBase(B, PerfTrace){}; + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + +protected: + // The parsed MMap event + struct MMapEvent { + uint64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMap2Event(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP2 event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMap2Event(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set InvalidReturnAddresses; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfScriptReader(Binary, PerfTrace){}; + // Parse the LBR only sample. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfScriptReader(Binary, PerfTrace){}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.cpp new file mode 100644 index 00000000000..1248e37dc50 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.cpp @@ -0,0 +1,979 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include +#include + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +cl::opt UseMD5( + "use-md5", cl::init(false), cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt + TrimColdProfile("trim-cold-profile", cl::init(false), cl::ZeroOrMore, + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), cl::ZeroOrMore, + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore, + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt CSProfMaxContextDepth( + "csprof-max-context-depth", cl::ZeroOrMore, + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +static cl::opt UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +extern cl::opt ProfileSummaryCutoffHot; + +static cl::opt GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(false), + cl::desc("Generate nested function profiles for CSSPGO")); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap &SampleCounters, + bool ProfileIsCSFlat) { + std::unique_ptr Generator; + if (ProfileIsCSFlat) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "AutoFDO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + // Use duplication factor to compensated for loop unroll/vectorization. + // Note that this is only needed when we're taking MAX of the counts at + // the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr R = + FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + if (!UpdateTotalSamples) + return; + + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { + SampleContext Context(FuncName); + auto Ret = ProfileMap.emplace(Context, FunctionSamples()); + if (Ret.second) { + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(Context); + } + return Ret.first->second; +} + +void ProfileGenerator::generateProfile() { + if (Binary->usePseudoProbes()) { + // TODO: Support probe based profile generation + exitWithError("Probe based profile generation not supported for AutoFDO, " + "consider dropping `--ignore-stack-samples` or adding `--use-dwarf-correlation`."); + } else { + generateLineNumBasedProfile(); + } + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(); + trimColdProfiles(ProfileMap, ColdCountThreshold); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector ColdProfiles; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfiles.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfiles) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters.size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters.begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateTotalSamples(); +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].FuncName); + FunctionProfile->addTotalSamples(Count); + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].FuncName); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartOffset = I.first.first; + for (const auto &Range : Binary->getRangesForOffset(StartOffset)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(Offset); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(Offset)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(SourceOffset); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + CalleeName, Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForContext( + const SampleContextFrameVector &Context, bool WasLeafInlined) { + auto I = ProfileMap.find(SampleContext(Context)); + if (I == ProfileMap.end()) { + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + SampleContext FContext(NewContext, RawContext); + auto Ret = ProfileMap.emplace(FContext, FunctionSamples()); + if (WasLeafInlined) + FContext.setAttribute(ContextWasInlined); + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(FContext); + return Ret.first->second; + } + return I->second; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCSFlat = true; + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + if (Binary->usePseudoProbes()) { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + postProcessProfiles(); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + // Hash map to deduplicate the function range and the item is a pair of + // function start and end offset. + std::unordered_map AggregatedRanges; + // Go through all the ranges in the CS counters, use the start of the range to + // look up the function it belongs and record the function range. + for (const auto &CI : SampleCounters) { + for (const auto &Item : CI.second.RangeCounter) { + // FIXME: Filter the bogus crossing function range. + uint64_t StartOffset = Item.first.first; + // Note that a function can be spilt into multiple ranges, so get all + // ranges of the function. + for (const auto &Range : Binary->getRangesForOffset(StartOffset)) + AggregatedRanges[Range.first] = Range.second; + } + } + + for (const auto &I : AggregatedRanges) { + uint64_t StartOffset = I.first; + uint64_t EndOffset = I.second; + Binary->computeInlinedContextSizeForRange(StartOffset, EndOffset); + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : SampleCounters) { + const auto *CtxKey = cast(CI.first.getPtr()); + + // Get or create function profile for the range + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(CtxKey->Context, CtxKey->WasLeafInlined); + + // Fill in function body samples + populateBodySamplesForFunction(FunctionProfile, CI.second.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(CtxKey->Context, FunctionProfile, + CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(); + + updateTotalSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset); + if (LeafLoc.hasValue()) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + SampleContextFrames ContextId, FunctionSamples &FunctionProfile, + const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset); + if (!LeafLoc.hasValue()) + continue; + FunctionProfile.addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, + Count); + + // Record head sample for called target(callee) + SampleContextFrameVector CalleeCtx(ContextId.begin(), ContextId.end()); + assert(CalleeCtx.back().FuncName == LeafLoc->FuncName && + "Leaf function name doesn't match"); + CalleeCtx.back() = *LeafLoc; + CalleeCtx.emplace_back(CalleeName, LineLocation(0, 0)); + FunctionSamples &CalleeProfile = getFunctionProfileForContext(CalleeCtx); + CalleeProfile.addHeadSamples(Count); + } +} + +static SampleContextFrame +getCallerContext(SampleContextFrames CalleeContext, + SampleContextFrameVector &CallerContext) { + assert(CalleeContext.size() > 1 && "Unexpected empty context"); + CalleeContext = CalleeContext.drop_back(); + CallerContext.assign(CalleeContext.begin(), CalleeContext.end()); + SampleContextFrame CallerFrame = CallerContext.back(); + CallerContext.back().Location = LineLocation(0, 0); + return CallerFrame; +} + +void CSProfileGenerator::populateInferredFunctionSamples() { + for (const auto &Item : ProfileMap) { + const auto &CalleeContext = Item.first; + const FunctionSamples &CalleeProfile = Item.second; + + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile.getHeadSamples()) + continue; + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CalleeContext.isBaseContext()) + continue; + + // Infer Caller's frame loc and context ID through string splitting + SampleContextFrameVector CallerContextId; + SampleContextFrame &&CallerLeafFrameLoc = + getCallerContext(CalleeContext.getContextFrames(), CallerContextId); + SampleContextFrames CallerContext(CallerContextId); + + // It's possible that we haven't seen any sample directly in the caller, + // in which case CallerProfile will not exist. But we can't modify + // ProfileMap while iterating it. + // TODO: created function profile for those callers too + if (ProfileMap.find(CallerContext) == ProfileMap.end()) + continue; + FunctionSamples &CallerProfile = ProfileMap[CallerContext]; + + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile.getEntrySamples(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile.getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples( + CallerLeafFrameLoc.Location.LineOffset, + CallerLeafFrameLoc.Location.Discriminator, + CalleeProfile.getContext().getName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.Location.LineOffset, + CallerLeafFrameLoc.Location.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); + } +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold) + .run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); + if (GenCSNestedProfile) { + CSProfileConverter CSConverter(ProfileMap); + CSConverter.convertProfiles(); + FunctionSamples::ProfileIsCSFlat = false; + FunctionSamples::ProfileIsCSNested = EnableCSPreInliner; + } +} + +void ProfileGeneratorBase::computeSummaryAndThreshold() { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + auto Summary = Builder.computeSummaryForProfiles(ProfileMap); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +// Helper function to extract context prefix string stack +// Extract context stack for reusing, leaf context stack will +// be added compressed while looking up function profile +static void extractPrefixContextStack( + SampleContextFrameVector &ContextStack, + const SmallVectorImpl &Probes, + ProfiledBinary *Binary) { + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + for (const auto &CI : SampleCounters) { + const ProbeBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, CtxKey->Probes, Binary); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack); + } +} + +void CSProfileGenerator::extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter) { + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + if (!Probe.isBlock()) + continue; + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, SampleContextFrames ContextStack) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(ContextStack, Probe); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + SampleContextFrames CalleeContextId = + FunctionProfile.getContext().getContextFrames(); + if (InlinerDesc != nullptr && CalleeContextId.size() > 1) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + SampleContextFrameVector CallerContextId; + SampleContextFrame &&CallerLeafFrameLoc = + getCallerContext(CalleeContextId, CallerContextId); + uint64_t CallerIndex = CallerLeafFrameLoc.Location.LineOffset; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + FunctionSamples &CallerProfile = + getFunctionProfileForContext(CallerContextId); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, 0, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples( + CallerIndex, 0, FunctionProfile.getContext().getName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, SampleContextFrames ContextStack) { + for (const auto &BI : BranchCounter) { + uint64_t SourceOffset = BI.first.first; + uint64_t TargetOffset = BI.first.second; + uint64_t Count = BI.second; + uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(ContextStack, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, + Count); + } +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + FunctionSamples &FunctionProile = + getFunctionProfileForContext(NewContextStack, WasLeafInlined); + FunctionProile.setFunctionHash(FuncDesc->FuncHash); + return FunctionProile; +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.h new file mode 100644 index 00000000000..af349ac9911 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/ProfileGenerator.h @@ -0,0 +1,312 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : Binary(Binary), SampleCounters(Counters){}; + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, const ContextSampleCounterMap &SampleCounters, + bool ProfileIsCSFlat); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + void updateTotalSamples(); + + StringRef getCalleeNameForOffset(uint64_t TargetOffset); + + void computeSummaryAndThreshold(); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + ProfiledBinary *Binary = nullptr; + + const ContextSampleCounterMap &SampleCounters; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : ProfileGeneratorBase(Binary, Counters){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(StringRef FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); +}; + +using ProbeCounterMap = + std::unordered_map; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap &Counters) + : ProfileGeneratorBase(Binary, Counters){}; + + void generateProfile() override; + + // Trim the context stack at a given depth. + template + static void trimContext(SmallVectorImpl &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template + static void compressRecursionContext(SmallVectorImpl &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + // Lookup or create FunctionSamples for the context + FunctionSamples & + getFunctionProfileForContext(const SampleContextFrameVector &Context, + bool WasLeafInlined = false); + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + void populateBoundarySamplesForFunction(SampleContextFrames ContextId, + FunctionSamples &FunctionProfile, + const BranchSample &BranchCounters); + void populateInferredFunctionSamples(); + + void generateProbeBasedProfile(); + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter); + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + SampleContextFrames ContextStack); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + SampleContextFrames ContextStack); + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(SampleContextFrames ContextStack, + const MCDecodedPseudoProbe *LeafProbe); + + // Underlying context table serves for sample profile writer. + std::unordered_set Contexts; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.cpp new file mode 100644 index 00000000000..a773a3c98d4 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.cpp @@ -0,0 +1,790 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "ProfileGenerator.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt ShowDisassemblyOnly("show-disassembly-only", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print disassembled code.")); + +cl::opt ShowSourceLocations("show-source-locations", cl::init(false), + cl::ZeroOrMore, + cl::desc("Print source locations.")); + +static cl::opt + ShowCanonicalFnName("show-canonical-fname", cl::init(false), cl::ZeroOrMore, + cl::desc("Print canonical function name.")); + +static cl::opt ShowPseudoProbe( + "show-pseudo-probe", cl::init(false), cl::ZeroOrMore, + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt UseDwarfCorrelation( + "use-dwarf-correlation", cl::init(false), cl::ZeroOrMore, + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::list DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +extern cl::opt ShowDetailedWarning; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + StringRef CallerName = Callsite.FuncName; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const SampleContext &Context) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + SampleContextFrames Frames = Context.getContextFrames(); + int32_t I = Frames.size() - 1; + Optional Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + + while (CurrNode && I >= 0) { + // Process from leaf function to callers (added to context). + const auto &ChildFrame = Frames[I--]; + PrevNode = CurrNode; + CurrNode = + CurrNode->getChildContext(ChildFrame.Location, ChildFrame.FuncName); + if (CurrNode && CurrNode->getFunctionSize().hasValue()) + Size = CurrNode->getFunctionSize().getValue(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size.hasValue()) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size.hasValue() && CurrNode && + !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize().hasValue()) + Size = CurrNode->getFunctionSize().getValue(); + } + } + + assert(Size.hasValue() && "We should at least find one context size."); + return Size.getValue(); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second.get(), ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, CallerName); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second.get(), ProbeContext); + } + + ProbeContext.pop_back(); +} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartOffset(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + auto *Obj = dyn_cast(&ExeBinary); + if (!Obj) + exitWithError("not a valid Elf image", Path); + + TheTriple = Obj->makeTriple(); + // Current only support X86 + if (!TheTriple.isX86()) + exitWithError("unsupported target", TheTriple.getTriple()); + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + // Decode pseudo probe related section + decodePseudoProbe(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*dyn_cast(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*dyn_cast(&ExeBinary)); + } + + // Disassemble the text sections. + disassemble(Obj); + + // Track size for optimized inlinees when probe is available + if (UsePseudoProbes && TrackFuncContextSize) + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologOffsets(StartOffset2FuncRangeMap); + ProEpilogTracker.inferEpilogOffsets(RetOffsets); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + uint64_t Offset1 = virtualAddrToOffset(Address1); + uint64_t Offset2 = virtualAddrToOffset(Address2); + const SampleContextFrameVector &Context1 = getFrameLocationStack(Offset1); + const SampleContextFrameVector &Context2 = getFrameLocationStack(Offset2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + uint64_t Offset = virtualAddrToOffset(Address); + const SampleContextFrameVector &ExpandedContext = + getFrameLocationStack(Offset); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile &Obj, StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from + // because we may build the tools on non-linux. + uint32_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFObjectFileBase *Obj) { + if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else + llvm_unreachable("invalid ELF object format"); +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = true; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::setIsFuncEntry(uint64_t Offset, StringRef RangeSymName) { + // Note that the start offset of each ELF section can be a non-function + // symbol, we need to binary search for the start of a real function range. + auto *FuncRange = findFuncRangeForOffset(Offset); + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionOffset = Section.getAddress() - getPreferredBaseAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartOffset = Symbols[SI].Addr - getPreferredBaseAddress(); + uint64_t NextStartOffset = + (SI + 1 < SE) ? Symbols[SI + 1].Addr - getPreferredBaseAddress() + : SectionOffset + SectSize; + setIsFuncEntry(StartOffset, + FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + auto WarnInvalidInsts = [](uint64_t Start, uint64_t End) { + WithColor::warning() << "Invalid instructions at " + << format("%8" PRIx64, Start) << " - " + << format("%8" PRIx64, End) << "\n"; + }; + + uint64_t Offset = StartOffset; + // Size of a consecutive invalid instruction range starting from Offset -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Offset < NextStartOffset) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = + DisAsm->getInstruction(Inst, Size, Bytes.slice(Offset - SectionOffset), + Offset + getPreferredBaseAddress(), nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), + Offset + getPreferredBaseAddress()); + } + outs() << format("%8" PRIx64 ":", Offset + getPreferredBaseAddress()); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs()); + else + outs() << "\t"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Offset); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + Offset2InstSizeMap[Offset] = Size; + + // Populate address maps. + CodeAddrOffsets.push_back(Offset); + if (MCDesc.isCall()) + CallOffsets.insert(Offset); + else if (MCDesc.isReturn()) + RetOffsets.insert(Offset); + else if (MCDesc.isBranch()) + BranchOffsets.insert(Offset); + + if (InvalidInstLength) { + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Offset += Size; + } + + if (InvalidInstLength) + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + SubtargetFeatures Features = Obj->getFeatures(); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features.getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionOffset = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionOffset, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ELFObjectFileBase *Obj, + std::map &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create(Obj); + if (!DebugContext) + exitWithError("Misssing debug info.", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) { + for (const auto &DieInfo : CompilationUnit->dies()) { + llvm::DWARFDie Die(CompilationUnit.get(), &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t FuncStart = Range.LowPC; + uint64_t FuncSize = Range.HighPC - FuncStart; + + if (FuncSize == 0 || FuncStart < getPreferredBaseAddress()) + continue; + + uint64_t StartOffset = FuncStart - getPreferredBaseAddress(); + uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress(); + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartOffset, EndOffset); + + auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartOffset = StartOffset; + FRange.EndOffset = EndOffset; + } else { + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartOffset + getPreferredBaseAddress()) + << " " << R.first->second.getFuncName() << " and " << Name + << "\n"; + } + } + } + } + assert(!StartOffset2FuncRangeMap.empty() && "Misssing debug info."); +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartOffset2FuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +void ProfiledBinary::setupSymbolizer() { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + Symbolizer = std::make_unique(SymbolizerOpts); +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Offset + getPreferredBaseAddress(), + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName == "") + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(*It.first, Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset) { + uint64_t RangeBegin = offsetToVirtualAddr(StartOffset); + uint64_t RangeEnd = offsetToVirtualAddr(EndOffset); + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + uint64_t Offset = virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &SymbolizedCallStack = + getFrameLocationStack(Offset, UsePseudoProbes); + uint64_t Size = Offset2InstSizeMap[Offset]; + + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeOffsetsSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeOffsetsSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.h new file mode 100644 index 00000000000..d3d1c6f1fd2 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/ProfiledBinary.h @@ -0,0 +1,541 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include +#include +#include +#include +#include +#include +#include +#include + +extern cl::opt EnableCSPreInliner; +extern cl::opt UseContextCostForPreInliner; + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; + +struct InstructionPointer { + const ProfiledBinary *Binary; + union { + // Offset of the executable segment of the binary. + uint64_t Offset = 0; + // Also used as address in unwinder + uint64_t Address; + }; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartOffset; + // EndOffset is an exclusive bound. + uint64_t EndOffset; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start offset is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog offset tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog offsets. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void inferPrologOffsets(std::map &FuncStartOffsetMap) { + for (auto I : FuncStartOffsetMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogOffsets(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const SampleContext &Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + void dump() { RootContext.dumpTree(); } + +private: + using ProbeFrameStack = SmallVector>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using OffsetRange = std::pair; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // The target triple. + Triple TheTriple; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr MRI; + std::unique_ptr AsmInfo; + std::unique_ptr STI; + std::unique_ptr MII; + std::unique_ptr DisAsm; + std::unique_ptr MIA; + std::unique_ptr IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map BinaryFunctions; + + // An ordered map of mapping function's start offset to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map StartOffset2FuncRangeMap; + + // Offset to context location map. Used to expand the context. + std::unordered_map Offset2LocStackMap; + + // Offset to instruction size map. Also used for quick offset lookup. + std::unordered_map Offset2InstSizeMap; + + // An array of offsets of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector CodeAddrOffsets; + // A set of call instruction offsets. Used by virtual unwinding. + std::unordered_set CallOffsets; + // A set of return instruction offsets. Used by virtual unwinding. + std::unordered_set RetOffsets; + // A set of branch instruction offsets. + std::unordered_set BranchOffsets; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O); + + template + void setPreferredTextSegmentAddresses(const ELFFile &Obj, StringRef FileName); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ELFObjectFileBase *Obj, + std::map &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ELFObjectFileBase *Obj); + void setupSymbolizer(); + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start offset of a function is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(uint64_t Offset, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ELFObjectFileBase *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), ProEpilogTracker(this), + TrackFuncContextSize(EnableCSPreInliner && + UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + setupSymbolizer(); + load(); + } + uint64_t virtualAddrToOffset(uint64_t VirtualAddress) const { + return VirtualAddress - BaseAddress; + } + uint64_t offsetToVirtualAddr(uint64_t Offset) const { + return Offset + BaseAddress; + } + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { return PreferredTextSegmentAddresses[0]; } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Offset) const { + auto I = Offset2InstSizeMap.find(Offset); + if (I == Offset2InstSizeMap.end()) + return 0; + return I->second; + } + + bool offsetIsCode(uint64_t Offset) const { + return Offset2InstSizeMap.find(Offset) != Offset2InstSizeMap.end(); + } + bool addressIsCode(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return offsetIsCode(Offset); + } + bool addressIsCall(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return CallOffsets.count(Offset); + } + bool addressIsReturn(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return RetOffsets.count(Offset); + } + bool addressInPrologEpilog(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return ProEpilogTracker.PrologEpilogSet.count(Offset); + } + + bool offsetIsTransfer(uint64_t Offset) { + return BranchOffsets.count(Offset) || RetOffsets.count(Offset) || + CallOffsets.count(Offset); + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return offsetToVirtualAddr(CodeAddrOffsets[Index]); + } + + size_t getCodeOffsetsSize() const { return CodeAddrOffsets.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + // Get the index in CodeAddrOffsets for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForOffset(uint64_t Offset) const { + auto Low = llvm::lower_bound(CodeAddrOffsets, Offset); + return Low - CodeAddrOffsets.begin(); + } + uint32_t getIndexForAddr(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return getIndexForOffset(Offset); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.find(Offset); + if (I == StartOffset2FuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input offset. + FuncRange *findFuncRangeForOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.upper_bound(Offset); + if (I == StartOffset2FuncRangeMap.begin()) + return nullptr; + I--; + + if (Offset >= I->second.EndOffset) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRangesForOffset(uint64_t Offset) { + auto *FRange = findFuncRangeForOffset(Offset); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + + uint32_t getFuncSizeForContext(SampleContext &Context) { + return FuncSizeTracker.getFuncSizeForContext(Context); + } + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + const SampleContextFrameVector & + getFrameLocationStack(uint64_t Offset, bool UseProbeDiscriminator = false) { + auto I = Offset2LocStackMap.emplace(Offset, SampleContextFrameVector()); + if (I.second) { + InstructionPointer IP(this, Offset); + I.first->second = symbolize(IP, true, UseProbeDiscriminator); + } + return I.first->second; + } + + Optional getInlineLeafFrameLoc(uint64_t Offset) { + const auto &Stack = getFrameLocationStack(Offset); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in Offset2LocStackMap. This is + // used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(Callsite.first, + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-14.0/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-14.0/llvm-profgen.cpp new file mode 100644 index 00000000000..f092df04d52 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-14.0/llvm-profgen.cpp @@ -0,0 +1,164 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), cl::ZeroOrMore, + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + uint16_t HasPerfData = PerfDataFilename.getNumOccurrences(); + uint16_t HasPerfScript = PerfScriptFilename.getNumOccurrences(); + uint16_t HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences(); + uint16_t S = HasPerfData + HasPerfScript + HasUnsymbolizedProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr Binary = + std::make_unique(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr Reader = + PerfReaderBase::create(Binary.get(), PerfFile); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getSampleCounters(), + Reader->profileIsCSFlat()); + Generator->generateProfile(); + Generator->write(); + + return EXIT_SUCCESS; +} diff --git a/tools/ldc-profgen/ldc-profgen-15.0/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-15.0/CMakeLists.txt new file mode 100644 index 00000000000..b3e05a94856 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/CMakeLists.txt @@ -0,0 +1,23 @@ + +set(LLVM_LINK_COMPONENTS + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + DebugInfoDWARF + Core + MC + IPO + MCDisassembler + Object + ProfileData + Support + Symbolize + ) + +add_llvm_tool(llvm-profgen + llvm-profgen.cpp + PerfReader.cpp + CSPreInliner.cpp + ProfiledBinary.cpp + ProfileGenerator.cpp + ) diff --git a/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.cpp new file mode 100644 index 00000000000..dbc5bc7327d --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.cpp @@ -0,0 +1,303 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include +#include + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +extern cl::opt SampleHotCallSiteThreshold; +extern cl::opt SampleColdCallSiteThreshold; +extern cl::opt ProfileInlineGrowthLimit; +extern cl::opt ProfileInlineLimitMin; +extern cl::opt ProfileInlineLimitMax; +extern cl::opt SortProfiledSCC; + +cl::opt EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); + +static cl::opt SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, + ProfiledBinary &Binary, ProfileSummary *Summary) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Tracker), Binary(Binary), Summary(Summary) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; + if (!ProfileInlineLimitMax.getNumOccurrences()) + ProfileInlineLimitMax = 3000; +} + +std::vector CSPreInliner::buildTopDownOrder() { + std::vector Order; + ProfiledCallGraph ProfiledCG(ContextTracker); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextNodeForProfile(CallerSamples); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(CalleeNode); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { + if (UseContextCost) + return Binary.getFuncSizeForContext(ContextNode); + + return ContextNode->getFunctionSamples()->getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return Candidate.CalleeSamples->getContext().hasAttribute( + ContextWasInlined); + + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + + if (Candidate.CallsiteCount <= ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + else { + // Linearly adjust threshold based on normalized hotness, i.e, a value in + // [0,1]. Use 10% cutoff instead of the max count as the normalization + // upperbound for stability. + double NormalizationUpperBound = + ProfileSummaryBuilder::getEntryForPercentile( + Summary->getDetailedSummary(), 100000 /* 10% */) + .MinCount; + double NormalizationLowerBound = ColdCountThreshold; + double NormalizedHotness = + (Candidate.CallsiteCount - NormalizationLowerBound) / + (NormalizationUpperBound - NormalizationLowerBound); + if (NormalizedHotness > 1.0) + NormalizedHotness = 1.0; + // Add 1 to to ensure hot callsites get a non-zero threshold, which could + // happen when SampleColdCallSiteThreshold is 0. This is when we do not + // want any inlining for cold callsites. + SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + + SampleColdCallSiteThreshold + 1; + } + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const StringRef Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = + getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG( + dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleContextTracker &ContextTracker, + bool IsInput) { + uint32_t Size = 0; + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + Size++; + dbgs() << " [" << ContextTracker.getContextString(Node) << "] " + << FSamples->getTotalSamples() << ":" + << FSamples->getHeadSamples() << "\n"; + } + } + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Size << " total):\n"; + }; +#endif + + LLVM_DEBUG(printProfileNames(ContextTracker, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (StringRef FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + for (auto *Node : ContextTracker) { + FunctionSamples *FProfile = Node->getFunctionSamples(); + if (FProfile && + (Node->getParentContext() != &ContextTracker.getRootContext() && + !FProfile->getContext().hasState(InlinedContext))) { + Node->setFunctionSamples(nullptr); + } + } + FunctionSamples::ProfileIsPreInlined = true; + + LLVM_DEBUG(printProfileNames(ContextTracker, false)); +} diff --git a/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.h new file mode 100644 index 00000000000..09dd2dec114 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/CSPreInliner.h @@ -0,0 +1,90 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, + ProfileSummary *Summary); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector buildTopDownOrder(); + void processFunction(StringRef Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const ContextTrieNode *ContextNode); + bool UseContextCost; + SampleContextTracker &ContextTracker; + ProfiledBinary &Binary; + ProfileSummary *Summary; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/CallContext.h b/tools/ldc-profgen/ldc-profgen-15.0/CallContext.h new file mode 100644 index 00000000000..5e552130d03 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/CallContext.h @@ -0,0 +1,59 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include +#include +#include + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.FuncName.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-15.0/ErrorHandling.h new file mode 100644 index 00000000000..b797add8a89 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template +T unwrapOrError(Expected EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.cpp new file mode 100644 index 00000000000..f28a852fad6 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.cpp @@ -0,0 +1,1196 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt SkipSymbolization("skip-symbolization", + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt ShowMmapEvents("show-mmap-events", + cl::desc("Print binary load events.")); + +static cl::opt + UseOffset("use-offset", cl::init(true), + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt + IgnoreStackSamples("ignore-stack-samples", + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt ShowDetailedWarning("show-detailed-warning", + cl::desc("Show detailed warning message.")); + +extern cl::opt PerfTraceFilename; +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (Source == ExternalAddr) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + + if (End == ExternalAddr && Target == ExternalAddr) { + // Filter out the case when leaf external frame matches the external LBR + // target, this is a valid state, it happens that the code run into external + // address then return back. The call frame under the external frame + // remains valid and can be unwound later, just skip recording this range. + NumPairedExtAddr++; + return; + } + + if (End == ExternalAddr || Target == ExternalAddr) { + // Range is invalid if only one point is external address. This means LBR + // traces contains a standalone external address failing to pair another + // one, likely due to interrupt jmp or broken perf script. Set the + // state to invalid. + NumUnpairedExtAddr++; + State.setInvalid(); + return; + } + + if (!isValidFallThroughRange(Binary->virtualAddrToOffset(Target), + Binary->virtualAddrToOffset(End), Binary)) { + // Skip unwinding the rest of LBR trace when a bogus range is seen. + State.setInvalid(); + return; + } + + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr FrameStack::getContextKey() { + std::shared_ptr KeyStr = + std::make_shared(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + return KeyStr; +} + +std::shared_ptr AddressStack::getContextKey() { + std::shared_ptr KeyStr = std::make_shared(); + KeyStr->Context = Stack; + CSProfileGenerator::compressRecursionContext(KeyStr->Context); + CSProfileGenerator::trimContext(KeyStr->Context); + return KeyStr; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &Item : Cur->RangeSamples) { + uint64_t StartOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t EndOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordRangeCount(StartOffset, EndOffset, std::get<2>(Item)); + } + + for (auto &Item : Cur->BranchSamples) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(std::get<0>(Item)); + uint64_t TargetOffset = Binary->virtualAddrToOffset(std::get<1>(Item)); + SCounter.recordBranchCount(SourceOffset, TargetOffset, std::get<2>(Item)); + } +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + AddressStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.Target == ExternalAddr) + return; + + // Record external-to-internal pattern on the trie root, it later can be + // used for generating head samples. + if (Branch.Source == ExternalAddr) { + State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + return; + } + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + NumTotalBranches += State.LBRStack.size(); + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return + // instruction + unwindReturn(State); + } else if (isValidState(State)) { + // Unwind branches + unwindBranch(State); + } else { + // Skip unwinding the rest of LBR trace. Reset the stack and update the + // state so that the rest of the trace can still be processed as if they + // do not have stack samples. + State.clearCallStack(); + State.InstPtr.update(State.getCurrentLBRSource()); + State.pushFrame(State.InstPtr.Address); + } + + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + Optional PIDFilter) { + std::unique_ptr PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = + PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput, PIDFilter); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset( + new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile PerfScriptReader::convertPerfDataToTrace( + ProfiledBinary *Binary, PerfInputFile &File, Optional PIDFilter) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + std::string PerfTraceFile = PerfData.str() + ".script.tmp"; + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + Optional Redirects[] = {llvm::None, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(PerfTraceFile)}; // Stderr + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, llvm::None, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::string PIDs; + std::unordered_set PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMap2Event(TraceIt.getCurrentLine()) && + extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second && (!PIDFilter || MMap.PID == *PIDFilter)) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + + // Run perf script again to retrieve events for PIDs collected above + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "ip,brstack", "--pid", + PIDs, "-i", PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, llvm::None, Redirects); + + return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent}; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath); + if (Binary->getName() != BinaryName) + return; + + // Drop the event if process does not match pid filter + if (PIDFilter && Event.PID != *PIDFilter) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = std::lower_bound(Offsets.begin(), Offsets.end(), Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast(K)) { + std::ostringstream OContextStr; + for (uint32_t I = 0; I < CtxKey->Context.size(); I++) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << "0x" + << utohexstr( + Binary->virtualAddrToOffset(CtxKey->Context[I]), + /*LowerCase=*/true); + } + return OContextStr.str(); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary(Unwinder.NumPairedExtAddr * 2, Unwinder.NumTotalBranches, + "of branches containing paired external address."); + + emitWarningSummary(Unwinder.NumUnpairedExtAddr, Unwinder.NumTotalBranches, + "of branches containing external address but doesn't have " + "another external address to pair, likely due to " + "interrupt jmp or broken perf script."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + uint64_t LeadingAddr; + if (!Records.empty() && !Records[0].contains('/')) { + if (Records[0].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index = 1; + } + + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + if (!SrcIsInternal) + Src = ExternalAddr; + if (!DstIsInternal) + Dst = ExternalAddr; + // Filter external-to-external case to reduce LBR trace size. + if (!SrcIsInternal && !DstIsInternal) + continue; + + LBRStack.emplace_back(LBREntry(Src, Dst)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr Sample = std::make_shared(); +#ifndef NDEBUG + Sample->Linenum = TraceIt.getLineNumber(); +#endif + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + Start = Binary->offsetToVirtualAddr(Start); + End = Binary->offsetToVirtualAddr(End); + } + + if (UseOffset && UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCS) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (!UseOffset || (UseOffset && UseLoadableSegmentAsBase)) { + uint64_t BaseAddr = 0; + if (UseOffset && UseLoadableSegmentAsBase) + BaseAddr = Binary->getFirstLoadableAddress(); + + Source = Binary->virtualAddrToOffset(Source + BaseAddr); + Target = Binary->virtualAddrToOffset(Target + BaseAddr); + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr Key = + std::make_shared(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.startswith("[")) { + ProfileIsCS = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t TargetOffset = Binary->virtualAddrToOffset(LBR.Target); + + // Record the branch if its sourceOffset is external. It can be the case an + // external source call an internal function, later this branch will be used + // to generate the function's head sample. + if (Binary->offsetIsCode(TargetOffset)) { + Counter.recordBranchCount(SourceOffset, TargetOffset, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartOffset = TargetOffset; + if (Binary->offsetIsCode(StartOffset) && Binary->offsetIsCode(EndOffeset) && + isValidFallThroughRange(StartOffset, EndOffeset, Binary)) + Counter.recordRangeCount(StartOffset, EndOffeset, Repeat); + EndOffeset = SourceOffset; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr Key = + std::make_shared(); + SampleCounters.emplace(Hashable(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const Pattern = + "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + Regex RegMmap2(Pattern); + SmallVector Fields; + bool R = RegMmap2.match(Line, &Fields); + if (!R) { + std::string ErrorMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + exitWithError(ErrorMsg); + } + Fields[PID].getAsInteger(10, MMap.PID); + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath); + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMap2Event(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMap2Event(TraceIt.getCurrentLine())) + parseMMap2Event(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + if (Records[1].startswith("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMap2Event(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 does not appear at the beginning of the line + // for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP2"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCS = !IgnoreStackSamples; + if (ProfileIsCS) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map, uint64_t, + pair_hash> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndOffeset = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceOffset = Binary->virtualAddrToOffset(LBR.Source); + uint64_t StartOffset = Binary->virtualAddrToOffset(LBR.Target); + if (EndOffeset != 0) + Ranges[{StartOffset, EndOffeset}] += Count; + EndOffeset = SourceOffset; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = + [&](uint64_t StartOffset, uint64_t EndOffset, StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() + << "[" + << format("%8" PRIx64, Binary->offsetToVirtualAddr(StartOffset)) + << "," + << format("%8" PRIx64, Binary->offsetToVirtualAddr(EndOffset)) + << "]: " << Msg << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + const char *BogusRangeMsg = "Range start is after or too far from range end."; + + uint64_t TotalRangeNum = 0; + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + uint64_t BogusRange = 0; + + for (auto &I : Ranges) { + uint64_t StartOffset = I.first.first; + uint64_t EndOffset = I.first.second; + TotalRangeNum += I.second; + + if (!Binary->offsetIsCode(StartOffset) || + !Binary->offsetIsTransfer(EndOffset)) { + InstNotBoundary += I.second; + WarnInvalidRange(StartOffset, EndOffset, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRangeForOffset(StartOffset); + if (!FRange) { + UnmatchedRange += I.second; + WarnInvalidRange(StartOffset, EndOffset, DanglingRangeMsg); + continue; + } + + if (EndOffset >= FRange->EndOffset) { + RangeCrossFunc += I.second; + WarnInvalidRange(StartOffset, EndOffset, RangeCrossFuncMsg); + } + + if (!isValidFallThroughRange(StartOffset, EndOffset, Binary)) { + BogusRange += I.second; + WarnInvalidRange(StartOffset, EndOffset, BogusRangeMsg); + } + } + + emitWarningSummary( + InstNotBoundary, TotalRangeNum, + "of samples are from ranges that are not on instruction boundary."); + emitWarningSummary( + UnmatchedRange, TotalRangeNum, + "of samples are from ranges that do not belong to any functions."); + emitWarningSummary( + RangeCrossFunc, TotalRangeNum, + "of samples are from ranges that do cross function boundaries."); + emitWarningSummary( + BogusRange, TotalRangeNum, + "of samples are from ranges that have range start after or too far from " + "range end acrossing the unconditinal jmp."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.h b/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.h new file mode 100644 index 00000000000..3ffed991873 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/PerfReader.h @@ -0,0 +1,742 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparision if hash conflicts probabilities is low) +template class Hashable { +public: + std::shared_ptr Data; + Hashable(const std::shared_ptr &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable &LHS, const Hashable &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector &OtherCallStack = Other->CallStack; + const SmallVector &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + uint64_t Linenum = 0; + + void print() const { + dbgs() << "Line " << Linenum << "\n"; + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map, uint64_t, + Hashable::Hash, Hashable::Equal>; + +using SampleVector = SmallVector, 16>; + +inline bool isValidFallThroughRange(uint64_t Start, uint64_t End, + ProfiledBinary *Binary) { + // Start bigger than End is considered invalid. + // LBR ranges cross the unconditional jmp are also assumed invalid. + // It's found that perf data may contain duplicate LBR entries that could form + // a range that does not reflect real execution flow on some Intel targets, + // e.g. Skylake. Such ranges are ususally very long. Exclude them since there + // cannot be a linear execution range that spans over unconditional jmp. + return Start <= End && !Binary->rangeCrossUncondBranch(Start, End); +} + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + // Indicate whether unwinding is currently in a bad state which requires to + // skip all subsequent unwinding. + bool Invalid = false; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + void setInvalid() { Invalid = true; } + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_AddrBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Address-based context id +struct AddrBasedCtxKey : public ContextKey { + SmallVector Context; + + bool WasLeafInlined; + AddrBasedCtxKey() : ContextKey(CK_AddrBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_AddrBased; + } + + bool isEqual(const ContextKey *K) const override { + const AddrBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_combine_range(Context.begin(), Context.end()); + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map, SampleCounter, + Hashable::Hash, Hashable::Equal>; + +struct FrameStack { + SmallVector Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct AddressStack { + SmallVector Stack; + ProfiledBinary *Binary; + AddressStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + uint64_t NumUnpairedExtAddr = 0; + uint64_t NumPairedExtAddr = 0; + +private: + bool isSourceExternal(UnwindState &State) const { + return State.getCurrentLBRSource() == ExternalAddr; + } + + bool isTargetExternal(UnwindState &State) const { + return State.getCurrentLBRTarget() == ExternalAddr; + } + + // Determine whether the return source is from external code by checking if + // the target's the next inst is a call inst. + bool isReturnFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + (Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) != 0); + } + + // If the source is external address but it's not the `return` case, treat it + // as a call from external. + bool isCallFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) == 0; + } + + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + if (!isValidState(State)) + return false; + + if (Binary->addressIsCall(State.getCurrentLBRSource())) + return true; + + return isCallFromExternal(State); + } + + bool isReturnState(UnwindState &State) const { + if (!isValidState(State)) + return false; + + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (Binary->addressIsReturn(State.getCurrentLBRSource())) + return true; + + return isReturnFromExternal(State); + } + + bool isValidState(UnwindState &State) const { return !State.Invalid; } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr create(ProfiledBinary *Binary, + PerfInputFile &PerfInput, + Optional PIDFilter); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCS() { return ProfileIsCS; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCS = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace, + Optional PID) + : PerfReaderBase(B, PerfTrace), PIDFilter(PID){}; + + // Entry of the reader to parse multiple perf traces + void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File, + Optional PIDFilter); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + +protected: + // The parsed MMap event + struct MMapEvent { + uint64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMap2Event(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP2 event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMap2Event(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set InvalidReturnAddresses; + // PID for the process of interest + Optional PIDFilter; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + Optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the LBR only sample. + void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + Optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.cpp new file mode 100644 index 00000000000..6acbb140103 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.cpp @@ -0,0 +1,1243 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include +#include +#include +#include + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +cl::opt UseMD5( + "use-md5", cl::init(false), cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt + TrimColdProfile("trim-cold-profile", + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt CSProfMaxContextDepth( + "csprof-max-context-depth", + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +static cl::opt UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +extern cl::opt ProfileSummaryCutoffHot; +extern cl::opt UseContextLessSummary; + +static cl::opt GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(true), + cl::desc("Generate nested function profiles for CSSPGO")); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap *SampleCounters, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, Profiles)); + } else { + Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "AutoFDO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + // Use duplication factor to compensated for loop unroll/vectorization. + // Note that this is only needed when we're taking MAX of the counts at + // the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr R = + FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +void ProfileGeneratorBase::updateCallsiteSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateCallsiteSamples(); + } +} + +void ProfileGeneratorBase::updateFunctionSamples() { + updateCallsiteSamples(); + + if (UpdateTotalSamples) + updateTotalSamples(); +} + +void ProfileGeneratorBase::collectProfiledFunctions() { + std::unordered_set ProfiledFunctions; + if (collectFunctionsFromRawProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else + llvm_unreachable("Unsupported input profile"); +} + +bool ProfileGeneratorBase::collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions) { + if (!SampleCounters) + return false; + // Go through all the stacks, ranges and branches in sample counters, use + // the start of the range to look up the function it belongs and record the + // function. + for (const auto &CI : *SampleCounters) { + if (const auto *CtxKey = dyn_cast(CI.first.getPtr())) { + for (auto Addr : CtxKey->Context) { + if (FuncRange *FRange = Binary->findFuncRangeForOffset( + Binary->virtualAddrToOffset(Addr))) + ProfiledFunctions.insert(FRange->Func); + } + } + + for (auto Item : CI.second.RangeCounter) { + uint64_t StartOffset = Item.first.first; + if (FuncRange *FRange = Binary->findFuncRangeForOffset(StartOffset)) + ProfiledFunctions.insert(FRange->Func); + } + + for (auto Item : CI.second.BranchCounter) { + uint64_t SourceOffset = Item.first.first; + uint64_t TargetOffset = Item.first.first; + if (FuncRange *FRange = Binary->findFuncRangeForOffset(SourceOffset)) + ProfiledFunctions.insert(FRange->Func); + if (FuncRange *FRange = Binary->findFuncRangeForOffset(TargetOffset)) + ProfiledFunctions.insert(FRange->Func); + } + } + return true; +} + +bool ProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (const auto &FS : ProfileMap) { + if (auto *Func = Binary->getBinaryFunction(FS.first.getName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +bool CSProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (auto *Node : ContextTracker) { + if (!Node->getFuncName().empty()) + if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { + SampleContext Context(FuncName); + auto Ret = ProfileMap.emplace(Context, FunctionSamples()); + if (Ret.second) { + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(Context); + } + return Ret.first->second; +} + +void ProfileGenerator::generateProfile() { + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) + Binary->decodePseudoProbe(); + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(ProfileMap); + trimColdProfiles(ProfileMap, ColdCountThreshold); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector ColdProfiles; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfiles.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfiles) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::generateProbeBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( + const RangeSample &RangeCounter) { + ProbeCounterMap ProbeCounter; + // preprocessRangeCounter returns disjoint ranges, so no longer to redo it + // inside extractProbesFromRange. + extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, + false); + + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(Probe, FrameVec, true); + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, Count); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + if (Probe->isEntry()) + FunctionProfile.addHeadSamples(Count); + } +} + +void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + + uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + + // Record called target sample and its count. + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(CallProbe, FrameVec, true); + + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, 0, CalleeName, Count); + } + } +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].FuncName); + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].FuncName); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartOffset = I.first.first; + for (const auto &Range : Binary->getRangesForOffset(StartOffset)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(Offset); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(Offset)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getFrameLocationStack(SourceOffset); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + CalleeName, Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + +FunctionSamples * +CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined) { + FunctionSamples *FProfile = ContextNode->getFunctionSamples(); + if (!FProfile) { + FSamplesList.emplace_back(); + FProfile = &FSamplesList.back(); + FProfile->setName(ContextNode->getFuncName()); + ContextNode->setFunctionSamples(FProfile); + } + // Update ContextWasInlined attribute for existing contexts. + // The current function can be called in two ways: + // - when processing a probe of the current frame + // - when processing the entry probe of an inlinee's frame, which + // is then used to update the callsite count of the current frame. + // The two can happen in any order, hence here we are making sure + // `ContextWasInlined` is always set as expected. + // TODO: Note that the former does not always happen if no probes of the + // current frame has samples, and if the latter happens, we could lose the + // attribute. This should be fixed. + if (WasLeafInlined) + FProfile->getContext().setAttribute(ContextWasInlined); + return FProfile; +} + +ContextTrieNode * +CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined) { + ContextTrieNode *ContextNode = + ContextTracker.getOrCreateContextPath(Context, true); + getOrCreateFunctionSamples(ContextNode, WasLeafInlined); + return ContextNode; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) + Binary->decodePseudoProbe(); + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + postProcessProfiles(); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + std::unordered_set ProfiledFunctions; + for (auto *Func : Binary->getProfiledFunctions()) + Binary->computeInlinedContextSizeForFunc(Func); + + // Flush the symbolizer to save memory. + Binary->flushSymbolizer(); +} + +void CSProfileGenerator::updateFunctionSamples() { + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + if (UpdateTotalSamples) + FSamples->updateTotalSamples(); + FSamples->updateCallsiteSamples(); + } + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : *SampleCounters) { + const auto *CtxKey = cast(CI.first.getPtr()); + + ContextTrieNode *ContextNode = &getRootContext(); + // Sample context will be empty if the jump is an external-to-internal call + // pattern, the head samples should be added for the internal function. + if (!CtxKey->Context.empty()) { + // Get or create function profile for the range + ContextNode = + getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); + // Fill in function body samples + populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), + CI.second.RangeCounter); + } + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(getRootContext()); + + updateFunctionSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + uint64_t Offset = Binary->virtualAddrToOffset(IP.Address); + auto LeafLoc = Binary->getInlineLeafFrameLoc(Offset); + if (LeafLoc) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + ContextTrieNode *Node, const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceOffset = Entry.first.first; + uint64_t TargetOffset = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + + ContextTrieNode *CallerNode = Node; + LineLocation CalleeCallSite(0, 0); + if (CallerNode != &getRootContext()) { + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset); + if (LeafLoc) { + CallerNode->getFunctionSamples()->addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, + Count); + // Record head sample for called target(callee) + CalleeCallSite = LeafLoc->Location; + } + } + + ContextTrieNode *CalleeNode = + CallerNode->getOrCreateChildContext(CalleeCallSite, CalleeName); + FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); + CalleeProfile->addHeadSamples(Count); + } +} + +void CSProfileGenerator::populateInferredFunctionSamples( + ContextTrieNode &Node) { + // There is no call jmp sample between the inliner and inlinee, we need to use + // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s + // sample depends on child(inlinee)'s sample, so traverse the tree in + // post-order. + for (auto &It : Node.getAllChildContext()) + populateInferredFunctionSamples(It.second); + + FunctionSamples *CalleeProfile = Node.getFunctionSamples(); + if (!CalleeProfile) + return; + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile->getHeadSamples()) + return; + ContextTrieNode *CallerNode = Node.getParentContext(); + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CallerNode == &getRootContext()) + return; + + LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); + FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + Node.getFuncName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); +} + +void CSProfileGenerator::convertToProfileMap( + ContextTrieNode &Node, SampleContextFrameVector &Context) { + FunctionSamples *FProfile = Node.getFunctionSamples(); + if (FProfile) { + Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); + FunctionSamples &NewProfile = Ret.first->second; + NewProfile.getContext().setContext(NewContext); + Context.pop_back(); + } + + for (auto &It : Node.getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); + convertToProfileMap(ChildNode, Context); + Context.pop_back(); + } +} + +void CSProfileGenerator::convertToProfileMap() { + assert(ProfileMap.empty() && + "ProfileMap should be empty before converting from the trie"); + assert(IsProfileValidOnTrie && + "Do not convert the trie twice, it's already destroyed"); + + SampleContextFrameVector Context; + for (auto &It : getRootContext().getAllChildContext()) + convertToProfileMap(It.second, Context); + + IsProfileValidOnTrie = false; +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + ContextTracker.populateFuncToCtxtMap(); + CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + convertToProfileMap(); + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); + if (GenCSNestedProfile) { + CSProfileConverter CSConverter(ProfileMap); + CSConverter.convertProfiles(); + FunctionSamples::ProfileIsCS = false; + } +} + +void ProfileGeneratorBase::computeSummaryAndThreshold( + SampleProfileMap &Profiles) { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + Summary = Builder.computeSummaryForProfiles(Profiles); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +void CSProfileGenerator::computeSummaryAndThreshold() { + // Always merge and use context-less profile map to compute summary. + SampleProfileMap ContextLessProfiles; + ContextTracker.createContextLessProfileMap(ContextLessProfiles); + + // Set the flag below to avoid merging the profile again in + // computeSummaryAndThreshold + FunctionSamples::ProfileIsCS = false; + assert( + (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && + "Don't set --profile-summary-contextless to false for profile " + "generation"); + ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); + // Recover the old value. + FunctionSamples::ProfileIsCS = true; +} + +void ProfileGeneratorBase::extractProbesFromRange( + const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges) { + const RangeSample *PRanges = &RangeCounter; + RangeSample Ranges; + if (FindDisjointRanges) { + findDisjointRanges(Ranges, RangeCounter); + PRanges = &Ranges; + } + + for (const auto &Range : *PRanges) { + uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first); + uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second); + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +static void +extractPrefixContextStack(SampleContextFrameVector &ContextStack, + const SmallVectorImpl &Addresses, + ProfiledBinary *Binary) { + SmallVector Probes; + for (auto Addr : reverse(Addresses)) { + const MCDecodedPseudoProbe *CallProbe = Binary->getCallProbeForAddr(Addr); + // These could be the cases when a probe is not found at a calliste. Cutting + // off the context from here since the inliner will not know how to consume + // a context with unknown callsites. + // 1. for functions that are not sampled when + // --decode-probe-for-profiled-functions-only is on. + // 2. for a merged callsite. Callsite merging may cause the loss of original + // probe IDs. + // 3. for an external callsite. + if (!CallProbe) + break; + Probes.push_back(CallProbe); + } + + std::reverse(Probes.begin(), Probes.end()); + + // Extract context stack for reusing, leaf context stack will be added + // compressed while looking up function profile. + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + for (const auto &CI : *SampleCounters) { + const AddrBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, CtxKey->Context, Binary); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStack); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStack); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, SampleContextFrames ContextStack) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (!Probe->isBlock() || Count == 0) + continue; + + ContextTrieNode *ContextNode = + getContextNodeForLeafProbe(ContextStack, Probe); + FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + ContextTrieNode *CallerNode = ContextNode->getParentContext(); + if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + FunctionSamples &CallerProfile = + *getOrCreateFunctionSamples(CallerNode); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, 0, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples(CallerIndex, 0, + ContextNode->getFuncName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, SampleContextFrames ContextStack) { + for (const auto &BI : BranchCounter) { + uint64_t SourceOffset = BI.first.first; + uint64_t TargetOffset = BI.first.second; + uint64_t Count = BI.second; + uint64_t SourceAddress = Binary->offsetToVirtualAddr(SourceOffset); + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(ContextStack, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForOffset(TargetOffset); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, + Count); + } +} + +ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( + SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + ContextTrieNode *ContextNode = + getOrCreateContextNode(NewContextStack, WasLeafInlined); + ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); + return ContextNode; +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + SampleContextFrames ContextStack, const MCDecodedPseudoProbe *LeafProbe) { + return *getContextNodeForLeafProbe(ContextStack, LeafProbe) + ->getFunctionSamples(); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.h new file mode 100644 index 00000000000..0ce464506be --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/ProfileGenerator.h @@ -0,0 +1,383 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +using ProbeCounterMap = + std::unordered_map; + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : Binary(Binary), SampleCounters(Counters){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const SampleProfileMap &&Profiles) + : Binary(Binary), ProfileMap(std::move(Profiles)){}; + + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, + bool profileIsCS); + static std::unique_ptr + create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, + bool profileIsCS); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges = true); + + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + + void updateFunctionSamples(); + + void updateTotalSamples(); + + void updateCallsiteSamples(); + + StringRef getCalleeNameForOffset(uint64_t TargetOffset); + + void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + void collectProfiledFunctions(); + + bool collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions); + + // Collect profiled Functions for llvm sample profile input. + virtual bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) = 0; + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + ProfiledBinary *Binary = nullptr; + + std::unique_ptr Summary; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + const ContextSampleCounterMap *SampleCounters = nullptr; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) + : ProfileGeneratorBase(Binary, std::move(Profiles)){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + void generateProbeBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(StringRef FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void + populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); + void populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; +}; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) + : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; + void generateProfile() override; + + // Trim the context stack at a given depth. + template + static void trimContext(SmallVectorImpl &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template + static void compressRecursionContext(SmallVectorImpl &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + + FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined = false); + + // Lookup or create ContextTrieNode for the context, FunctionSamples is + // created inside this function. + ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined = false); + + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + + void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, + const BranchSample &BranchCounters); + + void populateInferredFunctionSamples(ContextTrieNode &Node); + + void updateFunctionSamples(); + + void generateProbeBasedProfile(); + + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + SampleContextFrames ContextStack); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + SampleContextFrames ContextStack); + + ContextTrieNode * + getContextNodeForLeafProbe(SampleContextFrames ContextStack, + const MCDecodedPseudoProbe *LeafProbe); + + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(SampleContextFrames ContextStack, + const MCDecodedPseudoProbe *LeafProbe); + + void convertToProfileMap(ContextTrieNode &Node, + SampleContextFrameVector &Context); + + void convertToProfileMap(); + + void computeSummaryAndThreshold(); + + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; + + ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; + + // The container for holding the FunctionSamples used by context trie. + std::list FSamplesList; + + // Underlying context table serves for sample profile writer. + std::unordered_set Contexts; + + SampleContextTracker ContextTracker; + + bool IsProfileValidOnTrie = true; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.cpp new file mode 100644 index 00000000000..eef5b8eb8a0 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.cpp @@ -0,0 +1,889 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "ProfileGenerator.h" +#include "llvm/ADT/Triple.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt ShowDisassemblyOnly("show-disassembly-only", + cl::desc("Print disassembled code.")); + +cl::opt ShowSourceLocations("show-source-locations", + cl::desc("Print source locations.")); + +static cl::opt + ShowCanonicalFnName("show-canonical-fname", + cl::desc("Print canonical function name.")); + +static cl::opt ShowPseudoProbe( + "show-pseudo-probe", + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt UseDwarfCorrelation( + "use-dwarf-correlation", + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::opt + DWPPath("dwp", cl::init(""), + cl::desc("Path of .dwp file. When not specified, it will be " + ".dwp in the same directory as the main binary.")); + +static cl::list DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +extern cl::opt ShowDetailedWarning; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + StringRef CallerName = Callsite.FuncName; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + + Optional Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + LineLocation CallSiteLoc(0, 0); + while (CurrNode && Node->getParentContext() != nullptr) { + PrevNode = CurrNode; + CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName()); + if (CurrNode && CurrNode->getFunctionSize()) + Size = CurrNode->getFunctionSize().value(); + CallSiteLoc = Node->getCallSiteLoc(); + Node = Node->getParentContext(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size && CurrNode && !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize()) + Size = CurrNode->getFunctionSize().value(); + } + } + + assert(Size && "We should at least find one context size."); + return Size.value(); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second.get(), ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, CallerName); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second.get(), + ProbeContext); + } + + ProbeContext.pop_back(); +} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartOffset(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + auto *Obj = dyn_cast(&ExeBinary); + if (!Obj) + exitWithError("not a valid Elf image", Path); + + TheTriple = Obj->makeTriple(); + // Current only support X86 + if (!TheTriple.isX86()) + exitWithError("unsupported target", TheTriple.getTriple()); + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + checkPseudoProbe(Obj); + + if (ShowDisassemblyOnly) + decodePseudoProbe(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*cast(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*cast(&ExeBinary)); + } + + // Disassemble the text sections. + disassemble(Obj); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologOffsets(StartOffset2FuncRangeMap); + ProEpilogTracker.inferEpilogOffsets(RetOffsets); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + uint64_t Offset1 = virtualAddrToOffset(Address1); + uint64_t Offset2 = virtualAddrToOffset(Address2); + const SampleContextFrameVector &Context1 = getFrameLocationStack(Offset1); + const SampleContextFrameVector &Context2 = getFrameLocationStack(Offset2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + if (Stack.empty()) + return ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + uint64_t Offset = virtualAddrToOffset(Address); + const SampleContextFrameVector &ExpandedContext = + getFrameLocationStack(Offset); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from + // because we may build the tools on non-linux. + uint32_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses( + const ELFObjectFileBase *Obj) { + if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else + llvm_unreachable("invalid ELF object format"); +} + +void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + bool HasProbeDescSection = false; + bool HasPseudoProbeSection = false; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + if (SectionName == ".pseudo_probe_desc") { + HasProbeDescSection = true; + } else if (SectionName == ".pseudo_probe") { + HasPseudoProbeSection = true; + } + } + + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = HasProbeDescSection && HasPseudoProbeSection; +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (!UsePseudoProbes) + return; + + std::unordered_set ProfiledGuids; + if (!ShowDisassemblyOnly) + for (auto *F : ProfiledFunctions) + ProfiledGuids.insert(Function::getGUID(F->FuncName)); + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError( + "Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast(Contents.data()), + Contents.size(), ProfiledGuids)) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + } + } + + // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe + // is available + if (TrackFuncContextSize) { + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) { + auto *Frame = Child.second.get(); + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName; + TopLevelProbeFrameMap[FuncName] = Frame; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::decodePseudoProbe() { + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + auto *Obj = dyn_cast(&ExeBinary); + decodePseudoProbe(Obj); +} + +void ProfiledBinary::setIsFuncEntry(uint64_t Offset, StringRef RangeSymName) { + // Note that the start offset of each ELF section can be a non-function + // symbol, we need to binary search for the start of a real function range. + auto *FuncRange = findFuncRangeForOffset(Offset); + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionOffset = Section.getAddress() - getPreferredBaseAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartOffset = Symbols[SI].Addr - getPreferredBaseAddress(); + uint64_t NextStartOffset = + (SI + 1 < SE) ? Symbols[SI + 1].Addr - getPreferredBaseAddress() + : SectionOffset + SectSize; + setIsFuncEntry(StartOffset, + FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + auto WarnInvalidInsts = [](uint64_t Start, uint64_t End) { + WithColor::warning() << "Invalid instructions at " + << format("%8" PRIx64, Start) << " - " + << format("%8" PRIx64, End) << "\n"; + }; + + uint64_t Offset = StartOffset; + // Size of a consecutive invalid instruction range starting from Offset -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Offset < NextStartOffset) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = + DisAsm->getInstruction(Inst, Size, Bytes.slice(Offset - SectionOffset), + Offset + getPreferredBaseAddress(), nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), + Offset + getPreferredBaseAddress()); + } + outs() << format("%8" PRIx64 ":", Offset + getPreferredBaseAddress()); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs()); + else + outs() << "\t"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Offset); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + Offset2InstSizeMap[Offset] = Size; + + // Populate address maps. + CodeAddrOffsets.push_back(Offset); + if (MCDesc.isCall()) { + CallOffsets.insert(Offset); + UncondBranchOffsets.insert(Offset); + } else if (MCDesc.isReturn()) { + RetOffsets.insert(Offset); + UncondBranchOffsets.insert(Offset); + } else if (MCDesc.isBranch()) { + if (MCDesc.isUnconditionalBranch()) + UncondBranchOffsets.insert(Offset); + BranchOffsets.insert(Offset); + } + + if (InvalidInstLength) { + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Offset += Size; + } + + if (InvalidInstLength) + WarnInvalidInsts(Offset - InvalidInstLength, Offset - 1); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + SubtargetFeatures Features = Obj->getFeatures(); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features.getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionOffset = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionOffset, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ELFObjectFileBase *Obj, + std::map &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) { + for (const auto &DieInfo : CompilationUnit.dies()) { + llvm::DWARFDie Die(&CompilationUnit, &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t FuncStart = Range.LowPC; + uint64_t FuncSize = Range.HighPC - FuncStart; + + if (FuncSize == 0 || FuncStart < getPreferredBaseAddress()) + continue; + + uint64_t StartOffset = FuncStart - getPreferredBaseAddress(); + uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress(); + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartOffset, EndOffset); + + auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartOffset = StartOffset; + FRange.EndOffset = EndOffset; + } else { + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartOffset + getPreferredBaseAddress()) + << " " << R.first->second.getFuncName() << " and " << Name << "\n"; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create( + Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath); + if (!DebugContext) + exitWithError("Error creating the debug info context", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) + loadSymbolsFromDWARFUnit(*CompilationUnit.get()); + + // Handles DWO sections that can either be in .o, .dwo or .dwp files. + for (const auto &CompilationUnit : DebugContext->compile_units()) { + DWARFUnit *const DwarfUnit = CompilationUnit.get(); + if (llvm::Optional DWOId = DwarfUnit->getDWOId()) { + DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit(); + if (!DWOCU->isDWOUnit()) { + std::string DWOName = dwarf::toString( + DwarfUnit->getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + WithColor::warning() + << "DWO debug information for " << DWOName + << " was not loaded. Please check the .o, .dwo or .dwp path.\n"; + continue; + } + loadSymbolsFromDWARFUnit(*DWOCU); + } + } + + if (BinaryFunctions.empty()) + WithColor::warning() << "Loading of DWARF info completed, but no binary " + "functions have been retrieved.\n"; +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartOffset2FuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +void ProfiledBinary::setupSymbolizer() { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + SymbolizerOpts.DWPName = DWPPath; + Symbolizer = std::make_unique(SymbolizerOpts); +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Offset + getPreferredBaseAddress(), + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName == "") + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(*It.first, Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset) { + uint64_t RangeBegin = offsetToVirtualAddr(StartOffset); + uint64_t RangeEnd = offsetToVirtualAddr(EndOffset); + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + uint64_t Offset = virtualAddrToOffset(IP.Address); + const SampleContextFrameVector &SymbolizedCallStack = + getFrameLocationStack(Offset, UsePseudoProbes); + uint64_t Size = Offset2InstSizeMap[Offset]; + + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +void ProfiledBinary::computeInlinedContextSizeForFunc( + const BinaryFunction *Func) { + // Note that a function can be spilt into multiple ranges, so compute for all + // ranges of the function. + for (const auto &Range : Func->Ranges) + computeInlinedContextSizeForRange(Range.first, Range.second); + + // Track optimized-away inlinee for probed binary. A function inlined and then + // optimized away should still have their probes left over in places. + if (usePseudoProbes()) { + auto I = TopLevelProbeFrameMap.find(Func->FuncName); + if (I != TopLevelProbeFrameMap.end()) { + BinarySizeContextTracker::ProbeFrameStack ProbeContext; + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second, + ProbeContext); + } + } +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeOffsetsSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeOffsetsSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.h new file mode 100644 index 00000000000..c09931656d9 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/ProfiledBinary.h @@ -0,0 +1,577 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include +#include +#include +#include +#include +#include +#include +#include + +extern cl::opt EnableCSPreInliner; +extern cl::opt UseContextCostForPreInliner; + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; + +struct InstructionPointer { + const ProfiledBinary *Binary; + union { + // Offset of the executable segment of the binary. + uint64_t Offset = 0; + // Also used as address in unwinder + uint64_t Address; + }; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartOffset; + // EndOffset is an exclusive bound. + uint64_t EndOffset; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start offset is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog offset tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog offsets. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void inferPrologOffsets(std::map &FuncStartOffsetMap) { + for (auto I : FuncStartOffsetMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogOffsets(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Offset); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const ContextTrieNode *Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + using ProbeFrameStack = SmallVector>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + void dump() { RootContext.dumpTree(); } + +private: + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using OffsetRange = std::pair; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // The target triple. + Triple TheTriple; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr MRI; + std::unique_ptr AsmInfo; + std::unique_ptr STI; + std::unique_ptr MII; + std::unique_ptr DisAsm; + std::unique_ptr MIA; + std::unique_ptr IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map BinaryFunctions; + + // A list of binary functions that have samples. + std::unordered_set ProfiledFunctions; + + // An ordered map of mapping function's start offset to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map StartOffset2FuncRangeMap; + + // Offset to context location map. Used to expand the context. + std::unordered_map Offset2LocStackMap; + + // Offset to instruction size map. Also used for quick offset lookup. + std::unordered_map Offset2InstSizeMap; + + // An array of offsets of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector CodeAddrOffsets; + // A set of call instruction offsets. Used by virtual unwinding. + std::unordered_set CallOffsets; + // A set of return instruction offsets. Used by virtual unwinding. + std::unordered_set RetOffsets; + // An ordered set of unconditional branch instruction offsets. + std::set UncondBranchOffsets; + // A set of branch instruction offsets. + std::unordered_set BranchOffsets; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + // Function name to probe frame map for top-level outlined functions. + StringMap TopLevelProbeFrameMap; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O); + + template + void setPreferredTextSegmentAddresses(const ELFFile &Obj, StringRef FileName); + + void checkPseudoProbe(const ELFObjectFileBase *Obj); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ELFObjectFileBase *Obj, + std::map &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ELFObjectFileBase *Obj); + void setupSymbolizer(); + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // Load debug info from DWARF unit. + void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start offset of a function is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(uint64_t Offset, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ELFObjectFileBase *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), ProEpilogTracker(this), + TrackFuncContextSize(EnableCSPreInliner && + UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + setupSymbolizer(); + load(); + } + + void decodePseudoProbe(); + + uint64_t virtualAddrToOffset(uint64_t VirtualAddress) const { + return VirtualAddress - BaseAddress; + } + uint64_t offsetToVirtualAddr(uint64_t Offset) const { + return Offset + BaseAddress; + } + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { return PreferredTextSegmentAddresses[0]; } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Offset) const { + auto I = Offset2InstSizeMap.find(Offset); + if (I == Offset2InstSizeMap.end()) + return 0; + return I->second; + } + + bool offsetIsCode(uint64_t Offset) const { + return Offset2InstSizeMap.find(Offset) != Offset2InstSizeMap.end(); + } + bool addressIsCode(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return offsetIsCode(Offset); + } + bool addressIsCall(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return CallOffsets.count(Offset); + } + bool addressIsReturn(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return RetOffsets.count(Offset); + } + bool addressInPrologEpilog(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return ProEpilogTracker.PrologEpilogSet.count(Offset); + } + + bool offsetIsTransfer(uint64_t Offset) { + return BranchOffsets.count(Offset) || RetOffsets.count(Offset) || + CallOffsets.count(Offset); + } + + bool rangeCrossUncondBranch(uint64_t Start, uint64_t End) { + if (Start >= End) + return false; + auto R = UncondBranchOffsets.lower_bound(Start); + return R != UncondBranchOffsets.end() && *R < End; + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return offsetToVirtualAddr(CodeAddrOffsets[Index]); + } + + size_t getCodeOffsetsSize() const { return CodeAddrOffsets.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + // Get the index in CodeAddrOffsets for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForOffset(uint64_t Offset) const { + auto Low = llvm::lower_bound(CodeAddrOffsets, Offset); + return Low - CodeAddrOffsets.begin(); + } + uint32_t getIndexForAddr(uint64_t Address) const { + uint64_t Offset = virtualAddrToOffset(Address); + return getIndexForOffset(Offset); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.find(Offset); + if (I == StartOffset2FuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input offset. + FuncRange *findFuncRangeForOffset(uint64_t Offset) { + auto I = StartOffset2FuncRangeMap.upper_bound(Offset); + if (I == StartOffset2FuncRangeMap.begin()) + return nullptr; + I--; + + if (Offset >= I->second.EndOffset) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRangesForOffset(uint64_t Offset) { + auto *FRange = findFuncRangeForOffset(Offset); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + std::unordered_set &getProfiledFunctions() { + return ProfiledFunctions; + } + + void setProfiledFunctions(std::unordered_set &Funcs) { + ProfiledFunctions = Funcs; + } + + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + + uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) { + return FuncSizeTracker.getFuncSizeForContext(ContextNode); + } + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + const SampleContextFrameVector & + getFrameLocationStack(uint64_t Offset, bool UseProbeDiscriminator = false) { + auto I = Offset2LocStackMap.emplace(Offset, SampleContextFrameVector()); + if (I.second) { + InstructionPointer IP(this, Offset); + I.first->second = symbolize(IP, true, UseProbeDiscriminator); + } + return I.first->second; + } + + Optional getInlineLeafFrameLoc(uint64_t Offset) { + const auto &Stack = getFrameLocationStack(Offset); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + void flushSymbolizer() { Symbolizer.reset(); } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in Offset2LocStackMap. This is + // used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartOffset, + uint64_t EndOffset); + + void computeInlinedContextSizeForFunc(const BinaryFunction *Func); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(Callsite.first, + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-15.0/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-15.0/llvm-profgen.cpp new file mode 100644 index 00000000000..8b12c2fe46c --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-15.0/llvm-profgen.cpp @@ -0,0 +1,192 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + llvm::cl::MiscFlags::CommaSeparated, + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt SampleProfFilename( + "llvm-sample-profile", cl::value_desc("llvm sample profile"), + cl::desc("Path of the LLVM sample profile"), cl::cat(ProfGenCategory)); + +static cl::opt + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt + ProcessId("pid", cl::value_desc("process Id"), cl::init(0), + cl::desc("Process Id for the profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + uint16_t HasPerfData = PerfDataFilename.getNumOccurrences(); + uint16_t HasPerfScript = PerfScriptFilename.getNumOccurrences(); + uint16_t HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences(); + uint16_t HasSampleProfile = SampleProfFilename.getNumOccurrences(); + uint16_t S = + HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + CheckFileExists(HasSampleProfile, SampleProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr Binary = + std::make_unique(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + if (SampleProfFilename.getNumOccurrences()) { + LLVMContext Context; + auto ReaderOrErr = SampleProfileReader::create(SampleProfFilename, Context); + std::unique_ptr Reader = + std::move(ReaderOrErr.get()); + Reader->read(); + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getProfiles(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } else { + Optional PIDFilter; + if (ProcessId.getNumOccurrences()) + PIDFilter = ProcessId; + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr Reader = + PerfReaderBase::create(Binary.get(), PerfFile, PIDFilter); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), &Reader->getSampleCounters(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } + + return EXIT_SUCCESS; +} diff --git a/tools/ldc-profgen/ldc-profgen-16.0/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-16.0/CMakeLists.txt new file mode 100644 index 00000000000..354c63f409f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/CMakeLists.txt @@ -0,0 +1,25 @@ + +set(LLVM_LINK_COMPONENTS + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + DebugInfoDWARF + Core + MC + IPO + MCDisassembler + Object + ProfileData + Support + Symbolize + TargetParser + ) + +add_llvm_tool(llvm-profgen + llvm-profgen.cpp + PerfReader.cpp + CSPreInliner.cpp + ProfiledBinary.cpp + ProfileGenerator.cpp + MissingFrameInferrer.cpp + ) diff --git a/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.cpp new file mode 100644 index 00000000000..dbc5bc7327d --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.cpp @@ -0,0 +1,303 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include +#include + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +extern cl::opt SampleHotCallSiteThreshold; +extern cl::opt SampleColdCallSiteThreshold; +extern cl::opt ProfileInlineGrowthLimit; +extern cl::opt ProfileInlineLimitMin; +extern cl::opt ProfileInlineLimitMax; +extern cl::opt SortProfiledSCC; + +cl::opt EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); + +static cl::opt SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, + ProfiledBinary &Binary, ProfileSummary *Summary) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Tracker), Binary(Binary), Summary(Summary) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; + if (!ProfileInlineLimitMax.getNumOccurrences()) + ProfileInlineLimitMax = 3000; +} + +std::vector CSPreInliner::buildTopDownOrder() { + std::vector Order; + ProfiledCallGraph ProfiledCG(ContextTracker); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextNodeForProfile(CallerSamples); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(CalleeNode); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { + if (UseContextCost) + return Binary.getFuncSizeForContext(ContextNode); + + return ContextNode->getFunctionSamples()->getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return Candidate.CalleeSamples->getContext().hasAttribute( + ContextWasInlined); + + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + + if (Candidate.CallsiteCount <= ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + else { + // Linearly adjust threshold based on normalized hotness, i.e, a value in + // [0,1]. Use 10% cutoff instead of the max count as the normalization + // upperbound for stability. + double NormalizationUpperBound = + ProfileSummaryBuilder::getEntryForPercentile( + Summary->getDetailedSummary(), 100000 /* 10% */) + .MinCount; + double NormalizationLowerBound = ColdCountThreshold; + double NormalizedHotness = + (Candidate.CallsiteCount - NormalizationLowerBound) / + (NormalizationUpperBound - NormalizationLowerBound); + if (NormalizedHotness > 1.0) + NormalizedHotness = 1.0; + // Add 1 to to ensure hot callsites get a non-zero threshold, which could + // happen when SampleColdCallSiteThreshold is 0. This is when we do not + // want any inlining for cold callsites. + SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + + SampleColdCallSiteThreshold + 1; + } + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const StringRef Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = + getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG( + dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleContextTracker &ContextTracker, + bool IsInput) { + uint32_t Size = 0; + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + Size++; + dbgs() << " [" << ContextTracker.getContextString(Node) << "] " + << FSamples->getTotalSamples() << ":" + << FSamples->getHeadSamples() << "\n"; + } + } + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Size << " total):\n"; + }; +#endif + + LLVM_DEBUG(printProfileNames(ContextTracker, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (StringRef FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + for (auto *Node : ContextTracker) { + FunctionSamples *FProfile = Node->getFunctionSamples(); + if (FProfile && + (Node->getParentContext() != &ContextTracker.getRootContext() && + !FProfile->getContext().hasState(InlinedContext))) { + Node->setFunctionSamples(nullptr); + } + } + FunctionSamples::ProfileIsPreInlined = true; + + LLVM_DEBUG(printProfileNames(ContextTracker, false)); +} diff --git a/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.h new file mode 100644 index 00000000000..09dd2dec114 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/CSPreInliner.h @@ -0,0 +1,90 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, + ProfileSummary *Summary); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector buildTopDownOrder(); + void processFunction(StringRef Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const ContextTrieNode *ContextNode); + bool UseContextCost; + SampleContextTracker &ContextTracker; + ProfiledBinary &Binary; + ProfileSummary *Summary; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/CallContext.h b/tools/ldc-profgen/ldc-profgen-16.0/CallContext.h new file mode 100644 index 00000000000..5e552130d03 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/CallContext.h @@ -0,0 +1,59 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include +#include +#include + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.FuncName.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-16.0/ErrorHandling.h new file mode 100644 index 00000000000..b797add8a89 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template +T unwrapOrError(Expected EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.cpp b/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.cpp new file mode 100644 index 00000000000..4127fdc54bd --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.cpp @@ -0,0 +1,318 @@ +//===-- MissingFrameInferrer.cpp - Missing frame inferrer --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "missing-frame-inferrer" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(TailCallUniReachable, + "Number of frame pairs reachable via a unique tail call path"); +STATISTIC(TailCallMultiReachable, + "Number of frame pairs reachable via a multiple tail call paths"); +STATISTIC(TailCallUnreachable, + "Number of frame pairs unreachable via any tail call path"); +STATISTIC(TailCallFuncSingleTailCalls, + "Number of functions with single tail call site"); +STATISTIC(TailCallFuncMultipleTailCalls, + "Number of functions with multiple tail call sites"); +STATISTIC(TailCallMaxTailCallPath, "Length of the longest tail call path"); + +static cl::opt + MaximumSearchDepth("max-search-depth", cl::init(UINT32_MAX - 1), + cl::desc("The maximum levels the DFS-based missing " + "frame search should go with")); + +void MissingFrameInferrer::initialize( + const ContextSampleCounterMap *SampleCounters) { + // Refine call edges based on LBR samples. + if (SampleCounters) { + std::unordered_map> SampledCalls; + std::unordered_map> SampledTailCalls; + + // Populate SampledCalls based on static call sites. Similarly to + // SampledTailCalls. + for (const auto &CI : *SampleCounters) { + for (auto Item : CI.second.BranchCounter) { + auto From = Item.first.first; + auto To = Item.first.second; + if (CallEdges.count(From)) { + assert(CallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + SampledCalls[From].insert(To); + } + if (TailCallEdges.count(From)) { + assert(TailCallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + FuncRange *FromFRange = Binary->findFuncRange(From); + FuncRange *ToFRange = Binary->findFuncRange(To); + if (FromFRange != ToFRange) + SampledTailCalls[From].insert(To); + } + } + } + + // Replace static edges with dynamic edges. + CallEdges = SampledCalls; + TailCallEdges = SampledTailCalls; + } + + // Populate function-based edges. This is to speed up address to function + // translation. + for (auto Call : CallEdges) + for (auto Target : Call.second) + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) + CallEdgesF[Call.first].insert(ToFRange->Func); + + for (auto Call : TailCallEdges) { + for (auto Target : Call.second) { + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) { + TailCallEdgesF[Call.first].insert(ToFRange->Func); + TailCallTargetFuncs.insert(ToFRange->Func); + } + } + if (FuncRange *FromFRange = Binary->findFuncRange(Call.first)) + FuncToTailCallMap[FromFRange->Func].push_back(Call.first); + } + +#if LLVM_ENABLE_STATS + for (auto F : FuncToTailCallMap) { + assert(F.second.size() > 0 && ""); + if (F.second.size() > 1) + TailCallFuncMultipleTailCalls++; + else + TailCallFuncSingleTailCalls++; + } +#endif + +#ifndef NDEBUG + auto PrintCallTargets = + [&](const std::unordered_map> + &CallTargets, + bool IsTailCall) { + for (const auto &Targets : CallTargets) { + for (const auto &Target : Targets.second) { + dbgs() << (IsTailCall ? "TailCall" : "Call"); + dbgs() << " From " << format("%8" PRIx64, Targets.first) << " to " + << format("%8" PRIx64, Target) << "\n"; + } + } + }; + + LLVM_DEBUG(dbgs() << "============================\n "; + dbgs() << "Call targets:\n"; + PrintCallTargets(CallEdges, false); + dbgs() << "\nTail call targets:\n"; + PrintCallTargets(CallEdges, true); + dbgs() << "============================\n";); +#endif +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + BinaryFunction *From, BinaryFunction *To, SmallVectorImpl &Path) { + // Search for a unique path comprised of only tail call edges for a given + // source and target frame address on the a tail call graph that consists of + // only tail call edges. Note that only a unique path counts. Multiple paths + // are treated unreachable. + if (From == To) + return 1; + + // Ignore cyclic paths. Since we are doing a recursive DFS walk, if the source + // frame being visited is already in the stack, it means we are seeing a + // cycle. This is done before querying the cached result because the cached + // result may be computed based on the same path. Consider the following case: + // A -> B, B -> A, A -> D + // When computing unique reachablity from A to D, the cached result for (B,D) + // should not be counted since the unique path B->A->D is basically the same + // path as A->D. Counting that with invalidate the uniqueness from A to D. + if (Visiting.contains(From)) + return 0; + + // If already computed, return the cached result. + auto I = UniquePaths.find({From, To}); + if (I != UniquePaths.end()) { + Path.append(I->second.begin(), I->second.end()); + return 1; + } + + auto J = NonUniquePaths.find({From, To}); + if (J != NonUniquePaths.end()) { + return J->second; + } + + uint64_t Pos = Path.size(); + + // DFS walk each outgoing tail call edges. + // Bail out if we are already at the the maximum searching depth. + if (CurSearchingDepth == MaximumSearchDepth) + return 0; + + + if (!FuncToTailCallMap.count(From)) + return 0; + + CurSearchingDepth++; + Visiting.insert(From); + uint64_t NumPaths = 0; + for (auto TailCall : FuncToTailCallMap[From]) { + NumPaths += computeUniqueTailCallPath(TailCall, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + CurSearchingDepth--; + Visiting.erase(From); + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + Path.pop_back_n(Path.size() - Pos); + } + + // Cache the result. + if (NumPaths == 1) { + UniquePaths[{From, To}].assign(Path.begin() + Pos, Path.end()); +#if LLVM_ENABLE_STATS + auto &LocalPath = UniquePaths[{From, To}]; + assert((LocalPath.size() <= MaximumSearchDepth + 1) && + "Path should not be longer than the maximum searching depth"); + TailCallMaxTailCallPath = std::max(uint64_t(LocalPath.size()), + TailCallMaxTailCallPath.getValue()); +#endif + } else { + NonUniquePaths[{From, To}] = NumPaths; + } + + return NumPaths; +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + uint64_t From, BinaryFunction *To, SmallVectorImpl &Path) { + if (!TailCallEdgesF.count(From)) + return 0; + Path.push_back(From); + uint64_t NumPaths = 0; + for (auto Target : TailCallEdgesF[From]) { + NumPaths += computeUniqueTailCallPath(Target, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) + Path.pop_back(); + return NumPaths; +} + +bool MissingFrameInferrer::inferMissingFrames( + uint64_t From, uint64_t To, SmallVectorImpl &UniquePath) { + assert(!TailCallEdgesF.count(From) && + "transition between From and To cannot be via a tailcall otherwise " + "they would not show up at the same time"); + UniquePath.push_back(From); + uint64_t Pos = UniquePath.size(); + + FuncRange *ToFRange = Binary->findFuncRange(To); + if (!ToFRange) + return false; + + // Bail out if caller has no known outgoing call edges. + if (!CallEdgesF.count(From)) + return false; + + // Done with the inference if the calle is reachable via a single callsite. + // This may not be accurate but it improves the search throughput. + for (auto Target : CallEdgesF[From]) { + if (Target == ToFRange->Func) + return true; + } + + // Bail out if callee is not tailcall reachable at all. + if (!TailCallTargetFuncs.contains(ToFRange->Func)) + return false; + + Visiting.clear(); + CurSearchingDepth = 0; + uint64_t NumPaths = 0; + for (auto Target : CallEdgesF[From]) { + NumPaths += + computeUniqueTailCallPath(Target, ToFRange->Func, UniquePath); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + UniquePath.pop_back_n(UniquePath.size() - Pos); + assert(UniquePath.back() == From && "broken path"); + } + +#if LLVM_ENABLE_STATS + if (NumPaths == 1) { + if (ReachableViaUniquePaths.insert({From, ToFRange->StartAddress}).second) + TailCallUniReachable++; + } else if (NumPaths == 0) { + if (Unreachables.insert({From, ToFRange->StartAddress}).second) { + TailCallUnreachable++; + LLVM_DEBUG(dbgs() << "No path found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } else if (NumPaths > 1) { + if (ReachableViaMultiPaths.insert({From, ToFRange->StartAddress}) + .second) { + TailCallMultiReachable++; + LLVM_DEBUG(dbgs() << "Multiple paths found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } +#endif + + return NumPaths == 1; +} + +void MissingFrameInferrer::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + if (Context.size() == 1) { + NewContext = Context; + return; + } + + NewContext.clear(); + for (uint64_t I = 1; I < Context.size(); I++) { + inferMissingFrames(Context[I - 1], Context[I], NewContext); + } + NewContext.push_back(Context.back()); + + assert((NewContext.size() >= Context.size()) && + "Inferred context should include all frames in the original context"); + assert((NewContext.size() > Context.size() || NewContext == Context) && + "Inferred context should be exactly the same " + "with the original context"); +} diff --git a/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.h b/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.h new file mode 100644 index 00000000000..4680a9a979f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/MissingFrameInferrer.h @@ -0,0 +1,116 @@ +//===-- MissingFrameInferrer.h - Missing frame inferrer ---------- C++/-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H +#define LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H + +#include "PerfReader.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include +#include + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +struct BinaryFunction; + +class MissingFrameInferrer { +public: + MissingFrameInferrer(ProfiledBinary *Binary) : Binary(Binary) {} + + // Defininig a frame transition from a caller function to the callee function. + using CallerCalleePair = std::pair; + + void initialize(const ContextSampleCounterMap *SampleCounters); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + +private: + friend class ProfiledBinary; + + // Compute a unique tail call path for a pair of source frame address and + // target frame address. Append the unique path prefix (not including `To`) to + // `UniquePath` if exists. Return the whether this's a unqiue tail call + // path. The source/dest frame will typically be a pair of adjacent frame + // entries of call stack samples. + bool inferMissingFrames(uint64_t From, uint64_t To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source frame address to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(uint64_t From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source function to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(BinaryFunction *From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + ProfiledBinary *Binary; + + // A map of call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> CallEdges; + + // A map of tail call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> TailCallEdges; + + // Dynamic call targets in terms of BinaryFunction for any calls. + std::unordered_map> CallEdgesF; + + // Dynamic call targets in terms of BinaryFunction for tail calls. + std::unordered_map> + TailCallEdgesF; + + // Dynamic tail call targets of caller functions. + std::unordered_map> FuncToTailCallMap; + + // Functions that are reachable via tail calls. + DenseSet TailCallTargetFuncs; + + struct PairHash { + std::size_t operator()( + const std::pair &Pair) const { + return std::hash()(Pair.first) ^ + std::hash()(Pair.second); + } + }; + + // Cached results from a CallerCalleePair to a unique call path between them. + std::unordered_map, PairHash> + UniquePaths; + // Cached results from CallerCalleePair to the number of available call paths. + std::unordered_map NonUniquePaths; + + DenseSet Visiting; + + uint32_t CurSearchingDepth = 0; + +#if LLVM_ENABLE_STATS + DenseSet> ReachableViaUniquePaths; + DenseSet> Unreachables; + DenseSet> ReachableViaMultiPaths; +#endif +}; +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.cpp new file mode 100644 index 00000000000..86c0131e101 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.cpp @@ -0,0 +1,1208 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt SkipSymbolization("skip-symbolization", + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt ShowMmapEvents("show-mmap-events", + cl::desc("Print binary load events.")); + +static cl::opt + UseOffset("use-offset", cl::init(true), + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt + IgnoreStackSamples("ignore-stack-samples", + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt ShowDetailedWarning("show-detailed-warning", + cl::desc("Show detailed warning message.")); + +extern cl::opt PerfTraceFilename; +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (Source == ExternalAddr) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + + if (End == ExternalAddr && Target == ExternalAddr) { + // Filter out the case when leaf external frame matches the external LBR + // target, this is a valid state, it happens that the code run into external + // address then return back. The call frame under the external frame + // remains valid and can be unwound later, just skip recording this range. + NumPairedExtAddr++; + return; + } + + if (End == ExternalAddr || Target == ExternalAddr) { + // Range is invalid if only one point is external address. This means LBR + // traces contains a standalone external address failing to pair another + // one, likely due to interrupt jmp or broken perf script. Set the + // state to invalid. + NumUnpairedExtAddr++; + State.setInvalid(); + return; + } + + if (!isValidFallThroughRange(Target, End, Binary)) { + // Skip unwinding the rest of LBR trace when a bogus range is seen. + State.setInvalid(); + return; + } + + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr FrameStack::getContextKey() { + std::shared_ptr KeyStr = + std::make_shared(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + return KeyStr; +} + +std::shared_ptr AddressStack::getContextKey() { + std::shared_ptr KeyStr = std::make_shared(); + KeyStr->Context = Stack; + CSProfileGenerator::compressRecursionContext(KeyStr->Context); + CSProfileGenerator::trimContext(KeyStr->Context); + return KeyStr; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &I : Cur->RangeSamples) + SCounter.recordRangeCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); + + for (auto &I : Cur->BranchSamples) + SCounter.recordBranchCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + AddressStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.Target == ExternalAddr) + return; + + // Record external-to-internal pattern on the trie root, it later can be + // used for generating head samples. + if (Branch.Source == ExternalAddr) { + State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + return; + } + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + NumTotalBranches += State.LBRStack.size(); + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return + // instruction + unwindReturn(State); + } else if (isValidState(State)) { + // Unwind branches + unwindBranch(State); + } else { + // Skip unwinding the rest of LBR trace. Reset the stack and update the + // state so that the rest of the trace can still be processed as if they + // do not have stack samples. + State.clearCallStack(); + State.InstPtr.update(State.getCurrentLBRSource()); + State.pushFrame(State.InstPtr.Address); + } + + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter) { + std::unique_ptr PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = + PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput, PIDFilter); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset( + new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile +PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File, + std::optional PIDFilter) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + std::string PerfTraceFile = PerfData.str() + ".script.tmp"; + std::string ErrorFile = PerfData.str() + ".script.err.tmp"; + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + std::optional Redirects[] = {std::nullopt, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(ErrorFile)}; // Stderr + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, std::nullopt, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::string PIDs; + std::unordered_set PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMap2Event(TraceIt.getCurrentLine()) && + extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second && (!PIDFilter || MMap.PID == *PIDFilter)) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + + // Run perf script again to retrieve events for PIDs collected above + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "ip,brstack", "--pid", + PIDs, "-i", PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects); + + return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent}; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath); + if (Binary->getName() != BinaryName) + return; + + // Drop the event if process does not match pid filter + if (PIDFilter && Event.PID != *PIDFilter) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = llvm::lower_bound(Offsets, Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast(K)) { + std::ostringstream OContextStr; + for (uint32_t I = 0; I < CtxKey->Context.size(); I++) { + if (OContextStr.str().size()) + OContextStr << " @ "; + uint64_t Address = CtxKey->Context[I]; + if (UseOffset) { + if (UseLoadableSegmentAsBase) + Address -= Binary->getFirstLoadableAddress(); + else + Address -= Binary->getPreferredBaseAddress(); + } + OContextStr << "0x" + << utohexstr(Address, + /*LowerCase=*/true); + } + return OContextStr.str(); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary(Unwinder.NumPairedExtAddr * 2, Unwinder.NumTotalBranches, + "of branches containing paired external address."); + + emitWarningSummary(Unwinder.NumUnpairedExtAddr, Unwinder.NumTotalBranches, + "of branches containing external address but doesn't have " + "another external address to pair, likely due to " + "interrupt jmp or broken perf script."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + uint64_t LeadingAddr; + if (!Records.empty() && !Records[0].contains('/')) { + if (Records[0].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index = 1; + } + + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + // Canonicalize to use preferred load address as base address. + Src = Binary->canonicalizeVirtualAddress(Src); + Dst = Binary->canonicalizeVirtualAddress(Dst); + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + if (!SrcIsInternal) + Src = ExternalAddr; + if (!DstIsInternal) + Dst = ExternalAddr; + // Filter external-to-external case to reduce LBR trace size. + if (!SrcIsInternal && !DstIsInternal) + continue; + + LBRStack.emplace_back(LBREntry(Src, Dst)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + + FrameAddr = Binary->canonicalizeVirtualAddress(FrameAddr); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr Sample = std::make_shared(); +#ifndef NDEBUG + Sample->Linenum = TraceIt.getLineNumber(); +#endif + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } else { + Start -= Binary->getPreferredBaseAddress(); + End -= Binary->getPreferredBaseAddress(); + } + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCS) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Source += Binary->getFirstLoadableAddress(); + Target += Binary->getFirstLoadableAddress(); + } else { + Source += Binary->getPreferredBaseAddress(); + Target += Binary->getPreferredBaseAddress(); + } + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr Key = + std::make_shared(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.startswith("[")) { + ProfileIsCS = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndAddress = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t TargetAddress = LBR.Target; + + // Record the branch if its SourceAddress is external. It can be the case an + // external source call an internal function, later this branch will be used + // to generate the function's head sample. + if (Binary->addressIsCode(TargetAddress)) { + Counter.recordBranchCount(SourceAddress, TargetAddress, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartAddress = TargetAddress; + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + isValidFallThroughRange(StartAddress, EndAddress, Binary)) + Counter.recordRangeCount(StartAddress, EndAddress, Repeat); + EndAddress = SourceAddress; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr Key = + std::make_shared(); + SampleCounters.emplace(Hashable(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const Pattern = + "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + Regex RegMmap2(Pattern); + SmallVector Fields; + bool R = RegMmap2.match(Line, &Fields); + if (!R) { + std::string WarningMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + WithColor::warning() << WarningMsg; + } + Fields[PID].getAsInteger(10, MMap.PID); + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath); + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMap2Event(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMap2Event(TraceIt.getCurrentLine())) + parseMMap2Event(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + if (Records[1].startswith("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMap2Event(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 does not appear at the beginning of the line + // for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP2"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCS = !IgnoreStackSamples; + if (ProfileIsCS) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map, uint64_t, + pair_hash> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndAddress = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t StartAddress = LBR.Target; + if (EndAddress != 0) + Ranges[{StartAddress, EndAddress}] += Count; + EndAddress = SourceAddress; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = [&](uint64_t StartAddress, uint64_t EndAddress, + StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() << "[" << format("%8" PRIx64, StartAddress) << "," + << format("%8" PRIx64, EndAddress) << "]: " << Msg + << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + const char *BogusRangeMsg = "Range start is after or too far from range end."; + + uint64_t TotalRangeNum = 0; + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + uint64_t BogusRange = 0; + + for (auto &I : Ranges) { + uint64_t StartAddress = I.first.first; + uint64_t EndAddress = I.first.second; + TotalRangeNum += I.second; + + if (!Binary->addressIsCode(StartAddress) && + !Binary->addressIsCode(EndAddress)) + continue; + + if (!Binary->addressIsCode(StartAddress) || + !Binary->addressIsTransfer(EndAddress)) { + InstNotBoundary += I.second; + WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRange(StartAddress); + if (!FRange) { + UnmatchedRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, DanglingRangeMsg); + continue; + } + + if (EndAddress >= FRange->EndAddress) { + RangeCrossFunc += I.second; + WarnInvalidRange(StartAddress, EndAddress, RangeCrossFuncMsg); + } + + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + !isValidFallThroughRange(StartAddress, EndAddress, Binary)) { + BogusRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, BogusRangeMsg); + } + } + + emitWarningSummary( + InstNotBoundary, TotalRangeNum, + "of samples are from ranges that are not on instruction boundary."); + emitWarningSummary( + UnmatchedRange, TotalRangeNum, + "of samples are from ranges that do not belong to any functions."); + emitWarningSummary( + RangeCrossFunc, TotalRangeNum, + "of samples are from ranges that do cross function boundaries."); + emitWarningSummary( + BogusRange, TotalRangeNum, + "of samples are from ranges that have range start after or too far from " + "range end acrossing the unconditinal jmp."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.h b/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.h new file mode 100644 index 00000000000..14137e82572 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/PerfReader.h @@ -0,0 +1,742 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparison if hash conflicts probabilities is low) +template class Hashable { +public: + std::shared_ptr Data; + Hashable(const std::shared_ptr &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable &LHS, const Hashable &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector &OtherCallStack = Other->CallStack; + const SmallVector &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + uint64_t Linenum = 0; + + void print() const { + dbgs() << "Line " << Linenum << "\n"; + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map, uint64_t, + Hashable::Hash, Hashable::Equal>; + +using SampleVector = SmallVector, 16>; + +inline bool isValidFallThroughRange(uint64_t Start, uint64_t End, + ProfiledBinary *Binary) { + // Start bigger than End is considered invalid. + // LBR ranges cross the unconditional jmp are also assumed invalid. + // It's found that perf data may contain duplicate LBR entries that could form + // a range that does not reflect real execution flow on some Intel targets, + // e.g. Skylake. Such ranges are ususally very long. Exclude them since there + // cannot be a linear execution range that spans over unconditional jmp. + return Start <= End && !Binary->rangeCrossUncondBranch(Start, End); +} + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + // Indicate whether unwinding is currently in a bad state which requires to + // skip all subsequent unwinding. + bool Invalid = false; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + void setInvalid() { Invalid = true; } + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_AddrBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Address-based context id +struct AddrBasedCtxKey : public ContextKey { + SmallVector Context; + + bool WasLeafInlined; + AddrBasedCtxKey() : ContextKey(CK_AddrBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_AddrBased; + } + + bool isEqual(const ContextKey *K) const override { + const AddrBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_combine_range(Context.begin(), Context.end()); + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map, SampleCounter, + Hashable::Hash, Hashable::Equal>; + +struct FrameStack { + SmallVector Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct AddressStack { + SmallVector Stack; + ProfiledBinary *Binary; + AddressStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + uint64_t NumUnpairedExtAddr = 0; + uint64_t NumPairedExtAddr = 0; + +private: + bool isSourceExternal(UnwindState &State) const { + return State.getCurrentLBRSource() == ExternalAddr; + } + + bool isTargetExternal(UnwindState &State) const { + return State.getCurrentLBRTarget() == ExternalAddr; + } + + // Determine whether the return source is from external code by checking if + // the target's the next inst is a call inst. + bool isReturnFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + (Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) != 0); + } + + // If the source is external address but it's not the `return` case, treat it + // as a call from external. + bool isCallFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) == 0; + } + + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + if (!isValidState(State)) + return false; + + if (Binary->addressIsCall(State.getCurrentLBRSource())) + return true; + + return isCallFromExternal(State); + } + + bool isReturnState(UnwindState &State) const { + if (!isValidState(State)) + return false; + + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (Binary->addressIsReturn(State.getCurrentLBRSource())) + return true; + + return isReturnFromExternal(State); + } + + bool isValidState(UnwindState &State) const { return !State.Invalid; } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCS() { return ProfileIsCS; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCS = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace, + std::optional PID) + : PerfReaderBase(B, PerfTrace), PIDFilter(PID){}; + + // Entry of the reader to parse multiple perf traces + void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile + convertPerfDataToTrace(ProfiledBinary *Binary, PerfInputFile &File, + std::optional PIDFilter); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + +protected: + // The parsed MMap event + struct MMapEvent { + uint64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMap2Event(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP2 event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMap2Event(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set InvalidReturnAddresses; + // PID for the process of interest + std::optional PIDFilter; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the LBR only sample. + void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.cpp new file mode 100644 index 00000000000..dfc42a5f4e0 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.cpp @@ -0,0 +1,1270 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include +#include +#include +#include + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Compact_Binary, "compbinary", "Compact binary encoding"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +static cl::opt UseMD5( + "use-md5", cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt + TrimColdProfile("trim-cold-profile", + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt CSProfMaxContextDepth( + "csprof-max-context-depth", + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +static cl::opt UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +static cl::opt GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(true), + cl::desc("Generate nested function profiles for CSSPGO")); + +cl::opt InferMissingFrames( + "infer-missing-frames", llvm::cl::init(true), + llvm::cl::desc( + "Infer missing call frames due to compiler tail call elimination."), + llvm::cl::Optional); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +extern cl::opt ProfileSummaryCutoffHot; +extern cl::opt UseContextLessSummary; + +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap *SampleCounters, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + if (Binary->useFSDiscriminator()) + exitWithError("FS discriminator is not supported in CS profile."); + Generator.reset(new CSProfileGenerator(Binary, Profiles)); + } else { + Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "AutoFDO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + // Use duplication factor to compensated for loop unroll/vectorization. + // Note that this is only needed when we're taking MAX of the counts at + // the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr R = + FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +void ProfileGeneratorBase::updateCallsiteSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateCallsiteSamples(); + } +} + +void ProfileGeneratorBase::updateFunctionSamples() { + updateCallsiteSamples(); + + if (UpdateTotalSamples) + updateTotalSamples(); +} + +void ProfileGeneratorBase::collectProfiledFunctions() { + std::unordered_set ProfiledFunctions; + if (collectFunctionsFromRawProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else + llvm_unreachable("Unsupported input profile"); +} + +bool ProfileGeneratorBase::collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions) { + if (!SampleCounters) + return false; + // Go through all the stacks, ranges and branches in sample counters, use + // the start of the range to look up the function it belongs and record the + // function. + for (const auto &CI : *SampleCounters) { + if (const auto *CtxKey = dyn_cast(CI.first.getPtr())) { + for (auto StackAddr : CtxKey->Context) { + if (FuncRange *FRange = Binary->findFuncRange(StackAddr)) + ProfiledFunctions.insert(FRange->Func); + } + } + + for (auto Item : CI.second.RangeCounter) { + uint64_t StartAddress = Item.first.first; + if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) + ProfiledFunctions.insert(FRange->Func); + } + + for (auto Item : CI.second.BranchCounter) { + uint64_t SourceAddress = Item.first.first; + uint64_t TargetAddress = Item.first.second; + if (FuncRange *FRange = Binary->findFuncRange(SourceAddress)) + ProfiledFunctions.insert(FRange->Func); + if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) + ProfiledFunctions.insert(FRange->Func); + } + } + return true; +} + +bool ProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (const auto &FS : ProfileMap) { + if (auto *Func = Binary->getBinaryFunction(FS.first.getName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +bool CSProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (auto *Node : ContextTracker) { + if (!Node->getFuncName().empty()) + if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { + SampleContext Context(FuncName); + auto Ret = ProfileMap.emplace(Context, FunctionSamples()); + if (Ret.second) { + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(Context); + } + return Ret.first->second; +} + +void ProfileGenerator::generateProfile() { + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) + Binary->decodePseudoProbe(); + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(ProfileMap); + trimColdProfiles(ProfileMap, ColdCountThreshold); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector ColdProfiles; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfiles.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfiles) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::generateProbeBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( + const RangeSample &RangeCounter) { + ProbeCounterMap ProbeCounter; + // preprocessRangeCounter returns disjoint ranges, so no longer to redo it + // inside extractProbesFromRange. + extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, + false); + + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(Probe, FrameVec, true); + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, Count); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + if (Probe->isEntry()) + FunctionProfile.addHeadSamples(Count); + } +} + +void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + + // Record called target sample and its count. + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(CallProbe, FrameVec, true); + + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, 0, CalleeName, Count); + } + } +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].FuncName); + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].FuncName); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartAddress = I.first.first; + for (const auto &Range : Binary->getRanges(StartAddress)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const SampleContextFrameVector FrameVec = + Binary->getFrameLocationStack(IP.Address); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(IP.Address)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef +ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getCachedFrameLocationStack(SourceAddress); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + CalleeName, Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + +FunctionSamples * +CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined) { + FunctionSamples *FProfile = ContextNode->getFunctionSamples(); + if (!FProfile) { + FSamplesList.emplace_back(); + FProfile = &FSamplesList.back(); + FProfile->setName(ContextNode->getFuncName()); + ContextNode->setFunctionSamples(FProfile); + } + // Update ContextWasInlined attribute for existing contexts. + // The current function can be called in two ways: + // - when processing a probe of the current frame + // - when processing the entry probe of an inlinee's frame, which + // is then used to update the callsite count of the current frame. + // The two can happen in any order, hence here we are making sure + // `ContextWasInlined` is always set as expected. + // TODO: Note that the former does not always happen if no probes of the + // current frame has samples, and if the latter happens, we could lose the + // attribute. This should be fixed. + if (WasLeafInlined) + FProfile->getContext().setAttribute(ContextWasInlined); + return FProfile; +} + +ContextTrieNode * +CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined) { + ContextTrieNode *ContextNode = + ContextTracker.getOrCreateContextPath(Context, true); + getOrCreateFunctionSamples(ContextNode, WasLeafInlined); + return ContextNode; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) { + Binary->decodePseudoProbe(); + if (InferMissingFrames) + initializeMissingFrameInferrer(); + } + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + postProcessProfiles(); +} + +void CSProfileGenerator::initializeMissingFrameInferrer() { + Binary->getMissingContextInferrer()->initialize(SampleCounters); +} + +void CSProfileGenerator::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + Binary->inferMissingFrames(Context, NewContext); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + for (auto *Func : Binary->getProfiledFunctions()) + Binary->computeInlinedContextSizeForFunc(Func); + + // Flush the symbolizer to save memory. + Binary->flushSymbolizer(); +} + +void CSProfileGenerator::updateFunctionSamples() { + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + if (UpdateTotalSamples) + FSamples->updateTotalSamples(); + FSamples->updateCallsiteSamples(); + } + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : *SampleCounters) { + const auto *CtxKey = cast(CI.first.getPtr()); + + ContextTrieNode *ContextNode = &getRootContext(); + // Sample context will be empty if the jump is an external-to-internal call + // pattern, the head samples should be added for the internal function. + if (!CtxKey->Context.empty()) { + // Get or create function profile for the range + ContextNode = + getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); + // Fill in function body samples + populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), + CI.second.RangeCounter); + } + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(getRootContext()); + + updateFunctionSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address); + if (LeafLoc) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + ContextTrieNode *Node, const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + ContextTrieNode *CallerNode = Node; + LineLocation CalleeCallSite(0, 0); + if (CallerNode != &getRootContext()) { + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress); + if (LeafLoc) { + CallerNode->getFunctionSamples()->addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, + Count); + // Record head sample for called target(callee) + CalleeCallSite = LeafLoc->Location; + } + } + + ContextTrieNode *CalleeNode = + CallerNode->getOrCreateChildContext(CalleeCallSite, CalleeName); + FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); + CalleeProfile->addHeadSamples(Count); + } +} + +void CSProfileGenerator::populateInferredFunctionSamples( + ContextTrieNode &Node) { + // There is no call jmp sample between the inliner and inlinee, we need to use + // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s + // sample depends on child(inlinee)'s sample, so traverse the tree in + // post-order. + for (auto &It : Node.getAllChildContext()) + populateInferredFunctionSamples(It.second); + + FunctionSamples *CalleeProfile = Node.getFunctionSamples(); + if (!CalleeProfile) + return; + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile->getHeadSamples()) + return; + ContextTrieNode *CallerNode = Node.getParentContext(); + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CallerNode == &getRootContext()) + return; + + LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); + FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + Node.getFuncName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); +} + +void CSProfileGenerator::convertToProfileMap( + ContextTrieNode &Node, SampleContextFrameVector &Context) { + FunctionSamples *FProfile = Node.getFunctionSamples(); + if (FProfile) { + Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); + FunctionSamples &NewProfile = Ret.first->second; + NewProfile.getContext().setContext(NewContext); + Context.pop_back(); + } + + for (auto &It : Node.getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); + convertToProfileMap(ChildNode, Context); + Context.pop_back(); + } +} + +void CSProfileGenerator::convertToProfileMap() { + assert(ProfileMap.empty() && + "ProfileMap should be empty before converting from the trie"); + assert(IsProfileValidOnTrie && + "Do not convert the trie twice, it's already destroyed"); + + SampleContextFrameVector Context; + for (auto &It : getRootContext().getAllChildContext()) + convertToProfileMap(It.second, Context); + + IsProfileValidOnTrie = false; +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + ContextTracker.populateFuncToCtxtMap(); + CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + convertToProfileMap(); + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); + if (GenCSNestedProfile) { + CSProfileConverter CSConverter(ProfileMap); + CSConverter.convertProfiles(); + FunctionSamples::ProfileIsCS = false; + } +} + +void ProfileGeneratorBase::computeSummaryAndThreshold( + SampleProfileMap &Profiles) { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + Summary = Builder.computeSummaryForProfiles(Profiles); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +void CSProfileGenerator::computeSummaryAndThreshold() { + // Always merge and use context-less profile map to compute summary. + SampleProfileMap ContextLessProfiles; + ContextTracker.createContextLessProfileMap(ContextLessProfiles); + + // Set the flag below to avoid merging the profile again in + // computeSummaryAndThreshold + FunctionSamples::ProfileIsCS = false; + assert( + (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && + "Don't set --profile-summary-contextless to false for profile " + "generation"); + ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); + // Recover the old value. + FunctionSamples::ProfileIsCS = true; +} + +void ProfileGeneratorBase::extractProbesFromRange( + const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges) { + const RangeSample *PRanges = &RangeCounter; + RangeSample Ranges; + if (FindDisjointRanges) { + findDisjointRanges(Ranges, RangeCounter); + PRanges = &Ranges; + } + + for (const auto &Range : *PRanges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +static void extractPrefixContextStack(SampleContextFrameVector &ContextStack, + const SmallVectorImpl &AddrVec, + ProfiledBinary *Binary) { + SmallVector Probes; + for (auto Address : reverse(AddrVec)) { + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(Address); + // These could be the cases when a probe is not found at a calliste. Cutting + // off the context from here since the inliner will not know how to consume + // a context with unknown callsites. + // 1. for functions that are not sampled when + // --decode-probe-for-profiled-functions-only is on. + // 2. for a merged callsite. Callsite merging may cause the loss of original + // probe IDs. + // 3. for an external callsite. + if (!CallProbe) + break; + Probes.push_back(CallProbe); + } + + std::reverse(Probes.begin(), Probes.end()); + + // Extract context stack for reusing, leaf context stack will be added + // compressed while looking up function profile. + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + for (const auto &CI : *SampleCounters) { + const AddrBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (!Probe->isBlock() || Count == 0) + continue; + + ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe); + FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamplesForProbe(Probe->getIndex(), Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + ContextTrieNode *CallerNode = ContextNode->getParentContext(); + if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + FunctionSamples &CallerProfile = + *getOrCreateFunctionSamples(CallerNode); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, 0, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples(CallerIndex, 0, + ContextNode->getFuncName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamplesForProbe(Probe->getIndex(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { + for (const auto &BI : BranchCounter) { + uint64_t SourceAddress = BI.first.first; + uint64_t TargetAddress = BI.first.second; + uint64_t Count = BI.second; + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(CtxKey, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), 0, CalleeName, + Count); + } +} + +ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + + const SmallVectorImpl *PContext = &CtxKey->Context; + SmallVector NewContext; + + if (InferMissingFrames) { + SmallVector Context = CtxKey->Context; + // Append leaf frame for a complete inference. + Context.push_back(LeafProbe->getAddress()); + inferMissingFrames(Context, NewContext); + // Pop out the leaf probe that was pushed in above. + NewContext.pop_back(); + PContext = &NewContext; + } + + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, *PContext, Binary); + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + ContextTrieNode *ContextNode = + getOrCreateContextNode(NewContextStack, WasLeafInlined); + ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); + return ContextNode; +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.h new file mode 100644 index 00000000000..471792ec713 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/ProfileGenerator.h @@ -0,0 +1,390 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +using ProbeCounterMap = + std::unordered_map; + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : Binary(Binary), SampleCounters(Counters){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const SampleProfileMap &&Profiles) + : Binary(Binary), ProfileMap(std::move(Profiles)){}; + + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, + bool profileIsCS); + static std::unique_ptr + create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, + bool profileIsCS); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges = true); + + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + + void updateFunctionSamples(); + + void updateTotalSamples(); + + void updateCallsiteSamples(); + + StringRef getCalleeNameForAddress(uint64_t TargetAddress); + + void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + void collectProfiledFunctions(); + + bool collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions); + + // Collect profiled Functions for llvm sample profile input. + virtual bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) = 0; + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + ProfiledBinary *Binary = nullptr; + + std::unique_ptr Summary; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + const ContextSampleCounterMap *SampleCounters = nullptr; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) + : ProfileGeneratorBase(Binary, std::move(Profiles)){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + void generateProbeBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(StringRef FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void + populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); + void populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; +}; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) + : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; + void generateProfile() override; + + // Trim the context stack at a given depth. + template + static void trimContext(SmallVectorImpl &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template + static void compressRecursionContext(SmallVectorImpl &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + + FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined = false); + + // Lookup or create ContextTrieNode for the context, FunctionSamples is + // created inside this function. + ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined = false); + + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + + void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, + const BranchSample &BranchCounters); + + void populateInferredFunctionSamples(ContextTrieNode &Node); + + void updateFunctionSamples(); + + void generateProbeBasedProfile(); + + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + const AddrBasedCtxKey *CtxKey); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + const AddrBasedCtxKey *CtxKey); + + ContextTrieNode * + getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + void convertToProfileMap(ContextTrieNode &Node, + SampleContextFrameVector &Context); + + void convertToProfileMap(); + + void computeSummaryAndThreshold(); + + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; + + void initializeMissingFrameInferrer(); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; + + // The container for holding the FunctionSamples used by context trie. + std::list FSamplesList; + + // Underlying context table serves for sample profile writer. + std::unordered_set Contexts; + + SampleContextTracker ContextTracker; + + bool IsProfileValidOnTrie = true; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.cpp new file mode 100644 index 00000000000..00e9d502587 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.cpp @@ -0,0 +1,978 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "ProfileGenerator.h" +#include "llvm/ADT/Triple.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" +#include + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt ShowDisassemblyOnly("show-disassembly-only", + cl::desc("Print disassembled code.")); + +cl::opt ShowSourceLocations("show-source-locations", + cl::desc("Print source locations.")); + +static cl::opt + ShowCanonicalFnName("show-canonical-fname", + cl::desc("Print canonical function name.")); + +static cl::opt ShowPseudoProbe( + "show-pseudo-probe", + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt UseDwarfCorrelation( + "use-dwarf-correlation", + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::opt + DWPPath("dwp", cl::init(""), + cl::desc("Path of .dwp file. When not specified, it will be " + ".dwp in the same directory as the main binary.")); + +static cl::list DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +extern cl::opt ShowDetailedWarning; +extern cl::opt InferMissingFrames; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + StringRef CallerName = Callsite.FuncName; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + + std::optional Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + LineLocation CallSiteLoc(0, 0); + while (CurrNode && Node->getParentContext() != nullptr) { + PrevNode = CurrNode; + CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName()); + if (CurrNode && CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + CallSiteLoc = Node->getCallSiteLoc(); + Node = Node->getParentContext(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size && CurrNode && !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + } + } + + assert(Size && "We should at least find one context size."); + return *Size; +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second.get(), ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, CallerName); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second.get(), + ProbeContext); + } + + ProbeContext.pop_back(); +} + +ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath, + const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), ProEpilogTracker(this), + TrackFuncContextSize(EnableCSPreInliner && UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + setupSymbolizer(); + if (InferMissingFrames) + MissingContextInferrer = std::make_unique(this); + load(); +} + +ProfiledBinary::~ProfiledBinary() {} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartAddr(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + auto *Obj = dyn_cast(&ExeBinary); + if (!Obj) + exitWithError("not a valid Elf image", Path); + + TheTriple = Obj->makeTriple(); + // Current only support X86 + if (!TheTriple.isX86()) + exitWithError("unsupported target", TheTriple.getTriple()); + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*cast(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*cast(&ExeBinary)); + } + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + + checkPseudoProbe(Obj); + + if (UsePseudoProbes) + populateElfSymbolAddressList(Obj); + + if (ShowDisassemblyOnly) + decodePseudoProbe(Obj); + + // Disassemble the text sections. + disassemble(Obj); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologAddresses(StartAddrToFuncRangeMap); + ProEpilogTracker.inferEpilogAddresses(RetAddressSet); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + const SampleContextFrameVector &Context1 = + getCachedFrameLocationStack(Address1); + const SampleContextFrameVector &Context2 = + getCachedFrameLocationStack(Address2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + if (Stack.empty()) + return ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + const SampleContextFrameVector &ExpandedContext = + getCachedFrameLocationStack(Address); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from + // because we may build the tools on non-linux. + uint32_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses( + const ELFObjectFileBase *Obj) { + if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else + llvm_unreachable("invalid ELF object format"); +} + +void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + bool HasProbeDescSection = false; + bool HasPseudoProbeSection = false; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + if (SectionName == ".pseudo_probe_desc") { + HasProbeDescSection = true; + } else if (SectionName == ".pseudo_probe") { + HasPseudoProbeSection = true; + } + } + + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = HasProbeDescSection && HasPseudoProbeSection; +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (!UsePseudoProbes) + return; + + MCPseudoProbeDecoder::Uint64Set GuidFilter; + MCPseudoProbeDecoder::Uint64Map FuncStartAddresses; + if (ShowDisassemblyOnly) { + if (DisassembleFunctionSet.empty()) { + FuncStartAddresses = SymbolStartAddrs; + } else { + for (auto &F : DisassembleFunctionSet) { + auto GUID = Function::getGUID(F.first()); + if (auto StartAddr = SymbolStartAddrs.lookup(GUID)) { + FuncStartAddresses[GUID] = StartAddr; + FuncRange &Range = StartAddrToFuncRangeMap[StartAddr]; + GuidFilter.insert(Function::getGUID(Range.getFuncName())); + } + } + } + } else { + for (auto *F : ProfiledFunctions) { + GuidFilter.insert(Function::getGUID(F->FuncName)); + for (auto &Range : F->Ranges) { + auto GUIDs = StartAddrToSymMap.equal_range(Range.first); + for (auto I = GUIDs.first; I != GUIDs.second; ++I) + FuncStartAddresses[I->second] = I->first; + } + } + } + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError( + "Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast(Contents.data()), + Contents.size(), GuidFilter, FuncStartAddresses)) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + } + } + + // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe + // is available + if (TrackFuncContextSize) { + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) { + auto *Frame = Child.second.get(); + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName; + TopLevelProbeFrameMap[FuncName] = Frame; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::decodePseudoProbe() { + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + auto *Obj = dyn_cast(&ExeBinary); + decodePseudoProbe(Obj); +} + +void ProfiledBinary::setIsFuncEntry(FuncRange *FuncRange, + StringRef RangeSymName) { + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionAddress = Section.getAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartAddress = Symbols[SI].Addr; + uint64_t NextStartAddress = + (SI + 1 < SE) ? Symbols[SI + 1].Addr : SectionAddress + SectSize; + FuncRange *FRange = findFuncRange(StartAddress); + setIsFuncEntry(FRange, FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + auto WarnInvalidInsts = [](uint64_t Start, uint64_t End) { + WithColor::warning() << "Invalid instructions at " + << format("%8" PRIx64, Start) << " - " + << format("%8" PRIx64, End) << "\n"; + }; + + uint64_t Address = StartAddress; + // Size of a consecutive invalid instruction range starting from Address -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Address < NextStartAddress) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = DisAsm->getInstruction( + Inst, Size, Bytes.slice(Address - SectionAddress), Address, nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), Address); + } + outs() << format("%8" PRIx64 ":", Address); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), outs()); + else + outs() << "\t"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Address); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + AddressToInstSizeMap[Address] = Size; + + // Populate address maps. + CodeAddressVec.push_back(Address); + if (MCDesc.isCall()) { + CallAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isReturn()) { + RetAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isBranch()) { + if (MCDesc.isUnconditionalBranch()) + UncondBranchAddrSet.insert(Address); + BranchAddressSet.insert(Address); + } + + // Record potential call targets for tail frame inference later-on. + if (InferMissingFrames && FRange) { + uint64_t Target = 0; + MIA->evaluateBranch(Inst, Address, Size, Target); + if (MCDesc.isCall()) { + // Indirect call targets are unknown at this point. Recording the + // unknown target (zero) for further LBR-based refinement. + MissingContextInferrer->CallEdges[Address].insert(Target); + } else if (MCDesc.isUnconditionalBranch()) { + assert(Target && + "target should be known for unconditional direct branch"); + // Any inter-function unconditional jump is considered tail call at + // this point. This is not 100% accurate and could further be + // optimized based on some source annotation. + FuncRange *ToFRange = findFuncRange(Target); + if (ToFRange && ToFRange->Func != FRange->Func) + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Direct Tail call: " << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } else if (MCDesc.isIndirectBranch() && MCDesc.isBarrier()) { + // This is an indirect branch but not necessarily an indirect tail + // call. The isBarrier check is to filter out conditional branch. + // Similar with indirect call targets, recording the unknown target + // (zero) for further LBR-based refinement. + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Indirect Tail call: " + << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } + } + + if (InvalidInstLength) { + WarnInvalidInsts(Address - InvalidInstLength, Address - 1); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Address += Size; + } + + if (InvalidInstLength) + WarnInvalidInsts(Address - InvalidInstLength, Address - 1); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + Expected Features = Obj->getFeatures(); + if (!Features) + exitWithError(Features.takeError(), FileName); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features->getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionAddress = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionAddress, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ELFObjectFileBase *Obj, + std::map &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::populateElfSymbolAddressList( + const ELFObjectFileBase *Obj) { + // Create a mapping from virtual address to symbol GUID and the other way + // around. + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + uint64_t GUID = Function::getGUID(Name); + SymbolStartAddrs[GUID] = Addr; + StartAddrToSymMap.emplace(Addr, GUID); + } +} + +void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) { + for (const auto &DieInfo : CompilationUnit.dies()) { + llvm::DWARFDie Die(&CompilationUnit, &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t StartAddress = Range.LowPC; + uint64_t EndAddress = Range.HighPC; + + if (EndAddress <= StartAddress || + StartAddress < getPreferredBaseAddress()) + continue; + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartAddress, EndAddress); + + auto R = StartAddrToFuncRangeMap.emplace(StartAddress, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartAddress = StartAddress; + FRange.EndAddress = EndAddress; + } else { + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartAddress) << " " + << R.first->second.getFuncName() << " and " << Name << "\n"; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create( + Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath); + if (!DebugContext) + exitWithError("Error creating the debug info context", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) + loadSymbolsFromDWARFUnit(*CompilationUnit.get()); + + // Handles DWO sections that can either be in .o, .dwo or .dwp files. + for (const auto &CompilationUnit : DebugContext->compile_units()) { + DWARFUnit *const DwarfUnit = CompilationUnit.get(); + if (std::optional DWOId = DwarfUnit->getDWOId()) { + DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit(); + if (!DWOCU->isDWOUnit()) { + std::string DWOName = dwarf::toString( + DwarfUnit->getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + WithColor::warning() + << "DWO debug information for " << DWOName + << " was not loaded. Please check the .o, .dwo or .dwp path.\n"; + continue; + } + loadSymbolsFromDWARFUnit(*DWOCU); + } + } + + if (BinaryFunctions.empty()) + WithColor::warning() << "Loading of DWARF info completed, but no binary " + "functions have been retrieved.\n"; +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartAddrToFuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +void ProfiledBinary::setupSymbolizer() { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + SymbolizerOpts.DWPName = DWPPath; + Symbolizer = std::make_unique(SymbolizerOpts); +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Address, + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName == "") + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(*It.first, Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t RangeBegin, + uint64_t RangeEnd) { + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + const SampleContextFrameVector SymbolizedCallStack = + getFrameLocationStack(IP.Address, UsePseudoProbes); + uint64_t Size = AddressToInstSizeMap[IP.Address]; + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +void ProfiledBinary::computeInlinedContextSizeForFunc( + const BinaryFunction *Func) { + // Note that a function can be spilt into multiple ranges, so compute for all + // ranges of the function. + for (const auto &Range : Func->Ranges) + computeInlinedContextSizeForRange(Range.first, Range.second); + + // Track optimized-away inlinee for probed binary. A function inlined and then + // optimized away should still have their probes left over in places. + if (usePseudoProbes()) { + auto I = TopLevelProbeFrameMap.find(Func->FuncName); + if (I != TopLevelProbeFrameMap.end()) { + BinarySizeContextTracker::ProbeFrameStack ProbeContext; + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second, + ProbeContext); + } + } +} + +void ProfiledBinary::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + MissingContextInferrer->inferMissingFrames(Context, NewContext); +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeAddrVecSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeAddrVecSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.h new file mode 100644 index 00000000000..cdbaec740b4 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/ProfiledBinary.h @@ -0,0 +1,585 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include +#include +#include +#include +#include +#include +#include +#include + +extern cl::opt EnableCSPreInliner; +extern cl::opt UseContextCostForPreInliner; + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +class MissingFrameInferrer; + +struct InstructionPointer { + const ProfiledBinary *Binary; + // Address of the executable segment of the binary. + uint64_t Address; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartAddress; + // EndAddress is an exclusive bound. + uint64_t EndAddress; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start address is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog address tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog addresses. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void + inferPrologAddresses(std::map &FuncStartAddressMap) { + for (auto I : FuncStartAddressMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Address); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogAddresses(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Address); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const ContextTrieNode *Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + using ProbeFrameStack = SmallVector>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + void dump() { RootContext.dumpTree(); } + +private: + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using AddressRange = std::pair; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // The target triple. + Triple TheTriple; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr MRI; + std::unique_ptr AsmInfo; + std::unique_ptr STI; + std::unique_ptr MII; + std::unique_ptr DisAsm; + std::unique_ptr MIA; + std::unique_ptr IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map BinaryFunctions; + + // A list of binary functions that have samples. + std::unordered_set ProfiledFunctions; + + // GUID to Elf symbol start address map + DenseMap SymbolStartAddrs; + + // Start address to Elf symbol GUID map + std::unordered_multimap StartAddrToSymMap; + + // An ordered map of mapping function's start address to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map StartAddrToFuncRangeMap; + + // Address to context location map. Used to expand the context. + std::unordered_map AddressToLocStackMap; + + // Address to instruction size map. Also used for quick Address lookup. + std::unordered_map AddressToInstSizeMap; + + // An array of Addresses of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector CodeAddressVec; + // A set of call instruction addresses. Used by virtual unwinding. + std::unordered_set CallAddressSet; + // A set of return instruction addresses. Used by virtual unwinding. + std::unordered_set RetAddressSet; + // An ordered set of unconditional branch instruction addresses. + std::set UncondBranchAddrSet; + // A set of branch instruction addresses. + std::unordered_set BranchAddressSet; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Infer missing frames due to compiler optimizations such as tail call + // elimination. + std::unique_ptr MissingContextInferrer; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + // Function name to probe frame map for top-level outlined functions. + StringMap TopLevelProbeFrameMap; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O); + + template + void setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName); + + void checkPseudoProbe(const ELFObjectFileBase *Obj); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ELFObjectFileBase *Obj, + std::map &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ELFObjectFileBase *Obj); + void setupSymbolizer(); + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // Load debug info from DWARF unit. + void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit); + + // Create elf symbol to its start address mapping. + void populateElfSymbolAddressList(const ELFObjectFileBase *O); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start a function range is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(FuncRange *FRange, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ELFObjectFileBase *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath); + ~ProfiledBinary(); + + void decodePseudoProbe(); + + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + // Canonicalize to use preferred load address as base address. + uint64_t canonicalizeVirtualAddress(uint64_t Address) { + return Address - BaseAddress + getPreferredBaseAddress(); + } + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { + return PreferredTextSegmentAddresses[0]; + } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Address) const { + auto I = AddressToInstSizeMap.find(Address); + if (I == AddressToInstSizeMap.end()) + return 0; + return I->second; + } + + bool addressIsCode(uint64_t Address) const { + return AddressToInstSizeMap.find(Address) != AddressToInstSizeMap.end(); + } + + bool addressIsCall(uint64_t Address) const { + return CallAddressSet.count(Address); + } + bool addressIsReturn(uint64_t Address) const { + return RetAddressSet.count(Address); + } + bool addressInPrologEpilog(uint64_t Address) const { + return ProEpilogTracker.PrologEpilogSet.count(Address); + } + + bool addressIsTransfer(uint64_t Address) { + return BranchAddressSet.count(Address) || RetAddressSet.count(Address) || + CallAddressSet.count(Address); + } + + bool rangeCrossUncondBranch(uint64_t Start, uint64_t End) { + if (Start >= End) + return false; + auto R = UncondBranchAddrSet.lower_bound(Start); + return R != UncondBranchAddrSet.end() && *R < End; + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return CodeAddressVec[Index]; + } + + size_t getCodeAddrVecSize() const { return CodeAddressVec.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + // Get the index in CodeAddressVec for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForAddr(uint64_t Address) const { + auto Low = llvm::lower_bound(CodeAddressVec, Address); + return Low - CodeAddressVec.begin(); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartAddr(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.find(Address); + if (I == StartAddrToFuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input address. + FuncRange *findFuncRange(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.upper_bound(Address); + if (I == StartAddrToFuncRangeMap.begin()) + return nullptr; + I--; + + if (Address >= I->second.EndAddress) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRanges(uint64_t Address) { + auto *FRange = findFuncRange(Address); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + std::unordered_set &getProfiledFunctions() { + return ProfiledFunctions; + } + + void setProfiledFunctions(std::unordered_set &Funcs) { + ProfiledFunctions = Funcs; + } + + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + + uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) { + return FuncSizeTracker.getFuncSizeForContext(ContextNode); + } + + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + SampleContextFrameVector + getFrameLocationStack(uint64_t Address, bool UseProbeDiscriminator = false) { + InstructionPointer IP(this, Address); + return symbolize(IP, true, UseProbeDiscriminator); + } + + const SampleContextFrameVector & + getCachedFrameLocationStack(uint64_t Address, + bool UseProbeDiscriminator = false) { + auto I = AddressToLocStackMap.emplace(Address, SampleContextFrameVector()); + if (I.second) { + I.first->second = getFrameLocationStack(Address, UseProbeDiscriminator); + } + return I.first->second; + } + + std::optional getInlineLeafFrameLoc(uint64_t Address) { + const auto &Stack = getCachedFrameLocationStack(Address); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + void flushSymbolizer() { Symbolizer.reset(); } + + MissingFrameInferrer* getMissingContextInferrer() { + return MissingContextInferrer.get(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in AddressToLocStackMap. This + // is used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartAddress, + uint64_t EndAddress); + + void computeInlinedContextSizeForFunc(const BinaryFunction *Func); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(Callsite.first, + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-16.0/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-16.0/llvm-profgen.cpp new file mode 100644 index 00000000000..596882c1b93 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-16.0/llvm-profgen.cpp @@ -0,0 +1,190 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt SampleProfFilename( + "llvm-sample-profile", cl::value_desc("llvm sample profile"), + cl::desc("Path of the LLVM sample profile"), cl::cat(ProfGenCategory)); + +static cl::opt + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt + ProcessId("pid", cl::value_desc("process Id"), cl::init(0), + cl::desc("Process Id for the profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + bool HasPerfData = PerfDataFilename.getNumOccurrences() > 0; + bool HasPerfScript = PerfScriptFilename.getNumOccurrences() > 0; + bool HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences() > 0; + bool HasSampleProfile = SampleProfFilename.getNumOccurrences() > 0; + uint16_t S = + HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + CheckFileExists(HasSampleProfile, SampleProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr Binary = + std::make_unique(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + if (SampleProfFilename.getNumOccurrences()) { + LLVMContext Context; + auto ReaderOrErr = SampleProfileReader::create(SampleProfFilename, Context); + std::unique_ptr Reader = + std::move(ReaderOrErr.get()); + Reader->read(); + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getProfiles(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } else { + std::optional PIDFilter; + if (ProcessId.getNumOccurrences()) + PIDFilter = ProcessId; + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr Reader = + PerfReaderBase::create(Binary.get(), PerfFile, PIDFilter); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), &Reader->getSampleCounters(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } + + return EXIT_SUCCESS; +} diff --git a/tools/ldc-profgen/ldc-profgen-17.0/CMakeLists.txt b/tools/ldc-profgen/ldc-profgen-17.0/CMakeLists.txt new file mode 100644 index 00000000000..354c63f409f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/CMakeLists.txt @@ -0,0 +1,25 @@ + +set(LLVM_LINK_COMPONENTS + AllTargetsDescs + AllTargetsDisassemblers + AllTargetsInfos + DebugInfoDWARF + Core + MC + IPO + MCDisassembler + Object + ProfileData + Support + Symbolize + TargetParser + ) + +add_llvm_tool(llvm-profgen + llvm-profgen.cpp + PerfReader.cpp + CSPreInliner.cpp + ProfiledBinary.cpp + ProfileGenerator.cpp + MissingFrameInferrer.cpp + ) diff --git a/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.cpp b/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.cpp new file mode 100644 index 00000000000..ae0fd6d0b06 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.cpp @@ -0,0 +1,323 @@ +//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CSPreInliner.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include +#include + +#define DEBUG_TYPE "cs-preinliner" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(PreInlNumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(PreInlNumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); +STATISTIC(PreInlNumCSInlinedHitMinLimit, + "Number of functions with FDO inline stopped due to min size limit"); +STATISTIC(PreInlNumCSInlinedHitMaxLimit, + "Number of functions with FDO inline stopped due to max size limit"); +STATISTIC( + PreInlNumCSInlinedHitGrowthLimit, + "Number of functions with FDO inline stopped due to growth size limit"); + +// The switches specify inline thresholds used in SampleProfileLoader inlining. +// TODO: the actual threshold to be tuned here because the size here is based +// on machine code not LLVM IR. +namespace llvm { +extern cl::opt SampleHotCallSiteThreshold; +extern cl::opt SampleColdCallSiteThreshold; +extern cl::opt ProfileInlineGrowthLimit; +extern cl::opt ProfileInlineLimitMin; +extern cl::opt ProfileInlineLimitMax; +extern cl::opt SortProfiledSCC; + +cl::opt EnableCSPreInliner( + "csspgo-preinliner", cl::Hidden, cl::init(true), + cl::desc("Run a global pre-inliner to merge context profile based on " + "estimated global top-down inline decisions")); + +cl::opt UseContextCostForPreInliner( + "use-context-cost-for-preinliner", cl::Hidden, cl::init(true), + cl::desc("Use context-sensitive byte size cost for preinliner decisions")); +} // namespace llvm + +static cl::opt SamplePreInlineReplay( + "csspgo-replay-preinline", cl::Hidden, cl::init(false), + cl::desc( + "Replay previous inlining and adjust context profile accordingly")); + +static cl::opt CSPreinlMultiplierForPrevInl( + "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden, + cl::init(100), + cl::desc( + "Multiplier to bump up callsite threshold for previous inlining.")); + +CSPreInliner::CSPreInliner(SampleContextTracker &Tracker, + ProfiledBinary &Binary, ProfileSummary *Summary) + : UseContextCost(UseContextCostForPreInliner), + // TODO: Pass in a guid-to-name map in order for + // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes + // as their profile context. + ContextTracker(Tracker), Binary(Binary), Summary(Summary) { + // Set default preinliner hot/cold call site threshold tuned with CSSPGO. + // for good performance with reasonable profile size. + if (!SampleHotCallSiteThreshold.getNumOccurrences()) + SampleHotCallSiteThreshold = 1500; + if (!SampleColdCallSiteThreshold.getNumOccurrences()) + SampleColdCallSiteThreshold = 0; + if (!ProfileInlineLimitMax.getNumOccurrences()) + ProfileInlineLimitMax = 50000; +} + +std::vector CSPreInliner::buildTopDownOrder() { + std::vector Order; + // Trim cold edges to get a more stable call graph. This allows for a more + // stable top-down order which in turns helps the stablity of the generated + // profile from run to run. + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold); + + // Now that we have a profiled call graph, construct top-down order + // by building up SCC and reversing SCC order. + scc_iterator I = scc_begin(&ProfiledCG); + while (!I.isAtEnd()) { + auto Range = *I; + if (SortProfiledSCC) { + // Sort nodes in one SCC based on callsite hotness. + scc_member_iterator SI(*I); + Range = *SI; + } + for (auto *Node : Range) { + if (Node != ProfiledCG.getEntryNode()) + Order.push_back(Node->Name); + } + ++I; + } + std::reverse(Order.begin(), Order.end()); + + return Order; +} + +bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *CallerSamples) { + assert(CallerSamples && "Expect non-null caller samples"); + + // Ideally we want to consider everything a function calls, but as far as + // context profile is concerned, only those frames that are children of + // current one in the trie is relavent. So we walk the trie instead of call + // targets from function profile. + ContextTrieNode *CallerNode = + ContextTracker.getContextNodeForProfile(CallerSamples); + + bool HasNewCandidate = false; + for (auto &Child : CallerNode->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); + if (!CalleeSamples) + continue; + + // Call site count is more reliable, so we look up the corresponding call + // target profile in caller's context profile to retrieve call site count. + uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = CalleeNode->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + + // TODO: call site and callee entry count should be mostly consistent, add + // check for that. + HasNewCandidate = true; + uint32_t CalleeSize = getFuncSize(CalleeNode); + CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount), + CalleeSize); + } + + return HasNewCandidate; +} + +uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) { + if (UseContextCost) + return Binary.getFuncSizeForContext(ContextNode); + + return ContextNode->getFunctionSamples()->getBodySamples().size(); +} + +bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + // If replay inline is requested, simply follow the inline decision of the + // profiled binary. + if (SamplePreInlineReplay) + return WasInlined; + + unsigned int SampleThreshold = SampleColdCallSiteThreshold; + uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); + + if (Candidate.CallsiteCount <= ColdCountThreshold) + SampleThreshold = SampleColdCallSiteThreshold; + else { + // Linearly adjust threshold based on normalized hotness, i.e, a value in + // [0,1]. Use 10% cutoff instead of the max count as the normalization + // upperbound for stability. + double NormalizationUpperBound = + ProfileSummaryBuilder::getEntryForPercentile( + Summary->getDetailedSummary(), 100000 /* 10% */) + .MinCount; + double NormalizationLowerBound = ColdCountThreshold; + double NormalizedHotness = + (Candidate.CallsiteCount - NormalizationLowerBound) / + (NormalizationUpperBound - NormalizationLowerBound); + if (NormalizedHotness > 1.0) + NormalizedHotness = 1.0; + // Add 1 to to ensure hot callsites get a non-zero threshold, which could + // happen when SampleColdCallSiteThreshold is 0. This is when we do not + // want any inlining for cold callsites. + SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 + + SampleColdCallSiteThreshold + 1; + // Bump up the threshold to favor previous compiler inline decision. The + // compiler has more insight and knowledge about functions based on their IR + // and attribures and should be able to make a more reasonable inline + // decision. + if (WasInlined) + SampleThreshold *= CSPreinlMultiplierForPrevInl; + } + + return (Candidate.SizeCost < SampleThreshold); +} + +void CSPreInliner::processFunction(const StringRef Name) { + FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); + if (!FSamples) + return; + + unsigned FuncSize = + getFuncSize(ContextTracker.getContextNodeForProfile(FSamples)); + unsigned FuncFinalSize = FuncSize; + unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; + SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); + SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); + + LLVM_DEBUG(dbgs() << "Process " << Name + << " for context-sensitive pre-inlining (pre-inline size: " + << FuncSize << ", size limit: " << SizeLimit << ")\n"); + + ProfiledCandidateQueue CQueue; + getInlineCandidates(CQueue, FSamples); + + while (!CQueue.empty() && FuncFinalSize < SizeLimit) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool ShouldInline = false; + if ((ShouldInline = shouldInline(Candidate))) { + // We mark context as inlined as the corresponding context profile + // won't be merged into that function's base profile. + ++PreInlNumCSInlined; + ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); + Candidate.CalleeSamples->getContext().setAttribute( + ContextShouldBeInlined); + FuncFinalSize += Candidate.SizeCost; + getInlineCandidates(CQueue, Candidate.CalleeSamples); + } else { + ++PreInlNumCSNotInlined; + } + LLVM_DEBUG( + dbgs() << (ShouldInline ? " Inlined" : " Outlined") + << " context profile for: " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (callee size: " << Candidate.SizeCost + << ", call count:" << Candidate.CallsiteCount << ")\n"); + } + + if (!CQueue.empty()) { + if (SizeLimit == (unsigned)ProfileInlineLimitMax) + ++PreInlNumCSInlinedHitMaxLimit; + else if (SizeLimit == (unsigned)ProfileInlineLimitMin) + ++PreInlNumCSInlinedHitMinLimit; + else + ++PreInlNumCSInlinedHitGrowthLimit; + } + + LLVM_DEBUG({ + if (!CQueue.empty()) + dbgs() << " Inline candidates ignored due to size limit (inliner " + "original size: " + << FuncSize << ", inliner final size: " << FuncFinalSize + << ", size limit: " << SizeLimit << ")\n"; + + while (!CQueue.empty()) { + ProfiledInlineCandidate Candidate = CQueue.top(); + CQueue.pop(); + bool WasInlined = + Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); + dbgs() << " " + << ContextTracker.getContextString(*Candidate.CalleeSamples) + << " (candidate size:" << Candidate.SizeCost + << ", call count: " << Candidate.CallsiteCount << ", previously " + << (WasInlined ? "inlined)\n" : "not inlined)\n"); + } + }); +} + +void CSPreInliner::run() { +#ifndef NDEBUG + auto printProfileNames = [](SampleContextTracker &ContextTracker, + bool IsInput) { + uint32_t Size = 0; + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + Size++; + dbgs() << " [" << ContextTracker.getContextString(Node) << "] " + << FSamples->getTotalSamples() << ":" + << FSamples->getHeadSamples() << "\n"; + } + } + dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" + << Size << " total):\n"; + }; +#endif + + LLVM_DEBUG(printProfileNames(ContextTracker, true)); + + // Execute global pre-inliner to estimate a global top-down inline + // decision and merge profiles accordingly. This helps with profile + // merge for ThinLTO otherwise we won't be able to merge profiles back + // to base profile across module/thin-backend boundaries. + // It also helps better compress context profile to control profile + // size, as we now only need context profile for functions going to + // be inlined. + for (StringRef FuncName : buildTopDownOrder()) { + processFunction(FuncName); + } + + // Not inlined context profiles are merged into its base, so we can + // trim out such profiles from the output. + for (auto *Node : ContextTracker) { + FunctionSamples *FProfile = Node->getFunctionSamples(); + if (FProfile && + (Node->getParentContext() != &ContextTracker.getRootContext() && + !FProfile->getContext().hasState(InlinedContext))) { + Node->setFunctionSamples(nullptr); + } + } + FunctionSamples::ProfileIsPreInlined = true; + + LLVM_DEBUG(printProfileNames(ContextTracker, false)); +} diff --git a/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.h b/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.h new file mode 100644 index 00000000000..4d848aafdab --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/CSPreInliner.h @@ -0,0 +1,97 @@ +//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H +#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H + +#include "ProfiledBinary.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Inline candidate seen from profile +struct ProfiledInlineCandidate { + ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count, + uint32_t Size) + : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {} + // Context-sensitive function profile for inline candidate + const FunctionSamples *CalleeSamples; + // Call site count for an inline candidate + // TODO: make sure entry count for context profile and call site + // target count for corresponding call are consistent. + uint64_t CallsiteCount; + // Size proxy for function under particular call context. + uint64_t SizeCost; +}; + +// Inline candidate comparer using call site weight +struct ProfiledCandidateComparer { + bool operator()(const ProfiledInlineCandidate &LHS, + const ProfiledInlineCandidate &RHS) { + // Always prioritize inlining zero-sized functions as they do not affect the + // size budget. This could happen when all of the callee's code is gone and + // only pseudo probes are left. + if ((LHS.SizeCost == 0 || RHS.SizeCost == 0) && + (LHS.SizeCost != RHS.SizeCost)) + return RHS.SizeCost == 0; + + if (LHS.CallsiteCount != RHS.CallsiteCount) + return LHS.CallsiteCount < RHS.CallsiteCount; + + if (LHS.SizeCost != RHS.SizeCost) + return LHS.SizeCost > RHS.SizeCost; + + // Tie breaker using GUID so we have stable/deterministic inlining order + assert(LHS.CalleeSamples && RHS.CalleeSamples && + "Expect non-null FunctionSamples"); + return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) < + RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName()); + } +}; + +using ProfiledCandidateQueue = + PriorityQueue, + ProfiledCandidateComparer>; + +// Pre-compilation inliner based on context-sensitive profile. +// The PreInliner estimates inline decision using hotness from profile +// and cost estimation from machine code size. It helps merges context +// profile globally and achieves better post-inine profile quality, which +// otherwise won't be possible for ThinLTO. It also reduce context profile +// size by only keep context that is estimated to be inlined. +class CSPreInliner { +public: + CSPreInliner(SampleContextTracker &Tracker, ProfiledBinary &Binary, + ProfileSummary *Summary); + void run(); + +private: + bool getInlineCandidates(ProfiledCandidateQueue &CQueue, + const FunctionSamples *FCallerContextSamples); + std::vector buildTopDownOrder(); + void processFunction(StringRef Name); + bool shouldInline(ProfiledInlineCandidate &Candidate); + uint32_t getFuncSize(const ContextTrieNode *ContextNode); + bool UseContextCost; + SampleContextTracker &ContextTracker; + ProfiledBinary &Binary; + ProfileSummary *Summary; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/CallContext.h b/tools/ldc-profgen/ldc-profgen-17.0/CallContext.h new file mode 100644 index 00000000000..5e552130d03 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/CallContext.h @@ -0,0 +1,59 @@ +//===-- CallContext.h - Call Context Handler ---------------------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H +#define LLVM_TOOLS_LLVM_PROFGEN_CALLCONTEXT_H + +#include "llvm/ProfileData/SampleProf.h" +#include +#include +#include + +namespace llvm { +namespace sampleprof { + +inline std::string getCallSite(const SampleContextFrame &Callsite) { + std::string CallsiteStr = Callsite.FuncName.str(); + CallsiteStr += ":"; + CallsiteStr += Twine(Callsite.Location.LineOffset).str(); + if (Callsite.Location.Discriminator > 0) { + CallsiteStr += "."; + CallsiteStr += Twine(Callsite.Location.Discriminator).str(); + } + return CallsiteStr; +} + +// TODO: This operation is expansive. If it ever gets called multiple times we +// may think of making a class wrapper with internal states for it. +inline std::string getLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : Context) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +// Reverse call context, i.e., in the order of callee frames to caller frames, +// is useful during instruction printing or pseudo probe printing. +inline std::string +getReversedLocWithContext(const SampleContextFrameVector &Context) { + std::ostringstream OContextStr; + for (const auto &Callsite : reverse(Context)) { + if (OContextStr.str().size()) + OContextStr << " @ "; + OContextStr << getCallSite(Callsite); + } + return OContextStr.str(); +} + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/ErrorHandling.h b/tools/ldc-profgen/ldc-profgen-17.0/ErrorHandling.h new file mode 100644 index 00000000000..b797add8a89 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/ErrorHandling.h @@ -0,0 +1,56 @@ +//===-- ErrorHandling.h - Error handler -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H +#define LLVM_TOOLS_LLVM_PROFGEN_ERRORHANDLING_H + +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/WithColor.h" +#include + +using namespace llvm; + +[[noreturn]] inline void exitWithError(const Twine &Message, + StringRef Whence = StringRef(), + StringRef Hint = StringRef()) { + WithColor::error(errs(), "llvm-profgen"); + if (!Whence.empty()) + errs() << Whence.str() << ": "; + errs() << Message << "\n"; + if (!Hint.empty()) + WithColor::note() << Hint.str() << "\n"; + ::exit(EXIT_FAILURE); +} + +[[noreturn]] inline void exitWithError(std::error_code EC, + StringRef Whence = StringRef()) { + exitWithError(EC.message(), Whence); +} + +[[noreturn]] inline void exitWithError(Error E, StringRef Whence) { + exitWithError(errorToErrorCode(std::move(E)), Whence); +} + +template +T unwrapOrError(Expected EO, Ts &&... Args) { + if (EO) + return std::move(*EO); + exitWithError(EO.takeError(), std::forward(Args)...); +} + +inline void emitWarningSummary(uint64_t Num, uint64_t Total, StringRef Msg) { + if (!Total || !Num) + return; + WithColor::warning() << format("%.2f", static_cast(Num) * 100 / Total) + << "%(" << Num << "/" << Total << ") " << Msg << "\n"; +} + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.cpp b/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.cpp new file mode 100644 index 00000000000..ee49950f39c --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.cpp @@ -0,0 +1,316 @@ +//===-- MissingFrameInferrer.cpp - Missing frame inferrer --------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/Statistic.h" +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "missing-frame-inferrer" + +using namespace llvm; +using namespace sampleprof; + +STATISTIC(TailCallUniReachable, + "Number of frame pairs reachable via a unique tail call path"); +STATISTIC(TailCallMultiReachable, + "Number of frame pairs reachable via a multiple tail call paths"); +STATISTIC(TailCallUnreachable, + "Number of frame pairs unreachable via any tail call path"); +STATISTIC(TailCallFuncSingleTailCalls, + "Number of functions with single tail call site"); +STATISTIC(TailCallFuncMultipleTailCalls, + "Number of functions with multiple tail call sites"); +STATISTIC(TailCallMaxTailCallPath, "Length of the longest tail call path"); + +static cl::opt + MaximumSearchDepth("max-search-depth", cl::init(UINT32_MAX - 1), + cl::desc("The maximum levels the DFS-based missing " + "frame search should go with")); + +void MissingFrameInferrer::initialize( + const ContextSampleCounterMap *SampleCounters) { + // Refine call edges based on LBR samples. + if (SampleCounters) { + std::unordered_map> SampledCalls; + std::unordered_map> SampledTailCalls; + + // Populate SampledCalls based on static call sites. Similarly to + // SampledTailCalls. + for (const auto &CI : *SampleCounters) { + for (auto Item : CI.second.BranchCounter) { + auto From = Item.first.first; + auto To = Item.first.second; + if (CallEdges.count(From)) { + assert(CallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + SampledCalls[From].insert(To); + } + if (TailCallEdges.count(From)) { + assert(TailCallEdges[From].size() == 1 && + "A callsite should only appear once with either a known or a " + "zero (unknown) target value at this point"); + FuncRange *FromFRange = Binary->findFuncRange(From); + FuncRange *ToFRange = Binary->findFuncRange(To); + if (FromFRange != ToFRange) + SampledTailCalls[From].insert(To); + } + } + } + + // Replace static edges with dynamic edges. + CallEdges = SampledCalls; + TailCallEdges = SampledTailCalls; + } + + // Populate function-based edges. This is to speed up address to function + // translation. + for (auto Call : CallEdges) + for (auto Target : Call.second) + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) + CallEdgesF[Call.first].insert(ToFRange->Func); + + for (auto Call : TailCallEdges) { + for (auto Target : Call.second) { + if (FuncRange *ToFRange = Binary->findFuncRange(Target)) { + TailCallEdgesF[Call.first].insert(ToFRange->Func); + TailCallTargetFuncs.insert(ToFRange->Func); + } + } + if (FuncRange *FromFRange = Binary->findFuncRange(Call.first)) + FuncToTailCallMap[FromFRange->Func].push_back(Call.first); + } + +#if LLVM_ENABLE_STATS + for (auto F : FuncToTailCallMap) { + assert(F.second.size() > 0 && ""); + if (F.second.size() > 1) + TailCallFuncMultipleTailCalls++; + else + TailCallFuncSingleTailCalls++; + } +#endif + +#ifndef NDEBUG + auto PrintCallTargets = + [&](const std::unordered_map> + &CallTargets, + bool IsTailCall) { + for (const auto &Targets : CallTargets) { + for (const auto &Target : Targets.second) { + dbgs() << (IsTailCall ? "TailCall" : "Call"); + dbgs() << " From " << format("%8" PRIx64, Targets.first) << " to " + << format("%8" PRIx64, Target) << "\n"; + } + } + }; + + LLVM_DEBUG(dbgs() << "============================\n "; + dbgs() << "Call targets:\n"; + PrintCallTargets(CallEdges, false); + dbgs() << "\nTail call targets:\n"; + PrintCallTargets(CallEdges, true); + dbgs() << "============================\n";); +#endif +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + BinaryFunction *From, BinaryFunction *To, SmallVectorImpl &Path) { + // Search for a unique path comprised of only tail call edges for a given + // source and target frame address on the a tail call graph that consists of + // only tail call edges. Note that only a unique path counts. Multiple paths + // are treated unreachable. + if (From == To) + return 1; + + // Ignore cyclic paths. Since we are doing a recursive DFS walk, if the source + // frame being visited is already in the stack, it means we are seeing a + // cycle. This is done before querying the cached result because the cached + // result may be computed based on the same path. Consider the following case: + // A -> B, B -> A, A -> D + // When computing unique reachablity from A to D, the cached result for (B,D) + // should not be counted since the unique path B->A->D is basically the same + // path as A->D. Counting that with invalidate the uniqueness from A to D. + if (Visiting.contains(From)) + return 0; + + // If already computed, return the cached result. + auto I = UniquePaths.find({From, To}); + if (I != UniquePaths.end()) { + Path.append(I->second.begin(), I->second.end()); + return 1; + } + + auto J = NonUniquePaths.find({From, To}); + if (J != NonUniquePaths.end()) { + return J->second; + } + + uint64_t Pos = Path.size(); + + // DFS walk each outgoing tail call edges. + // Bail out if we are already at the the maximum searching depth. + if (CurSearchingDepth == MaximumSearchDepth) + return 0; + + + if (!FuncToTailCallMap.count(From)) + return 0; + + CurSearchingDepth++; + Visiting.insert(From); + uint64_t NumPaths = 0; + for (auto TailCall : FuncToTailCallMap[From]) { + NumPaths += computeUniqueTailCallPath(TailCall, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + CurSearchingDepth--; + Visiting.erase(From); + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + Path.pop_back_n(Path.size() - Pos); + } + + // Cache the result. + if (NumPaths == 1) { + UniquePaths[{From, To}].assign(Path.begin() + Pos, Path.end()); +#if LLVM_ENABLE_STATS + auto &LocalPath = UniquePaths[{From, To}]; + assert((LocalPath.size() <= MaximumSearchDepth + 1) && + "Path should not be longer than the maximum searching depth"); + TailCallMaxTailCallPath = std::max(uint64_t(LocalPath.size()), + TailCallMaxTailCallPath.getValue()); +#endif + } else { + NonUniquePaths[{From, To}] = NumPaths; + } + + return NumPaths; +} + +uint64_t MissingFrameInferrer::computeUniqueTailCallPath( + uint64_t From, BinaryFunction *To, SmallVectorImpl &Path) { + if (!TailCallEdgesF.count(From)) + return 0; + Path.push_back(From); + uint64_t NumPaths = 0; + for (auto Target : TailCallEdgesF[From]) { + NumPaths += computeUniqueTailCallPath(Target, To, Path); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) + Path.pop_back(); + return NumPaths; +} + +bool MissingFrameInferrer::inferMissingFrames( + uint64_t From, uint64_t To, SmallVectorImpl &UniquePath) { + assert(!TailCallEdgesF.count(From) && + "transition between From and To cannot be via a tailcall otherwise " + "they would not show up at the same time"); + UniquePath.push_back(From); + uint64_t Pos = UniquePath.size(); + + FuncRange *ToFRange = Binary->findFuncRange(To); + if (!ToFRange) + return false; + + // Bail out if caller has no known outgoing call edges. + if (!CallEdgesF.count(From)) + return false; + + // Done with the inference if the calle is reachable via a single callsite. + // This may not be accurate but it improves the search throughput. + if (llvm::is_contained(CallEdgesF[From], ToFRange->Func)) + return true; + + // Bail out if callee is not tailcall reachable at all. + if (!TailCallTargetFuncs.contains(ToFRange->Func)) + return false; + + Visiting.clear(); + CurSearchingDepth = 0; + uint64_t NumPaths = 0; + for (auto Target : CallEdgesF[From]) { + NumPaths += + computeUniqueTailCallPath(Target, ToFRange->Func, UniquePath); + // Stop analyzing the remaining if we are already seeing more than one + // reachable paths. + if (NumPaths > 1) + break; + } + + // Undo already-computed path if it is not unique. + if (NumPaths != 1) { + UniquePath.pop_back_n(UniquePath.size() - Pos); + assert(UniquePath.back() == From && "broken path"); + } + +#if LLVM_ENABLE_STATS + if (NumPaths == 1) { + if (ReachableViaUniquePaths.insert({From, ToFRange->StartAddress}).second) + TailCallUniReachable++; + } else if (NumPaths == 0) { + if (Unreachables.insert({From, ToFRange->StartAddress}).second) { + TailCallUnreachable++; + LLVM_DEBUG(dbgs() << "No path found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } else if (NumPaths > 1) { + if (ReachableViaMultiPaths.insert({From, ToFRange->StartAddress}) + .second) { + TailCallMultiReachable++; + LLVM_DEBUG(dbgs() << "Multiple paths found from " + << format("%8" PRIx64 ":", From) << " to " + << format("%8" PRIx64 ":", ToFRange->StartAddress) + << "\n"); + } + } +#endif + + return NumPaths == 1; +} + +void MissingFrameInferrer::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + if (Context.size() == 1) { + NewContext = Context; + return; + } + + NewContext.clear(); + for (uint64_t I = 1; I < Context.size(); I++) { + inferMissingFrames(Context[I - 1], Context[I], NewContext); + } + NewContext.push_back(Context.back()); + + assert((NewContext.size() >= Context.size()) && + "Inferred context should include all frames in the original context"); + assert((NewContext.size() > Context.size() || NewContext == Context) && + "Inferred context should be exactly the same " + "with the original context"); +} diff --git a/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.h b/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.h new file mode 100644 index 00000000000..4680a9a979f --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/MissingFrameInferrer.h @@ -0,0 +1,116 @@ +//===-- MissingFrameInferrer.h - Missing frame inferrer ---------- C++/-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H +#define LLVM_TOOLS_LLVM_PROFGEN_MISSINGFRAMEINFERRER_H + +#include "PerfReader.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include +#include + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +struct BinaryFunction; + +class MissingFrameInferrer { +public: + MissingFrameInferrer(ProfiledBinary *Binary) : Binary(Binary) {} + + // Defininig a frame transition from a caller function to the callee function. + using CallerCalleePair = std::pair; + + void initialize(const ContextSampleCounterMap *SampleCounters); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + +private: + friend class ProfiledBinary; + + // Compute a unique tail call path for a pair of source frame address and + // target frame address. Append the unique path prefix (not including `To`) to + // `UniquePath` if exists. Return the whether this's a unqiue tail call + // path. The source/dest frame will typically be a pair of adjacent frame + // entries of call stack samples. + bool inferMissingFrames(uint64_t From, uint64_t To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source frame address to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(uint64_t From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + // Compute a unique tail call path from the source function to the target + // function. Output the unique path prefix (not including `To`) in + // `UniquePath` if exists. Return the number of possibly availabe tail call + // paths. + uint64_t computeUniqueTailCallPath(BinaryFunction *From, BinaryFunction *To, + SmallVectorImpl &UniquePath); + + ProfiledBinary *Binary; + + // A map of call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> CallEdges; + + // A map of tail call instructions to their target addresses. This is first + // populated with static call edges but then trimmed down to dynamic call + // edges based on LBR samples. + std::unordered_map> TailCallEdges; + + // Dynamic call targets in terms of BinaryFunction for any calls. + std::unordered_map> CallEdgesF; + + // Dynamic call targets in terms of BinaryFunction for tail calls. + std::unordered_map> + TailCallEdgesF; + + // Dynamic tail call targets of caller functions. + std::unordered_map> FuncToTailCallMap; + + // Functions that are reachable via tail calls. + DenseSet TailCallTargetFuncs; + + struct PairHash { + std::size_t operator()( + const std::pair &Pair) const { + return std::hash()(Pair.first) ^ + std::hash()(Pair.second); + } + }; + + // Cached results from a CallerCalleePair to a unique call path between them. + std::unordered_map, PairHash> + UniquePaths; + // Cached results from CallerCalleePair to the number of available call paths. + std::unordered_map NonUniquePaths; + + DenseSet Visiting; + + uint32_t CurSearchingDepth = 0; + +#if LLVM_ENABLE_STATS + DenseSet> ReachableViaUniquePaths; + DenseSet> Unreachables; + DenseSet> ReachableViaMultiPaths; +#endif +}; +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.cpp b/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.cpp new file mode 100644 index 00000000000..9f451673069 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.cpp @@ -0,0 +1,1206 @@ +//===-- PerfReader.cpp - perfscript reader ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Process.h" + +#define DEBUG_TYPE "perf-reader" + +cl::opt SkipSymbolization("skip-symbolization", + cl::desc("Dump the unsymbolized profile to the " + "output file. It will show unwinder " + "output for CS profile generation.")); + +static cl::opt ShowMmapEvents("show-mmap-events", + cl::desc("Print binary load events.")); + +static cl::opt + UseOffset("use-offset", cl::init(true), + cl::desc("Work with `--skip-symbolization` or " + "`--unsymbolized-profile` to write/read the " + "offset instead of virtual address.")); + +static cl::opt UseLoadableSegmentAsBase( + "use-first-loadable-segment-as-base", + cl::desc("Use first loadable segment address as base address " + "for offsets in unsymbolized profile. By default " + "first executable segment address is used")); + +static cl::opt + IgnoreStackSamples("ignore-stack-samples", + cl::desc("Ignore call stack samples for hybrid samples " + "and produce context-insensitive profile.")); +cl::opt ShowDetailedWarning("show-detailed-warning", + cl::desc("Show detailed warning message.")); + +extern cl::opt PerfTraceFilename; +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt OutputFilename; + +namespace llvm { +namespace sampleprof { + +void VirtualUnwinder::unwindCall(UnwindState &State) { + uint64_t Source = State.getCurrentLBRSource(); + auto *ParentFrame = State.getParentFrame(); + // The 2nd frame after leaf could be missing if stack sample is + // taken when IP is within prolog/epilog, as frame chain isn't + // setup yet. Fill in the missing frame in that case. + // TODO: Currently we just assume all the addr that can't match the + // 2nd frame is in prolog/epilog. In the future, we will switch to + // pro/epi tracker(Dwarf CFI) for the precise check. + if (ParentFrame == State.getDummyRootPtr() || + ParentFrame->Address != Source) { + State.switchToFrame(Source); + if (ParentFrame != State.getDummyRootPtr()) { + if (Source == ExternalAddr) + NumMismatchedExtCallBranch++; + else + NumMismatchedProEpiBranch++; + } + } else { + State.popFrame(); + } + State.InstPtr.update(Source); +} + +void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) { + InstructionPointer &IP = State.InstPtr; + uint64_t Target = State.getCurrentLBRTarget(); + uint64_t End = IP.Address; + + if (End == ExternalAddr && Target == ExternalAddr) { + // Filter out the case when leaf external frame matches the external LBR + // target, this is a valid state, it happens that the code run into external + // address then return back. The call frame under the external frame + // remains valid and can be unwound later, just skip recording this range. + NumPairedExtAddr++; + return; + } + + if (End == ExternalAddr || Target == ExternalAddr) { + // Range is invalid if only one point is external address. This means LBR + // traces contains a standalone external address failing to pair another + // one, likely due to interrupt jmp or broken perf script. Set the + // state to invalid. + NumUnpairedExtAddr++; + State.setInvalid(); + return; + } + + if (!isValidFallThroughRange(Target, End, Binary)) { + // Skip unwinding the rest of LBR trace when a bogus range is seen. + State.setInvalid(); + return; + } + + if (Binary->usePseudoProbes()) { + // We don't need to top frame probe since it should be extracted + // from the range. + // The outcome of the virtual unwinding with pseudo probes is a + // map from a context key to the address range being unwound. + // This means basically linear unwinding is not needed for pseudo + // probes. The range will be simply recorded here and will be + // converted to a list of pseudo probes to report in ProfileGenerator. + State.getParentFrame()->recordRangeCount(Target, End, Repeat); + } else { + // Unwind linear execution part. + // Split and record the range by different inline context. For example: + // [0x01] ... main:1 # Target + // [0x02] ... main:2 + // [0x03] ... main:3 @ foo:1 + // [0x04] ... main:3 @ foo:2 + // [0x05] ... main:3 @ foo:3 + // [0x06] ... main:4 + // [0x07] ... main:5 # End + // It will be recorded: + // [main:*] : [0x06, 0x07], [0x01, 0x02] + // [main:3 @ foo:*] : [0x03, 0x05] + while (IP.Address > Target) { + uint64_t PrevIP = IP.Address; + IP.backward(); + // Break into segments for implicit call/return due to inlining + bool SameInlinee = Binary->inlineContextEqual(PrevIP, IP.Address); + if (!SameInlinee) { + State.switchToFrame(PrevIP); + State.CurrentLeafFrame->recordRangeCount(PrevIP, End, Repeat); + End = IP.Address; + } + } + assert(IP.Address == Target && "The last one must be the target address."); + // Record the remaining range, [0x01, 0x02] in the example + State.switchToFrame(IP.Address); + State.CurrentLeafFrame->recordRangeCount(IP.Address, End, Repeat); + } +} + +void VirtualUnwinder::unwindReturn(UnwindState &State) { + // Add extra frame as we unwind through the return + const LBREntry &LBR = State.getCurrentLBR(); + uint64_t CallAddr = Binary->getCallAddrFromFrameAddr(LBR.Target); + State.switchToFrame(CallAddr); + State.pushFrame(LBR.Source); + State.InstPtr.update(LBR.Source); +} + +void VirtualUnwinder::unwindBranch(UnwindState &State) { + // TODO: Tolerate tail call for now, as we may see tail call from libraries. + // This is only for intra function branches, excluding tail calls. + uint64_t Source = State.getCurrentLBRSource(); + State.switchToFrame(Source); + State.InstPtr.update(Source); +} + +std::shared_ptr FrameStack::getContextKey() { + std::shared_ptr KeyStr = + std::make_shared(); + KeyStr->Context = Binary->getExpandedContext(Stack, KeyStr->WasLeafInlined); + return KeyStr; +} + +std::shared_ptr AddressStack::getContextKey() { + std::shared_ptr KeyStr = std::make_shared(); + KeyStr->Context = Stack; + CSProfileGenerator::compressRecursionContext(KeyStr->Context); + CSProfileGenerator::trimContext(KeyStr->Context); + return KeyStr; +} + +template +void VirtualUnwinder::collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, + T &Stack) { + if (Cur->RangeSamples.empty() && Cur->BranchSamples.empty()) + return; + + std::shared_ptr Key = Stack.getContextKey(); + if (Key == nullptr) + return; + auto Ret = CtxCounterMap->emplace(Hashable(Key), SampleCounter()); + SampleCounter &SCounter = Ret.first->second; + for (auto &I : Cur->RangeSamples) + SCounter.recordRangeCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); + + for (auto &I : Cur->BranchSamples) + SCounter.recordBranchCount(std::get<0>(I), std::get<1>(I), std::get<2>(I)); +} + +template +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur, T &Stack) { + if (!Cur->isDummyRoot()) { + // Truncate the context for external frame since this isn't a real call + // context the compiler will see. + if (Cur->isExternalFrame() || !Stack.pushFrame(Cur)) { + // Process truncated context + // Start a new traversal ignoring its bottom context + T EmptyStack(Binary); + collectSamplesFromFrame(Cur, EmptyStack); + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), EmptyStack); + } + + // Keep note of untracked call site and deduplicate them + // for warning later. + if (!Cur->isLeafFrame()) + UntrackedCallsites.insert(Cur->Address); + + return; + } + } + + collectSamplesFromFrame(Cur, Stack); + // Process children frame + for (const auto &Item : Cur->Children) { + collectSamplesFromFrameTrie(Item.second.get(), Stack); + } + // Recover the call stack + Stack.popFrame(); +} + +void VirtualUnwinder::collectSamplesFromFrameTrie( + UnwindState::ProfiledFrame *Cur) { + if (Binary->usePseudoProbes()) { + AddressStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } else { + FrameStack Stack(Binary); + collectSamplesFromFrameTrie(Cur, Stack); + } +} + +void VirtualUnwinder::recordBranchCount(const LBREntry &Branch, + UnwindState &State, uint64_t Repeat) { + if (Branch.Target == ExternalAddr) + return; + + // Record external-to-internal pattern on the trie root, it later can be + // used for generating head samples. + if (Branch.Source == ExternalAddr) { + State.getDummyRootPtr()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + return; + } + + if (Binary->usePseudoProbes()) { + // Same as recordRangeCount, We don't need to top frame probe since we will + // extract it from branch's source address + State.getParentFrame()->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } else { + State.CurrentLeafFrame->recordBranchCount(Branch.Source, Branch.Target, + Repeat); + } +} + +bool VirtualUnwinder::unwind(const PerfSample *Sample, uint64_t Repeat) { + // Capture initial state as starting point for unwinding. + UnwindState State(Sample, Binary); + + // Sanity check - making sure leaf of LBR aligns with leaf of stack sample + // Stack sample sometimes can be unreliable, so filter out bogus ones. + if (!State.validateInitialState()) + return false; + + NumTotalBranches += State.LBRStack.size(); + // Now process the LBR samples in parrallel with stack sample + // Note that we do not reverse the LBR entry order so we can + // unwind the sample stack as we walk through LBR entries. + while (State.hasNextLBR()) { + State.checkStateConsistency(); + + // Do not attempt linear unwind for the leaf range as it's incomplete. + if (!State.IsLastLBR()) { + // Unwind implicit calls/returns from inlining, along the linear path, + // break into smaller sub section each with its own calling context. + unwindLinear(State, Repeat); + } + + // Save the LBR branch before it gets unwound. + const LBREntry &Branch = State.getCurrentLBR(); + if (isCallState(State)) { + // Unwind calls - we know we encountered call if LBR overlaps with + // transition between leaf the 2nd frame. Note that for calls that + // were not in the original stack sample, we should have added the + // extra frame when processing the return paired with this call. + unwindCall(State); + } else if (isReturnState(State)) { + // Unwind returns - check whether the IP is indeed at a return + // instruction + unwindReturn(State); + } else if (isValidState(State)) { + // Unwind branches + unwindBranch(State); + } else { + // Skip unwinding the rest of LBR trace. Reset the stack and update the + // state so that the rest of the trace can still be processed as if they + // do not have stack samples. + State.clearCallStack(); + State.InstPtr.update(State.getCurrentLBRSource()); + State.pushFrame(State.InstPtr.Address); + } + + State.advanceLBR(); + // Record `branch` with calling context after unwinding. + recordBranchCount(Branch, State, Repeat); + } + // As samples are aggregated on trie, record them into counter map + collectSamplesFromFrameTrie(State.getDummyRootPtr()); + + return true; +} + +std::unique_ptr +PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter) { + std::unique_ptr PerfReader; + + if (PerfInput.Format == PerfFormat::UnsymbolizedProfile) { + PerfReader.reset( + new UnsymbolizedProfileReader(Binary, PerfInput.InputFile)); + return PerfReader; + } + + // For perf data input, we need to convert them into perf script first. + if (PerfInput.Format == PerfFormat::PerfData) + PerfInput = + PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput, PIDFilter); + + assert((PerfInput.Format == PerfFormat::PerfScript) && + "Should be a perfscript!"); + + PerfInput.Content = + PerfScriptReader::checkPerfScriptType(PerfInput.InputFile); + if (PerfInput.Content == PerfContent::LBRStack) { + PerfReader.reset( + new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else if (PerfInput.Content == PerfContent::LBR) { + PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else { + exitWithError("Unsupported perfscript!"); + } + + return PerfReader; +} + +PerfInputFile +PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, + PerfInputFile &File, + std::optional PIDFilter) { + StringRef PerfData = File.InputFile; + // Run perf script to retrieve PIDs matching binary we're interested in. + auto PerfExecutable = sys::Process::FindInEnvPath("PATH", "perf"); + if (!PerfExecutable) { + exitWithError("Perf not found."); + } + std::string PerfPath = *PerfExecutable; + std::string PerfTraceFile = PerfData.str() + ".script.tmp"; + std::string ErrorFile = PerfData.str() + ".script.err.tmp"; + StringRef ScriptMMapArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "comm,pid", "-i", + PerfData}; + std::optional Redirects[] = {std::nullopt, // Stdin + StringRef(PerfTraceFile), // Stdout + StringRef(ErrorFile)}; // Stderr + sys::ExecuteAndWait(PerfPath, ScriptMMapArgs, std::nullopt, Redirects); + + // Collect the PIDs + TraceStream TraceIt(PerfTraceFile); + std::string PIDs; + std::unordered_set PIDSet; + while (!TraceIt.isAtEoF()) { + MMapEvent MMap; + if (isMMap2Event(TraceIt.getCurrentLine()) && + extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) { + auto It = PIDSet.emplace(MMap.PID); + if (It.second && (!PIDFilter || MMap.PID == *PIDFilter)) { + if (!PIDs.empty()) { + PIDs.append(","); + } + PIDs.append(utostr(MMap.PID)); + } + } + TraceIt.advance(); + } + + if (PIDs.empty()) { + exitWithError("No relevant mmap event is found in perf data."); + } + + // Run perf script again to retrieve events for PIDs collected above + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "ip,brstack", "--pid", + PIDs, "-i", PerfData}; + sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects); + + return {PerfTraceFile, PerfFormat::PerfScript, PerfContent::UnknownContent}; +} + +void PerfScriptReader::updateBinaryAddress(const MMapEvent &Event) { + // Drop the event which doesn't belong to user-provided binary + StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath); + if (Binary->getName() != BinaryName) + return; + + // Drop the event if process does not match pid filter + if (PIDFilter && Event.PID != *PIDFilter) + return; + + // Drop the event if its image is loaded at the same address + if (Event.Address == Binary->getBaseAddress()) { + Binary->setIsLoadedByMMap(true); + return; + } + + if (Event.Offset == Binary->getTextSegmentOffset()) { + // A binary image could be unloaded and then reloaded at different + // place, so update binary load address. + // Only update for the first executable segment and assume all other + // segments are loaded at consecutive memory addresses, which is the case on + // X64. + Binary->setBaseAddress(Event.Address); + Binary->setIsLoadedByMMap(true); + } else { + // Verify segments are loaded consecutively. + const auto &Offsets = Binary->getTextSegmentOffsets(); + auto It = llvm::lower_bound(Offsets, Event.Offset); + if (It != Offsets.end() && *It == Event.Offset) { + // The event is for loading a separate executable segment. + auto I = std::distance(Offsets.begin(), It); + const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses(); + if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() != + Event.Address - Binary->getBaseAddress()) + exitWithError("Executable segments not loaded consecutively"); + } else { + if (It == Offsets.begin()) + exitWithError("File offset not found"); + else { + // Find the segment the event falls in. A large segment could be loaded + // via multiple mmap calls with consecutive memory addresses. + --It; + assert(*It < Event.Offset); + if (Event.Offset - *It != Event.Address - Binary->getBaseAddress()) + exitWithError("Segment not loaded by consecutive mmaps"); + } + } + } +} + +static std::string getContextKeyStr(ContextKey *K, + const ProfiledBinary *Binary) { + if (const auto *CtxKey = dyn_cast(K)) { + return SampleContext::getContextString(CtxKey->Context); + } else if (const auto *CtxKey = dyn_cast(K)) { + std::ostringstream OContextStr; + for (uint32_t I = 0; I < CtxKey->Context.size(); I++) { + if (OContextStr.str().size()) + OContextStr << " @ "; + uint64_t Address = CtxKey->Context[I]; + if (UseOffset) { + if (UseLoadableSegmentAsBase) + Address -= Binary->getFirstLoadableAddress(); + else + Address -= Binary->getPreferredBaseAddress(); + } + OContextStr << "0x" + << utohexstr(Address, + /*LowerCase=*/true); + } + return OContextStr.str(); + } else { + llvm_unreachable("unexpected key type"); + } +} + +void HybridPerfReader::unwindSamples() { + VirtualUnwinder Unwinder(&SampleCounters, Binary); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + Unwinder.unwind(Sample, Item.second); + } + + // Warn about untracked frames due to missing probes. + if (ShowDetailedWarning) { + for (auto Address : Unwinder.getUntrackedCallsites()) + WithColor::warning() << "Profile context truncated due to missing probe " + << "for call instruction at " + << format("0x%" PRIx64, Address) << "\n"; + } + + emitWarningSummary(Unwinder.getUntrackedCallsites().size(), + SampleCounters.size(), + "of profiled contexts are truncated due to missing probe " + "for call instruction."); + + emitWarningSummary( + Unwinder.NumMismatchedExtCallBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to unwinding error of external frame."); + + emitWarningSummary(Unwinder.NumPairedExtAddr * 2, Unwinder.NumTotalBranches, + "of branches containing paired external address."); + + emitWarningSummary(Unwinder.NumUnpairedExtAddr, Unwinder.NumTotalBranches, + "of branches containing external address but doesn't have " + "another external address to pair, likely due to " + "interrupt jmp or broken perf script."); + + emitWarningSummary( + Unwinder.NumMismatchedProEpiBranch, Unwinder.NumTotalBranches, + "of branches'source is a call instruction but doesn't match call frame " + "stack, likely due to frame in prolog/epilog."); + + emitWarningSummary(Unwinder.NumMissingExternalFrame, + Unwinder.NumExtCallBranch, + "of artificial call branches but doesn't have an external " + "frame to match."); +} + +bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack) { + // The raw format of LBR stack is like: + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 + // It's in FIFO order and seperated by whitespace. + SmallVector Records; + TraceIt.getCurrentLine().split(Records, " ", -1, false); + auto WarnInvalidLBR = [](TraceStream &TraceIt) { + WithColor::warning() << "Invalid address in LBR record at line " + << TraceIt.getLineNumber() << ": " + << TraceIt.getCurrentLine() << "\n"; + }; + + // Skip the leading instruction pointer. + size_t Index = 0; + uint64_t LeadingAddr; + if (!Records.empty() && !Records[0].contains('/')) { + if (Records[0].getAsInteger(16, LeadingAddr)) { + WarnInvalidLBR(TraceIt); + TraceIt.advance(); + return false; + } + Index = 1; + } + + // Now extract LBR samples - note that we do not reverse the + // LBR entry order so we can unwind the sample stack as we walk + // through LBR entries. + while (Index < Records.size()) { + auto &Token = Records[Index++]; + if (Token.size() == 0) + continue; + + SmallVector Addresses; + Token.split(Addresses, "/"); + uint64_t Src; + uint64_t Dst; + + // Stop at broken LBR records. + if (Addresses.size() < 2 || Addresses[0].substr(2).getAsInteger(16, Src) || + Addresses[1].substr(2).getAsInteger(16, Dst)) { + WarnInvalidLBR(TraceIt); + break; + } + + // Canonicalize to use preferred load address as base address. + Src = Binary->canonicalizeVirtualAddress(Src); + Dst = Binary->canonicalizeVirtualAddress(Dst); + bool SrcIsInternal = Binary->addressIsCode(Src); + bool DstIsInternal = Binary->addressIsCode(Dst); + if (!SrcIsInternal) + Src = ExternalAddr; + if (!DstIsInternal) + Dst = ExternalAddr; + // Filter external-to-external case to reduce LBR trace size. + if (!SrcIsInternal && !DstIsInternal) + continue; + + LBRStack.emplace_back(LBREntry(Src, Dst)); + } + TraceIt.advance(); + return !LBRStack.empty(); +} + +bool PerfScriptReader::extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack) { + // The raw format of call stack is like: + // 4005dc # leaf frame + // 400634 + // 400684 # root frame + // It's in bottom-up order with each frame in one line. + + // Extract stack frames from sample + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + StringRef FrameStr = TraceIt.getCurrentLine().ltrim(); + uint64_t FrameAddr = 0; + if (FrameStr.getAsInteger(16, FrameAddr)) { + // We might parse a non-perf sample line like empty line and comments, + // skip it + TraceIt.advance(); + return false; + } + TraceIt.advance(); + + FrameAddr = Binary->canonicalizeVirtualAddress(FrameAddr); + // Currently intermixed frame from different binaries is not supported. + if (!Binary->addressIsCode(FrameAddr)) { + if (CallStack.empty()) + NumLeafExternalFrame++; + // Push a special value(ExternalAddr) for the external frames so that + // unwinder can still work on this with artificial Call/Return branch. + // After unwinding, the context will be truncated for external frame. + // Also deduplicate the consecutive external addresses. + if (CallStack.empty() || CallStack.back() != ExternalAddr) + CallStack.emplace_back(ExternalAddr); + continue; + } + + // We need to translate return address to call address for non-leaf frames. + if (!CallStack.empty()) { + auto CallAddr = Binary->getCallAddrFromFrameAddr(FrameAddr); + if (!CallAddr) { + // Stop at an invalid return address caused by bad unwinding. This could + // happen to frame-pointer-based unwinding and the callee functions that + // do not have the frame pointer chain set up. + InvalidReturnAddresses.insert(FrameAddr); + break; + } + FrameAddr = CallAddr; + } + + CallStack.emplace_back(FrameAddr); + } + + // Strip out the bottom external addr. + if (CallStack.size() > 1 && CallStack.back() == ExternalAddr) + CallStack.pop_back(); + + // Skip other unrelated line, find the next valid LBR line + // Note that even for empty call stack, we should skip the address at the + // bottom, otherwise the following pass may generate a truncated callstack + while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) { + TraceIt.advance(); + } + // Filter out broken stack sample. We may not have complete frame info + // if sample end up in prolog/epilog, the result is dangling context not + // connected to entry point. This should be relatively rare thus not much + // impact on overall profile quality. However we do want to filter them + // out to reduce the number of different calling contexts. One instance + // of such case - when sample landed in prolog/epilog, somehow stack + // walking will be broken in an unexpected way that higher frames will be + // missing. + return !CallStack.empty() && + !Binary->addressInPrologEpilog(CallStack.front()); +} + +void PerfScriptReader::warnIfMissingMMap() { + if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) { + WithColor::warning() << "No relevant mmap event is matched for " + << Binary->getName() + << ", will use preferred address (" + << format("0x%" PRIx64, + Binary->getPreferredBaseAddress()) + << ") as the base loading address!\n"; + // Avoid redundant warning, only warn at the first unmatched sample. + Binary->setMissingMMapWarned(true); + } +} + +void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + // The raw hybird sample started with call stack in FILO order and followed + // intermediately by LBR sample + // e.g. + // 4005dc # call stack leaf + // 400634 + // 400684 # call stack root + // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + // ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries + // + std::shared_ptr Sample = std::make_shared(); +#ifndef NDEBUG + Sample->Linenum = TraceIt.getLineNumber(); +#endif + // Parsing call stack and populate into PerfSample.CallStack + if (!extractCallstack(TraceIt, Sample->CallStack)) { + // Skip the next LBR line matched current call stack + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) + TraceIt.advance(); + return; + } + + warnIfMissingMMap(); + + if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) { + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + if (IgnoreStackSamples) { + Sample->CallStack.clear(); + } else { + // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR + // ranges + Sample->CallStack.front() = Sample->LBRStack[0].Target; + } + // Record samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } + } else { + // LBR sample is encoded in single line after stack sample + exitWithError("'Hybrid perf sample is corrupted, No LBR sample line"); + } +} + +void PerfScriptReader::writeUnsymbolizedProfile(StringRef Filename) { + std::error_code EC; + raw_fd_ostream OS(Filename, EC, llvm::sys::fs::OF_TextWithCRLF); + if (EC) + exitWithError(EC, Filename); + writeUnsymbolizedProfile(OS); +} + +// Use ordered map to make the output deterministic +using OrderedCounterForPrint = std::map; + +void PerfScriptReader::writeUnsymbolizedProfile(raw_fd_ostream &OS) { + OrderedCounterForPrint OrderedCounters; + for (auto &CI : SampleCounters) { + OrderedCounters[getContextKeyStr(CI.first.getPtr(), Binary)] = &CI.second; + } + + auto SCounterPrinter = [&](RangeSample &Counter, StringRef Separator, + uint32_t Indent) { + OS.indent(Indent); + OS << Counter.size() << "\n"; + for (auto &I : Counter) { + uint64_t Start = I.first.first; + uint64_t End = I.first.second; + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Start -= Binary->getFirstLoadableAddress(); + End -= Binary->getFirstLoadableAddress(); + } else { + Start -= Binary->getPreferredBaseAddress(); + End -= Binary->getPreferredBaseAddress(); + } + } + + OS.indent(Indent); + OS << Twine::utohexstr(Start) << Separator << Twine::utohexstr(End) << ":" + << I.second << "\n"; + } + }; + + for (auto &CI : OrderedCounters) { + uint32_t Indent = 0; + if (ProfileIsCS) { + // Context string key + OS << "[" << CI.first << "]\n"; + Indent = 2; + } + + SampleCounter &Counter = *CI.second; + SCounterPrinter(Counter.RangeCounter, "-", Indent); + SCounterPrinter(Counter.BranchCounter, "->", Indent); + } +} + +// Format of input: +// number of entries in RangeCounter +// from_1-to_1:count_1 +// from_2-to_2:count_2 +// ...... +// from_n-to_n:count_n +// number of entries in BranchCounter +// src_1->dst_1:count_1 +// src_2->dst_2:count_2 +// ...... +// src_n->dst_n:count_n +void UnsymbolizedProfileReader::readSampleCounters(TraceStream &TraceIt, + SampleCounter &SCounters) { + auto exitWithErrorForTraceLine = [](TraceStream &TraceIt) { + std::string Msg = TraceIt.isAtEoF() + ? "Invalid raw profile!" + : "Invalid raw profile at line " + + Twine(TraceIt.getLineNumber()).str() + ": " + + TraceIt.getCurrentLine().str(); + exitWithError(Msg); + }; + auto ReadNumber = [&](uint64_t &Num) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + if (TraceIt.getCurrentLine().ltrim().getAsInteger(10, Num)) + exitWithErrorForTraceLine(TraceIt); + TraceIt.advance(); + }; + + auto ReadCounter = [&](RangeSample &Counter, StringRef Separator) { + uint64_t Num = 0; + ReadNumber(Num); + while (Num--) { + if (TraceIt.isAtEoF()) + exitWithErrorForTraceLine(TraceIt); + StringRef Line = TraceIt.getCurrentLine().ltrim(); + + uint64_t Count = 0; + auto LineSplit = Line.split(":"); + if (LineSplit.second.empty() || LineSplit.second.getAsInteger(10, Count)) + exitWithErrorForTraceLine(TraceIt); + + uint64_t Source = 0; + uint64_t Target = 0; + auto Range = LineSplit.first.split(Separator); + if (Range.second.empty() || Range.first.getAsInteger(16, Source) || + Range.second.getAsInteger(16, Target)) + exitWithErrorForTraceLine(TraceIt); + + if (UseOffset) { + if (UseLoadableSegmentAsBase) { + Source += Binary->getFirstLoadableAddress(); + Target += Binary->getFirstLoadableAddress(); + } else { + Source += Binary->getPreferredBaseAddress(); + Target += Binary->getPreferredBaseAddress(); + } + } + + Counter[{Source, Target}] += Count; + TraceIt.advance(); + } + }; + + ReadCounter(SCounters.RangeCounter, "-"); + ReadCounter(SCounters.BranchCounter, "->"); +} + +void UnsymbolizedProfileReader::readUnsymbolizedProfile(StringRef FileName) { + TraceStream TraceIt(FileName); + while (!TraceIt.isAtEoF()) { + std::shared_ptr Key = + std::make_shared(); + StringRef Line = TraceIt.getCurrentLine(); + // Read context stack for CS profile. + if (Line.startswith("[")) { + ProfileIsCS = true; + auto I = ContextStrSet.insert(Line.str()); + SampleContext::createCtxVectorFromStr(*I.first, Key->Context); + TraceIt.advance(); + } + auto Ret = + SampleCounters.emplace(Hashable(Key), SampleCounter()); + readSampleCounters(TraceIt, Ret.first->second); + } +} + +void UnsymbolizedProfileReader::parsePerfTraces() { + readUnsymbolizedProfile(PerfTraceFile); +} + +void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + uint64_t EndAddress = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t TargetAddress = LBR.Target; + + // Record the branch if its SourceAddress is external. It can be the case an + // external source call an internal function, later this branch will be used + // to generate the function's head sample. + if (Binary->addressIsCode(TargetAddress)) { + Counter.recordBranchCount(SourceAddress, TargetAddress, Repeat); + } + + // If this not the first LBR, update the range count between TO of current + // LBR and FROM of next LBR. + uint64_t StartAddress = TargetAddress; + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + isValidFallThroughRange(StartAddress, EndAddress, Binary)) + Counter.recordRangeCount(StartAddress, EndAddress, Repeat); + EndAddress = SourceAddress; + } +} + +void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + // Parsing LBR stack and populate into PerfSample.LBRStack + if (extractLBRStack(TraceIt, Sample->LBRStack)) { + warnIfMissingMMap(); + // Record LBR only samples by aggregation + AggregatedSamples[Hashable(Sample)] += Count; + } +} + +void PerfScriptReader::generateUnsymbolizedProfile() { + // There is no context for LBR only sample, so initialize one entry with + // fake "empty" context key. + assert(SampleCounters.empty() && + "Sample counter map should be empty before raw profile generation"); + std::shared_ptr Key = + std::make_shared(); + SampleCounters.emplace(Hashable(Key), SampleCounter()); + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + computeCounterFromLBR(Sample, Item.second); + } +} + +uint64_t PerfScriptReader::parseAggregatedCount(TraceStream &TraceIt) { + // The aggregated count is optional, so do not skip the line and return 1 if + // it's unmatched + uint64_t Count = 1; + if (!TraceIt.getCurrentLine().getAsInteger(10, Count)) + TraceIt.advance(); + return Count; +} + +void PerfScriptReader::parseSample(TraceStream &TraceIt) { + NumTotalSample++; + uint64_t Count = parseAggregatedCount(TraceIt); + assert(Count >= 1 && "Aggregated count should be >= 1!"); + parseSample(TraceIt, Count); +} + +bool PerfScriptReader::extractMMap2EventForBinary(ProfiledBinary *Binary, + StringRef Line, + MMapEvent &MMap) { + // Parse a line like: + // PERF_RECORD_MMAP2 2113428/2113428: [0x7fd4efb57000(0x204000) @ 0 + // 08:04 19532229 3585508847]: r-xp /usr/lib64/libdl-2.17.so + constexpr static const char *const Pattern = + "PERF_RECORD_MMAP2 ([0-9]+)/[0-9]+: " + "\\[(0x[a-f0-9]+)\\((0x[a-f0-9]+)\\) @ " + "(0x[a-f0-9]+|0) .*\\]: [-a-z]+ (.*)"; + // Field 0 - whole line + // Field 1 - PID + // Field 2 - base address + // Field 3 - mmapped size + // Field 4 - page offset + // Field 5 - binary path + enum EventIndex { + WHOLE_LINE = 0, + PID = 1, + MMAPPED_ADDRESS = 2, + MMAPPED_SIZE = 3, + PAGE_OFFSET = 4, + BINARY_PATH = 5 + }; + + Regex RegMmap2(Pattern); + SmallVector Fields; + bool R = RegMmap2.match(Line, &Fields); + if (!R) { + std::string WarningMsg = "Cannot parse mmap event: " + Line.str() + " \n"; + WithColor::warning() << WarningMsg; + } + Fields[PID].getAsInteger(10, MMap.PID); + Fields[MMAPPED_ADDRESS].getAsInteger(0, MMap.Address); + Fields[MMAPPED_SIZE].getAsInteger(0, MMap.Size); + Fields[PAGE_OFFSET].getAsInteger(0, MMap.Offset); + MMap.BinaryPath = Fields[BINARY_PATH]; + if (ShowMmapEvents) { + outs() << "Mmap: Binary " << MMap.BinaryPath << " loaded at " + << format("0x%" PRIx64 ":", MMap.Address) << " \n"; + } + + StringRef BinaryName = llvm::sys::path::filename(MMap.BinaryPath); + return Binary->getName() == BinaryName; +} + +void PerfScriptReader::parseMMap2Event(TraceStream &TraceIt) { + MMapEvent MMap; + if (extractMMap2EventForBinary(Binary, TraceIt.getCurrentLine(), MMap)) + updateBinaryAddress(MMap); + TraceIt.advance(); +} + +void PerfScriptReader::parseEventOrSample(TraceStream &TraceIt) { + if (isMMap2Event(TraceIt.getCurrentLine())) + parseMMap2Event(TraceIt); + else + parseSample(TraceIt); +} + +void PerfScriptReader::parseAndAggregateTrace() { + // Trace line iterator + TraceStream TraceIt(PerfTraceFile); + while (!TraceIt.isAtEoF()) + parseEventOrSample(TraceIt); +} + +// A LBR sample is like: +// 40062f 0x5c6313f/0x5c63170/P/-/-/0 0x5c630e7/0x5c63130/P/-/-/0 ... +// A heuristic for fast detection by checking whether a +// leading " 0x" and the '/' exist. +bool PerfScriptReader::isLBRSample(StringRef Line) { + // Skip the leading instruction pointer + SmallVector Records; + Line.trim().split(Records, " ", 2, false); + if (Records.size() < 2) + return false; + if (Records[1].startswith("0x") && Records[1].contains('/')) + return true; + return false; +} + +bool PerfScriptReader::isMMap2Event(StringRef Line) { + // Short cut to avoid string find is possible. + if (Line.empty() || Line.size() < 50) + return false; + + if (std::isdigit(Line[0])) + return false; + + // PERF_RECORD_MMAP2 does not appear at the beginning of the line + // for ` perf script --show-mmap-events -i ...` + return Line.contains("PERF_RECORD_MMAP2"); +} + +// The raw hybird sample is like +// e.g. +// 4005dc # call stack leaf +// 400634 +// 400684 # call stack root +// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... +// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +// Determine the perfscript contains hybrid samples(call stack + LBRs) by +// checking whether there is a non-empty call stack immediately followed by +// a LBR sample +PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { + TraceStream TraceIt(FileName); + uint64_t FrameAddr = 0; + while (!TraceIt.isAtEoF()) { + // Skip the aggregated count + if (!TraceIt.getCurrentLine().getAsInteger(10, FrameAddr)) + TraceIt.advance(); + + // Detect sample with call stack + int32_t Count = 0; + while (!TraceIt.isAtEoF() && + !TraceIt.getCurrentLine().ltrim().getAsInteger(16, FrameAddr)) { + Count++; + TraceIt.advance(); + } + if (!TraceIt.isAtEoF()) { + if (isLBRSample(TraceIt.getCurrentLine())) { + if (Count > 0) + return PerfContent::LBRStack; + else + return PerfContent::LBR; + } + TraceIt.advance(); + } + } + + exitWithError("Invalid perf script input!"); + return PerfContent::UnknownContent; +} + +void HybridPerfReader::generateUnsymbolizedProfile() { + ProfileIsCS = !IgnoreStackSamples; + if (ProfileIsCS) + unwindSamples(); + else + PerfScriptReader::generateUnsymbolizedProfile(); +} + +void PerfScriptReader::warnTruncatedStack() { + if (ShowDetailedWarning) { + for (auto Address : InvalidReturnAddresses) { + WithColor::warning() + << "Truncated stack sample due to invalid return address at " + << format("0x%" PRIx64, Address) + << ", likely caused by frame pointer omission\n"; + } + } + emitWarningSummary( + InvalidReturnAddresses.size(), AggregatedSamples.size(), + "of truncated stack samples due to invalid return address, " + "likely caused by frame pointer omission."); +} + +void PerfScriptReader::warnInvalidRange() { + std::unordered_map, uint64_t, + pair_hash> + Ranges; + + for (const auto &Item : AggregatedSamples) { + const PerfSample *Sample = Item.first.getPtr(); + uint64_t Count = Item.second; + uint64_t EndAddress = 0; + for (const LBREntry &LBR : Sample->LBRStack) { + uint64_t SourceAddress = LBR.Source; + uint64_t StartAddress = LBR.Target; + if (EndAddress != 0) + Ranges[{StartAddress, EndAddress}] += Count; + EndAddress = SourceAddress; + } + } + + if (Ranges.empty()) { + WithColor::warning() << "No samples in perf script!\n"; + return; + } + + auto WarnInvalidRange = [&](uint64_t StartAddress, uint64_t EndAddress, + StringRef Msg) { + if (!ShowDetailedWarning) + return; + WithColor::warning() << "[" << format("%8" PRIx64, StartAddress) << "," + << format("%8" PRIx64, EndAddress) << "]: " << Msg + << "\n"; + }; + + const char *EndNotBoundaryMsg = "Range is not on instruction boundary, " + "likely due to profile and binary mismatch."; + const char *DanglingRangeMsg = "Range does not belong to any functions, " + "likely from PLT, .init or .fini section."; + const char *RangeCrossFuncMsg = + "Fall through range should not cross function boundaries, likely due to " + "profile and binary mismatch."; + const char *BogusRangeMsg = "Range start is after or too far from range end."; + + uint64_t TotalRangeNum = 0; + uint64_t InstNotBoundary = 0; + uint64_t UnmatchedRange = 0; + uint64_t RangeCrossFunc = 0; + uint64_t BogusRange = 0; + + for (auto &I : Ranges) { + uint64_t StartAddress = I.first.first; + uint64_t EndAddress = I.first.second; + TotalRangeNum += I.second; + + if (!Binary->addressIsCode(StartAddress) && + !Binary->addressIsCode(EndAddress)) + continue; + + if (!Binary->addressIsCode(StartAddress) || + !Binary->addressIsTransfer(EndAddress)) { + InstNotBoundary += I.second; + WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg); + } + + auto *FRange = Binary->findFuncRange(StartAddress); + if (!FRange) { + UnmatchedRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, DanglingRangeMsg); + continue; + } + + if (EndAddress >= FRange->EndAddress) { + RangeCrossFunc += I.second; + WarnInvalidRange(StartAddress, EndAddress, RangeCrossFuncMsg); + } + + if (Binary->addressIsCode(StartAddress) && + Binary->addressIsCode(EndAddress) && + !isValidFallThroughRange(StartAddress, EndAddress, Binary)) { + BogusRange += I.second; + WarnInvalidRange(StartAddress, EndAddress, BogusRangeMsg); + } + } + + emitWarningSummary( + InstNotBoundary, TotalRangeNum, + "of samples are from ranges that are not on instruction boundary."); + emitWarningSummary( + UnmatchedRange, TotalRangeNum, + "of samples are from ranges that do not belong to any functions."); + emitWarningSummary( + RangeCrossFunc, TotalRangeNum, + "of samples are from ranges that do cross function boundaries."); + emitWarningSummary( + BogusRange, TotalRangeNum, + "of samples are from ranges that have range start after or too far from " + "range end acrossing the unconditinal jmp."); +} + +void PerfScriptReader::parsePerfTraces() { + // Parse perf traces and do aggregation. + parseAndAggregateTrace(); + + emitWarningSummary(NumLeafExternalFrame, NumTotalSample, + "of samples have leaf external frame in call stack."); + emitWarningSummary(NumLeadingOutgoingLBR, NumTotalSample, + "of samples have leading external LBR."); + + // Generate unsymbolized profile. + warnTruncatedStack(); + warnInvalidRange(); + generateUnsymbolizedProfile(); + AggregatedSamples.clear(); + + if (SkipSymbolization) + writeUnsymbolizedProfile(OutputFilename); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.h b/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.h new file mode 100644 index 00000000000..14137e82572 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/PerfReader.h @@ -0,0 +1,742 @@ +//===-- PerfReader.h - perfscript reader -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H +#include "ErrorHandling.h" +#include "ProfiledBinary.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" +#include +#include +#include +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +// Stream based trace line iterator +class TraceStream { + std::string CurrentLine; + std::ifstream Fin; + bool IsAtEoF = false; + uint64_t LineNumber = 0; + +public: + TraceStream(StringRef Filename) : Fin(Filename.str()) { + if (!Fin.good()) + exitWithError("Error read input perf script file", Filename); + advance(); + } + + StringRef getCurrentLine() { + assert(!IsAtEoF && "Line iterator reaches the End-of-File!"); + return CurrentLine; + } + + uint64_t getLineNumber() { return LineNumber; } + + bool isAtEoF() { return IsAtEoF; } + + // Read the next line + void advance() { + if (!std::getline(Fin, CurrentLine)) { + IsAtEoF = true; + return; + } + LineNumber++; + } +}; + +// The type of input format. +enum PerfFormat { + UnknownFormat = 0, + PerfData = 1, // Raw linux perf.data. + PerfScript = 2, // Perf script create by `perf script` command. + UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. + +}; + +// The type of perfscript content. +enum PerfContent { + UnknownContent = 0, + LBR = 1, // Only LBR sample. + LBRStack = 2, // Hybrid sample including call stack and LBR stack. +}; + +struct PerfInputFile { + std::string InputFile; + PerfFormat Format = PerfFormat::UnknownFormat; + PerfContent Content = PerfContent::UnknownContent; +}; + +// The parsed LBR sample entry. +struct LBREntry { + uint64_t Source = 0; + uint64_t Target = 0; + LBREntry(uint64_t S, uint64_t T) : Source(S), Target(T) {} + +#ifndef NDEBUG + void print() const { + dbgs() << "from " << format("%#010x", Source) << " to " + << format("%#010x", Target); + } +#endif +}; + +#ifndef NDEBUG +static inline void printLBRStack(const SmallVectorImpl &LBRStack) { + for (size_t I = 0; I < LBRStack.size(); I++) { + dbgs() << "[" << I << "] "; + LBRStack[I].print(); + dbgs() << "\n"; + } +} + +static inline void printCallStack(const SmallVectorImpl &CallStack) { + for (size_t I = 0; I < CallStack.size(); I++) { + dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n"; + } +} +#endif + +// Hash interface for generic data of type T +// Data should implement a \fn getHashCode and a \fn isEqual +// Currently getHashCode is non-virtual to avoid the overhead of calling vtable, +// i.e we explicitly calculate hash of derived class, assign to base class's +// HashCode. This also provides the flexibility for calculating the hash code +// incrementally(like rolling hash) during frame stack unwinding since unwinding +// only changes the leaf of frame stack. \fn isEqual is a virtual function, +// which will have perf overhead. In the future, if we redesign a better hash +// function, then we can just skip this or switch to non-virtual function(like +// just ignore comparison if hash conflicts probabilities is low) +template class Hashable { +public: + std::shared_ptr Data; + Hashable(const std::shared_ptr &D) : Data(D) {} + + // Hash code generation + struct Hash { + uint64_t operator()(const Hashable &Key) const { + // Don't make it virtual for getHashCode + uint64_t Hash = Key.Data->getHashCode(); + assert(Hash && "Should generate HashCode for it!"); + return Hash; + } + }; + + // Hash equal + struct Equal { + bool operator()(const Hashable &LHS, const Hashable &RHS) const { + // Precisely compare the data, vtable will have overhead. + return LHS.Data->isEqual(RHS.Data.get()); + } + }; + + T *getPtr() const { return Data.get(); } +}; + +struct PerfSample { + // LBR stack recorded in FIFO order. + SmallVector LBRStack; + // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile + // generation + SmallVector CallStack; + + virtual ~PerfSample() = default; + uint64_t getHashCode() const { + // Use simple DJB2 hash + auto HashCombine = [](uint64_t H, uint64_t V) { + return ((H << 5) + H) + V; + }; + uint64_t Hash = 5381; + for (const auto &Value : CallStack) { + Hash = HashCombine(Hash, Value); + } + for (const auto &Entry : LBRStack) { + Hash = HashCombine(Hash, Entry.Source); + Hash = HashCombine(Hash, Entry.Target); + } + return Hash; + } + + bool isEqual(const PerfSample *Other) const { + const SmallVector &OtherCallStack = Other->CallStack; + const SmallVector &OtherLBRStack = Other->LBRStack; + + if (CallStack.size() != OtherCallStack.size() || + LBRStack.size() != OtherLBRStack.size()) + return false; + + if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) + return false; + + for (size_t I = 0; I < OtherLBRStack.size(); I++) { + if (LBRStack[I].Source != OtherLBRStack[I].Source || + LBRStack[I].Target != OtherLBRStack[I].Target) + return false; + } + return true; + } + +#ifndef NDEBUG + uint64_t Linenum = 0; + + void print() const { + dbgs() << "Line " << Linenum << "\n"; + dbgs() << "LBR stack\n"; + printLBRStack(LBRStack); + dbgs() << "Call stack\n"; + printCallStack(CallStack); + } +#endif +}; +// After parsing the sample, we record the samples by aggregating them +// into this counter. The key stores the sample data and the value is +// the sample repeat times. +using AggregatedCounter = + std::unordered_map, uint64_t, + Hashable::Hash, Hashable::Equal>; + +using SampleVector = SmallVector, 16>; + +inline bool isValidFallThroughRange(uint64_t Start, uint64_t End, + ProfiledBinary *Binary) { + // Start bigger than End is considered invalid. + // LBR ranges cross the unconditional jmp are also assumed invalid. + // It's found that perf data may contain duplicate LBR entries that could form + // a range that does not reflect real execution flow on some Intel targets, + // e.g. Skylake. Such ranges are ususally very long. Exclude them since there + // cannot be a linear execution range that spans over unconditional jmp. + return Start <= End && !Binary->rangeCrossUncondBranch(Start, End); +} + +// The state for the unwinder, it doesn't hold the data but only keep the +// pointer/index of the data, While unwinding, the CallStack is changed +// dynamicially and will be recorded as the context of the sample +struct UnwindState { + // Profiled binary that current frame address belongs to + const ProfiledBinary *Binary; + // Call stack trie node + struct ProfiledFrame { + const uint64_t Address = DummyRoot; + ProfiledFrame *Parent; + SampleVector RangeSamples; + SampleVector BranchSamples; + std::unordered_map> Children; + + ProfiledFrame(uint64_t Addr = 0, ProfiledFrame *P = nullptr) + : Address(Addr), Parent(P) {} + ProfiledFrame *getOrCreateChildFrame(uint64_t Address) { + assert(Address && "Address can't be zero!"); + auto Ret = Children.emplace( + Address, std::make_unique(Address, this)); + return Ret.first->second.get(); + } + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Count) { + RangeSamples.emplace_back(std::make_tuple(Start, End, Count)); + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Count) { + BranchSamples.emplace_back(std::make_tuple(Source, Target, Count)); + } + bool isDummyRoot() { return Address == DummyRoot; } + bool isExternalFrame() { return Address == ExternalAddr; } + bool isLeafFrame() { return Children.empty(); } + }; + + ProfiledFrame DummyTrieRoot; + ProfiledFrame *CurrentLeafFrame; + // Used to fall through the LBR stack + uint32_t LBRIndex = 0; + // Reference to PerfSample.LBRStack + const SmallVector &LBRStack; + // Used to iterate the address range + InstructionPointer InstPtr; + // Indicate whether unwinding is currently in a bad state which requires to + // skip all subsequent unwinding. + bool Invalid = false; + UnwindState(const PerfSample *Sample, const ProfiledBinary *Binary) + : Binary(Binary), LBRStack(Sample->LBRStack), + InstPtr(Binary, Sample->CallStack.front()) { + initFrameTrie(Sample->CallStack); + } + + bool validateInitialState() { + uint64_t LBRLeaf = LBRStack[LBRIndex].Target; + uint64_t LeafAddr = CurrentLeafFrame->Address; + assert((LBRLeaf != ExternalAddr || LBRLeaf == LeafAddr) && + "External leading LBR should match the leaf frame."); + + // When we take a stack sample, ideally the sampling distance between the + // leaf IP of stack and the last LBR target shouldn't be very large. + // Use a heuristic size (0x100) to filter out broken records. + if (LeafAddr < LBRLeaf || LeafAddr - LBRLeaf >= 0x100) { + WithColor::warning() << "Bogus trace: stack tip = " + << format("%#010x", LeafAddr) + << ", LBR tip = " << format("%#010x\n", LBRLeaf); + return false; + } + return true; + } + + void checkStateConsistency() { + assert(InstPtr.Address == CurrentLeafFrame->Address && + "IP should align with context leaf"); + } + + void setInvalid() { Invalid = true; } + bool hasNextLBR() const { return LBRIndex < LBRStack.size(); } + uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; } + uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; } + const LBREntry &getCurrentLBR() const { return LBRStack[LBRIndex]; } + bool IsLastLBR() const { return LBRIndex == 0; } + bool getLBRStackSize() const { return LBRStack.size(); } + void advanceLBR() { LBRIndex++; } + ProfiledFrame *getParentFrame() { return CurrentLeafFrame->Parent; } + + void pushFrame(uint64_t Address) { + CurrentLeafFrame = CurrentLeafFrame->getOrCreateChildFrame(Address); + } + + void switchToFrame(uint64_t Address) { + if (CurrentLeafFrame->Address == Address) + return; + CurrentLeafFrame = CurrentLeafFrame->Parent->getOrCreateChildFrame(Address); + } + + void popFrame() { CurrentLeafFrame = CurrentLeafFrame->Parent; } + + void clearCallStack() { CurrentLeafFrame = &DummyTrieRoot; } + + void initFrameTrie(const SmallVectorImpl &CallStack) { + ProfiledFrame *Cur = &DummyTrieRoot; + for (auto Address : reverse(CallStack)) { + Cur = Cur->getOrCreateChildFrame(Address); + } + CurrentLeafFrame = Cur; + } + + ProfiledFrame *getDummyRootPtr() { return &DummyTrieRoot; } +}; + +// Base class for sample counter key with context +struct ContextKey { + uint64_t HashCode = 0; + virtual ~ContextKey() = default; + uint64_t getHashCode() { + if (HashCode == 0) + genHashCode(); + return HashCode; + } + virtual void genHashCode() = 0; + virtual bool isEqual(const ContextKey *K) const { + return HashCode == K->HashCode; + }; + + // Utilities for LLVM-style RTTI + enum ContextKind { CK_StringBased, CK_AddrBased }; + const ContextKind Kind; + ContextKind getKind() const { return Kind; } + ContextKey(ContextKind K) : Kind(K){}; +}; + +// String based context id +struct StringBasedCtxKey : public ContextKey { + SampleContextFrameVector Context; + + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_StringBased; + } + + bool isEqual(const ContextKey *K) const override { + const StringBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_value(SampleContextFrames(Context)); + } +}; + +// Address-based context id +struct AddrBasedCtxKey : public ContextKey { + SmallVector Context; + + bool WasLeafInlined; + AddrBasedCtxKey() : ContextKey(CK_AddrBased), WasLeafInlined(false){}; + static bool classof(const ContextKey *K) { + return K->getKind() == CK_AddrBased; + } + + bool isEqual(const ContextKey *K) const override { + const AddrBasedCtxKey *Other = dyn_cast(K); + return Context == Other->Context; + } + + void genHashCode() override { + HashCode = hash_combine_range(Context.begin(), Context.end()); + } +}; + +// The counter of branch samples for one function indexed by the branch, +// which is represented as the source and target offset pair. +using BranchSample = std::map, uint64_t>; +// The counter of range samples for one function indexed by the range, +// which is represented as the start and end offset pair. +using RangeSample = std::map, uint64_t>; +// Wrapper for sample counters including range counter and branch counter +struct SampleCounter { + RangeSample RangeCounter; + BranchSample BranchCounter; + + void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { + assert(Start <= End && "Invalid instruction range"); + RangeCounter[{Start, End}] += Repeat; + } + void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { + BranchCounter[{Source, Target}] += Repeat; + } +}; + +// Sample counter with context to support context-sensitive profile +using ContextSampleCounterMap = + std::unordered_map, SampleCounter, + Hashable::Hash, Hashable::Equal>; + +struct FrameStack { + SmallVector Stack; + ProfiledBinary *Binary; + FrameStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +struct AddressStack { + SmallVector Stack; + ProfiledBinary *Binary; + AddressStack(ProfiledBinary *B) : Binary(B) {} + bool pushFrame(UnwindState::ProfiledFrame *Cur) { + assert(!Cur->isExternalFrame() && + "External frame's not expected for context stack."); + Stack.push_back(Cur->Address); + return true; + } + + void popFrame() { + if (!Stack.empty()) + Stack.pop_back(); + } + std::shared_ptr getContextKey(); +}; + +/* +As in hybrid sample we have a group of LBRs and the most recent sampling call +stack, we can walk through those LBRs to infer more call stacks which would be +used as context for profile. VirtualUnwinder is the class to do the call stack +unwinding based on LBR state. Two types of unwinding are processd here: +1) LBR unwinding and 2) linear range unwinding. +Specifically, for each LBR entry(can be classified into call, return, regular +branch), LBR unwinding will replay the operation by pushing, popping or +switching leaf frame towards the call stack and since the initial call stack +is most recently sampled, the replay should be in anti-execution order, i.e. for +the regular case, pop the call stack when LBR is call, push frame on call stack +when LBR is return. After each LBR processed, it also needs to align with the +next LBR by going through instructions from previous LBR's target to current +LBR's source, which is the linear unwinding. As instruction from linear range +can come from different function by inlining, linear unwinding will do the range +splitting and record counters by the range with same inline context. Over those +unwinding process we will record each call stack as context id and LBR/linear +range as sample counter for further CS profile generation. +*/ +class VirtualUnwinder { +public: + VirtualUnwinder(ContextSampleCounterMap *Counter, ProfiledBinary *B) + : CtxCounterMap(Counter), Binary(B) {} + bool unwind(const PerfSample *Sample, uint64_t Repeat); + std::set &getUntrackedCallsites() { return UntrackedCallsites; } + + uint64_t NumTotalBranches = 0; + uint64_t NumExtCallBranch = 0; + uint64_t NumMissingExternalFrame = 0; + uint64_t NumMismatchedProEpiBranch = 0; + uint64_t NumMismatchedExtCallBranch = 0; + uint64_t NumUnpairedExtAddr = 0; + uint64_t NumPairedExtAddr = 0; + +private: + bool isSourceExternal(UnwindState &State) const { + return State.getCurrentLBRSource() == ExternalAddr; + } + + bool isTargetExternal(UnwindState &State) const { + return State.getCurrentLBRTarget() == ExternalAddr; + } + + // Determine whether the return source is from external code by checking if + // the target's the next inst is a call inst. + bool isReturnFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + (Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) != 0); + } + + // If the source is external address but it's not the `return` case, treat it + // as a call from external. + bool isCallFromExternal(UnwindState &State) const { + return isSourceExternal(State) && + Binary->getCallAddrFromFrameAddr(State.getCurrentLBRTarget()) == 0; + } + + bool isCallState(UnwindState &State) const { + // The tail call frame is always missing here in stack sample, we will + // use a specific tail call tracker to infer it. + if (!isValidState(State)) + return false; + + if (Binary->addressIsCall(State.getCurrentLBRSource())) + return true; + + return isCallFromExternal(State); + } + + bool isReturnState(UnwindState &State) const { + if (!isValidState(State)) + return false; + + // Simply check addressIsReturn, as ret is always reliable, both for + // regular call and tail call. + if (Binary->addressIsReturn(State.getCurrentLBRSource())) + return true; + + return isReturnFromExternal(State); + } + + bool isValidState(UnwindState &State) const { return !State.Invalid; } + + void unwindCall(UnwindState &State); + void unwindLinear(UnwindState &State, uint64_t Repeat); + void unwindReturn(UnwindState &State); + void unwindBranch(UnwindState &State); + + template + void collectSamplesFromFrame(UnwindState::ProfiledFrame *Cur, T &Stack); + // Collect each samples on trie node by DFS traversal + template + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur, T &Stack); + void collectSamplesFromFrameTrie(UnwindState::ProfiledFrame *Cur); + + void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State, + uint64_t Repeat); + void recordBranchCount(const LBREntry &Branch, UnwindState &State, + uint64_t Repeat); + + ContextSampleCounterMap *CtxCounterMap; + // Profiled binary that current frame address belongs to + ProfiledBinary *Binary; + // Keep track of all untracked callsites + std::set UntrackedCallsites; +}; + +// Read perf trace to parse the events and samples. +class PerfReaderBase { +public: + PerfReaderBase(ProfiledBinary *B, StringRef PerfTrace) + : Binary(B), PerfTraceFile(PerfTrace) { + // Initialize the base address to preferred address. + Binary->setBaseAddress(Binary->getPreferredBaseAddress()); + }; + virtual ~PerfReaderBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, PerfInputFile &PerfInput, + std::optional PIDFilter); + + // Entry of the reader to parse multiple perf traces + virtual void parsePerfTraces() = 0; + const ContextSampleCounterMap &getSampleCounters() const { + return SampleCounters; + } + bool profileIsCS() { return ProfileIsCS; } + +protected: + ProfiledBinary *Binary = nullptr; + StringRef PerfTraceFile; + + ContextSampleCounterMap SampleCounters; + bool ProfileIsCS = false; + + uint64_t NumTotalSample = 0; + uint64_t NumLeafExternalFrame = 0; + uint64_t NumLeadingOutgoingLBR = 0; +}; + +// Read perf script to parse the events and samples. +class PerfScriptReader : public PerfReaderBase { +public: + PerfScriptReader(ProfiledBinary *B, StringRef PerfTrace, + std::optional PID) + : PerfReaderBase(B, PerfTrace), PIDFilter(PID){}; + + // Entry of the reader to parse multiple perf traces + void parsePerfTraces() override; + // Generate perf script from perf data + static PerfInputFile + convertPerfDataToTrace(ProfiledBinary *Binary, PerfInputFile &File, + std::optional PIDFilter); + // Extract perf script type by peaking at the input + static PerfContent checkPerfScriptType(StringRef FileName); + +protected: + // The parsed MMap event + struct MMapEvent { + uint64_t PID = 0; + uint64_t Address = 0; + uint64_t Size = 0; + uint64_t Offset = 0; + StringRef BinaryPath; + }; + + // Check whether a given line is LBR sample + static bool isLBRSample(StringRef Line); + // Check whether a given line is MMAP event + static bool isMMap2Event(StringRef Line); + // Parse a single line of a PERF_RECORD_MMAP2 event looking for a + // mapping between the binary name and its memory layout. + static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, + MMapEvent &MMap); + // Update base address based on mmap events + void updateBinaryAddress(const MMapEvent &Event); + // Parse mmap event and update binary address + void parseMMap2Event(TraceStream &TraceIt); + // Parse perf events/samples and do aggregation + void parseAndAggregateTrace(); + // Parse either an MMAP event or a perf sample + void parseEventOrSample(TraceStream &TraceIt); + // Warn if the relevant mmap event is missing. + void warnIfMissingMMap(); + // Emit accumulate warnings. + void warnTruncatedStack(); + // Warn if range is invalid. + void warnInvalidRange(); + // Extract call stack from the perf trace lines + bool extractCallstack(TraceStream &TraceIt, + SmallVectorImpl &CallStack); + // Extract LBR stack from one perf trace line + bool extractLBRStack(TraceStream &TraceIt, + SmallVectorImpl &LBRStack); + uint64_t parseAggregatedCount(TraceStream &TraceIt); + // Parse one sample from multiple perf lines, override this for different + // sample type + void parseSample(TraceStream &TraceIt); + // An aggregated count is given to indicate how many times the sample is + // repeated. + virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; + void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + // Post process the profile after trace aggregation, we will do simple range + // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). + virtual void generateUnsymbolizedProfile(); + void writeUnsymbolizedProfile(StringRef Filename); + void writeUnsymbolizedProfile(raw_fd_ostream &OS); + + // Samples with the repeating time generated by the perf reader + AggregatedCounter AggregatedSamples; + // Keep track of all invalid return addresses + std::set InvalidReturnAddresses; + // PID for the process of interest + std::optional PIDFilter; +}; + +/* + The reader of LBR only perf script. + A typical LBR sample is like: + 40062f 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 +*/ +class LBRPerfReader : public PerfScriptReader { +public: + LBRPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the LBR only sample. + void parseSample(TraceStream &TraceIt, uint64_t Count) override; +}; + +/* + Hybrid perf script includes a group of hybrid samples(LBRs + call stack), + which is used to generate CS profile. An example of hybrid sample: + 4005dc # call stack leaf + 400634 + 400684 # call stack root + 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ... + ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries +*/ +class HybridPerfReader : public PerfScriptReader { +public: + HybridPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID){}; + // Parse the hybrid sample including the call and LBR line + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + void generateUnsymbolizedProfile() override; + +private: + // Unwind the hybrid samples after aggregration + void unwindSamples(); +}; + +/* + Format of unsymbolized profile: + + [frame1 @ frame2 @ ...] # If it's a CS profile + number of entries in RangeCounter + from_1-to_1:count_1 + from_2-to_2:count_2 + ...... + from_n-to_n:count_n + number of entries in BranchCounter + src_1->dst_1:count_1 + src_2->dst_2:count_2 + ...... + src_n->dst_n:count_n + [frame1 @ frame2 @ ...] # Next context + ...... + +Note that non-CS profile doesn't have the empty `[]` context. +*/ +class UnsymbolizedProfileReader : public PerfReaderBase { +public: + UnsymbolizedProfileReader(ProfiledBinary *Binary, StringRef PerfTrace) + : PerfReaderBase(Binary, PerfTrace){}; + void parsePerfTraces() override; + +private: + void readSampleCounters(TraceStream &TraceIt, SampleCounter &SCounters); + void readUnsymbolizedProfile(StringRef Filename); + + std::unordered_set ContextStrSet; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.cpp b/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.cpp new file mode 100644 index 00000000000..97bc8d59b6c --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.cpp @@ -0,0 +1,1274 @@ +//===-- ProfileGenerator.cpp - Profile Generator ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "ProfileGenerator.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/ProfileData/ProfileCommon.h" +#include +#include +#include +#include + +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::Required, + cl::desc("Output profile file")); +static cl::alias OutputA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + +static cl::opt OutputFormat( + "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary), + cl::values( + clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"), + clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"), + clEnumValN(SPF_Text, "text", "Text encoding"), + clEnumValN(SPF_GCC, "gcc", + "GCC encoding (only meaningful for -sample)"))); + +static cl::opt UseMD5( + "use-md5", cl::Hidden, + cl::desc("Use md5 to represent function names in the output profile (only " + "meaningful for -extbinary)")); + +static cl::opt PopulateProfileSymbolList( + "populate-profile-symbol-list", cl::init(false), cl::Hidden, + cl::desc("Populate profile symbol list (only meaningful for -extbinary)")); + +static cl::opt FillZeroForAllFuncs( + "fill-zero-for-all-funcs", cl::init(false), cl::Hidden, + cl::desc("Attribute all functions' range with zero count " + "even it's not hit by any samples.")); + +static cl::opt RecursionCompression( + "compress-recursion", + cl::desc("Compressing recursion by deduplicating adjacent frame " + "sequences up to the specified size. -1 means no size limit."), + cl::Hidden, + cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize)); + +static cl::opt + TrimColdProfile("trim-cold-profile", + cl::desc("If the total count of the profile is smaller " + "than threshold, it will be trimmed.")); + +static cl::opt CSProfMergeColdContext( + "csprof-merge-cold-context", cl::init(true), + cl::desc("If the total count of context profile is smaller than " + "the threshold, it will be merged into context-less base " + "profile.")); + +static cl::opt CSProfMaxColdContextDepth( + "csprof-max-cold-context-depth", cl::init(1), + cl::desc("Keep the last K contexts while merging cold profile. 1 means the " + "context-less base profile")); + +static cl::opt CSProfMaxContextDepth( + "csprof-max-context-depth", + cl::desc("Keep the last K contexts while merging profile. -1 means no " + "depth limit."), + cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth)); + +static cl::opt HotFunctionDensityThreshold( + "hot-function-density-threshold", llvm::cl::init(1000), + llvm::cl::desc( + "specify density threshold for hot functions (default: 1000)"), + llvm::cl::Optional); +static cl::opt ShowDensity("show-density", llvm::cl::init(false), + llvm::cl::desc("show profile density details"), + llvm::cl::Optional); + +static cl::opt UpdateTotalSamples( + "update-total-samples", llvm::cl::init(false), + llvm::cl::desc( + "Update total samples by accumulating all its body samples."), + llvm::cl::Optional); + +static cl::opt GenCSNestedProfile( + "gen-cs-nested-profile", cl::Hidden, cl::init(true), + cl::desc("Generate nested function profiles for CSSPGO")); + +cl::opt InferMissingFrames( + "infer-missing-frames", llvm::cl::init(true), + llvm::cl::desc( + "Infer missing call frames due to compiler tail call elimination."), + llvm::cl::Optional); + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +extern cl::opt ProfileSummaryCutoffHot; +extern cl::opt UseContextLessSummary; + +namespace sampleprof { + +// Initialize the MaxCompressionSize to -1 which means no size limit +int32_t CSProfileGenerator::MaxCompressionSize = -1; + +int CSProfileGenerator::MaxContextDepth = -1; + +bool ProfileGeneratorBase::UseFSDiscriminator = false; + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, + const ContextSampleCounterMap *SampleCounters, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + Generator.reset(new CSProfileGenerator(Binary, SampleCounters)); + } else { + Generator.reset(new ProfileGenerator(Binary, SampleCounters)); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +std::unique_ptr +ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles, + bool ProfileIsCS) { + std::unique_ptr Generator; + if (ProfileIsCS) { + Generator.reset(new CSProfileGenerator(Binary, Profiles)); + } else { + Generator.reset(new ProfileGenerator(Binary, std::move(Profiles))); + } + ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); + FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); + + return Generator; +} + +void ProfileGeneratorBase::write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap) { + // Populate profile symbol list if extended binary format is used. + ProfileSymbolList SymbolList; + + if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) { + Binary->populateSymbolListFromDWARF(SymbolList); + Writer->setProfileSymbolList(&SymbolList); + } + + if (std::error_code EC = Writer->write(ProfileMap)) + exitWithError(std::move(EC)); +} + +void ProfileGeneratorBase::write() { + auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat); + if (std::error_code EC = WriterOrErr.getError()) + exitWithError(EC, OutputFilename); + + if (UseMD5) { + if (OutputFormat != SPF_Ext_Binary) + WithColor::warning() << "-use-md5 is ignored. Specify " + "--format=extbinary to enable it\n"; + else + WriterOrErr.get()->setUseMD5(); + } + + write(std::move(WriterOrErr.get()), ProfileMap); +} + +void ProfileGeneratorBase::showDensitySuggestion(double Density) { + if (Density == 0.0) + WithColor::warning() << "The --profile-summary-cutoff-hot option may be " + "set too low. Please check your command.\n"; + else if (Density < HotFunctionDensityThreshold) + WithColor::warning() + << "Sample PGO is estimated to optimize better with " + << format("%.1f", HotFunctionDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + + if (ShowDensity) + outs() << "Minimum profile density for hot functions with top " + << format("%.2f", + static_cast(ProfileSummaryCutoffHot.getValue()) / + 10000) + << "% total samples: " << format("%.1f", Density) << "\n"; +} + +double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold) { + double Density = DBL_MAX; + std::vector HotFuncs; + for (auto &I : Profiles) { + auto &FuncSamples = I.second; + if (FuncSamples.getTotalSamples() < HotCntThreshold) + continue; + HotFuncs.emplace_back(&FuncSamples); + } + + for (auto *FuncSamples : HotFuncs) { + auto *Func = Binary->getBinaryFunction(FuncSamples->getName()); + if (!Func) + continue; + uint64_t FuncSize = Func->getFuncSize(); + if (FuncSize == 0) + continue; + Density = + std::min(Density, static_cast(FuncSamples->getTotalSamples()) / + FuncSize); + } + + return Density == DBL_MAX ? 0.0 : Density; +} + +void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges) { + + /* + Regions may overlap with each other. Using the boundary info, find all + disjoint ranges and their sample count. BoundaryPoint contains the count + multiple samples begin/end at this points. + + |<--100-->| Sample1 + |<------200------>| Sample2 + A B C + + In the example above, + Sample1 begins at A, ends at B, its value is 100. + Sample2 beings at A, ends at C, its value is 200. + For A, BeginCount is the sum of sample begins at A, which is 300 and no + samples ends at A, so EndCount is 0. + Then boundary points A, B, and C with begin/end counts are: + A: (300, 0) + B: (0, 100) + C: (0, 200) + */ + struct BoundaryPoint { + // Sum of sample counts beginning at this point + uint64_t BeginCount = UINT64_MAX; + // Sum of sample counts ending at this point + uint64_t EndCount = UINT64_MAX; + // Is the begin point of a zero range. + bool IsZeroRangeBegin = false; + // Is the end point of a zero range. + bool IsZeroRangeEnd = false; + + void addBeginCount(uint64_t Count) { + if (BeginCount == UINT64_MAX) + BeginCount = 0; + BeginCount += Count; + } + + void addEndCount(uint64_t Count) { + if (EndCount == UINT64_MAX) + EndCount = 0; + EndCount += Count; + } + }; + + /* + For the above example. With boundary points, follwing logic finds two + disjoint region of + + [A,B]: 300 + [B+1,C]: 200 + + If there is a boundary point that both begin and end, the point itself + becomes a separate disjoint region. For example, if we have original + ranges of + + |<--- 100 --->| + |<--- 200 --->| + A B C + + there are three boundary points with their begin/end counts of + + A: (100, 0) + B: (200, 100) + C: (0, 200) + + the disjoint ranges would be + + [A, B-1]: 100 + [B, B]: 300 + [B+1, C]: 200. + + Example for zero value range: + + |<--- 100 --->| + |<--- 200 --->| + |<--------------- 0 ----------------->| + A B C D E F + + [A, B-1] : 0 + [B, C] : 100 + [C+1, D-1]: 0 + [D, E] : 200 + [E+1, F] : 0 + */ + std::map Boundaries; + + for (const auto &Item : Ranges) { + assert(Item.first.first <= Item.first.second && + "Invalid instruction range"); + auto &BeginPoint = Boundaries[Item.first.first]; + auto &EndPoint = Boundaries[Item.first.second]; + uint64_t Count = Item.second; + + BeginPoint.addBeginCount(Count); + EndPoint.addEndCount(Count); + if (Count == 0) { + BeginPoint.IsZeroRangeBegin = true; + EndPoint.IsZeroRangeEnd = true; + } + } + + // Use UINT64_MAX to indicate there is no existing range between BeginAddress + // and the next valid address + uint64_t BeginAddress = UINT64_MAX; + int ZeroRangeDepth = 0; + uint64_t Count = 0; + for (const auto &Item : Boundaries) { + uint64_t Address = Item.first; + const BoundaryPoint &Point = Item.second; + if (Point.BeginCount != UINT64_MAX) { + if (BeginAddress != UINT64_MAX) + DisjointRanges[{BeginAddress, Address - 1}] = Count; + Count += Point.BeginCount; + BeginAddress = Address; + ZeroRangeDepth += Point.IsZeroRangeBegin; + } + if (Point.EndCount != UINT64_MAX) { + assert((BeginAddress != UINT64_MAX) && + "First boundary point cannot be 'end' point"); + DisjointRanges[{BeginAddress, Address}] = Count; + assert(Count >= Point.EndCount && "Mismatched live ranges"); + Count -= Point.EndCount; + BeginAddress = Address + 1; + ZeroRangeDepth -= Point.IsZeroRangeEnd; + // If the remaining count is zero and it's no longer in a zero range, this + // means we consume all the ranges before, thus mark BeginAddress as + // UINT64_MAX. e.g. supposing we have two non-overlapping ranges: + // [<---- 10 ---->] + // [<---- 20 ---->] + // A B C D + // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't + // have the [B+1, C-1] zero range. + if (Count == 0 && ZeroRangeDepth == 0) + BeginAddress = UINT64_MAX; + } + } +} + +void ProfileGeneratorBase::updateBodySamplesforFunctionProfile( + FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc, + uint64_t Count) { + // Use the maximum count of samples with same line location + uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator); + + // Use duplication factor to compensated for loop unroll/vectorization. + // Note that this is only needed when we're taking MAX of the counts at + // the location instead of SUM. + Count *= getDuplicationFactor(LeafLoc.Location.Discriminator); + + ErrorOr R = + FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator); + + uint64_t PreviousCount = R ? R.get() : 0; + if (PreviousCount <= Count) { + FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator, + Count - PreviousCount); + } +} + +void ProfileGeneratorBase::updateTotalSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateTotalSamples(); + } +} + +void ProfileGeneratorBase::updateCallsiteSamples() { + for (auto &Item : ProfileMap) { + FunctionSamples &FunctionProfile = Item.second; + FunctionProfile.updateCallsiteSamples(); + } +} + +void ProfileGeneratorBase::updateFunctionSamples() { + updateCallsiteSamples(); + + if (UpdateTotalSamples) + updateTotalSamples(); +} + +void ProfileGeneratorBase::collectProfiledFunctions() { + std::unordered_set ProfiledFunctions; + if (collectFunctionsFromRawProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else if (collectFunctionsFromLLVMProfile(ProfiledFunctions)) + Binary->setProfiledFunctions(ProfiledFunctions); + else + llvm_unreachable("Unsupported input profile"); +} + +bool ProfileGeneratorBase::collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions) { + if (!SampleCounters) + return false; + // Go through all the stacks, ranges and branches in sample counters, use + // the start of the range to look up the function it belongs and record the + // function. + for (const auto &CI : *SampleCounters) { + if (const auto *CtxKey = dyn_cast(CI.first.getPtr())) { + for (auto StackAddr : CtxKey->Context) { + if (FuncRange *FRange = Binary->findFuncRange(StackAddr)) + ProfiledFunctions.insert(FRange->Func); + } + } + + for (auto Item : CI.second.RangeCounter) { + uint64_t StartAddress = Item.first.first; + if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) + ProfiledFunctions.insert(FRange->Func); + } + + for (auto Item : CI.second.BranchCounter) { + uint64_t SourceAddress = Item.first.first; + uint64_t TargetAddress = Item.first.second; + if (FuncRange *FRange = Binary->findFuncRange(SourceAddress)) + ProfiledFunctions.insert(FRange->Func); + if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) + ProfiledFunctions.insert(FRange->Func); + } + } + return true; +} + +bool ProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (const auto &FS : ProfileMap) { + if (auto *Func = Binary->getBinaryFunction(FS.first.getName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +bool CSProfileGenerator::collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) { + for (auto *Node : ContextTracker) { + if (!Node->getFuncName().empty()) + if (auto *Func = Binary->getBinaryFunction(Node->getFuncName())) + ProfiledFunctions.insert(Func); + } + return true; +} + +FunctionSamples & +ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { + SampleContext Context(FuncName); + auto Ret = ProfileMap.emplace(Context, FunctionSamples()); + if (Ret.second) { + FunctionSamples &FProfile = Ret.first->second; + FProfile.setContext(Context); + } + return Ret.first->second; +} + +void ProfileGenerator::generateProfile() { + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) + Binary->decodePseudoProbe(); + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + postProcessProfiles(); +} + +void ProfileGenerator::postProcessProfiles() { + computeSummaryAndThreshold(ProfileMap); + trimColdProfiles(ProfileMap, ColdCountThreshold); + calculateAndShowDensity(ProfileMap); +} + +void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold) { + if (!TrimColdProfile) + return; + + // Move cold profiles into a tmp container. + std::vector ColdProfiles; + for (const auto &I : ProfileMap) { + if (I.second.getTotalSamples() < ColdCntThreshold) + ColdProfiles.emplace_back(I.first); + } + + // Remove the cold profile from ProfileMap. + for (const auto &I : ColdProfiles) + ProfileMap.erase(I); +} + +void ProfileGenerator::generateLineNumBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::generateProbeBasedProfile() { + assert(SampleCounters->size() == 1 && + "Must have one entry for profile generation."); + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + const SampleCounter &SC = SampleCounters->begin()->second; + // Fill in function body samples + populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter); + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter); + + updateFunctionSamples(); +} + +void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions( + const RangeSample &RangeCounter) { + ProbeCounterMap ProbeCounter; + // preprocessRangeCounter returns disjoint ranges, so no longer to redo it + // inside extractProbesFromRange. + extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter, + false); + + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(Probe, FrameVec, true); + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, Count); + FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), + Count); + if (Probe->isEntry()) + FunctionProfile.addHeadSamples(Count); + } +} + +void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + + // Record called target sample and its count. + SampleContextFrameVector FrameVec; + Binary->getInlineContextForProbe(CallProbe, FrameVec, true); + + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + FrameVec.back().Location.Discriminator, + CalleeName, Count); + } + } +} + +FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( + const SampleContextFrameVector &FrameVec, uint64_t Count) { + // Get top level profile + FunctionSamples *FunctionProfile = + &getTopLevelFunctionProfile(FrameVec[0].FuncName); + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + + for (size_t I = 1; I < FrameVec.size(); I++) { + LineLocation Callsite( + FrameVec[I - 1].Location.LineOffset, + getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator)); + FunctionSamplesMap &SamplesMap = + FunctionProfile->functionSamplesAt(Callsite); + auto Ret = + SamplesMap.emplace(FrameVec[I].FuncName.str(), FunctionSamples()); + if (Ret.second) { + SampleContext Context(FrameVec[I].FuncName); + Ret.first->second.setContext(Context); + } + FunctionProfile = &Ret.first->second; + FunctionProfile->addTotalSamples(Count); + if (Binary->usePseudoProbes()) { + const auto *FuncDesc = Binary->getFuncDescForGUID( + Function::getGUID(FunctionProfile->getName())); + FunctionProfile->setFunctionHash(FuncDesc->FuncHash); + } + } + + return *FunctionProfile; +} + +RangeSample +ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) { + RangeSample Ranges(RangeCounter.begin(), RangeCounter.end()); + if (FillZeroForAllFuncs) { + for (auto &FuncI : Binary->getAllBinaryFunctions()) { + for (auto &R : FuncI.second.Ranges) { + Ranges[{R.first, R.second - 1}] += 0; + } + } + } else { + // For each range, we search for all ranges of the function it belongs to + // and initialize it with zero count, so it remains zero if doesn't hit any + // samples. This is to be consistent with compiler that interpret zero count + // as unexecuted(cold). + for (const auto &I : RangeCounter) { + uint64_t StartAddress = I.first.first; + for (const auto &Range : Binary->getRanges(StartAddress)) + Ranges[{Range.first, Range.second - 1}] += 0; + } + } + RangeSample DisjointRanges; + findDisjointRanges(DisjointRanges, Ranges); + return DisjointRanges; +} + +void ProfileGenerator::populateBodySamplesForAllFunctions( + const RangeSample &RangeCounter) { + for (const auto &Range : preprocessRangeCounter(RangeCounter)) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const SampleContextFrameVector FrameVec = + Binary->getFrameLocationStack(IP.Address); + if (!FrameVec.empty()) { + // FIXME: As accumulating total count per instruction caused some + // regression, we changed to accumulate total count per byte as a + // workaround. Tuning hotness threshold on the compiler side might be + // necessary in the future. + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, Count * Binary->getInstSize(IP.Address)); + updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(), + Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +StringRef +ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) { + // Get the function range by branch target if it's a call branch. + auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress); + + // We won't accumulate sample count for a range whose start is not the real + // function entry such as outlined function or inner labels. + if (!FRange || !FRange->IsFuncEntry) + return StringRef(); + + return FunctionSamples::getCanonicalFnName(FRange->getFuncName()); +} + +void ProfileGenerator::populateBoundarySamplesForAllFunctions( + const BranchSample &BranchCounters) { + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + // Record called target sample and its count. + const SampleContextFrameVector &FrameVec = + Binary->getCachedFrameLocationStack(SourceAddress); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = + getLeafProfileAndAddTotalSamples(FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + CalleeName, Count); + } + // Add head samples for callee. + FunctionSamples &CalleeProfile = getTopLevelFunctionProfile(CalleeName); + CalleeProfile.addHeadSamples(Count); + } +} + +void ProfileGeneratorBase::calculateAndShowDensity( + const SampleProfileMap &Profiles) { + double Density = calculateDensity(Profiles, HotCountThreshold); + showDensitySuggestion(Density); +} + +FunctionSamples * +CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined) { + FunctionSamples *FProfile = ContextNode->getFunctionSamples(); + if (!FProfile) { + FSamplesList.emplace_back(); + FProfile = &FSamplesList.back(); + FProfile->setName(ContextNode->getFuncName()); + ContextNode->setFunctionSamples(FProfile); + } + // Update ContextWasInlined attribute for existing contexts. + // The current function can be called in two ways: + // - when processing a probe of the current frame + // - when processing the entry probe of an inlinee's frame, which + // is then used to update the callsite count of the current frame. + // The two can happen in any order, hence here we are making sure + // `ContextWasInlined` is always set as expected. + // TODO: Note that the former does not always happen if no probes of the + // current frame has samples, and if the latter happens, we could lose the + // attribute. This should be fixed. + if (WasLeafInlined) + FProfile->getContext().setAttribute(ContextWasInlined); + return FProfile; +} + +ContextTrieNode * +CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined) { + ContextTrieNode *ContextNode = + ContextTracker.getOrCreateContextPath(Context, true); + getOrCreateFunctionSamples(ContextNode, WasLeafInlined); + return ContextNode; +} + +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + + collectProfiledFunctions(); + + if (Binary->usePseudoProbes()) { + Binary->decodePseudoProbe(); + if (InferMissingFrames) + initializeMissingFrameInferrer(); + } + + if (SampleCounters) { + if (Binary->usePseudoProbes()) { + generateProbeBasedProfile(); + } else { + generateLineNumBasedProfile(); + } + } + + if (Binary->getTrackFuncContextSize()) + computeSizeForProfiledFunctions(); + + postProcessProfiles(); +} + +void CSProfileGenerator::initializeMissingFrameInferrer() { + Binary->getMissingContextInferrer()->initialize(SampleCounters); +} + +void CSProfileGenerator::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + Binary->inferMissingFrames(Context, NewContext); +} + +void CSProfileGenerator::computeSizeForProfiledFunctions() { + for (auto *Func : Binary->getProfiledFunctions()) + Binary->computeInlinedContextSizeForFunc(Func); + + // Flush the symbolizer to save memory. + Binary->flushSymbolizer(); +} + +void CSProfileGenerator::updateFunctionSamples() { + for (auto *Node : ContextTracker) { + FunctionSamples *FSamples = Node->getFunctionSamples(); + if (FSamples) { + if (UpdateTotalSamples) + FSamples->updateTotalSamples(); + FSamples->updateCallsiteSamples(); + } + } +} + +void CSProfileGenerator::generateLineNumBasedProfile() { + for (const auto &CI : *SampleCounters) { + const auto *CtxKey = cast(CI.first.getPtr()); + + ContextTrieNode *ContextNode = &getRootContext(); + // Sample context will be empty if the jump is an external-to-internal call + // pattern, the head samples should be added for the internal function. + if (!CtxKey->Context.empty()) { + // Get or create function profile for the range + ContextNode = + getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined); + // Fill in function body samples + populateBodySamplesForFunction(*ContextNode->getFunctionSamples(), + CI.second.RangeCounter); + } + // Fill in boundary sample counts as well as call site samples for calls + populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter); + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(getRootContext()); + + updateFunctionSamples(); +} + +void CSProfileGenerator::populateBodySamplesForFunction( + FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) { + // Compute disjoint ranges first, so we can use MAX + // for calculating count for each location. + RangeSample Ranges; + findDisjointRanges(Ranges, RangeCounter); + for (const auto &Range : Ranges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (Count == 0) + continue; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address); + if (LeafLoc) { + // Recording body sample for this specific context + updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count); + FunctionProfile.addTotalSamples(Count); + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +void CSProfileGenerator::populateBoundarySamplesForFunction( + ContextTrieNode *Node, const BranchSample &BranchCounters) { + + for (const auto &Entry : BranchCounters) { + uint64_t SourceAddress = Entry.first.first; + uint64_t TargetAddress = Entry.first.second; + uint64_t Count = Entry.second; + assert(Count != 0 && "Unexpected zero weight branch"); + + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + + ContextTrieNode *CallerNode = Node; + LineLocation CalleeCallSite(0, 0); + if (CallerNode != &getRootContext()) { + // Record called target sample and its count + auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress); + if (LeafLoc) { + CallerNode->getFunctionSamples()->addCalledTargetSamples( + LeafLoc->Location.LineOffset, + getBaseDiscriminator(LeafLoc->Location.Discriminator), CalleeName, + Count); + // Record head sample for called target(callee) + CalleeCallSite = LeafLoc->Location; + } + } + + ContextTrieNode *CalleeNode = + CallerNode->getOrCreateChildContext(CalleeCallSite, CalleeName); + FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode); + CalleeProfile->addHeadSamples(Count); + } +} + +void CSProfileGenerator::populateInferredFunctionSamples( + ContextTrieNode &Node) { + // There is no call jmp sample between the inliner and inlinee, we need to use + // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s + // sample depends on child(inlinee)'s sample, so traverse the tree in + // post-order. + for (auto &It : Node.getAllChildContext()) + populateInferredFunctionSamples(It.second); + + FunctionSamples *CalleeProfile = Node.getFunctionSamples(); + if (!CalleeProfile) + return; + // If we already have head sample counts, we must have value profile + // for call sites added already. Skip to avoid double counting. + if (CalleeProfile->getHeadSamples()) + return; + ContextTrieNode *CallerNode = Node.getParentContext(); + // If we don't have context, nothing to do for caller's call site. + // This could happen for entry point function. + if (CallerNode == &getRootContext()) + return; + + LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc(); + FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode); + // Since we don't have call count for inlined functions, we + // estimate it from inlinee's profile using entry body sample. + uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate(); + // If we don't have samples with location, use 1 to indicate live. + if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size()) + EstimatedCallCount = 1; + CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + Node.getFuncName(), EstimatedCallCount); + CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset, + CallerLeafFrameLoc.Discriminator, + EstimatedCallCount); + CallerProfile.addTotalSamples(EstimatedCallCount); +} + +void CSProfileGenerator::convertToProfileMap( + ContextTrieNode &Node, SampleContextFrameVector &Context) { + FunctionSamples *FProfile = Node.getFunctionSamples(); + if (FProfile) { + Context.emplace_back(Node.getFuncName(), LineLocation(0, 0)); + // Save the new context for future references. + SampleContextFrames NewContext = *Contexts.insert(Context).first; + auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile)); + FunctionSamples &NewProfile = Ret.first->second; + NewProfile.getContext().setContext(NewContext); + Context.pop_back(); + } + + for (auto &It : Node.getAllChildContext()) { + ContextTrieNode &ChildNode = It.second; + Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc()); + convertToProfileMap(ChildNode, Context); + Context.pop_back(); + } +} + +void CSProfileGenerator::convertToProfileMap() { + assert(ProfileMap.empty() && + "ProfileMap should be empty before converting from the trie"); + assert(IsProfileValidOnTrie && + "Do not convert the trie twice, it's already destroyed"); + + SampleContextFrameVector Context; + for (auto &It : getRootContext().getAllChildContext()) + convertToProfileMap(It.second, Context); + + IsProfileValidOnTrie = false; +} + +void CSProfileGenerator::postProcessProfiles() { + // Compute hot/cold threshold based on profile. This will be used for cold + // context profile merging/trimming. + computeSummaryAndThreshold(); + + // Run global pre-inliner to adjust/merge context profile based on estimated + // inline decisions. + if (EnableCSPreInliner) { + ContextTracker.populateFuncToCtxtMap(); + CSPreInliner(ContextTracker, *Binary, Summary.get()).run(); + // Turn off the profile merger by default unless it is explicitly enabled. + if (!CSProfMergeColdContext.getNumOccurrences()) + CSProfMergeColdContext = false; + } + + convertToProfileMap(); + + // Trim and merge cold context profile using cold threshold above. + if (TrimColdProfile || CSProfMergeColdContext) { + SampleContextTrimmer(ProfileMap) + .trimAndMergeColdContextProfiles( + HotCountThreshold, TrimColdProfile, CSProfMergeColdContext, + CSProfMaxColdContextDepth, EnableCSPreInliner); + } + + // Merge function samples of CS profile to calculate profile density. + sampleprof::SampleProfileMap ContextLessProfiles; + for (const auto &I : ProfileMap) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + + calculateAndShowDensity(ContextLessProfiles); + if (GenCSNestedProfile) { + ProfileConverter CSConverter(ProfileMap); + CSConverter.convertCSProfiles(); + FunctionSamples::ProfileIsCS = false; + } +} + +void ProfileGeneratorBase::computeSummaryAndThreshold( + SampleProfileMap &Profiles) { + SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); + Summary = Builder.computeSummaryForProfiles(Profiles); + HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold( + (Summary->getDetailedSummary())); + ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold( + (Summary->getDetailedSummary())); +} + +void CSProfileGenerator::computeSummaryAndThreshold() { + // Always merge and use context-less profile map to compute summary. + SampleProfileMap ContextLessProfiles; + ContextTracker.createContextLessProfileMap(ContextLessProfiles); + + // Set the flag below to avoid merging the profile again in + // computeSummaryAndThreshold + FunctionSamples::ProfileIsCS = false; + assert( + (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) && + "Don't set --profile-summary-contextless to false for profile " + "generation"); + ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles); + // Recover the old value. + FunctionSamples::ProfileIsCS = true; +} + +void ProfileGeneratorBase::extractProbesFromRange( + const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges) { + const RangeSample *PRanges = &RangeCounter; + RangeSample Ranges; + if (FindDisjointRanges) { + findDisjointRanges(Ranges, RangeCounter); + PRanges = &Ranges; + } + + for (const auto &Range : *PRanges) { + uint64_t RangeBegin = Range.first.first; + uint64_t RangeEnd = Range.first.second; + uint64_t Count = Range.second; + + InstructionPointer IP(Binary, RangeBegin, true); + // Disjoint ranges may have range in the middle of two instr, + // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range + // can be Addr1+1 to Addr2-1. We should ignore such range. + if (IP.Address > RangeEnd) + continue; + + do { + const AddressProbesMap &Address2ProbesMap = + Binary->getAddress2ProbesMap(); + auto It = Address2ProbesMap.find(IP.Address); + if (It != Address2ProbesMap.end()) { + for (const auto &Probe : It->second) { + ProbeCounter[&Probe] += Count; + } + } + } while (IP.advance() && IP.Address <= RangeEnd); + } +} + +static void extractPrefixContextStack(SampleContextFrameVector &ContextStack, + const SmallVectorImpl &AddrVec, + ProfiledBinary *Binary) { + SmallVector Probes; + for (auto Address : reverse(AddrVec)) { + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(Address); + // These could be the cases when a probe is not found at a calliste. Cutting + // off the context from here since the inliner will not know how to consume + // a context with unknown callsites. + // 1. for functions that are not sampled when + // --decode-probe-for-profiled-functions-only is on. + // 2. for a merged callsite. Callsite merging may cause the loss of original + // probe IDs. + // 3. for an external callsite. + if (!CallProbe) + break; + Probes.push_back(CallProbe); + } + + std::reverse(Probes.begin(), Probes.end()); + + // Extract context stack for reusing, leaf context stack will be added + // compressed while looking up function profile. + for (const auto *P : Probes) { + Binary->getInlineContextForProbe(P, ContextStack, true); + } +} + +void CSProfileGenerator::generateProbeBasedProfile() { + // Enable pseudo probe functionalities in SampleProf + FunctionSamples::ProfileIsProbeBased = true; + for (const auto &CI : *SampleCounters) { + const AddrBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + // Fill in function body samples from probes, also infer caller's samples + // from callee's probe + populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey); + // Fill in boundary samples for a call probe + populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey); + } +} + +void CSProfileGenerator::populateBodySamplesWithProbes( + const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) { + ProbeCounterMap ProbeCounter; + // Extract the top frame probes by looking up each address among the range in + // the Address2ProbeMap + extractProbesFromRange(RangeCounter, ProbeCounter); + std::unordered_map> + FrameSamples; + for (const auto &PI : ProbeCounter) { + const MCDecodedPseudoProbe *Probe = PI.first; + uint64_t Count = PI.second; + // Disjoint ranges have introduce zero-filled gap that + // doesn't belong to current context, filter them out. + if (!Probe->isBlock() || Count == 0) + continue; + + ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe); + FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples(); + // Record the current frame and FunctionProfile whenever samples are + // collected for non-danglie probes. This is for reporting all of the + // zero count probes of the frame later. + FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile); + FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(), + Count); + FunctionProfile.addTotalSamples(Count); + if (Probe->isEntry()) { + FunctionProfile.addHeadSamples(Count); + // Look up for the caller's function profile + const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe); + ContextTrieNode *CallerNode = ContextNode->getParentContext(); + if (InlinerDesc != nullptr && CallerNode != &getRootContext()) { + // Since the context id will be compressed, we have to use callee's + // context id to infer caller's context id to ensure they share the + // same context prefix. + uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset; + uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator; + assert(CallerIndex && + "Inferred caller's location index shouldn't be zero!"); + assert(!CallerDiscriminator && + "Callsite probe should not have a discriminator!"); + FunctionSamples &CallerProfile = + *getOrCreateFunctionSamples(CallerNode); + CallerProfile.setFunctionHash(InlinerDesc->FuncHash); + CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count); + CallerProfile.addTotalSamples(Count); + CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator, + ContextNode->getFuncName(), Count); + } + } + } + + // Assign zero count for remaining probes without sample hits to + // differentiate from probes optimized away, of which the counts are unknown + // and will be inferred by the compiler. + for (auto &I : FrameSamples) { + for (auto *FunctionProfile : I.second) { + for (auto *Probe : I.first->getProbes()) { + FunctionProfile->addBodySamples(Probe->getIndex(), + Probe->getDiscriminator(), 0); + } + } + } +} + +void CSProfileGenerator::populateBoundarySamplesWithProbes( + const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) { + for (const auto &BI : BranchCounter) { + uint64_t SourceAddress = BI.first.first; + uint64_t TargetAddress = BI.first.second; + uint64_t Count = BI.second; + const MCDecodedPseudoProbe *CallProbe = + Binary->getCallProbeForAddr(SourceAddress); + if (CallProbe == nullptr) + continue; + FunctionSamples &FunctionProfile = + getFunctionProfileForLeafProbe(CtxKey, CallProbe); + FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count); + FunctionProfile.addTotalSamples(Count); + StringRef CalleeName = getCalleeNameForAddress(TargetAddress); + if (CalleeName.size() == 0) + continue; + FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(), + CallProbe->getDiscriminator(), + CalleeName, Count); + } +} + +ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + + const SmallVectorImpl *PContext = &CtxKey->Context; + SmallVector NewContext; + + if (InferMissingFrames) { + SmallVector Context = CtxKey->Context; + // Append leaf frame for a complete inference. + Context.push_back(LeafProbe->getAddress()); + inferMissingFrames(Context, NewContext); + // Pop out the leaf probe that was pushed in above. + NewContext.pop_back(); + PContext = &NewContext; + } + + SampleContextFrameVector ContextStack; + extractPrefixContextStack(ContextStack, *PContext, Binary); + + // Explicitly copy the context for appending the leaf context + SampleContextFrameVector NewContextStack(ContextStack.begin(), + ContextStack.end()); + Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true); + // For leaf inlined context with the top frame, we should strip off the top + // frame's probe id, like: + // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar" + auto LeafFrame = NewContextStack.back(); + LeafFrame.Location = LineLocation(0, 0); + NewContextStack.pop_back(); + // Compress the context string except for the leaf frame + CSProfileGenerator::compressRecursionContext(NewContextStack); + CSProfileGenerator::trimContext(NewContextStack); + NewContextStack.push_back(LeafFrame); + + const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid()); + bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite(); + ContextTrieNode *ContextNode = + getOrCreateContextNode(NewContextStack, WasLeafInlined); + ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash); + return ContextNode; +} + +FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe( + const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) { + return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples(); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.h b/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.h new file mode 100644 index 00000000000..471792ec713 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/ProfileGenerator.h @@ -0,0 +1,390 @@ +//===-- ProfileGenerator.h - Profile Generator -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H +#include "CSPreInliner.h" +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfiledBinary.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfWriter.h" +#include +#include + +using namespace llvm; +using namespace sampleprof; + +namespace llvm { +namespace sampleprof { + +using ProbeCounterMap = + std::unordered_map; + +// This base class for profile generation of sample-based PGO. We reuse all +// structures relating to function profiles and profile writers as seen in +// /ProfileData/SampleProf.h. +class ProfileGeneratorBase { + +public: + ProfileGeneratorBase(ProfiledBinary *Binary) : Binary(Binary){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : Binary(Binary), SampleCounters(Counters){}; + ProfileGeneratorBase(ProfiledBinary *Binary, + const SampleProfileMap &&Profiles) + : Binary(Binary), ProfileMap(std::move(Profiles)){}; + + virtual ~ProfileGeneratorBase() = default; + static std::unique_ptr + create(ProfiledBinary *Binary, const ContextSampleCounterMap *Counters, + bool profileIsCS); + static std::unique_ptr + create(ProfiledBinary *Binary, SampleProfileMap &ProfileMap, + bool profileIsCS); + virtual void generateProfile() = 0; + void write(); + + static uint32_t + getDuplicationFactor(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? 1 + : llvm::DILocation::getDuplicationFactorFromDiscriminator( + Discriminator); + } + + static uint32_t + getBaseDiscriminator(unsigned Discriminator, + bool UseFSD = ProfileGeneratorBase::UseFSDiscriminator) { + return UseFSD ? Discriminator + : DILocation::getBaseDiscriminatorFromDiscriminator( + Discriminator, /* IsFSDiscriminator */ false); + } + + static bool UseFSDiscriminator; + +protected: + // Use SampleProfileWriter to serialize profile map + void write(std::unique_ptr Writer, + SampleProfileMap &ProfileMap); + /* + For each region boundary point, mark if it is begin or end (or both) of + the region. Boundary points are inclusive. Log the sample count as well + so we can use it when we compute the sample count of each disjoint region + later. Note that there might be multiple ranges with different sample + count that share same begin/end point. We need to accumulate the sample + count for the boundary point for such case, because for the example + below, + + |<--100-->| + |<------200------>| + A B C + + sample count for disjoint region [A,B] would be 300. + */ + void findDisjointRanges(RangeSample &DisjointRanges, + const RangeSample &Ranges); + + // Go through each address from range to extract the top frame probe by + // looking up in the Address2ProbeMap + void extractProbesFromRange(const RangeSample &RangeCounter, + ProbeCounterMap &ProbeCounter, + bool FindDisjointRanges = true); + + // Helper function for updating body sample for a leaf location in + // FunctionProfile + void updateBodySamplesforFunctionProfile(FunctionSamples &FunctionProfile, + const SampleContextFrame &LeafLoc, + uint64_t Count); + + void updateFunctionSamples(); + + void updateTotalSamples(); + + void updateCallsiteSamples(); + + StringRef getCalleeNameForAddress(uint64_t TargetAddress); + + void computeSummaryAndThreshold(SampleProfileMap &ProfileMap); + + void calculateAndShowDensity(const SampleProfileMap &Profiles); + + double calculateDensity(const SampleProfileMap &Profiles, + uint64_t HotCntThreshold); + + void showDensitySuggestion(double Density); + + void collectProfiledFunctions(); + + bool collectFunctionsFromRawProfile( + std::unordered_set &ProfiledFunctions); + + // Collect profiled Functions for llvm sample profile input. + virtual bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) = 0; + + // Thresholds from profile summary to answer isHotCount/isColdCount queries. + uint64_t HotCountThreshold; + + uint64_t ColdCountThreshold; + + ProfiledBinary *Binary = nullptr; + + std::unique_ptr Summary; + + // Used by SampleProfileWriter + SampleProfileMap ProfileMap; + + const ContextSampleCounterMap *SampleCounters = nullptr; +}; + +class ProfileGenerator : public ProfileGeneratorBase { + +public: + ProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + ProfileGenerator(ProfiledBinary *Binary, const SampleProfileMap &&Profiles) + : ProfileGeneratorBase(Binary, std::move(Profiles)){}; + void generateProfile() override; + +private: + void generateLineNumBasedProfile(); + void generateProbeBasedProfile(); + RangeSample preprocessRangeCounter(const RangeSample &RangeCounter); + FunctionSamples &getTopLevelFunctionProfile(StringRef FuncName); + // Helper function to get the leaf frame's FunctionProfile by traversing the + // inline stack and meanwhile it adds the total samples for each frame's + // function profile. + FunctionSamples & + getLeafProfileAndAddTotalSamples(const SampleContextFrameVector &FrameVec, + uint64_t Count); + void populateBodySamplesForAllFunctions(const RangeSample &RangeCounter); + void + populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); + void + populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); + void populateBoundarySamplesWithProbesForAllFunctions( + const BranchSample &BranchCounters); + void postProcessProfiles(); + void trimColdProfiles(const SampleProfileMap &Profiles, + uint64_t ColdCntThreshold); + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; +}; + +class CSProfileGenerator : public ProfileGeneratorBase { +public: + CSProfileGenerator(ProfiledBinary *Binary, + const ContextSampleCounterMap *Counters) + : ProfileGeneratorBase(Binary, Counters){}; + CSProfileGenerator(ProfiledBinary *Binary, SampleProfileMap &Profiles) + : ProfileGeneratorBase(Binary), ContextTracker(Profiles, nullptr){}; + void generateProfile() override; + + // Trim the context stack at a given depth. + template + static void trimContext(SmallVectorImpl &S, int Depth = MaxContextDepth) { + if (Depth < 0 || static_cast(Depth) >= S.size()) + return; + std::copy(S.begin() + S.size() - static_cast(Depth), S.end(), + S.begin()); + S.resize(Depth); + } + + // Remove adjacent repeated context sequences up to a given sequence length, + // -1 means no size limit. Note that repeated sequences are identified based + // on the exact call site, this is finer granularity than function recursion. + template + static void compressRecursionContext(SmallVectorImpl &Context, + int32_t CSize = MaxCompressionSize) { + uint32_t I = 1; + uint32_t HS = static_cast(Context.size() / 2); + uint32_t MaxDedupSize = + CSize == -1 ? HS : std::min(static_cast(CSize), HS); + auto BeginIter = Context.begin(); + // Use an in-place algorithm to save memory copy + // End indicates the end location of current iteration's data + uint32_t End = 0; + // Deduplicate from length 1 to the max possible size of a repeated + // sequence. + while (I <= MaxDedupSize) { + // This is a linear algorithm that deduplicates adjacent repeated + // sequences of size I. The deduplication detection runs on a sliding + // window whose size is 2*I and it keeps sliding the window to deduplicate + // the data inside. Once duplication is detected, deduplicate it by + // skipping the right half part of the window, otherwise just copy back + // the new one by appending them at the back of End pointer(for the next + // iteration). + // + // For example: + // Input: [a1, a2, b1, b2] + // (Added index to distinguish the same char, the origin is [a, a, b, + // b], the size of the dedup window is 2(I = 1) at the beginning) + // + // 1) The initial status is a dummy window[null, a1], then just copy the + // right half of the window(End = 0), then slide the window. + // Result: [a1], a2, b1, b2 (End points to the element right before ], + // after ] is the data of the previous iteration) + // + // 2) Next window is [a1, a2]. Since a1 == a2, then skip the right half of + // the window i.e the duplication happen. Only slide the window. + // Result: [a1], a2, b1, b2 + // + // 3) Next window is [a2, b1], copy the right half of the window(b1 is + // new) to the End and slide the window. + // Result: [a1, b1], b1, b2 + // + // 4) Next window is [b1, b2], same to 2), skip b2. + // Result: [a1, b1], b1, b2 + // After resize, it will be [a, b] + + // Use pointers like below to do comparison inside the window + // [a b c a b c] + // | | | | | + // LeftBoundary Left Right Left+I Right+I + // A duplication found if Left < LeftBoundry. + + int32_t Right = I - 1; + End = I; + int32_t LeftBoundary = 0; + while (Right + I < Context.size()) { + // To avoids scanning a part of a sequence repeatedly, it finds out + // the common suffix of two hald in the window. The common suffix will + // serve as the common prefix of next possible pair of duplicate + // sequences. The non-common part will be ignored and never scanned + // again. + + // For example. + // Input: [a, b1], c1, b2, c2 + // I = 2 + // + // 1) For the window [a, b1, c1, b2], non-common-suffix for the right + // part is 'c1', copy it and only slide the window 1 step. + // Result: [a, b1, c1], b2, c2 + // + // 2) Next window is [b1, c1, b2, c2], so duplication happen. + // Result after resize: [a, b, c] + + int32_t Left = Right; + while (Left >= LeftBoundary && Context[Left] == Context[Left + I]) { + // Find the longest suffix inside the window. When stops, Left points + // at the diverging point in the current sequence. + Left--; + } + + bool DuplicationFound = (Left < LeftBoundary); + // Don't need to recheck the data before Right + LeftBoundary = Right + 1; + if (DuplicationFound) { + // Duplication found, skip right half of the window. + Right += I; + } else { + // Copy the non-common-suffix part of the adjacent sequence. + std::copy(BeginIter + Right + 1, BeginIter + Left + I + 1, + BeginIter + End); + End += Left + I - Right; + // Only slide the window by the size of non-common-suffix + Right = Left + I; + } + } + // Don't forget the remaining part that's not scanned. + std::copy(BeginIter + Right + 1, Context.end(), BeginIter + End); + End += Context.size() - Right - 1; + I++; + Context.resize(End); + MaxDedupSize = std::min(static_cast(End / 2), MaxDedupSize); + } + } + +private: + void generateLineNumBasedProfile(); + + FunctionSamples *getOrCreateFunctionSamples(ContextTrieNode *ContextNode, + bool WasLeafInlined = false); + + // Lookup or create ContextTrieNode for the context, FunctionSamples is + // created inside this function. + ContextTrieNode *getOrCreateContextNode(const SampleContextFrames Context, + bool WasLeafInlined = false); + + // For profiled only functions, on-demand compute their inline context + // function byte size which is used by the pre-inliner. + void computeSizeForProfiledFunctions(); + // Post processing for profiles before writing out, such as mermining + // and trimming cold profiles, running preinliner on profiles. + void postProcessProfiles(); + + void populateBodySamplesForFunction(FunctionSamples &FunctionProfile, + const RangeSample &RangeCounters); + + void populateBoundarySamplesForFunction(ContextTrieNode *CallerNode, + const BranchSample &BranchCounters); + + void populateInferredFunctionSamples(ContextTrieNode &Node); + + void updateFunctionSamples(); + + void generateProbeBasedProfile(); + + // Fill in function body samples from probes + void populateBodySamplesWithProbes(const RangeSample &RangeCounter, + const AddrBasedCtxKey *CtxKey); + // Fill in boundary samples for a call probe + void populateBoundarySamplesWithProbes(const BranchSample &BranchCounter, + const AddrBasedCtxKey *CtxKey); + + ContextTrieNode * + getContextNodeForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + // Helper function to get FunctionSamples for the leaf probe + FunctionSamples & + getFunctionProfileForLeafProbe(const AddrBasedCtxKey *CtxKey, + const MCDecodedPseudoProbe *LeafProbe); + + void convertToProfileMap(ContextTrieNode &Node, + SampleContextFrameVector &Context); + + void convertToProfileMap(); + + void computeSummaryAndThreshold(); + + bool collectFunctionsFromLLVMProfile( + std::unordered_set &ProfiledFunctions) override; + + void initializeMissingFrameInferrer(); + + // Given an input `Context`, output `NewContext` with inferred missing tail + // call frames. + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + ContextTrieNode &getRootContext() { return ContextTracker.getRootContext(); }; + + // The container for holding the FunctionSamples used by context trie. + std::list FSamplesList; + + // Underlying context table serves for sample profile writer. + std::unordered_set Contexts; + + SampleContextTracker ContextTracker; + + bool IsProfileValidOnTrie = true; + +public: + // Deduplicate adjacent repeated context sequences up to a given sequence + // length. -1 means no size limit. + static int32_t MaxCompressionSize; + static int MaxContextDepth; +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.cpp b/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.cpp new file mode 100644 index 00000000000..4755f758a82 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.cpp @@ -0,0 +1,977 @@ +//===-- ProfiledBinary.cpp - Binary decoder ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ProfiledBinary.h" +#include "ErrorHandling.h" +#include "MissingFrameInferrer.h" +#include "ProfileGenerator.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Demangle/Demangle.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/TargetParser/Triple.h" +#include + +#define DEBUG_TYPE "load-binary" + +using namespace llvm; +using namespace sampleprof; + +cl::opt ShowDisassemblyOnly("show-disassembly-only", + cl::desc("Print disassembled code.")); + +cl::opt ShowSourceLocations("show-source-locations", + cl::desc("Print source locations.")); + +static cl::opt + ShowCanonicalFnName("show-canonical-fname", + cl::desc("Print canonical function name.")); + +static cl::opt ShowPseudoProbe( + "show-pseudo-probe", + cl::desc("Print pseudo probe section and disassembled info.")); + +static cl::opt UseDwarfCorrelation( + "use-dwarf-correlation", + cl::desc("Use dwarf for profile correlation even when binary contains " + "pseudo probe.")); + +static cl::opt + DWPPath("dwp", cl::init(""), + cl::desc("Path of .dwp file. When not specified, it will be " + ".dwp in the same directory as the main binary.")); + +static cl::list DisassembleFunctions( + "disassemble-functions", cl::CommaSeparated, + cl::desc("List of functions to print disassembly for. Accept demangled " + "names only. Only work with show-disassembly-only")); + +extern cl::opt ShowDetailedWarning; +extern cl::opt InferMissingFrames; + +namespace llvm { +namespace sampleprof { + +static const Target *getTarget(const ObjectFile *Obj) { + Triple TheTriple = Obj->makeTriple(); + std::string Error; + std::string ArchName; + const Target *TheTarget = + TargetRegistry::lookupTarget(ArchName, TheTriple, Error); + if (!TheTarget) + exitWithError(Error, Obj->getFileName()); + return TheTarget; +} + +void BinarySizeContextTracker::addInstructionForContext( + const SampleContextFrameVector &Context, uint32_t InstrSize) { + ContextTrieNode *CurNode = &RootContext; + bool IsLeaf = true; + for (const auto &Callsite : reverse(Context)) { + StringRef CallerName = Callsite.FuncName; + LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.Location; + CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName); + IsLeaf = false; + } + + CurNode->addFunctionSize(InstrSize); +} + +uint32_t +BinarySizeContextTracker::getFuncSizeForContext(const ContextTrieNode *Node) { + ContextTrieNode *CurrNode = &RootContext; + ContextTrieNode *PrevNode = nullptr; + + std::optional Size; + + // Start from top-level context-less function, traverse down the reverse + // context trie to find the best/longest match for given context, then + // retrieve the size. + LineLocation CallSiteLoc(0, 0); + while (CurrNode && Node->getParentContext() != nullptr) { + PrevNode = CurrNode; + CurrNode = CurrNode->getChildContext(CallSiteLoc, Node->getFuncName()); + if (CurrNode && CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + CallSiteLoc = Node->getCallSiteLoc(); + Node = Node->getParentContext(); + } + + // If we traversed all nodes along the path of the context and haven't + // found a size yet, pivot to look for size from sibling nodes, i.e size + // of inlinee under different context. + if (!Size) { + if (!CurrNode) + CurrNode = PrevNode; + while (!Size && CurrNode && !CurrNode->getAllChildContext().empty()) { + CurrNode = &CurrNode->getAllChildContext().begin()->second; + if (CurrNode->getFunctionSize()) + Size = *CurrNode->getFunctionSize(); + } + } + + assert(Size && "We should at least find one context size."); + return *Size; +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder) { + ProbeFrameStack ProbeContext; + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) + trackInlineesOptimizedAway(ProbeDecoder, *Child.second.get(), ProbeContext); +} + +void BinarySizeContextTracker::trackInlineesOptimizedAway( + MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, ProbeFrameStack &ProbeContext) { + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(ProbeNode.Guid)->FuncName; + ProbeContext.emplace_back(FuncName, 0); + + // This ProbeContext has a probe, so it has code before inlining and + // optimization. Make sure we mark its size as known. + if (!ProbeNode.getProbes().empty()) { + ContextTrieNode *SizeContext = &RootContext; + for (auto &ProbeFrame : reverse(ProbeContext)) { + StringRef CallerName = ProbeFrame.first; + LineLocation CallsiteLoc(ProbeFrame.second, 0); + SizeContext = + SizeContext->getOrCreateChildContext(CallsiteLoc, CallerName); + } + // Add 0 size to make known. + SizeContext->addFunctionSize(0); + } + + // DFS down the probe inline tree + for (const auto &ChildNode : ProbeNode.getChildren()) { + InlineSite Location = ChildNode.first; + ProbeContext.back().second = std::get<1>(Location); + trackInlineesOptimizedAway(ProbeDecoder, *ChildNode.second.get(), + ProbeContext); + } + + ProbeContext.pop_back(); +} + +ProfiledBinary::ProfiledBinary(const StringRef ExeBinPath, + const StringRef DebugBinPath) + : Path(ExeBinPath), DebugBinaryPath(DebugBinPath), + SymbolizerOpts(getSymbolizerOpts()), ProEpilogTracker(this), + Symbolizer(std::make_unique(SymbolizerOpts)), + TrackFuncContextSize(EnableCSPreInliner && UseContextCostForPreInliner) { + // Point to executable binary if debug info binary is not specified. + SymbolizerPath = DebugBinPath.empty() ? ExeBinPath : DebugBinPath; + if (InferMissingFrames) + MissingContextInferrer = std::make_unique(this); + load(); +} + +ProfiledBinary::~ProfiledBinary() {} + +void ProfiledBinary::warnNoFuncEntry() { + uint64_t NoFuncEntryNum = 0; + for (auto &F : BinaryFunctions) { + if (F.second.Ranges.empty()) + continue; + bool hasFuncEntry = false; + for (auto &R : F.second.Ranges) { + if (FuncRange *FR = findFuncRangeForStartAddr(R.first)) { + if (FR->IsFuncEntry) { + hasFuncEntry = true; + break; + } + } + } + + if (!hasFuncEntry) { + NoFuncEntryNum++; + if (ShowDetailedWarning) + WithColor::warning() + << "Failed to determine function entry for " << F.first + << " due to inconsistent name from symbol table and dwarf info.\n"; + } + } + emitWarningSummary(NoFuncEntryNum, BinaryFunctions.size(), + "of functions failed to determine function entry due to " + "inconsistent name from symbol table and dwarf info."); +} + +void ProfiledBinary::load() { + // Attempt to open the binary. + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + + auto *Obj = dyn_cast(&ExeBinary); + if (!Obj) + exitWithError("not a valid Elf image", Path); + + TheTriple = Obj->makeTriple(); + + LLVM_DEBUG(dbgs() << "Loading " << Path << "\n"); + + // Find the preferred load address for text sections. + setPreferredTextSegmentAddresses(Obj); + + // Load debug info of subprograms from DWARF section. + // If path of debug info binary is specified, use the debug info from it, + // otherwise use the debug info from the executable binary. + if (!DebugBinaryPath.empty()) { + OwningBinary DebugPath = + unwrapOrError(createBinary(DebugBinaryPath), DebugBinaryPath); + loadSymbolsFromDWARF(*cast(DebugPath.getBinary())); + } else { + loadSymbolsFromDWARF(*cast(&ExeBinary)); + } + + DisassembleFunctionSet.insert(DisassembleFunctions.begin(), + DisassembleFunctions.end()); + + checkPseudoProbe(Obj); + + if (UsePseudoProbes) + populateElfSymbolAddressList(Obj); + + if (ShowDisassemblyOnly) + decodePseudoProbe(Obj); + + // Disassemble the text sections. + disassemble(Obj); + + // Use function start and return address to infer prolog and epilog + ProEpilogTracker.inferPrologAddresses(StartAddrToFuncRangeMap); + ProEpilogTracker.inferEpilogAddresses(RetAddressSet); + + warnNoFuncEntry(); + + // TODO: decode other sections. +} + +bool ProfiledBinary::inlineContextEqual(uint64_t Address1, uint64_t Address2) { + const SampleContextFrameVector &Context1 = + getCachedFrameLocationStack(Address1); + const SampleContextFrameVector &Context2 = + getCachedFrameLocationStack(Address2); + if (Context1.size() != Context2.size()) + return false; + if (Context1.empty()) + return false; + // The leaf frame contains location within the leaf, and it + // needs to be remove that as it's not part of the calling context + return std::equal(Context1.begin(), Context1.begin() + Context1.size() - 1, + Context2.begin(), Context2.begin() + Context2.size() - 1); +} + +SampleContextFrameVector +ProfiledBinary::getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined) { + SampleContextFrameVector ContextVec; + if (Stack.empty()) + return ContextVec; + // Process from frame root to leaf + for (auto Address : Stack) { + const SampleContextFrameVector &ExpandedContext = + getCachedFrameLocationStack(Address); + // An instruction without a valid debug line will be ignored by sample + // processing + if (ExpandedContext.empty()) + return SampleContextFrameVector(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); + ContextVec.append(ExpandedContext); + } + + // Replace with decoded base discriminator + for (auto &Frame : ContextVec) { + Frame.Location.Discriminator = ProfileGeneratorBase::getBaseDiscriminator( + Frame.Location.Discriminator, UseFSDiscriminator); + } + + assert(ContextVec.size() && "Context length should be at least 1"); + + // Compress the context string except for the leaf frame + auto LeafFrame = ContextVec.back(); + LeafFrame.Location = LineLocation(0, 0); + ContextVec.pop_back(); + CSProfileGenerator::compressRecursionContext(ContextVec); + CSProfileGenerator::trimContext(ContextVec); + ContextVec.push_back(LeafFrame); + return ContextVec; +} + +template +void ProfiledBinary::setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName) { + const auto &PhdrRange = unwrapOrError(Obj.program_headers(), FileName); + // FIXME: This should be the page size of the system running profiling. + // However such info isn't available at post-processing time, assuming + // 4K page now. Note that we don't use EXEC_PAGESIZE from + // because we may build the tools on non-linux. + uint32_t PageSize = 0x1000; + for (const typename ELFT::Phdr &Phdr : PhdrRange) { + if (Phdr.p_type == ELF::PT_LOAD) { + if (!FirstLoadableAddress) + FirstLoadableAddress = Phdr.p_vaddr & ~(PageSize - 1U); + if (Phdr.p_flags & ELF::PF_X) { + // Segments will always be loaded at a page boundary. + PreferredTextSegmentAddresses.push_back(Phdr.p_vaddr & + ~(PageSize - 1U)); + TextSegmentOffsets.push_back(Phdr.p_offset & ~(PageSize - 1U)); + } + } + } + + if (PreferredTextSegmentAddresses.empty()) + exitWithError("no executable segment found", FileName); +} + +void ProfiledBinary::setPreferredTextSegmentAddresses( + const ELFObjectFileBase *Obj) { + if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = dyn_cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else if (const auto *ELFObj = cast(Obj)) + setPreferredTextSegmentAddresses(ELFObj->getELFFile(), Obj->getFileName()); + else + llvm_unreachable("invalid ELF object format"); +} + +void ProfiledBinary::checkPseudoProbe(const ELFObjectFileBase *Obj) { + if (UseDwarfCorrelation) + return; + + bool HasProbeDescSection = false; + bool HasPseudoProbeSection = false; + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + if (SectionName == ".pseudo_probe_desc") { + HasProbeDescSection = true; + } else if (SectionName == ".pseudo_probe") { + HasPseudoProbeSection = true; + } + } + + // set UsePseudoProbes flag, used for PerfReader + UsePseudoProbes = HasProbeDescSection && HasPseudoProbeSection; +} + +void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) { + if (!UsePseudoProbes) + return; + + MCPseudoProbeDecoder::Uint64Set GuidFilter; + MCPseudoProbeDecoder::Uint64Map FuncStartAddresses; + if (ShowDisassemblyOnly) { + if (DisassembleFunctionSet.empty()) { + FuncStartAddresses = SymbolStartAddrs; + } else { + for (auto &F : DisassembleFunctionSet) { + auto GUID = Function::getGUID(F.first()); + if (auto StartAddr = SymbolStartAddrs.lookup(GUID)) { + FuncStartAddresses[GUID] = StartAddr; + FuncRange &Range = StartAddrToFuncRangeMap[StartAddr]; + GuidFilter.insert(Function::getGUID(Range.getFuncName())); + } + } + } + } else { + for (auto *F : ProfiledFunctions) { + GuidFilter.insert(Function::getGUID(F->FuncName)); + for (auto &Range : F->Ranges) { + auto GUIDs = StartAddrToSymMap.equal_range(Range.first); + for (auto I = GUIDs.first; I != GUIDs.second; ++I) + FuncStartAddresses[I->second] = I->first; + } + } + } + + StringRef FileName = Obj->getFileName(); + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (SectionName == ".pseudo_probe_desc") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildGUID2FuncDescMap( + reinterpret_cast(Contents.data()), + Contents.size())) + exitWithError( + "Pseudo Probe decoder fail in .pseudo_probe_desc section"); + } else if (SectionName == ".pseudo_probe") { + StringRef Contents = unwrapOrError(Section.getContents(), FileName); + if (!ProbeDecoder.buildAddress2ProbeMap( + reinterpret_cast(Contents.data()), + Contents.size(), GuidFilter, FuncStartAddresses)) + exitWithError("Pseudo Probe decoder fail in .pseudo_probe section"); + } + } + + // Build TopLevelProbeFrameMap to track size for optimized inlinees when probe + // is available + if (TrackFuncContextSize) { + for (const auto &Child : ProbeDecoder.getDummyInlineRoot().getChildren()) { + auto *Frame = Child.second.get(); + StringRef FuncName = + ProbeDecoder.getFuncDescForGUID(Frame->Guid)->FuncName; + TopLevelProbeFrameMap[FuncName] = Frame; + } + } + + if (ShowPseudoProbe) + ProbeDecoder.printGUID2FuncDescMap(outs()); +} + +void ProfiledBinary::decodePseudoProbe() { + OwningBinary OBinary = unwrapOrError(createBinary(Path), Path); + Binary &ExeBinary = *OBinary.getBinary(); + auto *Obj = dyn_cast(&ExeBinary); + decodePseudoProbe(Obj); +} + +void ProfiledBinary::setIsFuncEntry(FuncRange *FuncRange, + StringRef RangeSymName) { + // Skip external function symbol. + if (!FuncRange) + return; + + // Set IsFuncEntry to ture if there is only one range in the function or the + // RangeSymName from ELF is equal to its DWARF-based function name. + if (FuncRange->Func->Ranges.size() == 1 || + (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)) + FuncRange->IsFuncEntry = true; +} + +bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, + const SectionRef &Section) { + std::size_t SE = Symbols.size(); + uint64_t SectionAddress = Section.getAddress(); + uint64_t SectSize = Section.getSize(); + uint64_t StartAddress = Symbols[SI].Addr; + uint64_t NextStartAddress = + (SI + 1 < SE) ? Symbols[SI + 1].Addr : SectionAddress + SectSize; + FuncRange *FRange = findFuncRange(StartAddress); + setIsFuncEntry(FRange, FunctionSamples::getCanonicalFnName(Symbols[SI].Name)); + StringRef SymbolName = + ShowCanonicalFnName + ? FunctionSamples::getCanonicalFnName(Symbols[SI].Name) + : Symbols[SI].Name; + bool ShowDisassembly = + ShowDisassemblyOnly && (DisassembleFunctionSet.empty() || + DisassembleFunctionSet.count(SymbolName)); + if (ShowDisassembly) + outs() << '<' << SymbolName << ">:\n"; + + auto WarnInvalidInsts = [](uint64_t Start, uint64_t End) { + WithColor::warning() << "Invalid instructions at " + << format("%8" PRIx64, Start) << " - " + << format("%8" PRIx64, End) << "\n"; + }; + + uint64_t Address = StartAddress; + // Size of a consecutive invalid instruction range starting from Address -1 + // backwards. + uint64_t InvalidInstLength = 0; + while (Address < NextStartAddress) { + MCInst Inst; + uint64_t Size; + // Disassemble an instruction. + bool Disassembled = DisAsm->getInstruction( + Inst, Size, Bytes.slice(Address - SectionAddress), Address, nulls()); + if (Size == 0) + Size = 1; + + if (ShowDisassembly) { + if (ShowPseudoProbe) { + ProbeDecoder.printProbeForAddress(outs(), Address); + } + outs() << format("%8" PRIx64 ":", Address); + size_t Start = outs().tell(); + if (Disassembled) + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), outs()); + else + outs() << "\t"; + if (ShowSourceLocations) { + unsigned Cur = outs().tell() - Start; + if (Cur < 40) + outs().indent(40 - Cur); + InstructionPointer IP(this, Address); + outs() << getReversedLocWithContext( + symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe)); + } + outs() << "\n"; + } + + if (Disassembled) { + const MCInstrDesc &MCDesc = MII->get(Inst.getOpcode()); + + // Record instruction size. + AddressToInstSizeMap[Address] = Size; + + // Populate address maps. + CodeAddressVec.push_back(Address); + if (MCDesc.isCall()) { + CallAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isReturn()) { + RetAddressSet.insert(Address); + UncondBranchAddrSet.insert(Address); + } else if (MCDesc.isBranch()) { + if (MCDesc.isUnconditionalBranch()) + UncondBranchAddrSet.insert(Address); + BranchAddressSet.insert(Address); + } + + // Record potential call targets for tail frame inference later-on. + if (InferMissingFrames && FRange) { + uint64_t Target = 0; + MIA->evaluateBranch(Inst, Address, Size, Target); + if (MCDesc.isCall()) { + // Indirect call targets are unknown at this point. Recording the + // unknown target (zero) for further LBR-based refinement. + MissingContextInferrer->CallEdges[Address].insert(Target); + } else if (MCDesc.isUnconditionalBranch()) { + assert(Target && + "target should be known for unconditional direct branch"); + // Any inter-function unconditional jump is considered tail call at + // this point. This is not 100% accurate and could further be + // optimized based on some source annotation. + FuncRange *ToFRange = findFuncRange(Target); + if (ToFRange && ToFRange->Func != FRange->Func) + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Direct Tail call: " << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } else if (MCDesc.isIndirectBranch() && MCDesc.isBarrier()) { + // This is an indirect branch but not necessarily an indirect tail + // call. The isBarrier check is to filter out conditional branch. + // Similar with indirect call targets, recording the unknown target + // (zero) for further LBR-based refinement. + MissingContextInferrer->TailCallEdges[Address].insert(Target); + LLVM_DEBUG({ + dbgs() << "Indirect Tail call: " + << format("%8" PRIx64 ":", Address); + IPrinter->printInst(&Inst, Address + Size, "", *STI.get(), dbgs()); + dbgs() << "\n"; + }); + } + } + + if (InvalidInstLength) { + WarnInvalidInsts(Address - InvalidInstLength, Address - 1); + InvalidInstLength = 0; + } + } else { + InvalidInstLength += Size; + } + + Address += Size; + } + + if (InvalidInstLength) + WarnInvalidInsts(Address - InvalidInstLength, Address - 1); + + if (ShowDisassembly) + outs() << "\n"; + + return true; +} + +void ProfiledBinary::setUpDisassembler(const ELFObjectFileBase *Obj) { + const Target *TheTarget = getTarget(Obj); + std::string TripleName = TheTriple.getTriple(); + StringRef FileName = Obj->getFileName(); + + MRI.reset(TheTarget->createMCRegInfo(TripleName)); + if (!MRI) + exitWithError("no register info for target " + TripleName, FileName); + + MCTargetOptions MCOptions; + AsmInfo.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions)); + if (!AsmInfo) + exitWithError("no assembly info for target " + TripleName, FileName); + + Expected Features = Obj->getFeatures(); + if (!Features) + exitWithError(Features.takeError(), FileName); + STI.reset( + TheTarget->createMCSubtargetInfo(TripleName, "", Features->getString())); + if (!STI) + exitWithError("no subtarget info for target " + TripleName, FileName); + + MII.reset(TheTarget->createMCInstrInfo()); + if (!MII) + exitWithError("no instruction info for target " + TripleName, FileName); + + MCContext Ctx(Triple(TripleName), AsmInfo.get(), MRI.get(), STI.get()); + std::unique_ptr MOFI( + TheTarget->createMCObjectFileInfo(Ctx, /*PIC=*/false)); + Ctx.setObjectFileInfo(MOFI.get()); + DisAsm.reset(TheTarget->createMCDisassembler(*STI, Ctx)); + if (!DisAsm) + exitWithError("no disassembler for target " + TripleName, FileName); + + MIA.reset(TheTarget->createMCInstrAnalysis(MII.get())); + + int AsmPrinterVariant = AsmInfo->getAssemblerDialect(); + IPrinter.reset(TheTarget->createMCInstPrinter( + Triple(TripleName), AsmPrinterVariant, *AsmInfo, *MII, *MRI)); + IPrinter->setPrintBranchImmAsAddress(true); +} + +void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) { + // Set up disassembler and related components. + setUpDisassembler(Obj); + + // Create a mapping from virtual address to symbol name. The symbols in text + // sections are the candidates to dissassemble. + std::map AllSymbols; + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + section_iterator SecI = unwrapOrError(Symbol.getSection(), FileName); + if (SecI != Obj->section_end()) + AllSymbols[*SecI].push_back(SymbolInfoTy(Addr, Name, ELF::STT_NOTYPE)); + } + + // Sort all the symbols. Use a stable sort to stabilize the output. + for (std::pair &SecSyms : AllSymbols) + stable_sort(SecSyms.second); + + assert((DisassembleFunctionSet.empty() || ShowDisassemblyOnly) && + "Functions to disassemble should be only specified together with " + "--show-disassembly-only"); + + if (ShowDisassemblyOnly) + outs() << "\nDisassembly of " << FileName << ":\n"; + + // Dissassemble a text section. + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isText()) + continue; + + uint64_t ImageLoadAddr = getPreferredBaseAddress(); + uint64_t SectionAddress = Section.getAddress() - ImageLoadAddr; + uint64_t SectSize = Section.getSize(); + if (!SectSize) + continue; + + // Register the text section. + TextSections.insert({SectionAddress, SectSize}); + + StringRef SectionName = unwrapOrError(Section.getName(), FileName); + + if (ShowDisassemblyOnly) { + outs() << "\nDisassembly of section " << SectionName; + outs() << " [" << format("0x%" PRIx64, Section.getAddress()) << ", " + << format("0x%" PRIx64, Section.getAddress() + SectSize) + << "]:\n\n"; + } + + if (SectionName == ".plt") + continue; + + // Get the section data. + ArrayRef Bytes = + arrayRefFromStringRef(unwrapOrError(Section.getContents(), FileName)); + + // Get the list of all the symbols in this section. + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + // Disassemble symbol by symbol. + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (!dissassembleSymbol(SI, Bytes, Symbols, Section)) + exitWithError("disassembling error", FileName); + } + } + + // Dissassemble rodata section to check if FS discriminator symbol exists. + checkUseFSDiscriminator(Obj, AllSymbols); +} + +void ProfiledBinary::checkUseFSDiscriminator( + const ELFObjectFileBase *Obj, + std::map &AllSymbols) { + const char *FSDiscriminatorVar = "__llvm_fs_discriminator__"; + for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end(); + SI != SE; ++SI) { + const SectionRef &Section = *SI; + if (!Section.isData() || Section.getSize() == 0) + continue; + SectionSymbolsTy &Symbols = AllSymbols[Section]; + + for (std::size_t SI = 0, SE = Symbols.size(); SI != SE; ++SI) { + if (Symbols[SI].Name == FSDiscriminatorVar) { + UseFSDiscriminator = true; + return; + } + } + } +} + +void ProfiledBinary::populateElfSymbolAddressList( + const ELFObjectFileBase *Obj) { + // Create a mapping from virtual address to symbol GUID and the other way + // around. + StringRef FileName = Obj->getFileName(); + for (const SymbolRef &Symbol : Obj->symbols()) { + const uint64_t Addr = unwrapOrError(Symbol.getAddress(), FileName); + const StringRef Name = unwrapOrError(Symbol.getName(), FileName); + uint64_t GUID = Function::getGUID(Name); + SymbolStartAddrs[GUID] = Addr; + StartAddrToSymMap.emplace(Addr, GUID); + } +} + +void ProfiledBinary::loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit) { + for (const auto &DieInfo : CompilationUnit.dies()) { + llvm::DWARFDie Die(&CompilationUnit, &DieInfo); + + if (!Die.isSubprogramDIE()) + continue; + auto Name = Die.getName(llvm::DINameKind::LinkageName); + if (!Name) + Name = Die.getName(llvm::DINameKind::ShortName); + if (!Name) + continue; + + auto RangesOrError = Die.getAddressRanges(); + if (!RangesOrError) + continue; + const DWARFAddressRangesVector &Ranges = RangesOrError.get(); + + if (Ranges.empty()) + continue; + + // Different DWARF symbols can have same function name, search or create + // BinaryFunction indexed by the name. + auto Ret = BinaryFunctions.emplace(Name, BinaryFunction()); + auto &Func = Ret.first->second; + if (Ret.second) + Func.FuncName = Ret.first->first; + + for (const auto &Range : Ranges) { + uint64_t StartAddress = Range.LowPC; + uint64_t EndAddress = Range.HighPC; + + if (EndAddress <= StartAddress || + StartAddress < getPreferredBaseAddress()) + continue; + + // We may want to know all ranges for one function. Here group the + // ranges and store them into BinaryFunction. + Func.Ranges.emplace_back(StartAddress, EndAddress); + + auto R = StartAddrToFuncRangeMap.emplace(StartAddress, FuncRange()); + if (R.second) { + FuncRange &FRange = R.first->second; + FRange.Func = &Func; + FRange.StartAddress = StartAddress; + FRange.EndAddress = EndAddress; + } else { + WithColor::warning() + << "Duplicated symbol start address at " + << format("%8" PRIx64, StartAddress) << " " + << R.first->second.getFuncName() << " and " << Name << "\n"; + } + } + } +} + +void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) { + auto DebugContext = llvm::DWARFContext::create( + Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, DWPPath); + if (!DebugContext) + exitWithError("Error creating the debug info context", Path); + + for (const auto &CompilationUnit : DebugContext->compile_units()) + loadSymbolsFromDWARFUnit(*CompilationUnit.get()); + + // Handles DWO sections that can either be in .o, .dwo or .dwp files. + for (const auto &CompilationUnit : DebugContext->compile_units()) { + DWARFUnit *const DwarfUnit = CompilationUnit.get(); + if (std::optional DWOId = DwarfUnit->getDWOId()) { + DWARFUnit *DWOCU = DwarfUnit->getNonSkeletonUnitDIE(false).getDwarfUnit(); + if (!DWOCU->isDWOUnit()) { + std::string DWOName = dwarf::toString( + DwarfUnit->getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), + ""); + WithColor::warning() + << "DWO debug information for " << DWOName + << " was not loaded. Please check the .o, .dwo or .dwp path.\n"; + continue; + } + loadSymbolsFromDWARFUnit(*DWOCU); + } + } + + if (BinaryFunctions.empty()) + WithColor::warning() << "Loading of DWARF info completed, but no binary " + "functions have been retrieved.\n"; +} + +void ProfiledBinary::populateSymbolListFromDWARF( + ProfileSymbolList &SymbolList) { + for (auto &I : StartAddrToFuncRangeMap) + SymbolList.add(I.second.getFuncName()); +} + +symbolize::LLVMSymbolizer::Options ProfiledBinary::getSymbolizerOpts() const { + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + SymbolizerOpts.PrintFunctions = + DILineInfoSpecifier::FunctionNameKind::LinkageName; + SymbolizerOpts.Demangle = false; + SymbolizerOpts.DefaultArch = TheTriple.getArchName().str(); + SymbolizerOpts.UseSymbolTable = false; + SymbolizerOpts.RelativeAddresses = false; + SymbolizerOpts.DWPName = DWPPath; + return SymbolizerOpts; +} + +SampleContextFrameVector ProfiledBinary::symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName, + bool UseProbeDiscriminator) { + assert(this == IP.Binary && + "Binary should only symbolize its own instruction"); + auto Addr = object::SectionedAddress{IP.Address, + object::SectionedAddress::UndefSection}; + DIInliningInfo InlineStack = unwrapOrError( + Symbolizer->symbolizeInlinedCode(SymbolizerPath.str(), Addr), + SymbolizerPath); + + SampleContextFrameVector CallStack; + for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) { + const auto &CallerFrame = InlineStack.getFrame(I); + if (CallerFrame.FunctionName == "") + break; + + StringRef FunctionName(CallerFrame.FunctionName); + if (UseCanonicalFnName) + FunctionName = FunctionSamples::getCanonicalFnName(FunctionName); + + uint32_t Discriminator = CallerFrame.Discriminator; + uint32_t LineOffset = (CallerFrame.Line - CallerFrame.StartLine) & 0xffff; + if (UseProbeDiscriminator) { + LineOffset = + PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator); + Discriminator = 0; + } + + LineLocation Line(LineOffset, Discriminator); + auto It = NameStrings.insert(FunctionName.str()); + CallStack.emplace_back(*It.first, Line); + } + + return CallStack; +} + +void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t RangeBegin, + uint64_t RangeEnd) { + InstructionPointer IP(this, RangeBegin, true); + + if (IP.Address != RangeBegin) + WithColor::warning() << "Invalid start instruction at " + << format("%8" PRIx64, RangeBegin) << "\n"; + + if (IP.Address >= RangeEnd) + return; + + do { + const SampleContextFrameVector SymbolizedCallStack = + getFrameLocationStack(IP.Address, UsePseudoProbes); + uint64_t Size = AddressToInstSizeMap[IP.Address]; + // Record instruction size for the corresponding context + FuncSizeTracker.addInstructionForContext(SymbolizedCallStack, Size); + + } while (IP.advance() && IP.Address < RangeEnd); +} + +void ProfiledBinary::computeInlinedContextSizeForFunc( + const BinaryFunction *Func) { + // Note that a function can be spilt into multiple ranges, so compute for all + // ranges of the function. + for (const auto &Range : Func->Ranges) + computeInlinedContextSizeForRange(Range.first, Range.second); + + // Track optimized-away inlinee for probed binary. A function inlined and then + // optimized away should still have their probes left over in places. + if (usePseudoProbes()) { + auto I = TopLevelProbeFrameMap.find(Func->FuncName); + if (I != TopLevelProbeFrameMap.end()) { + BinarySizeContextTracker::ProbeFrameStack ProbeContext; + FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder, *I->second, + ProbeContext); + } + } +} + +void ProfiledBinary::inferMissingFrames( + const SmallVectorImpl &Context, + SmallVectorImpl &NewContext) { + MissingContextInferrer->inferMissingFrames(Context, NewContext); +} + +InstructionPointer::InstructionPointer(const ProfiledBinary *Binary, + uint64_t Address, bool RoundToNext) + : Binary(Binary), Address(Address) { + Index = Binary->getIndexForAddr(Address); + if (RoundToNext) { + // we might get address which is not the code + // it should round to the next valid address + if (Index >= Binary->getCodeAddrVecSize()) + this->Address = UINT64_MAX; + else + this->Address = Binary->getAddressforIndex(Index); + } +} + +bool InstructionPointer::advance() { + Index++; + if (Index >= Binary->getCodeAddrVecSize()) { + Address = UINT64_MAX; + return false; + } + Address = Binary->getAddressforIndex(Index); + return true; +} + +bool InstructionPointer::backward() { + if (Index == 0) { + Address = 0; + return false; + } + Index--; + Address = Binary->getAddressforIndex(Index); + return true; +} + +void InstructionPointer::update(uint64_t Addr) { + Address = Addr; + Index = Binary->getIndexForAddr(Address); +} + +} // end namespace sampleprof +} // end namespace llvm diff --git a/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.h b/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.h new file mode 100644 index 00000000000..a6d78c661cc --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/ProfiledBinary.h @@ -0,0 +1,589 @@ +//===-- ProfiledBinary.h - Binary decoder -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H +#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H + +#include "CallContext.h" +#include "ErrorHandling.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/DebugInfo/DWARF/DWARFContext.h" +#include "llvm/DebugInfo/Symbolize/Symbolize.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCInstrAnalysis.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCPseudoProbe.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" +#include "llvm/Transforms/IPO/SampleContextTracker.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +extern cl::opt EnableCSPreInliner; +extern cl::opt UseContextCostForPreInliner; +} // namespace llvm + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::object; + +namespace llvm { +namespace sampleprof { + +class ProfiledBinary; +class MissingFrameInferrer; + +struct InstructionPointer { + const ProfiledBinary *Binary; + // Address of the executable segment of the binary. + uint64_t Address; + // Index to the sorted code address array of the binary. + uint64_t Index = 0; + InstructionPointer(const ProfiledBinary *Binary, uint64_t Address, + bool RoundToNext = false); + bool advance(); + bool backward(); + void update(uint64_t Addr); +}; + +// The special frame addresses. +enum SpecialFrameAddr { + // Dummy root of frame trie. + DummyRoot = 0, + // Represent all the addresses outside of current binary. + // This's also used to indicate the call stack should be truncated since this + // isn't a real call context the compiler will see. + ExternalAddr = 1, +}; + +using RangesTy = std::vector>; + +struct BinaryFunction { + StringRef FuncName; + // End of range is an exclusive bound. + RangesTy Ranges; + + uint64_t getFuncSize() { + uint64_t Sum = 0; + for (auto &R : Ranges) { + Sum += R.second - R.first; + } + return Sum; + } +}; + +// Info about function range. A function can be split into multiple +// non-continuous ranges, each range corresponds to one FuncRange. +struct FuncRange { + uint64_t StartAddress; + // EndAddress is an exclusive bound. + uint64_t EndAddress; + // Function the range belongs to + BinaryFunction *Func; + // Whether the start address is the real entry of the function. + bool IsFuncEntry = false; + + StringRef getFuncName() { return Func->FuncName; } +}; + +// PrologEpilog address tracker, used to filter out broken stack samples +// Currently we use a heuristic size (two) to infer prolog and epilog +// based on the start address and return address. In the future, +// we will switch to Dwarf CFI based tracker +struct PrologEpilogTracker { + // A set of prolog and epilog addresses. Used by virtual unwinding. + std::unordered_set PrologEpilogSet; + ProfiledBinary *Binary; + PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){}; + + // Take the two addresses from the start of function as prolog + void + inferPrologAddresses(std::map &FuncStartAddressMap) { + for (auto I : FuncStartAddressMap) { + PrologEpilogSet.insert(I.first); + InstructionPointer IP(Binary, I.first); + if (!IP.advance()) + break; + PrologEpilogSet.insert(IP.Address); + } + } + + // Take the last two addresses before the return address as epilog + void inferEpilogAddresses(std::unordered_set &RetAddrs) { + for (auto Addr : RetAddrs) { + PrologEpilogSet.insert(Addr); + InstructionPointer IP(Binary, Addr); + if (!IP.backward()) + break; + PrologEpilogSet.insert(IP.Address); + } + } +}; + +// Track function byte size under different context (outlined version as well as +// various inlined versions). It also provides query support to get function +// size with the best matching context, which is used to help pre-inliner use +// accurate post-optimization size to make decisions. +// TODO: If an inlinee is completely optimized away, ideally we should have zero +// for its context size, currently we would misss such context since it doesn't +// have instructions. To fix this, we need to mark all inlinee with entry probe +// but without instructions as having zero size. +class BinarySizeContextTracker { +public: + // Add instruction with given size to a context + void addInstructionForContext(const SampleContextFrameVector &Context, + uint32_t InstrSize); + + // Get function size with a specific context. When there's no exact match + // for the given context, try to retrieve the size of that function from + // closest matching context. + uint32_t getFuncSizeForContext(const ContextTrieNode *Context); + + // For inlinees that are full optimized away, we can establish zero size using + // their remaining probes. + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder); + + using ProbeFrameStack = SmallVector>; + void trackInlineesOptimizedAway(MCPseudoProbeDecoder &ProbeDecoder, + MCDecodedPseudoProbeInlineTree &ProbeNode, + ProbeFrameStack &Context); + + void dump() { RootContext.dumpTree(); } + +private: + // Root node for context trie tree, node that this is a reverse context trie + // with callee as parent and caller as child. This way we can traverse from + // root to find the best/longest matching context if an exact match does not + // exist. It gives us the best possible estimate for function's post-inline, + // post-optimization byte size. + ContextTrieNode RootContext; +}; + +using AddressRange = std::pair; + +class ProfiledBinary { + // Absolute path of the executable binary. + std::string Path; + // Path of the debug info binary. + std::string DebugBinaryPath; + // The target triple. + Triple TheTriple; + // Path of symbolizer path which should be pointed to binary with debug info. + StringRef SymbolizerPath; + // Options used to configure the symbolizer + symbolize::LLVMSymbolizer::Options SymbolizerOpts; + // The runtime base address that the first executable segment is loaded at. + uint64_t BaseAddress = 0; + // The runtime base address that the first loadabe segment is loaded at. + uint64_t FirstLoadableAddress = 0; + // The preferred load address of each executable segment. + std::vector PreferredTextSegmentAddresses; + // The file offset of each executable segment. + std::vector TextSegmentOffsets; + + // Mutiple MC component info + std::unique_ptr MRI; + std::unique_ptr AsmInfo; + std::unique_ptr STI; + std::unique_ptr MII; + std::unique_ptr DisAsm; + std::unique_ptr MIA; + std::unique_ptr IPrinter; + // A list of text sections sorted by start RVA and size. Used to check + // if a given RVA is a valid code address. + std::set> TextSections; + + // A map of mapping function name to BinaryFunction info. + std::unordered_map BinaryFunctions; + + // A list of binary functions that have samples. + std::unordered_set ProfiledFunctions; + + // GUID to Elf symbol start address map + DenseMap SymbolStartAddrs; + + // Start address to Elf symbol GUID map + std::unordered_multimap StartAddrToSymMap; + + // An ordered map of mapping function's start address to function range + // relevant info. Currently to determine if the offset of ELF is the start of + // a real function, we leverage the function range info from DWARF. + std::map StartAddrToFuncRangeMap; + + // Address to context location map. Used to expand the context. + std::unordered_map AddressToLocStackMap; + + // Address to instruction size map. Also used for quick Address lookup. + std::unordered_map AddressToInstSizeMap; + + // An array of Addresses of all instructions sorted in increasing order. The + // sorting is needed to fast advance to the next forward/backward instruction. + std::vector CodeAddressVec; + // A set of call instruction addresses. Used by virtual unwinding. + std::unordered_set CallAddressSet; + // A set of return instruction addresses. Used by virtual unwinding. + std::unordered_set RetAddressSet; + // An ordered set of unconditional branch instruction addresses. + std::set UncondBranchAddrSet; + // A set of branch instruction addresses. + std::unordered_set BranchAddressSet; + + // Estimate and track function prolog and epilog ranges. + PrologEpilogTracker ProEpilogTracker; + + // Infer missing frames due to compiler optimizations such as tail call + // elimination. + std::unique_ptr MissingContextInferrer; + + // Track function sizes under different context + BinarySizeContextTracker FuncSizeTracker; + + // The symbolizer used to get inline context for an instruction. + std::unique_ptr Symbolizer; + + // String table owning function name strings created from the symbolizer. + std::unordered_set NameStrings; + + // A collection of functions to print disassembly for. + StringSet<> DisassembleFunctionSet; + + // Pseudo probe decoder + MCPseudoProbeDecoder ProbeDecoder; + + // Function name to probe frame map for top-level outlined functions. + StringMap TopLevelProbeFrameMap; + + bool UsePseudoProbes = false; + + bool UseFSDiscriminator = false; + + // Whether we need to symbolize all instructions to get function context size. + bool TrackFuncContextSize = false; + + // Indicate if the base loading address is parsed from the mmap event or uses + // the preferred address + bool IsLoadedByMMap = false; + // Use to avoid redundant warning. + bool MissingMMapWarned = false; + + void setPreferredTextSegmentAddresses(const ELFObjectFileBase *O); + + template + void setPreferredTextSegmentAddresses(const ELFFile &Obj, + StringRef FileName); + + void checkPseudoProbe(const ELFObjectFileBase *Obj); + + void decodePseudoProbe(const ELFObjectFileBase *Obj); + + void + checkUseFSDiscriminator(const ELFObjectFileBase *Obj, + std::map &AllSymbols); + + // Set up disassembler and related components. + void setUpDisassembler(const ELFObjectFileBase *Obj); + symbolize::LLVMSymbolizer::Options getSymbolizerOpts() const; + + // Load debug info of subprograms from DWARF section. + void loadSymbolsFromDWARF(ObjectFile &Obj); + + // Load debug info from DWARF unit. + void loadSymbolsFromDWARFUnit(DWARFUnit &CompilationUnit); + + // Create elf symbol to its start address mapping. + void populateElfSymbolAddressList(const ELFObjectFileBase *O); + + // A function may be spilt into multiple non-continuous address ranges. We use + // this to set whether start a function range is the real entry of the + // function and also set false to the non-function label. + void setIsFuncEntry(FuncRange *FRange, StringRef RangeSymName); + + // Warn if no entry range exists in the function. + void warnNoFuncEntry(); + + /// Dissassemble the text section and build various address maps. + void disassemble(const ELFObjectFileBase *O); + + /// Helper function to dissassemble the symbol and extract info for unwinding + bool dissassembleSymbol(std::size_t SI, ArrayRef Bytes, + SectionSymbolsTy &Symbols, const SectionRef &Section); + /// Symbolize a given instruction pointer and return a full call context. + SampleContextFrameVector symbolize(const InstructionPointer &IP, + bool UseCanonicalFnName = false, + bool UseProbeDiscriminator = false); + /// Decode the interesting parts of the binary and build internal data + /// structures. On high level, the parts of interest are: + /// 1. Text sections, including the main code section and the PLT + /// entries that will be used to handle cross-module call transitions. + /// 2. The .debug_line section, used by Dwarf-based profile generation. + /// 3. Pseudo probe related sections, used by probe-based profile + /// generation. + void load(); + +public: + ProfiledBinary(const StringRef ExeBinPath, const StringRef DebugBinPath); + ~ProfiledBinary(); + + void decodePseudoProbe(); + + StringRef getPath() const { return Path; } + StringRef getName() const { return llvm::sys::path::filename(Path); } + uint64_t getBaseAddress() const { return BaseAddress; } + void setBaseAddress(uint64_t Address) { BaseAddress = Address; } + + // Canonicalize to use preferred load address as base address. + uint64_t canonicalizeVirtualAddress(uint64_t Address) { + return Address - BaseAddress + getPreferredBaseAddress(); + } + // Return the preferred load address for the first executable segment. + uint64_t getPreferredBaseAddress() const { + return PreferredTextSegmentAddresses[0]; + } + // Return the preferred load address for the first loadable segment. + uint64_t getFirstLoadableAddress() const { return FirstLoadableAddress; } + // Return the file offset for the first executable segment. + uint64_t getTextSegmentOffset() const { return TextSegmentOffsets[0]; } + const std::vector &getPreferredTextSegmentAddresses() const { + return PreferredTextSegmentAddresses; + } + const std::vector &getTextSegmentOffsets() const { + return TextSegmentOffsets; + } + + uint64_t getInstSize(uint64_t Address) const { + auto I = AddressToInstSizeMap.find(Address); + if (I == AddressToInstSizeMap.end()) + return 0; + return I->second; + } + + bool addressIsCode(uint64_t Address) const { + return AddressToInstSizeMap.find(Address) != AddressToInstSizeMap.end(); + } + + bool addressIsCall(uint64_t Address) const { + return CallAddressSet.count(Address); + } + bool addressIsReturn(uint64_t Address) const { + return RetAddressSet.count(Address); + } + bool addressInPrologEpilog(uint64_t Address) const { + return ProEpilogTracker.PrologEpilogSet.count(Address); + } + + bool addressIsTransfer(uint64_t Address) { + return BranchAddressSet.count(Address) || RetAddressSet.count(Address) || + CallAddressSet.count(Address); + } + + bool rangeCrossUncondBranch(uint64_t Start, uint64_t End) { + if (Start >= End) + return false; + auto R = UncondBranchAddrSet.lower_bound(Start); + return R != UncondBranchAddrSet.end() && *R < End; + } + + uint64_t getAddressforIndex(uint64_t Index) const { + return CodeAddressVec[Index]; + } + + size_t getCodeAddrVecSize() const { return CodeAddressVec.size(); } + + bool usePseudoProbes() const { return UsePseudoProbes; } + bool useFSDiscriminator() const { return UseFSDiscriminator; } + // Get the index in CodeAddressVec for the address + // As we might get an address which is not the code + // here it would round to the next valid code address by + // using lower bound operation + uint32_t getIndexForAddr(uint64_t Address) const { + auto Low = llvm::lower_bound(CodeAddressVec, Address); + return Low - CodeAddressVec.begin(); + } + + uint64_t getCallAddrFromFrameAddr(uint64_t FrameAddr) const { + if (FrameAddr == ExternalAddr) + return ExternalAddr; + auto I = getIndexForAddr(FrameAddr); + FrameAddr = I ? getAddressforIndex(I - 1) : 0; + if (FrameAddr && addressIsCall(FrameAddr)) + return FrameAddr; + return 0; + } + + FuncRange *findFuncRangeForStartAddr(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.find(Address); + if (I == StartAddrToFuncRangeMap.end()) + return nullptr; + return &I->second; + } + + // Binary search the function range which includes the input address. + FuncRange *findFuncRange(uint64_t Address) { + auto I = StartAddrToFuncRangeMap.upper_bound(Address); + if (I == StartAddrToFuncRangeMap.begin()) + return nullptr; + I--; + + if (Address >= I->second.EndAddress) + return nullptr; + + return &I->second; + } + + // Get all ranges of one function. + RangesTy getRanges(uint64_t Address) { + auto *FRange = findFuncRange(Address); + // Ignore the range which falls into plt section or system lib. + if (!FRange) + return RangesTy(); + + return FRange->Func->Ranges; + } + + const std::unordered_map & + getAllBinaryFunctions() { + return BinaryFunctions; + } + + std::unordered_set &getProfiledFunctions() { + return ProfiledFunctions; + } + + void setProfiledFunctions(std::unordered_set &Funcs) { + ProfiledFunctions = Funcs; + } + + BinaryFunction *getBinaryFunction(StringRef FName) { + auto I = BinaryFunctions.find(FName.str()); + if (I == BinaryFunctions.end()) + return nullptr; + return &I->second; + } + + uint32_t getFuncSizeForContext(const ContextTrieNode *ContextNode) { + return FuncSizeTracker.getFuncSizeForContext(ContextNode); + } + + void inferMissingFrames(const SmallVectorImpl &Context, + SmallVectorImpl &NewContext); + + // Load the symbols from debug table and populate into symbol list. + void populateSymbolListFromDWARF(ProfileSymbolList &SymbolList); + + SampleContextFrameVector + getFrameLocationStack(uint64_t Address, bool UseProbeDiscriminator = false) { + InstructionPointer IP(this, Address); + return symbolize(IP, SymbolizerOpts.UseSymbolTable, UseProbeDiscriminator); + } + + const SampleContextFrameVector & + getCachedFrameLocationStack(uint64_t Address, + bool UseProbeDiscriminator = false) { + auto I = AddressToLocStackMap.emplace(Address, SampleContextFrameVector()); + if (I.second) { + I.first->second = getFrameLocationStack(Address, UseProbeDiscriminator); + } + return I.first->second; + } + + std::optional getInlineLeafFrameLoc(uint64_t Address) { + const auto &Stack = getCachedFrameLocationStack(Address); + if (Stack.empty()) + return {}; + return Stack.back(); + } + + void flushSymbolizer() { Symbolizer.reset(); } + + MissingFrameInferrer* getMissingContextInferrer() { + return MissingContextInferrer.get(); + } + + // Compare two addresses' inline context + bool inlineContextEqual(uint64_t Add1, uint64_t Add2); + + // Get the full context of the current stack with inline context filled in. + // It will search the disassembling info stored in AddressToLocStackMap. This + // is used as the key of function sample map + SampleContextFrameVector + getExpandedContext(const SmallVectorImpl &Stack, + bool &WasLeafInlined); + // Go through instructions among the given range and record its size for the + // inline context. + void computeInlinedContextSizeForRange(uint64_t StartAddress, + uint64_t EndAddress); + + void computeInlinedContextSizeForFunc(const BinaryFunction *Func); + + const MCDecodedPseudoProbe *getCallProbeForAddr(uint64_t Address) const { + return ProbeDecoder.getCallProbeForAddr(Address); + } + + void getInlineContextForProbe(const MCDecodedPseudoProbe *Probe, + SampleContextFrameVector &InlineContextStack, + bool IncludeLeaf = false) const { + SmallVector ProbeInlineContext; + ProbeDecoder.getInlineContextForProbe(Probe, ProbeInlineContext, + IncludeLeaf); + for (uint32_t I = 0; I < ProbeInlineContext.size(); I++) { + auto &Callsite = ProbeInlineContext[I]; + // Clear the current context for an unknown probe. + if (Callsite.second == 0 && I != ProbeInlineContext.size() - 1) { + InlineContextStack.clear(); + continue; + } + InlineContextStack.emplace_back(Callsite.first, + LineLocation(Callsite.second, 0)); + } + } + const AddressProbesMap &getAddress2ProbesMap() const { + return ProbeDecoder.getAddress2ProbesMap(); + } + const MCPseudoProbeFuncDesc *getFuncDescForGUID(uint64_t GUID) { + return ProbeDecoder.getFuncDescForGUID(GUID); + } + + const MCPseudoProbeFuncDesc * + getInlinerDescForProbe(const MCDecodedPseudoProbe *Probe) { + return ProbeDecoder.getInlinerDescForProbe(Probe); + } + + bool getTrackFuncContextSize() { return TrackFuncContextSize; } + + bool getIsLoadedByMMap() { return IsLoadedByMMap; } + + void setIsLoadedByMMap(bool Value) { IsLoadedByMMap = Value; } + + bool getMissingMMapWarned() { return MissingMMapWarned; } + + void setMissingMMapWarned(bool Value) { MissingMMapWarned = Value; } +}; + +} // end namespace sampleprof +} // end namespace llvm + +#endif diff --git a/tools/ldc-profgen/ldc-profgen-17.0/llvm-profgen.cpp b/tools/ldc-profgen/ldc-profgen-17.0/llvm-profgen.cpp new file mode 100644 index 00000000000..3b974e25103 --- /dev/null +++ b/tools/ldc-profgen/ldc-profgen-17.0/llvm-profgen.cpp @@ -0,0 +1,193 @@ +//===- llvm-profgen.cpp - LLVM SPGO profile generation tool -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// llvm-profgen generates SPGO profiles from perf script ouput. +// +//===----------------------------------------------------------------------===// + +#include "ErrorHandling.h" +#include "PerfReader.h" +#include "ProfileGenerator.h" +#include "ProfiledBinary.h" +#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/VirtualFileSystem.h" + +static cl::OptionCategory ProfGenCategory("ProfGen Options"); + +static cl::opt PerfScriptFilename( + "perfscript", cl::value_desc("perfscript"), + cl::desc("Path of perf-script trace created by Linux perf tool with " + "`script` command(the raw perf.data should be profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PSA("ps", cl::desc("Alias for --perfscript"), + cl::aliasopt(PerfScriptFilename)); + +static cl::opt PerfDataFilename( + "perfdata", cl::value_desc("perfdata"), + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -b)"), + cl::cat(ProfGenCategory)); +static cl::alias PDA("pd", cl::desc("Alias for --perfdata"), + cl::aliasopt(PerfDataFilename)); + +static cl::opt UnsymbolizedProfFilename( + "unsymbolized-profile", cl::value_desc("unsymbolized profile"), + cl::desc("Path of the unsymbolized profile created by " + "`llvm-profgen` with `--skip-symbolization`"), + cl::cat(ProfGenCategory)); +static cl::alias UPA("up", cl::desc("Alias for --unsymbolized-profile"), + cl::aliasopt(UnsymbolizedProfFilename)); + +static cl::opt SampleProfFilename( + "llvm-sample-profile", cl::value_desc("llvm sample profile"), + cl::desc("Path of the LLVM sample profile"), cl::cat(ProfGenCategory)); + +static cl::opt + BinaryPath("binary", cl::value_desc("binary"), cl::Required, + cl::desc("Path of profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt + ProcessId("pid", cl::value_desc("process Id"), cl::init(0), + cl::desc("Process Id for the profiled executable binary."), + cl::cat(ProfGenCategory)); + +static cl::opt DebugBinPath( + "debug-binary", cl::value_desc("debug-binary"), + cl::desc("Path of debug info binary, llvm-profgen will load the DWARF info " + "from it instead of the executable binary."), + cl::cat(ProfGenCategory)); + +extern cl::opt ShowDisassemblyOnly; +extern cl::opt ShowSourceLocations; +extern cl::opt SkipSymbolization; + +using namespace llvm; +using namespace sampleprof; + +// Validate the command line input. +static void validateCommandLine() { + // Allow the missing perfscript if we only use to show binary disassembly. + if (!ShowDisassemblyOnly) { + // Validate input profile is provided only once + bool HasPerfData = PerfDataFilename.getNumOccurrences() > 0; + bool HasPerfScript = PerfScriptFilename.getNumOccurrences() > 0; + bool HasUnsymbolizedProfile = + UnsymbolizedProfFilename.getNumOccurrences() > 0; + bool HasSampleProfile = SampleProfFilename.getNumOccurrences() > 0; + uint16_t S = + HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile; + if (S != 1) { + std::string Msg = + S > 1 + ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " + "cannot be used together." + : "Perf input file is missing, please use one of `--perfscript`, " + "`--perfdata` and `--unsymbolized-profile` for the input."; + exitWithError(Msg); + } + + auto CheckFileExists = [](bool H, StringRef File) { + if (H && !llvm::sys::fs::exists(File)) { + std::string Msg = "Input perf file(" + File.str() + ") doesn't exist."; + exitWithError(Msg); + } + }; + + CheckFileExists(HasPerfData, PerfDataFilename); + CheckFileExists(HasPerfScript, PerfScriptFilename); + CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); + CheckFileExists(HasSampleProfile, SampleProfFilename); + } + + if (!llvm::sys::fs::exists(BinaryPath)) { + std::string Msg = "Input binary(" + BinaryPath + ") doesn't exist."; + exitWithError(Msg); + } + + if (CSProfileGenerator::MaxCompressionSize < -1) { + exitWithError("Value of --compress-recursion should >= -1"); + } + if (ShowSourceLocations && !ShowDisassemblyOnly) { + exitWithError("--show-source-locations should work together with " + "--show-disassembly-only!"); + } +} + +static PerfInputFile getPerfInputFile() { + PerfInputFile File; + if (PerfDataFilename.getNumOccurrences()) { + File.InputFile = PerfDataFilename; + File.Format = PerfFormat::PerfData; + } else if (PerfScriptFilename.getNumOccurrences()) { + File.InputFile = PerfScriptFilename; + File.Format = PerfFormat::PerfScript; + } else if (UnsymbolizedProfFilename.getNumOccurrences()) { + File.InputFile = UnsymbolizedProfFilename; + File.Format = PerfFormat::UnsymbolizedProfile; + } + return File; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + // Initialize targets and assembly printers/parsers. + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllDisassemblers(); + + cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()}); + cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n"); + validateCommandLine(); + + // Load symbols and disassemble the code of a given binary. + std::unique_ptr Binary = + std::make_unique(BinaryPath, DebugBinPath); + if (ShowDisassemblyOnly) + return EXIT_SUCCESS; + + if (SampleProfFilename.getNumOccurrences()) { + LLVMContext Context; + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = + SampleProfileReader::create(SampleProfFilename, Context, *FS); + std::unique_ptr Reader = + std::move(ReaderOrErr.get()); + Reader->read(); + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), Reader->getProfiles(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } else { + std::optional PIDFilter; + if (ProcessId.getNumOccurrences()) + PIDFilter = ProcessId; + PerfInputFile PerfFile = getPerfInputFile(); + std::unique_ptr Reader = + PerfReaderBase::create(Binary.get(), PerfFile, PIDFilter); + // Parse perf events and samples + Reader->parsePerfTraces(); + + if (SkipSymbolization) + return EXIT_SUCCESS; + + std::unique_ptr Generator = + ProfileGeneratorBase::create(Binary.get(), &Reader->getSampleCounters(), + Reader->profileIsCS()); + Generator->generateProfile(); + Generator->write(); + } + + return EXIT_SUCCESS; +}