Skip to content

Commit

Permalink
Merge pull request #48 from ucb-bar/lbr
Browse files Browse the repository at this point in the history
Last Branch Record Performance Monitoring
  • Loading branch information
T-K-233 authored Dec 26, 2024
2 parents 2bc02cf + 2e2fa04 commit a382f71
Show file tree
Hide file tree
Showing 20 changed files with 532 additions and 30 deletions.
16 changes: 15 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ option(BAREMETAL_BUILD_X86 "Build for x86 platform" OF
option(CHIP "Build for a specific platform" OFF )

option(PROF_COV "Build with profiling and coverage" OFF )
option(GCNO_ONLY "Only build gcno files" OFF )
option(USE_PGO "Build with profile guided optimization" OFF )

#################################
Expand All @@ -49,7 +50,7 @@ set(CMAKE_EXECUTABLE_SUFFIX ".elf")
set(ARCH "rv64imafd")
set(ABI "lp64d")
set(CMODEL "medany")
set(ARCH_FLAGS --verbose -march=${ARCH} -mabi=${ABI} -mcmodel=${CMODEL})
set(ARCH_FLAGS -march=${ARCH} -mabi=${ABI} -mcmodel=${CMODEL})

# spec
set(SPECS "nosys.specs")
Expand Down Expand Up @@ -81,9 +82,15 @@ if (PROF_COV)
add_compile_options(-fprofile-arcs -ftest-coverage)
endif()

if (GCNO_ONLY)
message(STATUS "Building only gcno files")
add_compile_options(-ftest-coverage)
endif()

if (USE_PGO)
add_compile_options(-fprofile-use)
endif()
add_compile_options(-fno-builtin)

# add_compile_options(-ffunction-sections -fdata-sections -fno-common -fno-builtin-printf -fno-pie)
# add_compile_options(-Wall -Wextra -Warray-bounds -Wno-unused-parameter -Wcast-qual)
Expand All @@ -97,12 +104,15 @@ add_link_options(-nostartfiles)
add_link_options(${ARCH_FLAGS})
add_link_options(${SPEC_FLAGS})
add_link_options(-T ${LINKER_SCRIPT})
# add_link_options(-rtlib=compiler-rt)
# add_link_options(-Wl,--rtlib-path=/scratch/iansseijelly/riscv-llvm-install/lib/linux/libclang_rt.builtins-riscv64.a)
if (PROF_COV)
add_link_options(-lgcov)
endif()




#################################
# Build
#################################
Expand All @@ -113,6 +123,10 @@ add_executable(app

target_include_directories(app PUBLIC app/include)

# add std lib path
include_directories(/scratch/iansseijelly/chipyard/.conda-env/riscv-tools/riscv64-unknown-elf/include)
link_directories(/scratch/iansseijelly/chipyard/.conda-env/riscv-tools/riscv64-unknown-elf/lib)


#################################
# Dependencies
Expand Down
22 changes: 22 additions & 0 deletions driver/rocket-chip/l_trace_encoder/l_trace_encoder.c
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
#include "l_trace_encoder.h"

void l_trace_sink_dma_configure_addr(LTraceSinkDmaType *sink_dma, uint64_t dma_addr) {
sink_dma->TR_SK_DMA_ADDR = dma_addr;
}

void l_trace_sink_dma_read(LTraceSinkDmaType *sink_dma, uint8_t *buffer) {
sink_dma->TR_SK_DMA_FLUSH = 1;
while (sink_dma->TR_SK_DMA_FLUSH_DONE == 0) {
;
}
// printf("[l_trace_sink_dma_read] flush done\n");
uint64_t count = sink_dma->TR_SK_DMA_COUNT;
printf("[l_trace_sink_dma_read] count: %lld\n", count);
for (uint8_t i = 0; i < count; i++) {
printf("%02x ", buffer[i]);
}
printf("\n");
}

void l_trace_encoder_start(LTraceEncoderType *encoder) {
SET_BITS(encoder->TR_TE_CTRL, 0x1 << 1);
}

void l_trace_encoder_configure_target(LTraceEncoderType *encoder, uint64_t target) {
encoder->TR_TE_TARGET = target;
}

void l_trace_encoder_stop(LTraceEncoderType *encoder) {
CLEAR_BITS(encoder->TR_TE_CTRL, 0x1 << 1);
}
Expand Down
29 changes: 26 additions & 3 deletions driver/rocket-chip/l_trace_encoder/l_trace_encoder.h
Original file line number Diff line number Diff line change
@@ -1,26 +1,49 @@
#ifndef __L_TRACE_ENCODER_H
#define __L_TRACE_ENCODER_H

#include <stdio.h>

#include "metal.h"
#include "rocketcore.h"

typedef struct {
uint32_t TR_TE_CTRL;
uint32_t TR_TE_TARGET;
} LTraceEncoderType;

typedef struct {
uint32_t TR_SK_DMA_FLUSH;
uint32_t TR_SK_DMA_FLUSH_DONE;
uint64_t TR_SK_DMA_ADDR;
uint64_t TR_SK_DMA_COUNT;
} LTraceSinkDmaType;

#define TARGET_PRINT 0x0
#define TARGET_DMA 0x1
#define L_TRACE_ENCODER_BASE_ADDRESS 0x3000000

#define L_TRACE_ENCODER0 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x0000))
#define L_TRACE_ENCODER1 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x1000))
#define L_TRACE_ENCODER2 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x2000))
#define L_TRACE_ENCODER3 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x3000))

#define L_TRACE_SINK_DMA_BASE_ADDRESS 0x3010000
#define L_TRACE_SINK_DMA0 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x0000))
#define L_TRACE_SINK_DMA1 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x1000))
#define L_TRACE_SINK_DMA2 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x2000))
#define L_TRACE_SINK_DMA3 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x3000))

static inline LTraceEncoderType *l_trace_encoder_get(uint32_t hart_id) {
return (LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + hart_id * 0x1000);
}

void l_trace_encoder_start(LTraceEncoderType *encoder);
static inline LTraceSinkDmaType *l_trace_sink_dma_get(uint32_t hart_id) {
return (LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + hart_id * 0x1000);
}

void l_trace_encoder_start(LTraceEncoderType *encoder);
void l_trace_encoder_stop(LTraceEncoderType *encoder);

#endif /* __L_TRACE_ENCODER_H */
void l_trace_encoder_configure_target(LTraceEncoderType *encoder, uint64_t target);
void l_trace_sink_dma_configure_addr(LTraceSinkDmaType *sink_dma, uint64_t dma_addr);
void l_trace_sink_dma_read(LTraceSinkDmaType *sink_dma, uint8_t *buffer);
#endif /* __L_TRACE_ENCODER_H */
38 changes: 38 additions & 0 deletions driver/rocket-chip/rocketcore/riscv_encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,25 @@
#define CSR_MHPMCOUNTER29H 0xb9d
#define CSR_MHPMCOUNTER30H 0xb9e
#define CSR_MHPMCOUNTER31H 0xb9f
// custom csr for LBR functionalities
#define CSR_LBR_CTRL 0x401
#define CSR_LBR_NUM 0x402
#define CSR_LBR_SRC0 0x403
#define CSR_LBR_SRC1 0x404
#define CSR_LBR_SRC2 0x405
#define CSR_LBR_SRC3 0x406
#define CSR_LBR_SRC4 0x407
#define CSR_LBR_SRC5 0x408
#define CSR_LBR_SRC6 0x409
#define CSR_LBR_SRC7 0x40a
#define CSR_LBR_DST0 0x40b
#define CSR_LBR_DST1 0x40c
#define CSR_LBR_DST2 0x40d
#define CSR_LBR_DST3 0x40e
#define CSR_LBR_DST4 0x40f
#define CSR_LBR_DST5 0x410
#define CSR_LBR_DST6 0x411
#define CSR_LBR_DST7 0x412
#define CAUSE_MISALIGNED_FETCH 0x0
#define CAUSE_FAULT_FETCH 0x1
#define CAUSE_ILLEGAL_INSTRUCTION 0x2
Expand All @@ -1024,6 +1043,7 @@
#define CAUSE_SUPERVISOR_ECALL 0x9
#define CAUSE_HYPERVISOR_ECALL 0xa
#define CAUSE_MACHINE_ECALL 0xb

#endif /* __RV_ENCODING_H */

#ifdef DECLARE_INSN
Expand Down Expand Up @@ -1454,6 +1474,24 @@ DECLARE_CSR(mhpmcounter28h, CSR_MHPMCOUNTER28H)
DECLARE_CSR(mhpmcounter29h, CSR_MHPMCOUNTER29H)
DECLARE_CSR(mhpmcounter30h, CSR_MHPMCOUNTER30H)
DECLARE_CSR(mhpmcounter31h, CSR_MHPMCOUNTER31H)
DECLARE_CSR(lbr_ctrl, CSR_LBR_CTRL)
DECLARE_CSR(lbr_num, CSR_LBR_NUM)
DECLARE_CSR(lbr_src0, CSR_LBR_SRC0)
DECLARE_CSR(lbr_src1, CSR_LBR_SRC1)
DECLARE_CSR(lbr_src2, CSR_LBR_SRC2)
DECLARE_CSR(lbr_src3, CSR_LBR_SRC3)
DECLARE_CSR(lbr_src4, CSR_LBR_SRC4)
DECLARE_CSR(lbr_src5, CSR_LBR_SRC5)
DECLARE_CSR(lbr_src6, CSR_LBR_SRC6)
DECLARE_CSR(lbr_src7, CSR_LBR_SRC7)
DECLARE_CSR(lbr_dst0, CSR_LBR_DST0)
DECLARE_CSR(lbr_dst1, CSR_LBR_DST1)
DECLARE_CSR(lbr_dst2, CSR_LBR_DST2)
DECLARE_CSR(lbr_dst3, CSR_LBR_DST3)
DECLARE_CSR(lbr_dst4, CSR_LBR_DST4)
DECLARE_CSR(lbr_dst5, CSR_LBR_DST5)
DECLARE_CSR(lbr_dst6, CSR_LBR_DST6)
DECLARE_CSR(lbr_dst7, CSR_LBR_DST7)
#endif

#ifdef DECLARE_CAUSE
Expand Down
14 changes: 14 additions & 0 deletions examples/embench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@
# Define the list of benchmarks
option (EMBENCH_ENABLE_TRACE_PRINT "Enable trace print for benchmarks" OFF)
option (EMBENCH_ENABLE_TRACE_DMA "Enable trace dma for benchmarks" OFF)

if (EMBENCH_ENABLE_TRACE_PRINT)
add_definitions(-DUSE_L_TRACE)
add_definitions(-DUSE_L_TRACE_PRINT)
endif()

if (EMBENCH_ENABLE_TRACE_DMA)
add_definitions(-DUSE_L_TRACE)
add_definitions(-DUSE_L_TRACE_DMA)
endif()

set(BENCHMARKS
dummy
wikisort
Expand Down Expand Up @@ -29,4 +42,5 @@ foreach(benchmark ${BENCHMARKS})
if (PROF_COV)
target_link_libraries(${benchmark} PRIVATE gcov)
endif()
target_link_libraries(${benchmark} PRIVATE lbr)
endforeach()
80 changes: 62 additions & 18 deletions examples/embench/common/inc/trigger.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,62 @@

// use timer interrupt to trigger the profiler
// #define TIMER_INTERRUPT
// use PMU reading to report the profile
// #define USE_PMU_READING
// use LBR to report the trace
// #define USE_LBR
// report the total time of the benchmark
#define REPORT_TOTAL_TIME
// use trace encoder to report the trace
// use trace encoder RTL Print to report the trace
// #define USE_L_TRACE
// use trace encoder DMA to report the trace
// #define USE_L_TRACE_DMA
// use trace encoder RTL Print to report the trace
// #define USE_L_TRACE_PRINT

/* timer interrupt interval in milliseconds
only used when TIMER_INTERRUPT is defined
*/
#define TIMER_INTERRUPT_INTERVAL 1000
#define TIMER_INTERRUPT_INTERVAL 100

static inline void start_trigger(void) {
#ifdef USE_L_TRACE
LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id());
l_trace_encoder_start(encoder);
#endif
#ifdef USE_L_TRACE_DMA
static uint8_t dma_buffer[512 * 1024];
#endif

static inline void start_trigger(void) {
#ifdef REPORT_TOTAL_TIME
uint64_t curr_time = clint_get_time(CLINT);
printf("start trigger at %lld\n", curr_time);
#endif

#ifdef TIMER_INTERRUPT
LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id());
LTraceSinkDmaType *sink_dma = l_trace_sink_dma_get(get_hart_id());

#ifdef USE_L_TRACE_DMA
l_trace_sink_dma_configure_addr(sink_dma, (uint64_t)dma_buffer);
l_trace_encoder_configure_target(encoder, TARGET_DMA);
#endif

#ifdef USE_L_TRACE_PRINT
l_trace_encoder_configure_target(encoder, TARGET_PRINT);
#endif

#ifdef USE_L_TRACE
l_trace_encoder_start(encoder);
#endif

#ifdef USE_PMU_READING
PMU_EVENT_ENABLE(PMU_EVENT(1, LOAD_USE_INTERLOCK), 3);
PMU_COUNTER_RESET(3);
PMU_EVENT_ENABLE(PMU_EVENT(1, BRANCH_MISPREDICTION), 4);
PMU_COUNTER_RESET(4);
#endif

#ifdef USE_LBR
lbr_init();
#endif

#ifdef TIMER_INTERRUPT
enable_global_interrupt();
enable_timer_interrupt();
clint_set_timer_interrupt_target(CLINT, get_hart_id(), curr_time + TIMER_INTERRUPT_INTERVAL);
Expand All @@ -44,22 +73,32 @@ static inline void start_trigger(void) {
#ifdef TIMER_INTERRUPT
// override weak implementation
void machine_timer_interrupt_callback() {
PMU_INHIBIT_ENABLE(3);
PMU_INHIBIT_ENABLE(4);
#ifdef USE_PMU_READING
PMU_INHIBIT_ENABLE(3);
PMU_INHIBIT_ENABLE(4);
uint64_t curr_cycle = get_cycles();
printf("curr_cycle: %lld\n", curr_cycle);
uint64_t load_use_interlock_count = PMU_COUNTER_READ_CLEAR(3);
printf("load use interlock count: %lld\n", load_use_interlock_count);
uint64_t branch_misprediction_count = PMU_COUNTER_READ_CLEAR(4);
printf("branch misprediction count: %lld\n", branch_misprediction_count);
PMU_INHIBIT_DISABLE(3);
PMU_INHIBIT_DISABLE(4);
#endif

#ifdef USE_LBR
lbr_fetch_records();
#endif

uint64_t curr_time = clint_get_time(CLINT);
uint64_t curr_cycle = get_cycles();
printf("curr_cycle: %lld\n", curr_cycle);
uint64_t load_use_interlock_count = PMU_COUNTER_READ_CLEAR(3);
printf("load use interlock count: %lld\n", load_use_interlock_count);
uint64_t branch_misprediction_count = PMU_COUNTER_READ_CLEAR(4);
printf("branch misprediction count: %lld\n", branch_misprediction_count);
clint_set_timer_interrupt_target(CLINT, get_hart_id(), curr_time + TIMER_INTERRUPT_INTERVAL);
PMU_INHIBIT_DISABLE(3);
PMU_INHIBIT_DISABLE(4);
}
#endif

static inline void stop_trigger(void) {
#ifdef USE_LBR
lbr_dump_records();
#endif
#ifdef USE_L_TRACE
LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id());
l_trace_encoder_stop(encoder);
Expand All @@ -69,6 +108,11 @@ static inline void stop_trigger(void) {
int64_t curr_time = clint_get_time(CLINT);
printf("stop trigger at %lld\n", curr_time);
#endif

#ifdef USE_L_TRACE_DMA
LTraceSinkDmaType *sink_dma = l_trace_sink_dma_get(get_hart_id());
l_trace_sink_dma_read(sink_dma, dma_buffer);
#endif
}

#endif /* __TRIGGER_H */
10 changes: 7 additions & 3 deletions examples/pmu-tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ set (TESTS
pmu-test-load
pmu-test-store
pmu-test-inhibit
lbr-test
ltrace-dma-test
sort
)

foreach(test ${TESTS})
add_executable(${test} src/${test}.c)
target_link_libraries(${test} PRIVATE
-L${CMAKE_BINARY_DIR}/glossy -Wl,--whole-archive glossy -Wl,--no-whole-archive
target_link_libraries(${test}
PUBLIC l_trace_encoder
PRIVATE -L${CMAKE_BINARY_DIR}/glossy -Wl,--whole-archive glossy -Wl,--no-whole-archive
)
endforeach()
endforeach()
Loading

0 comments on commit a382f71

Please sign in to comment.