From c5fe13a39840a6489d1417b9fd195b6157e515ef Mon Sep 17 00:00:00 2001 From: Lux Date: Thu, 21 Nov 2024 18:53:30 -0800 Subject: [PATCH 1/9] ADD: remove verbose compiler flag --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c3d7fa..c092e2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(CMAKE_EXECUTABLE_SUFFIX ".elf") set(ARCH "rv64imafd") set(ABI "lp64d") set(CMODEL "medany") -set(ARCH_FLAGS --verbose -march=${ARCH} -mabi=${ABI} -mcmodel=${CMODEL}) +set(ARCH_FLAGS -march=${ARCH} -mabi=${ABI} -mcmodel=${CMODEL}) # spec set(SPECS "nosys.specs") From 963e09fa2505df0a094a7c678c6e6bf5b7a09674 Mon Sep 17 00:00:00 2001 From: Lux Date: Thu, 21 Nov 2024 18:53:56 -0800 Subject: [PATCH 2/9] ADD: test code for using and reading the lbr csrs --- .../rocket-chip/rocketcore/riscv_encoding.h | 38 +++++++++++++ examples/pmu-tests/CMakeLists.txt | 1 + examples/pmu-tests/src/lbr-test.c | 57 +++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 examples/pmu-tests/src/lbr-test.c diff --git a/driver/rocket-chip/rocketcore/riscv_encoding.h b/driver/rocket-chip/rocketcore/riscv_encoding.h index 1403a78..2418734 100644 --- a/driver/rocket-chip/rocketcore/riscv_encoding.h +++ b/driver/rocket-chip/rocketcore/riscv_encoding.h @@ -1012,6 +1012,25 @@ #define CSR_MHPMCOUNTER29H 0xb9d #define CSR_MHPMCOUNTER30H 0xb9e #define CSR_MHPMCOUNTER31H 0xb9f +// custom csr for LBR functionalities +#define CSR_LBR_CTRL 0x401 +#define CSR_LBR_NUM 0x402 +#define CSR_LBR_SRC0 0x403 +#define CSR_LBR_SRC1 0x404 +#define CSR_LBR_SRC2 0x405 +#define CSR_LBR_SRC3 0x406 +#define CSR_LBR_SRC4 0x407 +#define CSR_LBR_SRC5 0x408 +#define CSR_LBR_SRC6 0x409 +#define CSR_LBR_SRC7 0x40a +#define CSR_LBR_DST0 0x40b +#define CSR_LBR_DST1 0x40c +#define CSR_LBR_DST2 0x40d +#define CSR_LBR_DST3 0x40e +#define CSR_LBR_DST4 0x40f +#define CSR_LBR_DST5 0x410 +#define CSR_LBR_DST6 0x411 +#define CSR_LBR_DST7 0x412 #define CAUSE_MISALIGNED_FETCH 0x0 #define CAUSE_FAULT_FETCH 0x1 #define CAUSE_ILLEGAL_INSTRUCTION 0x2 @@ -1024,6 +1043,7 @@ #define CAUSE_SUPERVISOR_ECALL 0x9 #define CAUSE_HYPERVISOR_ECALL 0xa #define CAUSE_MACHINE_ECALL 0xb + #endif /* __RV_ENCODING_H */ #ifdef DECLARE_INSN @@ -1454,6 +1474,24 @@ DECLARE_CSR(mhpmcounter28h, CSR_MHPMCOUNTER28H) DECLARE_CSR(mhpmcounter29h, CSR_MHPMCOUNTER29H) DECLARE_CSR(mhpmcounter30h, CSR_MHPMCOUNTER30H) DECLARE_CSR(mhpmcounter31h, CSR_MHPMCOUNTER31H) +DECLARE_CSR(lbr_ctrl, CSR_LBR_CTRL) +DECLARE_CSR(lbr_num, CSR_LBR_NUM) +DECLARE_CSR(lbr_src0, CSR_LBR_SRC0) +DECLARE_CSR(lbr_src1, CSR_LBR_SRC1) +DECLARE_CSR(lbr_src2, CSR_LBR_SRC2) +DECLARE_CSR(lbr_src3, CSR_LBR_SRC3) +DECLARE_CSR(lbr_src4, CSR_LBR_SRC4) +DECLARE_CSR(lbr_src5, CSR_LBR_SRC5) +DECLARE_CSR(lbr_src6, CSR_LBR_SRC6) +DECLARE_CSR(lbr_src7, CSR_LBR_SRC7) +DECLARE_CSR(lbr_dst0, CSR_LBR_DST0) +DECLARE_CSR(lbr_dst1, CSR_LBR_DST1) +DECLARE_CSR(lbr_dst2, CSR_LBR_DST2) +DECLARE_CSR(lbr_dst3, CSR_LBR_DST3) +DECLARE_CSR(lbr_dst4, CSR_LBR_DST4) +DECLARE_CSR(lbr_dst5, CSR_LBR_DST5) +DECLARE_CSR(lbr_dst6, CSR_LBR_DST6) +DECLARE_CSR(lbr_dst7, CSR_LBR_DST7) #endif #ifdef DECLARE_CAUSE diff --git a/examples/pmu-tests/CMakeLists.txt b/examples/pmu-tests/CMakeLists.txt index d4b0c6c..2c8aa6a 100644 --- a/examples/pmu-tests/CMakeLists.txt +++ b/examples/pmu-tests/CMakeLists.txt @@ -2,6 +2,7 @@ set (TESTS pmu-test-load pmu-test-store pmu-test-inhibit + lbr-test ) foreach(test ${TESTS}) diff --git a/examples/pmu-tests/src/lbr-test.c b/examples/pmu-tests/src/lbr-test.c new file mode 100644 index 0000000..47132cf --- /dev/null +++ b/examples/pmu-tests/src/lbr-test.c @@ -0,0 +1,57 @@ +#include +#include "riscv.h" +#include "riscv_encoding.h" +#include "pmu.h" +#define NUM_ITERS 4 +#define LBR_NUM 8 + +int main(int argc, char **argv) { + // enable LBR + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(1)); + volatile int c; + // do some dummy loops + for (int i = 0; i < NUM_ITERS; i++) { + c += i; + } + // do some different loops + for (int i = 0; i < NUM_ITERS; i++) { + c -= i; + } + // do a jump calling a dummy function + volatile int d = dummy_function(c); + // disable LBR + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(0)); + // read LBR0 + uint64_t src, dst; + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC0)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST0)); + printf("LBR[0]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC1)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST1)); + printf("LBR[1]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC2)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST2)); + printf("LBR[2]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC3)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST3)); + printf("LBR[3]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC4)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST4)); + printf("LBR[4]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC5)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST5)); + printf("LBR[5]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC6)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST6)); + printf("LBR[6]: src = %lx, dst = %lx\n", src, dst); + asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC7)); + asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST7)); + printf("LBR[7]: src = %lx, dst = %lx\n", src, dst); + return 0; +} + +__attribute__((noinline)) +int dummy_function(int i) { + int j = i + 1; + return j; +} \ No newline at end of file From 2ab198e360e9213694e34a0c654a853afccae691 Mon Sep 17 00:00:00 2001 From: Lux Date: Mon, 25 Nov 2024 10:43:11 -0800 Subject: [PATCH 3/9] ADD: baremetal perf LBR library support --- examples/embench/CMakeLists.txt | 1 + examples/embench/common/inc/trigger.h | 46 +++++++++++----- examples/pmu-tests/src/lbr-test.c | 5 +- glossy/src/trap/trap.S | 6 +++ lib/CMakeLists.txt | 3 +- lib/perf/CMakeLists.txt | 5 ++ lib/perf/lbr.c | 76 +++++++++++++++++++++++++++ lib/perf/lbr.h | 19 +++++++ lib/perf/perf.h | 2 + 9 files changed, 146 insertions(+), 17 deletions(-) create mode 100644 lib/perf/CMakeLists.txt create mode 100644 lib/perf/lbr.c create mode 100644 lib/perf/lbr.h create mode 100644 lib/perf/perf.h diff --git a/examples/embench/CMakeLists.txt b/examples/embench/CMakeLists.txt index 20c6172..e176624 100644 --- a/examples/embench/CMakeLists.txt +++ b/examples/embench/CMakeLists.txt @@ -29,4 +29,5 @@ foreach(benchmark ${BENCHMARKS}) if (PROF_COV) target_link_libraries(${benchmark} PRIVATE gcov) endif() + target_link_libraries(${benchmark} PRIVATE lbr) endforeach() diff --git a/examples/embench/common/inc/trigger.h b/examples/embench/common/inc/trigger.h index 0eb05bc..9829959 100644 --- a/examples/embench/common/inc/trigger.h +++ b/examples/embench/common/inc/trigger.h @@ -7,7 +7,11 @@ #include "pmu.h" // use timer interrupt to trigger the profiler -// #define TIMER_INTERRUPT +#define TIMER_INTERRUPT +// use PMU reading to report the profile +// #define USE_PMU_READING +// use LBR to report the trace +#define USE_LBR // report the total time of the benchmark #define REPORT_TOTAL_TIME // use trace encoder to report the trace @@ -16,7 +20,7 @@ /* timer interrupt interval in milliseconds only used when TIMER_INTERRUPT is defined */ -#define TIMER_INTERRUPT_INTERVAL 1000 +#define TIMER_INTERRUPT_INTERVAL 100 static inline void start_trigger(void) { #ifdef USE_L_TRACE @@ -29,12 +33,18 @@ static inline void start_trigger(void) { printf("start trigger at %lld\n", curr_time); #endif - #ifdef TIMER_INTERRUPT + #ifdef USE_PMU_READING PMU_EVENT_ENABLE(PMU_EVENT(1, LOAD_USE_INTERLOCK), 3); PMU_COUNTER_RESET(3); PMU_EVENT_ENABLE(PMU_EVENT(1, BRANCH_MISPREDICTION), 4); PMU_COUNTER_RESET(4); + #endif + #ifdef USE_LBR + lbr_init(); + #endif + + #ifdef TIMER_INTERRUPT enable_global_interrupt(); enable_timer_interrupt(); clint_set_timer_interrupt_target(CLINT, get_hart_id(), curr_time + TIMER_INTERRUPT_INTERVAL); @@ -44,22 +54,32 @@ static inline void start_trigger(void) { #ifdef TIMER_INTERRUPT // override weak implementation void machine_timer_interrupt_callback() { - PMU_INHIBIT_ENABLE(3); - PMU_INHIBIT_ENABLE(4); + #ifdef USE_PMU_READING + PMU_INHIBIT_ENABLE(3); + PMU_INHIBIT_ENABLE(4); + uint64_t curr_cycle = get_cycles(); + printf("curr_cycle: %lld\n", curr_cycle); + uint64_t load_use_interlock_count = PMU_COUNTER_READ_CLEAR(3); + printf("load use interlock count: %lld\n", load_use_interlock_count); + uint64_t branch_misprediction_count = PMU_COUNTER_READ_CLEAR(4); + printf("branch misprediction count: %lld\n", branch_misprediction_count); + PMU_INHIBIT_DISABLE(3); + PMU_INHIBIT_DISABLE(4); + #endif + + #ifdef USE_LBR + lbr_fetch_records(); + #endif + uint64_t curr_time = clint_get_time(CLINT); - uint64_t curr_cycle = get_cycles(); - printf("curr_cycle: %lld\n", curr_cycle); - uint64_t load_use_interlock_count = PMU_COUNTER_READ_CLEAR(3); - printf("load use interlock count: %lld\n", load_use_interlock_count); - uint64_t branch_misprediction_count = PMU_COUNTER_READ_CLEAR(4); - printf("branch misprediction count: %lld\n", branch_misprediction_count); clint_set_timer_interrupt_target(CLINT, get_hart_id(), curr_time + TIMER_INTERRUPT_INTERVAL); - PMU_INHIBIT_DISABLE(3); - PMU_INHIBIT_DISABLE(4); } #endif static inline void stop_trigger(void) { + #ifdef USE_LBR + lbr_dump_records(); + #endif #ifdef USE_L_TRACE LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id()); l_trace_encoder_stop(encoder); diff --git a/examples/pmu-tests/src/lbr-test.c b/examples/pmu-tests/src/lbr-test.c index 47132cf..84b5be8 100644 --- a/examples/pmu-tests/src/lbr-test.c +++ b/examples/pmu-tests/src/lbr-test.c @@ -7,7 +7,7 @@ int main(int argc, char **argv) { // enable LBR - asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(1)); + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(3)); volatile int c; // do some dummy loops for (int i = 0; i < NUM_ITERS; i++) { @@ -20,8 +20,7 @@ int main(int argc, char **argv) { // do a jump calling a dummy function volatile int d = dummy_function(c); // disable LBR - asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(0)); - // read LBR0 + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(1)); uint64_t src, dst; asm volatile ("csrr %0, %1" : "=r"(src) : "n"(CSR_LBR_SRC0)); asm volatile ("csrr %0, %1" : "=r"(dst) : "n"(CSR_LBR_DST0)); diff --git a/glossy/src/trap/trap.S b/glossy/src/trap/trap.S index ec5a3ca..9fe28b1 100644 --- a/glossy/src/trap/trap.S +++ b/glossy/src/trap/trap.S @@ -43,6 +43,9 @@ trap_vector: SREG x30, 30*REGBYTES(sp) SREG x31, 31*REGBYTES(sp) + /* disable LBR during the trap */ + csrrci a0, 0x401, 0x02 + /* Invoke higher-level trap handler */ csrr a0, mepc csrr a1, mcause @@ -51,6 +54,9 @@ trap_vector: call trap_handler csrw mepc, a0 + /* restore LBR after the trap */ + csrrsi a0, 0x401, 0x02 + /* Remain in M-mode after return */ li t0, MSTATUS_MPP csrs mstatus, t0 diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index fe266da..f92cf1b 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1 +1,2 @@ -add_subdirectory(gcov) \ No newline at end of file +add_subdirectory(gcov) +add_subdirectory(perf) diff --git a/lib/perf/CMakeLists.txt b/lib/perf/CMakeLists.txt new file mode 100644 index 0000000..154226a --- /dev/null +++ b/lib/perf/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(lbr STATIC lbr.c) + +target_include_directories(lbr PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries(lbr PUBLIC rocketcore) \ No newline at end of file diff --git a/lib/perf/lbr.c b/lib/perf/lbr.c new file mode 100644 index 0000000..e9afb40 --- /dev/null +++ b/lib/perf/lbr.c @@ -0,0 +1,76 @@ +#include "lbr.h" + +void lbr_init() { + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(3)); + lbr_buffer_counter = 0; +} + +void lbr_fetch_records() { + uint64_t src0, dst0; + asm volatile ("csrr %0, %1" : "=r"(src0) : "n"(CSR_LBR_SRC0)); + asm volatile ("csrr %0, %1" : "=r"(dst0) : "n"(CSR_LBR_DST0)); + lbr_buffer[lbr_buffer_counter + 0] = src0; + lbr_buffer[lbr_buffer_counter + 1] = dst0; + lbr_buffer_counter += 2; + + uint64_t src1, dst1; + asm volatile ("csrr %0, %1" : "=r"(src1) : "n"(CSR_LBR_SRC1)); + asm volatile ("csrr %0, %1" : "=r"(dst1) : "n"(CSR_LBR_DST1)); + lbr_buffer[lbr_buffer_counter + 0] = src1; + lbr_buffer[lbr_buffer_counter + 1] = dst1; + lbr_buffer_counter += 2; + + uint64_t src2, dst2; + asm volatile ("csrr %0, %1" : "=r"(src2) : "n"(CSR_LBR_SRC2)); + asm volatile ("csrr %0, %1" : "=r"(dst2) : "n"(CSR_LBR_DST2)); + lbr_buffer[lbr_buffer_counter + 0] = src2; + lbr_buffer[lbr_buffer_counter + 1] = dst2; + lbr_buffer_counter += 2; + + uint64_t src3, dst3; + asm volatile ("csrr %0, %1" : "=r"(src3) : "n"(CSR_LBR_SRC3)); + asm volatile ("csrr %0, %1" : "=r"(dst3) : "n"(CSR_LBR_DST3)); + lbr_buffer[lbr_buffer_counter + 0] = src3; + lbr_buffer[lbr_buffer_counter + 1] = dst3; + lbr_buffer_counter += 2; + + uint64_t src4, dst4; + asm volatile ("csrr %0, %1" : "=r"(src4) : "n"(CSR_LBR_SRC4)); + asm volatile ("csrr %0, %1" : "=r"(dst4) : "n"(CSR_LBR_DST4)); + lbr_buffer[lbr_buffer_counter + 0] = src4; + lbr_buffer[lbr_buffer_counter + 1] = dst4; + lbr_buffer_counter += 2; + + uint64_t src5, dst5; + asm volatile ("csrr %0, %1" : "=r"(src5) : "n"(CSR_LBR_SRC5)); + asm volatile ("csrr %0, %1" : "=r"(dst5) : "n"(CSR_LBR_DST5)); + lbr_buffer[lbr_buffer_counter + 0] = src5; + lbr_buffer[lbr_buffer_counter + 1] = dst5; + lbr_buffer_counter += 2; + + uint64_t src6, dst6; + asm volatile ("csrr %0, %1" : "=r"(src6) : "n"(CSR_LBR_SRC6)); + asm volatile ("csrr %0, %1" : "=r"(dst6) : "n"(CSR_LBR_DST6)); + lbr_buffer[lbr_buffer_counter + 0] = src6; + lbr_buffer[lbr_buffer_counter + 1] = dst6; + lbr_buffer_counter += 2; + + uint64_t src7, dst7; + asm volatile ("csrr %0, %1" : "=r"(src7) : "n"(CSR_LBR_SRC7)); + asm volatile ("csrr %0, %1" : "=r"(dst7) : "n"(CSR_LBR_DST7)); + lbr_buffer[lbr_buffer_counter + 0] = src7; + lbr_buffer[lbr_buffer_counter + 1] = dst7; + lbr_buffer_counter += 2; +} + +void lbr_dump_records() { + lbr_stop(); + printf("LBR dump: %zu records\n", lbr_buffer_counter / 2); + for (size_t i = 0; i < lbr_buffer_counter; i += 2) { + printf("%lx,%lx\n", lbr_buffer[i], lbr_buffer[i + 1]); + } +} + +inline void lbr_stop() { + asm volatile ("csrw %0, %1" :: "n"(CSR_LBR_CTRL), "i"(0)); +} diff --git a/lib/perf/lbr.h b/lib/perf/lbr.h new file mode 100644 index 0000000..9110dab --- /dev/null +++ b/lib/perf/lbr.h @@ -0,0 +1,19 @@ +#ifndef __PERF_LBR_H +#define __PERF_LBR_H + +#include +#include +#include "riscv_encoding.h" +#include "pmu.h" + +#define LBR_BUFFER_SIZE 8192 +#define LBR_RECORD_NUM 8 + +static uint64_t lbr_buffer[LBR_BUFFER_SIZE]; +static size_t lbr_buffer_counter; + +void lbr_init(); +void lbr_fetch_records(); +void lbr_dump_records(); +void lbr_stop(); +#endif diff --git a/lib/perf/perf.h b/lib/perf/perf.h new file mode 100644 index 0000000..eb6c947 --- /dev/null +++ b/lib/perf/perf.h @@ -0,0 +1,2 @@ +#include "lbr.h" + From 2de3b1f72d1a94a2e05c974d67d78a44bdc867ee Mon Sep 17 00:00:00 2001 From: Lux Date: Mon, 25 Nov 2024 13:13:43 -0800 Subject: [PATCH 4/9] ADD: fix ordering bugs, now generated dumpable afdo --- scripts/gcov/dump_gcda.py | 1 - scripts/perf/dump_lbr.py | 53 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 scripts/perf/dump_lbr.py diff --git a/scripts/gcov/dump_gcda.py b/scripts/gcov/dump_gcda.py index 8117def..44d1b78 100755 --- a/scripts/gcov/dump_gcda.py +++ b/scripts/gcov/dump_gcda.py @@ -1,6 +1,5 @@ import re import os - import argparse # Regular expressions to match file path and hex data lines diff --git a/scripts/perf/dump_lbr.py b/scripts/perf/dump_lbr.py new file mode 100644 index 0000000..00b651b --- /dev/null +++ b/scripts/perf/dump_lbr.py @@ -0,0 +1,53 @@ +import re +import os +import argparse +from collections import OrderedDict + +magic_line = re.compile(r"LBR dump: (\d+) records") +NUM_SAMPLES = 8 # every 8 lines form a sample + +def parse_log(log_file_path): + with open(log_file_path, "r") as log_file: + found_magic = False + tmp_tuple_list = [] + range_map = OrderedDict() + branch_map = OrderedDict() + for line in log_file: + magic_line_match = magic_line.match(line) + # finding the starting line + if magic_line_match: + found_magic = True + num_records = int(magic_line_match.group(1)) + assert num_records % NUM_SAMPLES == 0 + # iteration assumes continuous records + if found_magic: + for _ in range(num_records): + line = next(log_file) + tmp_tuple_list.append(tuple(map(lambda x: int(x, 16) - 0x80000000, line.strip().split(',')))) + # process the sample + for i in range(0, num_records, NUM_SAMPLES): + for j in range(i, i + NUM_SAMPLES): + branch_map[tmp_tuple_list[j]] = branch_map.get(tmp_tuple_list[j], 0) + 1 + if j % NUM_SAMPLES != NUM_SAMPLES - 1: + # range is from the last branch dst to the current branch src + range_tuple = (tmp_tuple_list[j+1][1], tmp_tuple_list[j][0]) + range_map[range_tuple] = range_map.get(range_tuple, 0) + 1 + return range_map, branch_map + +def write_results(range_map, branch_map): + # write the results + with open("lbr_branch.txt", "w") as branch_file: + branch_file.write(f"{len(range_map)}\n") + for k, v in range_map.items(): + branch_file.write(f"{k[0]}-{k[1]}:{v}\n") + branch_file.write("0\n") + branch_file.write(f"{len(branch_map)}\n") + for k, v in branch_map.items(): + branch_file.write(f"{k[0]}->{k[1]}:{v}\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parse spike log and write lbr files") + parser.add_argument("log_file", help="Path to the vcs log file") + args = parser.parse_args() + range_map, branch_map = parse_log(args.log_file) + write_results(range_map, branch_map) From 8d48c672a17db4638568c9b68c747db170231212 Mon Sep 17 00:00:00 2001 From: Lux Date: Mon, 9 Dec 2024 11:06:01 -0800 Subject: [PATCH 5/9] ADD: ltrace dma functionalities --- .../l_trace_encoder/l_trace_encoder.c | 22 ++++++++++ .../l_trace_encoder/l_trace_encoder.h | 27 ++++++++++-- examples/embench/common/inc/trigger.h | 44 ++++++++++++++----- examples/pmu-tests/CMakeLists.txt | 8 ++-- examples/pmu-tests/src/ltrace-dma-test.c | 36 +++++++++++++++ scripts/trace/dump_ltrace.py | 33 ++++++++++++++ 6 files changed, 154 insertions(+), 16 deletions(-) create mode 100644 examples/pmu-tests/src/ltrace-dma-test.c create mode 100644 scripts/trace/dump_ltrace.py diff --git a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.c b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.c index 6d7a34e..6791cd3 100644 --- a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.c +++ b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.c @@ -1,9 +1,31 @@ #include "l_trace_encoder.h" +void l_trace_sink_dma_configure_addr(LTraceSinkDmaType *sink_dma, uint64_t dma_addr) { + sink_dma->TR_SK_DMA_ADDR = dma_addr; +} + +void l_trace_sink_dma_read(LTraceSinkDmaType *sink_dma, uint8_t *buffer) { + sink_dma->TR_SK_DMA_FLUSH = 1; + while (sink_dma->TR_SK_DMA_FLUSH_DONE == 0) { + ; + } + // printf("[l_trace_sink_dma_read] flush done\n"); + uint64_t count = sink_dma->TR_SK_DMA_COUNT; + printf("[l_trace_sink_dma_read] count: %lld\n", count); + for (uint8_t i = 0; i < count; i++) { + printf("%02x ", buffer[i]); + } + printf("\n"); +} + void l_trace_encoder_start(LTraceEncoderType *encoder) { SET_BITS(encoder->TR_TE_CTRL, 0x1 << 1); } +void l_trace_encoder_configure_target(LTraceEncoderType *encoder, uint64_t target) { + encoder->TR_TE_TARGET = target; +} + void l_trace_encoder_stop(LTraceEncoderType *encoder) { CLEAR_BITS(encoder->TR_TE_CTRL, 0x1 << 1); } diff --git a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h index 37817cc..9ca8ecc 100644 --- a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h +++ b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h @@ -6,8 +6,18 @@ typedef struct { uint32_t TR_TE_CTRL; + uint32_t TR_TE_TARGET; } LTraceEncoderType; +typedef struct { + uint32_t TR_SK_DMA_FLUSH; + uint32_t TR_SK_DMA_FLUSH_DONE; + uint64_t TR_SK_DMA_ADDR; + uint64_t TR_SK_DMA_COUNT; +} LTraceSinkDmaType; + +#define TARGET_PRINT 0x0 +#define TARGET_DMA 0x1 #define L_TRACE_ENCODER_BASE_ADDRESS 0x3000000 #define L_TRACE_ENCODER0 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x0000)) @@ -15,12 +25,23 @@ typedef struct { #define L_TRACE_ENCODER2 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x2000)) #define L_TRACE_ENCODER3 ((LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + 0x3000)) +#define L_TRACE_SINK_DMA_BASE_ADDRESS 0x3010000 +#define L_TRACE_SINK_DMA0 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x0000)) +#define L_TRACE_SINK_DMA1 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x1000)) +#define L_TRACE_SINK_DMA2 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x2000)) +#define L_TRACE_SINK_DMA3 ((LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + 0x3000)) + static inline LTraceEncoderType *l_trace_encoder_get(uint32_t hart_id) { return (LTraceEncoderType *)(L_TRACE_ENCODER_BASE_ADDRESS + hart_id * 0x1000); } -void l_trace_encoder_start(LTraceEncoderType *encoder); +static inline LTraceSinkDmaType *l_trace_sink_dma_get(uint32_t hart_id) { + return (LTraceSinkDmaType *)(L_TRACE_SINK_DMA_BASE_ADDRESS + hart_id * 0x1000); +} +void l_trace_encoder_start(LTraceEncoderType *encoder); void l_trace_encoder_stop(LTraceEncoderType *encoder); - -#endif /* __L_TRACE_ENCODER_H */ \ No newline at end of file +void l_trace_encoder_configure_target(LTraceEncoderType *encoder, uint64_t target); +void l_trace_sink_dma_configure_addr(LTraceSinkDmaType *sink_dma, uint64_t dma_addr); +void l_trace_sink_dma_read(LTraceSinkDmaType *sink_dma, uint8_t *buffer); +#endif /* __L_TRACE_ENCODER_H */ diff --git a/examples/embench/common/inc/trigger.h b/examples/embench/common/inc/trigger.h index 9829959..976327b 100644 --- a/examples/embench/common/inc/trigger.h +++ b/examples/embench/common/inc/trigger.h @@ -7,32 +7,51 @@ #include "pmu.h" // use timer interrupt to trigger the profiler -#define TIMER_INTERRUPT +// #define TIMER_INTERRUPT // use PMU reading to report the profile -// #define USE_PMU_READING +#define USE_PMU_READING // use LBR to report the trace -#define USE_LBR +// #define USE_LBR // report the total time of the benchmark #define REPORT_TOTAL_TIME -// use trace encoder to report the trace -// #define USE_L_TRACE +// use trace encoder RTL Print to report the trace +#define USE_L_TRACE +// use trace encoder DMA to report the trace +#define USE_L_TRACE_DMA +// use trace encoder RTL Print to report the trace +// #define USE_L_TRACE_PRINT /* timer interrupt interval in milliseconds only used when TIMER_INTERRUPT is defined */ #define TIMER_INTERRUPT_INTERVAL 100 -static inline void start_trigger(void) { - #ifdef USE_L_TRACE - LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id()); - l_trace_encoder_start(encoder); - #endif +#ifdef USE_L_TRACE_DMA + static uint8_t dma_buffer[512 * 1024]; +#endif +static inline void start_trigger(void) { #ifdef REPORT_TOTAL_TIME uint64_t curr_time = clint_get_time(CLINT); printf("start trigger at %lld\n", curr_time); #endif + LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id()); + LTraceSinkDmaType *sink_dma = l_trace_sink_dma_get(get_hart_id()); + + #ifdef USE_L_TRACE_DMA + l_trace_sink_dma_configure_addr(sink_dma, (uint64_t)dma_buffer); + l_trace_encoder_configure_target(encoder, TARGET_DMA); + #endif + + #ifdef USE_L_TRACE_PRINT + l_trace_encoder_configure_target(encoder, TARGET_PRINT); + #endif + + #ifdef USE_L_TRACE + l_trace_encoder_start(encoder); + #endif + #ifdef USE_PMU_READING PMU_EVENT_ENABLE(PMU_EVENT(1, LOAD_USE_INTERLOCK), 3); PMU_COUNTER_RESET(3); @@ -89,6 +108,11 @@ static inline void stop_trigger(void) { int64_t curr_time = clint_get_time(CLINT); printf("stop trigger at %lld\n", curr_time); #endif + + #ifdef USE_L_TRACE_DMA + LTraceSinkDmaType *sink_dma = l_trace_sink_dma_get(get_hart_id()); + l_trace_sink_dma_read(sink_dma, dma_buffer); + #endif } #endif /* __TRIGGER_H */ \ No newline at end of file diff --git a/examples/pmu-tests/CMakeLists.txt b/examples/pmu-tests/CMakeLists.txt index 2c8aa6a..ea1665c 100644 --- a/examples/pmu-tests/CMakeLists.txt +++ b/examples/pmu-tests/CMakeLists.txt @@ -3,11 +3,13 @@ set (TESTS pmu-test-store pmu-test-inhibit lbr-test + ltrace-dma-test ) foreach(test ${TESTS}) add_executable(${test} src/${test}.c) - target_link_libraries(${test} PRIVATE - -L${CMAKE_BINARY_DIR}/glossy -Wl,--whole-archive glossy -Wl,--no-whole-archive + target_link_libraries(${test} + PUBLIC l_trace_encoder + PRIVATE -L${CMAKE_BINARY_DIR}/glossy -Wl,--whole-archive glossy -Wl,--no-whole-archive ) -endforeach() \ No newline at end of file +endforeach() diff --git a/examples/pmu-tests/src/ltrace-dma-test.c b/examples/pmu-tests/src/ltrace-dma-test.c new file mode 100644 index 0000000..e5cd4ad --- /dev/null +++ b/examples/pmu-tests/src/ltrace-dma-test.c @@ -0,0 +1,36 @@ +#include +#include "riscv.h" +#include "riscv_encoding.h" +#include "l_trace_encoder.h" + +#define NUM_ITERS 100 + +#define USE_L_TRACE_DMA + +__attribute__((aligned(64), section(".noinit"))) static volatile uint8_t dma_buffer[512 * 1024]; + +int main(int argc, char **argv) { + // 64 aligned + LTraceEncoderType *encoder = l_trace_encoder_get(get_hart_id()); + #ifdef USE_L_TRACE_DMA + LTraceSinkDmaType *sink_dma = l_trace_sink_dma_get(get_hart_id()); + l_trace_sink_dma_configure_addr(sink_dma, (uint64_t)dma_buffer); + l_trace_encoder_configure_target(encoder, TARGET_DMA); + #else + l_trace_encoder_configure_target(encoder, TARGET_PRINT); + #endif + l_trace_encoder_start(encoder); + volatile int c; + // do some dummy loops + for (int i = 0; i < NUM_ITERS; i++) { + c += i; + } + // do some different loops + for (int i = 0; i < NUM_ITERS; i++) { + c -= i; + } + l_trace_encoder_stop(encoder); + #ifdef USE_L_TRACE_DMA + l_trace_sink_dma_read(sink_dma, dma_buffer); + #endif +} diff --git a/scripts/trace/dump_ltrace.py b/scripts/trace/dump_ltrace.py new file mode 100644 index 0000000..f859931 --- /dev/null +++ b/scripts/trace/dump_ltrace.py @@ -0,0 +1,33 @@ +import re +import os +import argparse + +# example log: +# [l_trace_sink_dma_read] count: 192 +# 1e 04 00 00 84 1c 9d 0a a9 80 2c 24 24 24 24 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 24 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 19 2c 24 24 24 24 18 24 18 18 18 18 18 18 18 18 18 18 18 18 18 18 2c 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 00 18 18 19 17 16 7e 04 +def parse_log(log_path): + with open(log_path, "r") as log_file: + lines = log_file.readlines() + for idx, line in enumerate(lines): + if count_match := re.match(r"\[l_trace_sink_dma_read\] count: (\d+)", line): + count = int(count_match.group(1)) + target_line = lines[idx + 1].strip() + split_line = target_line.split() + assert len(split_line) == count + return split_line + return None + +# write as raw hex dump +def write_results(split_line): + with open("ltrace_dram_dump.txt", "wb") as dump_file: + for hex_str in split_line: + byte_val = int(hex_str, 16) + dump_file.write(bytes([byte_val])) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--log_path", type=str, required=True) + args = parser.parse_args() + log_path = args.log_path + split_line = parse_log(log_path) + write_results(split_line) From 095c43d36317f724afa84da2a4ff0540f4f565d3 Mon Sep 17 00:00:00 2001 From: Lux Date: Fri, 13 Dec 2024 23:34:55 -0800 Subject: [PATCH 6/9] ADD: llvm cmakelist --- riscv-llvm.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/riscv-llvm.cmake b/riscv-llvm.cmake index 3555d7f..a64e48a 100644 --- a/riscv-llvm.cmake +++ b/riscv-llvm.cmake @@ -2,19 +2,19 @@ # RISCV Toolchain ################################# option(RISCV "Build for RISC-V" ON) -option(USE_LLVM "Use LLVM toolchain" ON) + set(CMAKE_SYSTEM_NAME "Generic" CACHE STRING "") set(CMAKE_SYSTEM_PROCESSOR "riscv" CACHE STRING "") set(TOOLCHAIN_PREFIX "riscv64-unknown-elf-") -set(MYRISCV "/scratch/iansseijelly/riscv-toolchain-build") +set(MYRISCV "/scratch/iansseijelly/riscv-llvm-install") set(CMAKE_AR "llvm-ar") set(CMAKE_ASM_COMPILER "${MYRISCV}/bin/clang") set(CMAKE_C_COMPILER "${MYRISCV}/bin/clang") set(CMAKE_CXX_COMPILER "${MYRISCV}/bin/clang++") -set(CMAKE_LINKER "lld") +set(CMAKE_LINKER "${MYRISCV}/bin/lld") set(CMAKE_OBJCOPY "llvm-objcopy") set(CMAKE_OBJDUMP "llvm-objdump") set(CMAKE_SIZE "llvm-size") From de22aba05fb617ee2d3f8b656135d3b25514594e Mon Sep 17 00:00:00 2001 From: Lux Date: Fri, 13 Dec 2024 23:35:22 -0800 Subject: [PATCH 7/9] ADD: new cmake options --- CMakeLists.txt | 14 ++++++++++++++ examples/embench/CMakeLists.txt | 13 +++++++++++++ 2 files changed, 27 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c092e2b..3ed5422 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(BAREMETAL_BUILD_X86 "Build for x86 platform" OF option(CHIP "Build for a specific platform" OFF ) option(PROF_COV "Build with profiling and coverage" OFF ) +option(GCNO_ONLY "Only build gcno files" OFF ) option(USE_PGO "Build with profile guided optimization" OFF ) ################################# @@ -81,9 +82,15 @@ if (PROF_COV) add_compile_options(-fprofile-arcs -ftest-coverage) endif() +if (GCNO_ONLY) + message(STATUS "Building only gcno files") + add_compile_options(-ftest-coverage) +endif() + if (USE_PGO) add_compile_options(-fprofile-use) endif() +add_compile_options(-fno-builtin) # add_compile_options(-ffunction-sections -fdata-sections -fno-common -fno-builtin-printf -fno-pie) # add_compile_options(-Wall -Wextra -Warray-bounds -Wno-unused-parameter -Wcast-qual) @@ -97,12 +104,15 @@ add_link_options(-nostartfiles) add_link_options(${ARCH_FLAGS}) add_link_options(${SPEC_FLAGS}) add_link_options(-T ${LINKER_SCRIPT}) +# add_link_options(-rtlib=compiler-rt) +# add_link_options(-Wl,--rtlib-path=/scratch/iansseijelly/riscv-llvm-install/lib/linux/libclang_rt.builtins-riscv64.a) if (PROF_COV) add_link_options(-lgcov) endif() + ################################# # Build ################################# @@ -113,6 +123,10 @@ add_executable(app target_include_directories(app PUBLIC app/include) +# add std lib path +include_directories(/scratch/iansseijelly/chipyard/.conda-env/riscv-tools/riscv64-unknown-elf/include) +link_directories(/scratch/iansseijelly/chipyard/.conda-env/riscv-tools/riscv64-unknown-elf/lib) + ################################# # Dependencies diff --git a/examples/embench/CMakeLists.txt b/examples/embench/CMakeLists.txt index e176624..ebc37ad 100644 --- a/examples/embench/CMakeLists.txt +++ b/examples/embench/CMakeLists.txt @@ -1,4 +1,17 @@ # Define the list of benchmarks +option (EMBENCH_ENABLE_TRACE_PRINT "Enable trace print for benchmarks" OFF) +option (EMBENCH_ENABLE_TRACE_DMA "Enable trace dma for benchmarks" OFF) + +if (EMBENCH_ENABLE_TRACE_PRINT) + add_definitions(-DUSE_L_TRACE) + add_definitions(-DUSE_L_TRACE_PRINT) +endif() + +if (EMBENCH_ENABLE_TRACE_DMA) + add_definitions(-DUSE_L_TRACE) + add_definitions(-DUSE_L_TRACE_DMA) +endif() + set(BENCHMARKS dummy wikisort From 0636804b4a96f8340f2e0316bbdd8763d3bf2fe5 Mon Sep 17 00:00:00 2001 From: Lux Date: Fri, 13 Dec 2024 23:35:44 -0800 Subject: [PATCH 8/9] FIX: minor fixes --- driver/rocket-chip/l_trace_encoder/l_trace_encoder.h | 2 ++ examples/embench/common/inc/trigger.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h index 9ca8ecc..98d0783 100644 --- a/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h +++ b/driver/rocket-chip/l_trace_encoder/l_trace_encoder.h @@ -1,6 +1,8 @@ #ifndef __L_TRACE_ENCODER_H #define __L_TRACE_ENCODER_H +#include + #include "metal.h" #include "rocketcore.h" diff --git a/examples/embench/common/inc/trigger.h b/examples/embench/common/inc/trigger.h index 976327b..ee58ff9 100644 --- a/examples/embench/common/inc/trigger.h +++ b/examples/embench/common/inc/trigger.h @@ -9,15 +9,15 @@ // use timer interrupt to trigger the profiler // #define TIMER_INTERRUPT // use PMU reading to report the profile -#define USE_PMU_READING +// #define USE_PMU_READING // use LBR to report the trace // #define USE_LBR // report the total time of the benchmark #define REPORT_TOTAL_TIME // use trace encoder RTL Print to report the trace -#define USE_L_TRACE +// #define USE_L_TRACE // use trace encoder DMA to report the trace -#define USE_L_TRACE_DMA +// #define USE_L_TRACE_DMA // use trace encoder RTL Print to report the trace // #define USE_L_TRACE_PRINT From 2e2fa049c13d2e487bb2c2852e14652ec444feee Mon Sep 17 00:00:00 2001 From: Lux Date: Fri, 13 Dec 2024 23:35:54 -0800 Subject: [PATCH 9/9] ADD: sort test case --- examples/pmu-tests/CMakeLists.txt | 3 +- examples/pmu-tests/src/sort.c | 57 +++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 examples/pmu-tests/src/sort.c diff --git a/examples/pmu-tests/CMakeLists.txt b/examples/pmu-tests/CMakeLists.txt index ea1665c..2404d0d 100644 --- a/examples/pmu-tests/CMakeLists.txt +++ b/examples/pmu-tests/CMakeLists.txt @@ -4,12 +4,13 @@ set (TESTS pmu-test-inhibit lbr-test ltrace-dma-test + sort ) foreach(test ${TESTS}) add_executable(${test} src/${test}.c) target_link_libraries(${test} - PUBLIC l_trace_encoder + PUBLIC l_trace_encoder PRIVATE -L${CMAKE_BINARY_DIR}/glossy -Wl,--whole-archive glossy -Wl,--no-whole-archive ) endforeach() diff --git a/examples/pmu-tests/src/sort.c b/examples/pmu-tests/src/sort.c new file mode 100644 index 0000000..e7c4607 --- /dev/null +++ b/examples/pmu-tests/src/sort.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#define ARRAY_LEN 500 + +static inline void start() { + printf("hello world\n"); +} + +static inline void stop() { +} + +void bubble_sort (uint32_t *a, uint32_t n) { + uint32_t i, t, s = 1; + while (s) { + s = 0; + for (i = 1; i < n; i++) { + if (a[i] < a[i - 1]) { + t = a[i]; + a[i] = a[i - 1]; + a[i - 1] = t; + s = 1; + } + } + } +} + +void generate_array(uint32_t *a) { + for(uint32_t i=0; i