From ed3826e3f5adaba222b413b108956b06825294dd Mon Sep 17 00:00:00 2001 From: Subhankar Pal Date: Sun, 4 Oct 2020 23:39:23 -0400 Subject: [PATCH] Add new demo example - parallel vector addition --- example/app/src/vector_add.cpp | 159 +++++++++++++++++++++++++++++++++ example/model/params.h | 18 ++-- scripts/build-gem5.sh | 24 ++--- 3 files changed, 183 insertions(+), 18 deletions(-) create mode 100644 example/app/src/vector_add.cpp diff --git a/example/app/src/vector_add.cpp b/example/app/src/vector_add.cpp new file mode 100644 index 0000000..13f6c42 --- /dev/null +++ b/example/app/src/vector_add.cpp @@ -0,0 +1,159 @@ +#include "params.h" // import parameters of target hardware as macros +#include "util.h" // import primitive definitions +#include +#include +#include + +#if defined(AUTO_TRACING) || defined(MANUAL_TRACING) +#include "hetsim_default_rt.h" +#endif + +#define N 100000 + +void *work(void *arg) { // manager "spawns" worker threads with tid=1,2,3... + unsigned tid = *(unsigned *)(arg); + __register_core_id(tid); +#if defined(AUTO_TRACING) || defined(MANUAL_TRACING) + __open_trace_log(tid); +#endif // AUTO_TRACING || MANUAL_TRACING + + // retrieve variables from work queue + volatile float *a = (volatile float *)__pop(0); + volatile float *b = (volatile float *)__pop(0); + volatile float *c = (volatile float *)__pop(0); + int start_idx = (int)__pop(0); + int end_idx = (int)__pop(0); + pthread_barrier_t *bar = (pthread_barrier_t *)__pop(0); + + // receive start signal + __pop(0); + + // perform actual computation + for (int i = start_idx; i <= end_idx; ++i) { + c[i] += a[i] + b[i]; + } + + // synchronize with manager + __barrier_wait(bar); + +#if defined(AUTO_TRACING) || defined(MANUAL_TRACING) + __close_trace_log(tid); +#endif // AUTO_TRACING || MANUAL_TRACING + return NULL; +} // end of work() + +int main() { + printf("== Vector Add Test with N = %u, NUM_WORKER = %u\n", N, NUM_WORKER); + __init_queues(WQ_DEPTH); + __register_core_id(0); // manager is assigned core-id 0 + + // main memory allocation + // in this example, we are working with 3 float arrays each of size N + size_t RAM_SIZE_BYTES = 3 * N * sizeof(float); + char *ram = (char *)mmap((void *)(RAM_BASE_ADDR), RAM_SIZE_BYTES, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE, 0, 0); + + // scratchpad memory allocation +#ifdef EMULATION + // for emulation + char *dspm = (char *)mmap((void *)(SPM_BASE_ADDR), SPM_SIZE_BYTES, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE, 0, 0); +#else // !EMULATION + // the model uses physically-addressed scratchpad that does not need explicit allocation + char *dspm = (char *)SPM_BASE_ADDR; +#endif // EMULATION + + // allocate the vectors and populate them + float *a = (float *)(ram); + float *b = (float *)(ram + N * sizeof(float)); + float *c = (float *)(ram + 2 * N * sizeof(float)); + for (int i = 0; i < N; ++i) { + a[i] = float(i + 1); + b[i] = float(i + 1); + c[i] = 0.0; + } +#if defined(AUTO_TRACING) || defined(MANUAL_TRACING) + __open_trace_log(0); // use core-id as argument +#endif + + + // allocate barrier object for synchronization + pthread_barrier_t *bar = (pthread_barrier_t *)(dspm); + // initialize barrier with participants = 1 manager + NUM_WORKER workers + __barrier_init(bar, NUM_WORKER + 1); + + // allocate thread objects for each "worker" PE + pthread_t *workers = new pthread_t[NUM_WORKER]; + + // create vector of core IDs to send to each thread + unsigned *tids = new unsigned[NUM_WORKER]; + for (int i = 0; i < NUM_WORKER; ++i) { + tids[i] = i + 1; + // spawn worker thread + pthread_create(workers + i, NULL, work, &tids[i]); + } + + // partition the work and push work "packets" + for (int i = 0; i < NUM_WORKER; ++i) { + // each worker is assigned floor(N / NUM_WORKER) elements + int n = N / NUM_WORKER; + int start_idx = i * n; + int end_idx = (i + 1) * n - 1; + + // handle trailing elements by assigning to final worker + if (i == NUM_WORKER - 1) { + end_idx = N - 1; + } + // push through work queues + __push(i + 1, (uintptr_t)(a)); + __push(i + 1, (uintptr_t)(b)); + __push(i + 1, (uintptr_t)(c)); + __push(i + 1, (unsigned)(start_idx)); + __push(i + 1, (unsigned)(end_idx)); + __push(i + 1, (uintptr_t)(bar)); + } +// ----- ROI begin ----- + __reset_stats(); // begin recording time here + for (int i = 0; i < NUM_WORKER; ++i) { + __push(i + 1, 0); // start signal, value is ignored + } + __barrier_wait(bar); // synchronize with worker threads + + __dump_reset_stats(); // end recording time here +// ----- ROI end ----- +#if defined(AUTO_TRACING) || defined(MANUAL_TRACING) + __close_trace_log(0); +#endif // AUTO_TRACING || MANUAL_TRACING + + // join with all threads + for (int tid = 0; tid < NUM_WORKER; ++tid) { + pthread_join(workers[tid], NULL); + } + + // result checking + bool pass = true; + for (int i = 0; i < N; ++i) { + if(c[i] != a[i] + b[i]) { + pass = false; + printf("[FAILED] C[%u] = %f (exp = %f)\n", i, c[i], a[i] + b[i]); + break; + } + } + + // clean up +#ifdef EMULATION + munmap(dspm, SPM_SIZE_BYTES); +#endif // EMULATION + munmap(ram, RAM_SIZE_BYTES); + delete[] workers; + delete[] tids; + __teardown_queues(); + + if (pass) + printf("== Test Passed ==\n"); + else + printf("== Test Failed ==\n"); + return 0; +} // end of main() diff --git a/example/model/params.h b/example/model/params.h index 51b69ca..f1b7a22 100644 --- a/example/model/params.h +++ b/example/model/params.h @@ -31,12 +31,14 @@ * HetSim primitives * @author Subhankar Pal */ -#define NUM_WORKER 8 -#define WQ_DEPTH 4 -#define MAX_OUTSTANDING_REQS 1 -#define CLOCK_SPEED_GHZ 1 -#define SPM_SIZE_BYTES 4096 +#define NUM_WORKER 8 +#define WQ_DEPTH 4 +#define MAX_OUTSTANDING_REQS 1 +#define CLOCK_SPEED_GHZ 1 +#define SPM_SIZE_BYTES 4096 +#define PAGE_SIZE_BYTES 4096 -#define WQ_POP_ADDR 0xE0100000 -#define WQ_PUSH_BASE_ADDR 0xE0100004 -#define SPM_BASE_ADDR 0xE0101000 +#define WQ_POP_ADDR 0xE0100000 +#define WQ_PUSH_BASE_ADDR 0xE0100004 +#define SPM_BASE_ADDR 0xE0101000 +#define RAM_BASE_ADDR 0x40000000 diff --git a/scripts/build-gem5.sh b/scripts/build-gem5.sh index ac362c9..589934a 100755 --- a/scripts/build-gem5.sh +++ b/scripts/build-gem5.sh @@ -8,28 +8,32 @@ FLAVOR=opt ROOTDIR=`pwd`/.. CPUTYPE=TimingSimpleCPU +VERBOSE=${VERBOSE:-1} source $ROOTDIR/scripts/init.sh cd $ROOTDIR/gem5 -info "Starting gem5 build for $CPUTYPE at `pwd` with ${CPUTYPE}..." -scons build/ARM/gem5.$FLAVOR CPU_MODELS=$CPUTYPE -j`nproc` +info "Starting gem5 build for $CPUTYPE" +scons_cmd="scons build/ARM/gem5.$FLAVOR CPU_MODELS=$CPUTYPE -j`nproc`" +if [ $VERBOSE -eq 0 ]; then + eval $scons_cmd > /dev/null 2>&1 +else + eval $scons_cmd +fi if [ $? -eq 0 ]; then - info "gem5 build succeeded!" + info "gem5 build succeeded" else - warn "gem5 build failed!" + warn "gem5 build failed" exit 1 fi -info "Compiling m5op for Arm..." -cd util/m5 -make -f Makefile.thumb - -info "Compiling m5threads library..." -cd ../../../m5threads/tests +info "Compiling m5threads library" +cd $ROOTDIR/m5threads/tests make ../pthread.o if [ $? -gt 0 ] ; then warn "m5threads compilation failed" exit fi + +info "build-gem5.sh successfully exiting" exit 0