diff --git a/Examples/DMA.cpp b/Examples/DMA.cpp new file mode 100644 index 0000000..1ba39d5 --- /dev/null +++ b/Examples/DMA.cpp @@ -0,0 +1,51 @@ +#include "QPULib.h" + +using namespace QPULib; + +void dma(Ptr p) +{ + // Setup load of 16 vectors into VPM, starting at word address 0 + dmaSetReadPitch(64); + dmaSetupRead(HORIZ, 16, 0); + // Start loading from memory at address 'p' + dmaStartRead(p); + // Wait until load complete + dmaWaitRead(); + + // Setup load of 16 vectors from VPM, starting at vector address 0 + vpmSetupRead(HORIZ, 16, 0); + // Setup store to VPM, starting at vector address 16 + vpmSetupWrite(HORIZ, 16); + + // Read each vector, increment it, and write it back + for (int i = 0; i < 16; i++) + vpmPut(vpmGetInt() + 1); + + // Setup store of 16 vectors into VPM, starting at word address 256 + dmaSetupWrite(HORIZ, 16, 256); + // Start writing to memory at address 'p' + dmaStartWrite(p); + // Wait until store complete + dmaWaitWrite(); +} + +int main() +{ + // Construct kernel + auto k = compile(dma); + + // Allocate and initialise array shared between ARM and GPU + SharedArray array(256); + for (int i = 0; i < 256; i++) + array[i] = i; + + // Invoke the kernel and display the result + k(&array); + for (int i = 0; i < 16; i++) { + for (int j = 0; j < 16; j++) + printf("%i ", array[16*i + j]); + printf("\n"); + } + + return 0; +} diff --git a/Examples/OET.cpp b/Examples/OET.cpp index 14c9269..08d402c 100644 --- a/Examples/OET.cpp +++ b/Examples/OET.cpp @@ -6,11 +6,8 @@ using namespace QPULib; void oet(Ptr p) { - setReadStride(1); - setWriteStride(1); - Int evens = *p; - Int odds = *(p+1); + Int odds = *(p+16); For (Int count = 0, count < 16, count++) Int evens2 = min(evens, odds); @@ -31,8 +28,8 @@ void oet(Ptr p) odds = odds2; End - *p = evens; - *(p+1) = odds; + *p = evens; + *(p+16) = odds; } int main() @@ -48,7 +45,7 @@ int main() // Invoke the kernel and display the result k.call(&a); for (int i = 0; i < 32; i++) - printf("%i: %i\n", i, a[i]); + printf("%i: %i\n", i, (i & 1) ? a[16+(i>>1)] : a[i>>1]); return 0; } diff --git a/Lib/Common/Queue.h b/Lib/Common/Queue.h new file mode 100644 index 0000000..63312e4 --- /dev/null +++ b/Lib/Common/Queue.h @@ -0,0 +1,21 @@ +#ifndef _QUEUE_H_ +#define _QUEUE_H_ + +namespace QPULib { + + // Very simple queue containing N elements of type T + template struct Queue { + T elems[N+1]; + int front; + int back; + Queue() { front = back = 0; } + bool isEmpty() { return front == back; } + bool isFull() { return ((back+1)%(N+1)) == front; } + void enq(T elem) { elems[back] = elem; back = (back+1)%(N+1); } + T* first() { return &elems[front]; } + void deq() { front = (front+1)%(N+1); } + }; + +} + +#endif diff --git a/Lib/Kernel.cpp b/Lib/Kernel.cpp index d39e211..de7f568 100644 --- a/Lib/Kernel.cpp +++ b/Lib/Kernel.cpp @@ -9,6 +9,7 @@ #include "Target/Satisfy.h" #include "Target/LoadStore.h" #include "Target/Encode.h" +#include "Target/Pretty.h" namespace QPULib { diff --git a/Lib/Kernel.h b/Lib/Kernel.h index 08d7504..b87bb07 100644 --- a/Lib/Kernel.h +++ b/Lib/Kernel.h @@ -195,7 +195,7 @@ template struct Kernel { resetFreshLabelGen(); // Reserved general-purpose variables - Int qpuId, qpuCount, readStride, writeStride; + Int qpuId, qpuCount; qpuId = getUniformInt(); qpuCount = getUniformInt(); diff --git a/Lib/Source/Float.cpp b/Lib/Source/Float.cpp index a2db882..97fbe05 100644 --- a/Lib/Source/Float.cpp +++ b/Lib/Source/Float.cpp @@ -89,6 +89,15 @@ FloatExpr getUniformFloat() return mkFloatExpr(e); } +// Read vector from VPM +FloatExpr vpmGetFloat() +{ + Expr* e = mkExpr(); + e->tag = VAR; + e->var.tag = VPM_READ; + return mkFloatExpr(e); +} + // Add FloatExpr operator+(FloatExpr a, FloatExpr b) { return mkFloatApply(a, mkOp(ADD, FLOAT), b); } diff --git a/Lib/Source/Float.h b/Lib/Source/Float.h index b1c10e6..bae399a 100644 --- a/Lib/Source/Float.h +++ b/Lib/Source/Float.h @@ -52,6 +52,7 @@ struct Float { // ============================================================================ FloatExpr getUniformFloat(); +FloatExpr vpmGetFloat(); FloatExpr operator+(FloatExpr a, FloatExpr b); FloatExpr operator-(FloatExpr a, FloatExpr b); diff --git a/Lib/Source/Int.cpp b/Lib/Source/Int.cpp index ff3db7a..27e3ac4 100644 --- a/Lib/Source/Int.cpp +++ b/Lib/Source/Int.cpp @@ -122,6 +122,15 @@ IntExpr numQPUs() return mkIntExpr(e); } +// Read vector from VPM +IntExpr vpmGetInt() +{ + Expr* e = mkExpr(); + e->tag = VAR; + e->var.tag = VPM_READ; + return mkIntExpr(e); +} + // Vector rotation IntExpr rotate(IntExpr a, IntExpr b) { return mkIntApply(a, mkOp(ROTATE, INT32), b); } diff --git a/Lib/Source/Int.h b/Lib/Source/Int.h index d6e16ae..d78e613 100644 --- a/Lib/Source/Int.h +++ b/Lib/Source/Int.h @@ -59,6 +59,7 @@ IntExpr getUniformInt(); IntExpr index(); IntExpr me(); IntExpr numQPUs(); +IntExpr vpmGetInt(); IntExpr rotate(IntExpr a, IntExpr b); FloatExpr rotate(FloatExpr a, IntExpr b); diff --git a/Lib/Source/Interpreter.cpp b/Lib/Source/Interpreter.cpp index 700e439..7f0b31b 100644 --- a/Lib/Source/Interpreter.cpp +++ b/Lib/Source/Interpreter.cpp @@ -46,6 +46,11 @@ Vec evalVar(CoreState* s, Var v) return x; } + // VPM read + case VPM_READ: + printf("QPULib: vpmGet() not supported by interpreter\n"); + break; + default: printf("QPULib: reading from write-only variable\n"); } @@ -145,7 +150,7 @@ Vec eval(CoreState* s, Expr* e) Vec v; for (int i = 0; i < NUM_LANES; i++) { v.elems[i].intVal = emuHeap[hp>>2]; - hp += 4*(s->readStride+1); + hp += s->readStride; } return v; } @@ -291,6 +296,11 @@ void assignToVar(CoreState* s, Vec cond, Var v, Vec x) return; } + // VPM write + case VPM_WRITE: + printf("QPULib: vpmPut() not supported by interpreter\n"); + break; + // Others are read-only case UNIFORM: case QPU_NUM: @@ -322,7 +332,7 @@ void execAssign(CoreState* s, Vec cond, Expr* lhs, Expr* rhs) int hp = index.elems[0].intVal; for (int i = 0; i < NUM_LANES; i++) { emuHeap[hp>>2] = val.elems[i].intVal; - hp += 4*(s->writeStride+1); + hp += 4 + s->writeStride; } return; } @@ -464,7 +474,7 @@ void execStoreRequest(CoreState* s, Expr* data, Expr* addr) { int hp = index.elems[0].intVal; for (int i = 0; i < NUM_LANES; i++) { emuHeap[hp>>2] = val.elems[i].intVal; - hp += 4*(s->writeStride+1); + hp += 4 + s->writeStride; } } @@ -565,8 +575,19 @@ void exec(InterpreterState* state, CoreState* s) else state->sema[stmt->semaId]--; return; - // Flush outstanding stores - case FLUSH: return; + case DMA_READ_WAIT: + case DMA_WRITE_WAIT: + case SETUP_VPM_READ: + case SETUP_VPM_WRITE: + case SETUP_DMA_READ: + case SETUP_DMA_WRITE: + // Interpreter ignores these + return; + + case DMA_START_READ: + case DMA_START_WRITE: + printf("QPULib: DMA access not supported by interpreter\n"); + break; } // Unreachable diff --git a/Lib/Source/Pretty.cpp b/Lib/Source/Pretty.cpp index 8f1c4ae..3d89fd0 100644 --- a/Lib/Source/Pretty.cpp +++ b/Lib/Source/Pretty.cpp @@ -79,6 +79,10 @@ void pretty(FILE *f, Expr* e) fprintf(f, "QPU_NUM"); else if (e->var.tag == ELEM_NUM) fprintf(f, "ELEM_NUM"); + else if (e->var.tag == VPM_READ) + fprintf(f, "VPM_READ"); + else if (e->var.tag == VPM_WRITE) + fprintf(f, "VPM_WRITE"); else if (e->var.tag == TMU0_ADDR) fprintf(f, "TMU0_ADDR"); break; @@ -264,7 +268,7 @@ void pretty(FILE *f, int indent, Stmt* s) // Set read stride case SET_READ_STRIDE: indentBy(f, indent); - fprintf(f, "setReadStride("); + fprintf(f, "dmaSetReadPitch("); pretty(f, s->stride); fprintf(f, ")\n"); break; @@ -272,7 +276,7 @@ void pretty(FILE *f, int indent, Stmt* s) // Set write stride case SET_WRITE_STRIDE: indentBy(f, indent); - fprintf(f, "setWriteStride("); + fprintf(f, "dmaSetWriteStride("); pretty(f, s->stride); fprintf(f, ")\n"); break; @@ -295,12 +299,6 @@ void pretty(FILE *f, int indent, Stmt* s) fprintf(f, ")\n"); break; - // Flush outstanding stores - case FLUSH: - indentBy(f, indent); - fprintf(f, "flush()\n"); - break; - // Increment semaphore case SEMA_INC: indentBy(f, indent); @@ -319,6 +317,78 @@ void pretty(FILE *f, int indent, Stmt* s) fprintf(f, "hostIRQ()\n"); break; + // Setup VPM Read + case SETUP_VPM_READ: + indentBy(f, indent); + fprintf(f, "vpmSetupRead("); + fprintf(f, "numVecs=%i, ", s->setupVPMRead.numVecs); + fprintf(f, "dir=%s,", s->setupVPMRead.hor ? "HOR" : "VIR"); + fprintf(f, "stride=%i,", s->setupVPMRead.stride); + pretty(f, s->setupVPMRead.addr); + fprintf(f, ");\n"); + break; + + // Setup VPM Write + case SETUP_VPM_WRITE: + indentBy(f, indent); + fprintf(f, "vpmSetupWrite("); + fprintf(f, "dir=%s,", s->setupVPMWrite.hor ? "HOR" : "VIR"); + fprintf(f, "stride=%i,", s->setupVPMWrite.stride); + pretty(f, s->setupVPMWrite.addr); + fprintf(f, ");\n"); + break; + + // DMA read wait + case DMA_READ_WAIT: + indentBy(f, indent); + fprintf(f, "dmaReadWait();\n"); + break; + + // DMA write wait + case DMA_WRITE_WAIT: + indentBy(f, indent); + fprintf(f, "dmaWriteWait();\n"); + break; + + // DMA start read + case DMA_START_READ: + indentBy(f, indent); + fprintf(f, "dmaStartRead("); + pretty(f, s->startDMARead); + fprintf(f, ");\n"); + break; + + // DMA start write + case DMA_START_WRITE: + indentBy(f, indent); + fprintf(f, "dmaStartWrite("); + pretty(f, s->startDMAWrite); + fprintf(f, ");\n"); + break; + + // DMA read setup + case SETUP_DMA_READ: + indentBy(f, indent); + fprintf(f, "dmaSetupRead("); + fprintf(f, "numRows=%i,", s->setupDMARead.numRows); + fprintf(f, "rowLen=%i,", s->setupDMARead.rowLen); + fprintf(f, "dir=%s,", s->setupDMARead.hor ? "HORIZ" : "VERT"); + fprintf(f, "vpitch=%i,", s->setupDMARead.vpitch); + pretty(f, s->setupDMARead.vpmAddr); + fprintf(f, ");\n"); + break; + + // DMA write setup + case SETUP_DMA_WRITE: + indentBy(f, indent); + fprintf(f, "dmaSetupWrite("); + fprintf(f, "numRows=%i,", s->setupDMAWrite.numRows); + fprintf(f, "rowLen=%i,", s->setupDMAWrite.rowLen); + fprintf(f, "dir=%s,", s->setupDMAWrite.hor ? "HORIZ" : "VERT"); + pretty(f, s->setupDMAWrite.vpmAddr); + fprintf(f, ");\n"); + break; + // Not reachable default: assert(false); diff --git a/Lib/Source/Stmt.cpp b/Lib/Source/Stmt.cpp index 50099f3..b793435 100644 --- a/Lib/Source/Stmt.cpp +++ b/Lib/Source/Stmt.cpp @@ -186,10 +186,45 @@ void Print(IntExpr x) } //============================================================================= -// Set stride +// VPM Setup //============================================================================= -void setReadStride(IntExpr stride) +static void vpmSetupReadCore(int n, IntExpr addr, bool hor, int stride) +{ + Stmt* s = mkStmt(); + s->tag = SETUP_VPM_READ; + s->setupVPMRead.numVecs = n; + s->setupVPMRead.stride = stride; + s->setupVPMRead.hor = hor; + s->setupVPMRead.addr = addr.expr; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +static void vpmSetupWriteCore(IntExpr addr, bool hor, int stride) +{ + Stmt* s = mkStmt(); + s->tag = SETUP_VPM_WRITE; + s->setupVPMWrite.stride = stride; + s->setupVPMWrite.hor = hor; + s->setupVPMWrite.addr = addr.expr; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +void vpmSetupRead(Dir d, int n, IntExpr addr, int stride) +{ + vpmSetupReadCore(n, addr, d == HORIZ ? 1 : 0, stride); +} + +void vpmSetupWrite(Dir d, IntExpr addr, int stride) +{ + vpmSetupWriteCore(addr, d == HORIZ ? 1 : 0, stride); +} + +// ============================================================================ +// DMA +// ============================================================================ + +void dmaSetReadPitch(IntExpr stride) { Stmt* s = mkStmt(); s->tag = SET_READ_STRIDE; @@ -197,7 +232,7 @@ void setReadStride(IntExpr stride) stmtStack.replace(mkSeq(stmtStack.top(), s)); } -void setWriteStride(IntExpr stride) +void dmaSetWriteStride(IntExpr stride) { Stmt* s = mkStmt(); s->tag = SET_WRITE_STRIDE; @@ -205,14 +240,53 @@ void setWriteStride(IntExpr stride) stmtStack.replace(mkSeq(stmtStack.top(), s)); } +void dmaSetupRead(Dir dir, int numRows, IntExpr vpmAddr, + int rowLen, int vpitch) +{ + Stmt* s = mkStmt(); + s->tag = SETUP_DMA_READ; + s->setupDMARead.hor = dir == HORIZ ? 1 : 0; + s->setupDMARead.numRows = numRows; + s->setupDMARead.rowLen = rowLen; + s->setupDMARead.vpitch = vpitch; + s->setupDMARead.vpmAddr = vpmAddr.expr; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +void dmaSetupWrite(Dir dir, int numRows, IntExpr vpmAddr, int rowLen) +{ + Stmt* s = mkStmt(); + s->tag = SETUP_DMA_WRITE; + s->setupDMAWrite.hor = dir == HORIZ ? 1 : 0; + s->setupDMAWrite.numRows = numRows; + s->setupDMAWrite.rowLen = rowLen; + s->setupDMAWrite.vpmAddr = vpmAddr.expr; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +void dmaWaitRead() +{ + Stmt* s = mkStmt(); + s->tag = DMA_READ_WAIT; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +void dmaWaitWrite() +{ + Stmt* s = mkStmt(); + s->tag = DMA_WRITE_WAIT; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + // ============================================================================ // QPU code for clean exit // ============================================================================ void kernelFinish() { - // Ensure outstanding stores have completed - flush(); + // Ensure outstanding DMAs have completed + dmaWaitRead(); + dmaWaitWrite(); // QPU 0 waits until all other QPUs have finished // before sending a host IRQ. diff --git a/Lib/Source/Stmt.h b/Lib/Source/Stmt.h index b11c519..19c1b39 100644 --- a/Lib/Source/Stmt.h +++ b/Lib/Source/Stmt.h @@ -41,8 +41,15 @@ void For_(BoolExpr b); void ForBody_(); void Print(const char *); void Print(IntExpr x); -void setReadStride(IntExpr n); -void setWriteStride(IntExpr n); +void dmaSetReadPitch(IntExpr n); +void dmaSetWriteStride(IntExpr n); +void dmaSetupRead(Dir dir, int numRows, IntExpr vpmAddr, + int rowLen = 16, int vpitch = 1); +void dmaSetupWrite(Dir dir, int numRows, IntExpr vpmAddr, int rowLen = 16); +void dmaWaitRead(); +void dmaWaitWrite(); +void vpmSetupRead(Dir dir, int n, IntExpr addr, int stride = 1); +void vpmSetupWrite(Dir dir, IntExpr addr, int stride = 1); void kernelFinish(); } // namespace QPULib diff --git a/Lib/Source/StmtExtra.h b/Lib/Source/StmtExtra.h index f23f405..076d62c 100644 --- a/Lib/Source/StmtExtra.h +++ b/Lib/Source/StmtExtra.h @@ -34,6 +34,57 @@ inline void semaDec(int semaId) stmtStack.replace(mkSeq(stmtStack.top(), s)); } +//============================================================================= +// VPM operations +//============================================================================= + +inline void vpmPutExpr(Expr* e) +{ + Var v; v.tag = VPM_WRITE; + Stmt* s = mkAssign(mkVar(v), e); + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +inline void vpmPut(IntExpr data) + { vpmPutExpr(data.expr); } + +inline void vpmPut(FloatExpr data) + { vpmPutExpr(data.expr); } + +template inline void vpmPut(PtrExpr data) + { vpmPutExpr(data.expr); } + +template inline void vpmPut(Ptr &data) + { vpmPutExpr(data.expr); } + +inline void dmaStartReadExpr(Expr* e) +{ + Stmt* s = mkStmt(); + s->tag = DMA_START_READ; + s->startDMARead = e; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +template inline void dmaStartRead(PtrExpr memAddr) + { dmaStartReadExpr(memAddr.expr); } + +template inline void dmaStartRead(Ptr &memAddr) + { dmaStartReadExpr(memAddr.expr); } + +inline void dmaStartWriteExpr(Expr* e) +{ + Stmt* s = mkStmt(); + s->tag = DMA_START_WRITE; + s->startDMAWrite = e; + stmtStack.replace(mkSeq(stmtStack.top(), s)); +} + +template inline void dmaStartWrite(PtrExpr memAddr) + { dmaStartWriteExpr(memAddr.expr); } + +template inline void dmaStartWrite(Ptr &memAddr) + { dmaStartWriteExpr(memAddr.expr); } + //============================================================================= // Receive, request, store operations //============================================================================= @@ -89,13 +140,6 @@ inline void store(IntExpr data, Ptr &addr) inline void store(FloatExpr data, Ptr &addr) { storeExpr(data.expr, addr.expr); } -inline void flush() -{ - Stmt* s = mkStmt(); - s->tag = FLUSH; - stmtStack.replace(mkSeq(stmtStack.top(), s)); -} - } // namespace QPULib #endif // _QPULIB_SOURCE_STMTEXTRA_H_ diff --git a/Lib/Source/Syntax.h b/Lib/Source/Syntax.h index 8e16212..f65e324 100644 --- a/Lib/Source/Syntax.h +++ b/Lib/Source/Syntax.h @@ -40,6 +40,9 @@ bool isUnary(Op op); // Is operator commutative? bool isCommutative(Op op); +// Direction for VPM/DMA loads and stores +enum Dir { HORIZ, VERT }; + // ============================================================================ // Variables // ============================================================================ @@ -55,6 +58,8 @@ enum VarTag { // QPU's unique id (replicated 16 times). , ELEM_NUM // (Read-only.) Reading this variable will yield a vector // containing the integers from 0 to 15. + , VPM_READ // (Read-only.) Read a vector from the VPM. + , VPM_WRITE // (Write-only.) Write a vector to the VPM. , TMU0_ADDR // (Write-only.) Initiate load via TMU }; @@ -69,10 +74,8 @@ struct Var { // Reserved general-purpose vars enum ReservedVarId { - RSV_QPU_ID = 0, - RSV_NUM_QPUS = 1, - RSV_READ_STRIDE = 2, - RSV_WRITE_STRIDE = 3 + RSV_QPU_ID = 0, + RSV_NUM_QPUS = 1 }; // ============================================================================ @@ -206,8 +209,12 @@ enum StmtTag { SKIP, ASSIGN, SEQ, WHERE, IF, WHILE, PRINT, FOR, SET_READ_STRIDE, SET_WRITE_STRIDE, - LOAD_RECEIVE, STORE_REQUEST, FLUSH, - SEND_IRQ_TO_HOST, SEMA_INC, SEMA_DEC }; + LOAD_RECEIVE, STORE_REQUEST, + SEND_IRQ_TO_HOST, SEMA_INC, SEMA_DEC, + SETUP_VPM_READ, SETUP_VPM_WRITE, + SETUP_DMA_READ, SETUP_DMA_WRITE, + DMA_READ_WAIT, DMA_WRITE_WAIT, + DMA_START_READ, DMA_START_WRITE }; struct Stmt { // What kind of statement is it? @@ -246,6 +253,25 @@ struct Stmt { // Semaphore id for increment / decrement int semaId; + + // VPM read setup + struct { int numVecs; Expr* addr; int hor; int stride; } setupVPMRead; + + // VPM write setup + struct { Expr* addr; int hor; int stride; } setupVPMWrite; + + // DMA read setup + struct { Expr* vpmAddr; int numRows; int rowLen; + int hor; int vpitch; } setupDMARead; + + // DMA write setup + struct { Expr* vpmAddr; int numRows; int rowLen; int hor; } setupDMAWrite; + + // DMA start read + Expr* startDMARead; + + // DMA start write + Expr* startDMAWrite; }; }; diff --git a/Lib/Source/Translate.cpp b/Lib/Source/Translate.cpp index 9004137..7bc6eec 100644 --- a/Lib/Source/Translate.cpp +++ b/Lib/Source/Translate.cpp @@ -64,10 +64,18 @@ Reg srcReg(Var v) r.tag = SPECIAL; r.regId = SPECIAL_ELEM_NUM; return r; + case VPM_READ: + r.tag = SPECIAL; + r.regId = SPECIAL_VPM_READ; + return r; case STANDARD: r.tag = REG_A; r.regId = v.id; return r; + case VPM_WRITE: + case TMU0_ADDR: + printf("QPULib: Reading from write-only special register is forbidden\n"); + assert(false); } // Not reachable @@ -82,12 +90,17 @@ Reg dstReg(Var v) case UNIFORM: case QPU_NUM: case ELEM_NUM: + case VPM_READ: printf("QPULib: writing to read-only special register is forbidden\n"); assert(false); case STANDARD: r.tag = REG_A; r.regId = v.id; return r; + case VPM_WRITE: + r.tag = SPECIAL; + r.regId = SPECIAL_VPM_WRITE; + return r; case TMU0_ADDR: r.tag = SPECIAL; r.regId = SPECIAL_TMU0_S; @@ -261,19 +274,27 @@ void varAssign( Seq* seq // Target instruction sequence to extend printf("QPULib: dereferencing not yet supported inside 'where'\n"); assert(false); } - Instr instr; - instr.tag = LD1; - instr.LD1.addr = srcReg(e.deref.ptr->var); - instr.LD1.buffer = A; - seq->append(instr); - instr.tag = LD2; - seq->append(instr); - instr.tag = LD3; - instr.LD3.buffer = A; - seq->append(instr); - instr.tag = LD4; - instr.LD4.dest = dstReg(v); - seq->append(instr); + // Load address + Reg loadAddr; + loadAddr.tag = SPECIAL; + loadAddr.regId = SPECIAL_QPU_NUM; + // Setup DMA + genSetReadPitch(seq, 4); + genSetupDMALoad(seq, 16, 1, 1, 1, loadAddr); + // Start DMA load + genStartDMALoad(seq, srcReg(e.deref.ptr->var)); + // Wait for DMA + genWaitDMALoad(seq); + // Setup VPM + Reg addr; + addr.tag = SPECIAL; + addr.regId = SPECIAL_QPU_NUM; + genSetupVPMLoad(seq, 1, addr, 0, 1); + // Get from VPM + Reg data; + data.tag = SPECIAL; + data.regId = SPECIAL_VPM_READ; + seq->append(genLShift(dstReg(v), data, 0)); return; } @@ -349,17 +370,31 @@ void assign( Seq* seq // Target instruction sequence to extend // Case: *v := rhs where v is a var and rhs is a var // ------------------------------------------------- if (lhs.tag == DEREF) { - Instr instr; - instr.tag = ST1; - instr.ST1.data = srcReg(rhs->var); - instr.ST1.buffer = A; - seq->append(instr); - instr.tag = ST2; - instr.ST2.addr = srcReg(lhs.deref.ptr->var); - instr.ST2.buffer = A; - seq->append(instr); - instr.tag = ST3; - seq->append(instr); + // QPU id + Reg qpuId; + qpuId.tag = SPECIAL; + qpuId.regId = SPECIAL_QPU_NUM; + // Setup VPM + Reg addr = freshReg(); + seq->append(genLI(addr, 16)); + seq->append(genADD(addr, addr, qpuId)); + genSetupVPMStore(seq, addr, 0, 1); + // Store address + Reg storeAddr = freshReg(); + seq->append(genLI(storeAddr, 256)); + seq->append(genADD(storeAddr, storeAddr, qpuId)); + // Setup DMA + genSetWriteStride(seq, 0); + genSetupDMAStore(seq, 16, 1, 1, storeAddr); + // Put to VPM + Reg data; + data.tag = SPECIAL; + data.regId = SPECIAL_VPM_WRITE; + seq->append(genLShift(data, srcReg(rhs->var), 0)); + // Start DMA + genStartDMAStore(seq, srcReg(lhs.deref.ptr->var)); + // Wait for store to complete + genWaitDMAStore(seq); return; } @@ -881,13 +916,13 @@ void setStrideStmt(Seq* seq, StmtTag tag, Expr* e) { if (e->tag == INT_LIT) { if (tag == SET_READ_STRIDE) - genSetReadStride(seq, e->intLit); + genSetReadPitch(seq, e->intLit); else genSetWriteStride(seq, e->intLit); } else if (e->tag == VAR) { if (tag == SET_READ_STRIDE) - genSetReadStride(seq, srcReg(e->var)); + genSetReadPitch(seq, srcReg(e->var)); else genSetWriteStride(seq, srcReg(e->var)); } @@ -897,12 +932,108 @@ void setStrideStmt(Seq* seq, StmtTag tag, Expr* e) Var v = freshVar(); varAssign(seq, always, v, e); if (tag == SET_READ_STRIDE) - genSetReadStride(seq, srcReg(v)); + genSetReadPitch(seq, srcReg(v)); else genSetWriteStride(seq, srcReg(v)); } } +// ============================================================================ +// VPM setup statements +// ============================================================================ + +void setupVPMReadStmt(Seq* seq, int n, Expr* e, int hor, int stride) +{ + if (e->tag == INT_LIT) + genSetupVPMLoad(seq, n, e->intLit, hor, stride); + else if (e->tag == VAR) + genSetupVPMLoad(seq, n, srcReg(e->var), hor, stride); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genSetupVPMLoad(seq, n, srcReg(v), hor, stride); + } +} + +void setupVPMWriteStmt(Seq* seq, Expr* e, int hor, int stride) +{ + if (e->tag == INT_LIT) + genSetupVPMStore(seq, e->intLit, hor, stride); + else if (e->tag == VAR) + genSetupVPMStore(seq, srcReg(e->var), hor, stride); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genSetupVPMStore(seq, srcReg(v), hor, stride); + } +} + +// ============================================================================ +// DMA statements +// ============================================================================ + +void setupDMAReadStmt(Seq* seq, int numRows, int rowLen, + int hor, Expr* e, int vpitch) +{ + if (e->tag == INT_LIT) + genSetupDMALoad(seq, numRows, rowLen, hor, vpitch, e->intLit); + else if (e->tag == VAR) + genSetupDMALoad(seq, numRows, rowLen, hor, vpitch, srcReg(e->var)); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genSetupDMALoad(seq, numRows, rowLen, hor, vpitch, srcReg(v)); + } +} + +void setupDMAWriteStmt(Seq* seq, int numRows, int rowLen, + int hor, Expr* e) +{ + if (e->tag == INT_LIT) + genSetupDMAStore(seq, numRows, rowLen, hor, e->intLit); + else if (e->tag == VAR) + genSetupDMAStore(seq, numRows, rowLen, hor, srcReg(e->var)); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genSetupDMAStore(seq, numRows, rowLen, hor, srcReg(v)); + } +} + +void startDMAReadStmt(Seq* seq, Expr* e) +{ + if (e->tag == VAR) + genStartDMALoad(seq, srcReg(e->var)); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genStartDMALoad(seq, srcReg(e->var)); + } +} + +void startDMAWriteStmt(Seq* seq, Expr* e) +{ + if (e->tag == VAR) + genStartDMAStore(seq, srcReg(e->var)); + else { + AssignCond always; + always.tag = ALWAYS; + Var v = freshVar(); + varAssign(seq, always, v, e); + genStartDMAStore(seq, srcReg(e->var)); + } +} + // ============================================================================ // Load receive statements // ============================================================================ @@ -933,17 +1064,31 @@ void storeRequest(Seq* seq, Expr* data, Expr* addr) addr = putInVar(seq, addr); } - Instr instr; - instr.tag = ST3; - seq->append(instr); - instr.tag = ST1; - instr.ST1.data = srcReg(data->var); - instr.ST1.buffer = A; - seq->append(instr); - instr.tag = ST2; - instr.ST2.addr = srcReg(addr->var); - instr.ST2.buffer = A; - seq->append(instr); + // QPU id + Reg qpuId; + qpuId.tag = SPECIAL; + qpuId.regId = SPECIAL_QPU_NUM; + // Setup VPM + Reg addrReg = freshReg(); + seq->append(genLI(addrReg, 16)); + seq->append(genADD(addrReg, addrReg, qpuId)); + genSetupVPMStore(seq, addrReg, 0, 1); + // Store address + Reg storeAddr = freshReg(); + seq->append(genLI(storeAddr, 256)); + seq->append(genADD(storeAddr, storeAddr, qpuId)); + // Wait for any outstanding store to complete + genWaitDMAStore(seq); + // Setup DMA + genSetWriteStride(seq, 0); + genSetupDMAStore(seq, 16, 1, 1, storeAddr); + // Put to VPM + Reg dataReg; + dataReg.tag = SPECIAL; + dataReg.regId = SPECIAL_VPM_WRITE; + seq->append(genLShift(dataReg, srcReg(data->var), 0)); + // Start DMA + genStartDMAStore(seq, srcReg(addr->var)); } // ============================================================================ @@ -1127,16 +1272,6 @@ void stmt(Seq* seq, Stmt* s) return; } - // ------------- - // Case: flush() - // ------------- - if (s->tag == FLUSH) { - // Flush outstanding stores - Instr instr; instr.tag = ST3; - seq->append(instr); - return; - } - // --------------------------------------------------------------- // Case: semaInc(n) or semaDec(n) where n is an int (semaphore id) // --------------------------------------------------------------- @@ -1153,6 +1288,86 @@ void stmt(Seq* seq, Stmt* s) return; } + // ---------------------------------------- + // Case: vpmSetupRead(dir, n, addr, stride) + // ---------------------------------------- + if (s->tag == SETUP_VPM_READ) { + setupVPMReadStmt(seq, + s->setupVPMRead.numVecs, + s->setupVPMRead.addr, + s->setupVPMRead.hor, + s->setupVPMRead.stride); + return; + } + + // -------------------------------------- + // Case: vpmSetupWrite(dir, addr, stride) + // -------------------------------------- + if (s->tag == SETUP_VPM_WRITE) { + setupVPMWriteStmt(seq, + s->setupVPMWrite.addr, + s->setupVPMWrite.hor, + s->setupVPMWrite.stride); + return; + } + + // ------------------------------------------------------ + // Case: dmaSetupRead(dir, numRows, addr, rowLen, vpitch) + // ------------------------------------------------------ + if (s->tag == SETUP_DMA_READ) { + setupDMAReadStmt(seq, + s->setupDMARead.numRows, + s->setupDMARead.rowLen, + s->setupDMARead.hor, + s->setupDMARead.vpmAddr, + s->setupDMARead.vpitch); + return; + } + + // ----------------------------------------------- + // Case: dmaSetupWrite(dir, numRows, addr, rowLen) + // ----------------------------------------------- + if (s->tag == SETUP_DMA_WRITE) { + setupDMAWriteStmt(seq, + s->setupDMAWrite.numRows, + s->setupDMAWrite.rowLen, + s->setupDMAWrite.hor, + s->setupDMAWrite.vpmAddr); + return; + } + + // ------------------- + // Case: dmaReadWait() + // ------------------- + if (s->tag == DMA_READ_WAIT) { + genWaitDMALoad(seq); + return; + } + + // -------------------- + // Case: dmaWriteWait() + // -------------------- + if (s->tag == DMA_WRITE_WAIT) { + genWaitDMAStore(seq); + return; + } + + // ------------------------ + // Case: dmaStartRead(addr) + // ------------------------ + if (s->tag == DMA_START_READ) { + startDMAReadStmt(seq, s->startDMARead); + return; + } + + // ------------------------- + // Case: dmaStartWrite(addr) + // ------------------------- + if (s->tag == DMA_START_WRITE) { + startDMAWriteStmt(seq, s->startDMAWrite); + return; + } + // Not reachable assert(false); } diff --git a/Lib/Target/Emulator.cpp b/Lib/Target/Emulator.cpp index c9568d2..ae3f41e 100644 --- a/Lib/Target/Emulator.cpp +++ b/Lib/Target/Emulator.cpp @@ -21,7 +21,7 @@ int32_t* emuHeap = NULL; // Read a vector register // ============================================================================ -Vec readReg(QPUState* s, Seq* uniforms, Reg reg) +Vec readReg(QPUState* s, State* g, Reg reg) { Vec v; int r = reg.regId; @@ -42,14 +42,14 @@ Vec readReg(QPUState* s, Seq* uniforms, Reg reg) return v; } else if (reg.regId == SPECIAL_UNIFORM) { - assert(s->nextUniform < uniforms->numElems); + assert(s->nextUniform < g->uniforms->numElems); for (int i = 0; i < NUM_LANES; i++) if (s->nextUniform == -2) v.elems[i].intVal = s->id; else if (s->nextUniform == -1) v.elems[i].intVal = s->numQPUs; else - v.elems[i].intVal = uniforms->elems[s->nextUniform]; + v.elems[i].intVal = g->uniforms->elems[s->nextUniform]; s->nextUniform++; return v; } @@ -58,6 +58,103 @@ Vec readReg(QPUState* s, Seq* uniforms, Reg reg) v.elems[i].intVal = s->id; return v; } + else if (reg.regId == SPECIAL_VPM_READ) { + // Make sure there's a VPM load request waiting + assert(! s->vpmLoadQueue.isEmpty()); + VPMLoadReq* req = s->vpmLoadQueue.first(); + assert(req->numVecs > 0); + if (req->hor) { + // Horizontal load + for (int i = 0; i < NUM_LANES; i++) { + int index = (16*req->addr+i); + assert(index < VPM_SIZE); + v.elems[i] = g->vpm[index]; + } + } + else { + // Vertical load + for (int i = 0; i < NUM_LANES; i++) { + uint32_t x = req->addr & 0xf; + uint32_t y = req->addr >> 4; + int index = (y*16*16 + x + i*16); + assert(index < VPM_SIZE); + v.elems[i] = g->vpm[index]; + } + } + req->numVecs--; + req->addr = req->addr + req->stride; + if (req->numVecs == 0) s->vpmLoadQueue.deq(); + return v; + } + else if (reg.regId == SPECIAL_DMA_LD_WAIT) { + // Perform DMA load to completion + if (s->dmaLoad.active == false) return v; + DMALoadReq* req = &s->dmaLoadSetup; + if (req->hor) { + // Horizontal access + uint32_t y = (req->vpmAddr >> 4) & 0x3f; + for (int r = 0; r < req->numRows; r++) { + uint32_t x = req->vpmAddr & 0xf; + for (int i = 0; i < req->rowLen; i++) { + int addr = s->dmaLoad.addr.intVal + (r * s->readPitch) + i*4; + g->vpm[y*16 + x].intVal = emuHeap[addr >> 2]; + x = (x+1) % 16; + } + y = (y+1) % 64; + } + } + else { + // Vertical access + uint32_t x = req->vpmAddr & 0xf; + for (int r = 0; r < req->numRows; r++) { + uint32_t y = ((req->vpmAddr >> 4) + r*req->vpitch) & 0x3f; + for (int i = 0; i < req->rowLen; i++) { + int addr = s->dmaLoad.addr.intVal + (r * s->readPitch) + i*4; + g->vpm[y*16 + x].intVal = emuHeap[addr >> 2]; + y = (y+1) % 64; + } + x = (x+1) % 16; + } + } + s->dmaLoad.active = false; + return v; // Return value unspecified + } + else if (reg.regId == SPECIAL_DMA_ST_WAIT) { + // Perform DMA store to completion + if (s->dmaStore.active == false) return v; + DMAStoreReq* req = &s->dmaStoreSetup; + uint32_t memAddr = s->dmaStore.addr.intVal; + if (req->hor) { + // Horizontal access + uint32_t y = (req->vpmAddr >> 4) & 0x3f; + for (int r = 0; r < req->numRows; r++) { + uint32_t x = req->vpmAddr & 0xf; + for (int i = 0; i < req->rowLen; i++) { + emuHeap[memAddr >> 2] = g->vpm[y*16 + x].intVal; + x = (x+1) % 16; + memAddr = memAddr + 4; + } + y = (y+1) % 64; + memAddr += s->writeStride; + } + } + else { + // Vertical access + uint32_t x = req->vpmAddr & 0xf; + for (int r = 0; r < req->numRows; r++) { + uint32_t y = (req->vpmAddr >> 4) & 0x3f; + for (int i = 0; i < req->rowLen; i++) { + emuHeap[memAddr >> 2] = g->vpm[y*16 + x].intVal; + y = (y+1) % 64; + memAddr = memAddr + 4; + } + x = (x+1) % 16; + memAddr += s->writeStride; + } + } + s->dmaStore.active = false; + return v; // Return value unspecified + } printf("QPULib: can't read special register\n"); abort(); case NONE: @@ -131,7 +228,8 @@ inline bool checkBranchCond(QPUState* s, BranchCond cond) // Write a vector to a register // ============================================================================ -void writeReg(QPUState* s, bool setFlags, AssignCond cond, Reg dest, Vec v) +void writeReg(QPUState* s, State* g, bool setFlags, + AssignCond cond, Reg dest, Vec v) { switch (dest.tag) { case REG_A: @@ -169,22 +267,37 @@ void writeReg(QPUState* s, bool setFlags, AssignCond cond, Reg dest, Vec v) case SPECIAL_RD_SETUP: { int setup = v.elems[0].intVal; if ((setup & 0xf0000000) == 0x90000000) { - // Set read stride - int stride = ((setup & 0x1fff) >> 2) - 1; - s->readStride = stride; + // Set read pitch + int pitch = (setup & 0x1fff); + s->readPitch = pitch; return; } else if ((setup & 0xc0000000) == 0) { - // Initiate VPM load - VPMLoadQueue* q = &s->vpmLoadQueue; - assert((q->back+1)%3 != q->front); // Assert not full - BufferAorB buffer = A; - q->addrs[q->back] = NUM_LANES*(4*s->id + (buffer == A ? 0 : 1)); - q->back = (q->back+1)%3; + // QPU only allows two VPM loads queued at a time + assert(! s->vpmLoadQueue.isFull()); + // Create VPM load request + VPMLoadReq req; + req.numVecs = (setup >> 20) & 0xf; + if (req.numVecs == 0) req.numVecs = 16; + req.hor = ((setup >> 11) & 1); + req.addr = setup & 0xff; + req.stride = (setup >> 12) & 0x3f; + if (req.stride == 0) req.stride = 64; + // Add VPM load request to queue + s->vpmLoadQueue.enq(req); return; } else if (setup & 0x80000000) { - // DMA read setup + // DMA load setup + DMALoadReq* req = &s->dmaLoadSetup; + req->rowLen = (setup >> 20) & 0xf; + if (req->rowLen == 0) req->rowLen = 16; + req->numRows = (setup >> 16) & 0xf; + if (req->numRows == 0) req->numRows = 16; + req->vpitch = (setup >> 12) & 0xf; + if (req->vpitch == 0) req->vpitch = 16; + req->hor = (setup & 0x800) ? false : true; + req->vpmAddr = (setup & 0x7ff); return; } break; @@ -193,41 +306,74 @@ void writeReg(QPUState* s, bool setFlags, AssignCond cond, Reg dest, Vec v) int setup = v.elems[0].intVal; if ((setup & 0xc0000000) == 0xc0000000) { // Set write stride - int stride = (setup & 0x1fff) >> 2; + int stride = setup & 0x1fff; s->writeStride = stride; return; } else if ((setup & 0xc0000000) == 0x80000000) { // DMA write setup + DMAStoreReq* req = &s->dmaStoreSetup; + req->rowLen = (setup >> 16) & 0x7f; + if (req->rowLen == 0) req->rowLen = 128; + req->numRows = (setup >> 23) & 0x7f; + if (req->numRows == 0) req->numRows = 128; + req->hor = (setup & 0x4000); + req->vpmAddr = (setup >> 3) & 0x7ff; return; } else if ((setup & 0xc0000000) == 0) { - // Setup VPM store + VPMStoreReq req; + req.hor = (setup >> 11) & 1; + req.addr = setup & 0xff; + req.stride = (setup >> 12) & 0x3f; + if (req.stride == 0) req.stride = 64; + s->vpmStoreSetup = req; return; } break; } + case SPECIAL_VPM_WRITE: { + VPMStoreReq* req = &s->vpmStoreSetup; + if (req->hor) { + // Horizontal store + for (int i = 0; i < NUM_LANES; i++) { + int index = (16*req->addr+i); + assert(index < VPM_SIZE); + g->vpm[index] = v.elems[i]; + } + } + else { + // Vertical store + uint32_t x = req->addr & 0xf; + uint32_t y = req->addr >> 4; + for (int i = 0; i < NUM_LANES; i++) { + int index = (y*16*16 + x + i*16); + assert(index < VPM_SIZE); + g->vpm[index] = v.elems[i]; + } + } + req->addr = req->addr + req->stride; + return; + } case SPECIAL_DMA_LD_ADDR: { // Initiate DMA load assert(!s->dmaLoad.active); s->dmaLoad.active = true; s->dmaLoad.addr = v.elems[0]; - s->dmaLoad.buffer = A; return; } case SPECIAL_DMA_ST_ADDR: { // Initiate DMA store assert(!s->dmaStore.active); - s->dmaStore.addr = v.elems[0]; - s->dmaStore.buffer = A; s->dmaStore.active = true; + s->dmaStore.addr = v.elems[0]; return; } case SPECIAL_HOST_INT: { return; } case SPECIAL_TMU0_S: { - assert(s->loadBuffer->numElems < 8); + assert(s->loadBuffer->numElems < 4); Vec val; for (int i = 0; i < NUM_LANES; i++) { uint32_t a = (uint32_t) v.elems[i].intVal; @@ -312,10 +458,10 @@ Vec evalSmallImm(QPUState* s, SmallImm imm) assert(false); } -Vec readRegOrImm(QPUState* s, Seq* uniforms, RegOrImm src) +Vec readRegOrImm(QPUState* s, State* g, RegOrImm src) { switch (src.tag) { - case REG: return readReg(s, uniforms, src.reg); + case REG: return readReg(s, g, src.reg); case IMM: return evalSmallImm(s, src.smallImm); } @@ -351,16 +497,16 @@ inline int32_t clz(int32_t x) // ALU // ============================================================================ -Vec alu(QPUState* s, Seq* uniforms, +Vec alu(QPUState* s, State* g, RegOrImm srcA, ALUOp op, RegOrImm srcB) { // First, obtain vector operands Vec x, y, z; - x = readRegOrImm(s, uniforms, srcA); + x = readRegOrImm(s, g, srcA); if (srcA.tag == REG && srcB.tag == REG && srcA.reg == srcB.reg) y = x; else - y = readRegOrImm(s, uniforms, srcB); + y = readRegOrImm(s, g, srcB); Word* a = x.elems; Word* b = y.elems; Word* c = z.elems; @@ -509,15 +655,6 @@ Vec alu(QPUState* s, Seq* uniforms, return z; } -// ============================================================================ -// In-flight memory requests -// ============================================================================ - -struct InFlightMemReq { - Word addr; - BufferAorB buffer; -}; - // ============================================================================ // Printing routines // ============================================================================ @@ -576,10 +713,12 @@ void emulate { State state; state.output = output; + state.uniforms = uniforms; // Initialise state for (int i = 0; i < numQPUs; i++) { QPUState q; + memset(&q, 0, sizeof(QPUState)); q.id = i; q.numQPUs = numQPUs; q.pc = 0; @@ -591,9 +730,7 @@ void emulate q.nextUniform = -2; q.dmaLoad.active = false; q.dmaStore.active = false; - q.vpmLoadQueue.back = 0; - q.vpmLoadQueue.front = 0; - q.readStride = 0; + q.readPitch = 0; q.writeStride = 0; q.loadBuffer = new SmallSeq; state.qpu[i] = q; @@ -616,15 +753,16 @@ void emulate // Load immediate case LI: { Vec imm = evalImm(instr.LI.imm); - writeReg(s, instr.LI.setFlags, instr.LI.cond, instr.LI.dest, imm); + writeReg(s, &state, instr.LI.setFlags, + instr.LI.cond, instr.LI.dest, imm); break; } // ALU operation case ALU: { - Vec result = alu(s, uniforms, instr.ALU.srcA, + Vec result = alu(s, &state, instr.ALU.srcA, instr.ALU.op, instr.ALU.srcB); if (instr.ALU.op != NOP) - writeReg(s, instr.ALU.setFlags, instr.ALU.cond, + writeReg(s, &state, instr.ALU.setFlags, instr.ALU.cond, instr.ALU.dest, result); break; } @@ -656,83 +794,6 @@ void emulate // No-op case NO_OP: break; - // LD1: DMA vector in DRAM into VPM (local) memory - case LD1: { - assert(!s->dmaLoad.active); - Vec addr = readReg(s, uniforms, instr.LD1.addr); - s->dmaLoad.active = true; - s->dmaLoad.addr = addr.elems[0]; - s->dmaLoad.buffer = instr.LD1.buffer; - break; - } - // LD2: wait for DMA completion - case LD2: { - assert(s->dmaLoad.active); - uint32_t hp = (uint32_t) s->dmaLoad.addr.intVal; - int vpmAddr = NUM_LANES * - (4*s->id + (s->dmaLoad.buffer == A ? 0 : 1)); - for (int i = 0; i < NUM_LANES; i++) { - state.vpm[vpmAddr+i].intVal = emuHeap[hp>>2]; - hp += 4*(s->readStride+1); - } - s->dmaLoad.active = false; - break; - } - // LD3: setup a read from VPM memory - case LD3: { - VPMLoadQueue* q = &s->vpmLoadQueue; - assert((q->back+1)%3 != q->front); // Assert not full - q->addrs[q->back] = NUM_LANES * - (4*s->id + (instr.LD3.buffer == A ? 0 : 1)); - q->back = (q->back+1)%3; - break; - } - // LD4: transfer from VPM into given register - case LD4: { - VPMLoadQueue* q = &s->vpmLoadQueue; - assert(q->back != q->front); // Assert not empty - int vpmAddr = q->addrs[q->front]; - q->front = (q->front+1)%3; - Vec v; - for (int i = 0; i < NUM_LANES; i++) - v.elems[i] = state.vpm[vpmAddr+i]; - AssignCond always; - always.tag = ALWAYS; - writeReg(s, false, always, instr.LD4.dest, v); - break; - } - // ST1: write the vector to VPM (local) memory - case ST1: { - Vec v = readReg(s, uniforms, instr.ST1.data); - int vpmAddr = NUM_LANES * - (4*s->id + (instr.ST1.buffer == A ? 2 : 3)); - for (int i = 0; i < NUM_LANES; i++) - state.vpm[vpmAddr+i] = v.elems[i]; - break; - } - // ST2: DMA from the VPM out to DRAM - case ST2: { - assert(!s->dmaStore.active); - Vec addr = readReg(s, uniforms, instr.ST2.addr); - s->dmaStore.addr = addr.elems[0]; - s->dmaStore.buffer = instr.ST2.buffer; - s->dmaStore.active = true; - break; - } - // ST3: wait for DMA to complete - case ST3: { - if (s->dmaStore.active) { - uint32_t hp = (uint32_t) s->dmaStore.addr.intVal; - int vpmAddr = NUM_LANES * - (4*s->id + (s->dmaStore.buffer == A ? 2 : 3)); - for (int i = 0; i < NUM_LANES; i++) { - emuHeap[hp>>2] = state.vpm[vpmAddr+i].intVal; - hp += 4*(s->writeStride+1); - } - s->dmaStore.active = false; - } - break; - } // PRS: print string case PRS: { emitStr(state.output, instr.PRS); @@ -740,13 +801,13 @@ void emulate } // PRI: print integer case PRI: { - Vec x = readReg(s, uniforms, instr.PRI); + Vec x = readReg(s, &state, instr.PRI); printIntVec(state.output, x); break; } // PRF: print integer case PRF: { - Vec x = readReg(s, uniforms, instr.PRF); + Vec x = readReg(s, &state, instr.PRF); printFloatVec(state.output, x); break; } @@ -756,7 +817,7 @@ void emulate Vec val = s->loadBuffer->remove(0); AssignCond always; always.tag = ALWAYS; - writeReg(s, false, always, instr.RECV.dest, val); + writeReg(s, &state, false, always, instr.RECV.dest, val); break; } // Read from TMU0 into accumulator 4 @@ -768,7 +829,7 @@ void emulate Reg dest; dest.tag = ACC; dest.regId = 4; - writeReg(s, false, always, dest, val); + writeReg(s, &state, false, always, dest, val); break; } // Host IRQ diff --git a/Lib/Target/Emulator.h b/Lib/Target/Emulator.h index 6a51574..9f27c62 100644 --- a/Lib/Target/Emulator.h +++ b/Lib/Target/Emulator.h @@ -3,12 +3,13 @@ #include #include "Common/Seq.h" +#include "Common/Queue.h" #include "Target/Syntax.h" -#define VPM_SIZE 2048 #define NUM_LANES 16 #define MAX_QPUS 12 #define EMULATOR_HEAP_SIZE 3*65536 +#define VPM_SIZE 1024 namespace QPULib { @@ -23,47 +24,76 @@ struct Vec { Word elems[NUM_LANES]; }; -// In-flight DMA requests -struct DMAReq { +// In-flight DMA request +struct DMAAddr { bool active; Word addr; - BufferAorB buffer; }; -// VPM load queue (max 2 elements) -struct VPMLoadQueue { - int addrs[3]; - int front, back; +// VPM load request +struct VPMLoadReq { + int numVecs; // Number of vectors to load + bool hor; // Horizintal or vertical access? + int addr; // Address in VPM to load from + int stride; // Added to address after every vector read +}; + +// VPM store request +struct VPMStoreReq { + bool hor; // Horizintal or vertical access? + int addr; // Address in VPM to load from + int stride; // Added to address after every vector written +}; + +// DMA load request +struct DMALoadReq { + bool hor; // Horizintal or vertical access? + int numRows; // Number of rows in memory + int rowLen; // Length of each row in memory + int vpmAddr; // VPM address to write to + int vpitch; // Added to vpmAddr after each vector loaded +}; + +// DMA store request +struct DMAStoreReq { + bool hor; // Horizintal or vertical access? + int numRows; // Number of rows in memory + int rowLen; // Length of each row in memory + int vpmAddr; // VPM address to load from }; // State of a single QPU. struct QPUState { - int id; // QPU id - int numQPUs; // QPU count - bool running; // Is QPU active, or has it halted? - int pc; // Program counter - Vec* regFileA; // Register file A - int sizeRegFileA; // (and size) - Vec* regFileB; // Register file B - int sizeRegFileB; // (and size) - Vec accum[6]; // Accumulator registers - bool negFlags[NUM_LANES]; // Negative flags - bool zeroFlags[NUM_LANES]; // Zero flags - int nextUniform; // Pointer to next uniform to read - DMAReq dmaLoad; // In-flight DMA load - DMAReq dmaStore; // In-flight DMA store - VPMLoadQueue vpmLoadQueue; // VPM load queue - int readStride; // Read stride - int writeStride; // Write stride - SmallSeq* loadBuffer; // Load buffer for loads via TMU + int id; // QPU id + int numQPUs; // QPU count + bool running; // Is QPU active, or has it halted? + int pc; // Program counter + Vec* regFileA; // Register file A + int sizeRegFileA; // (and size) + Vec* regFileB; // Register file B + int sizeRegFileB; // (and size) + Vec accum[6]; // Accumulator registers + bool negFlags[NUM_LANES]; // Negative flags + bool zeroFlags[NUM_LANES]; // Zero flags + int nextUniform; // Pointer to next uniform to read + DMAAddr dmaLoad; // DMA load address + DMAAddr dmaStore; // DMA store address + DMALoadReq dmaLoadSetup; // DMA load setup register + DMAStoreReq dmaStoreSetup; // DMA store setup register + Queue<2, VPMLoadReq> vpmLoadQueue; // VPM load queue + VPMStoreReq vpmStoreSetup; // VPM store setup + int readPitch; // Read pitch + int writeStride; // Write stride + SmallSeq* loadBuffer; // Load buffer for loads via TMU }; // State of the VideoCore. struct State { - QPUState qpu[MAX_QPUS]; // State of each QPU - Word vpm[VPM_SIZE]; // Shared VPM memory - Seq* output; // Output for print statements - int sema[16]; // Semaphores + QPUState qpu[MAX_QPUS]; // State of each QPU + Seq* uniforms; // Kernel parameters + Word vpm[VPM_SIZE]; // Shared VPM memory + Seq* output; // Output for print statements + int sema[16]; // Semaphores }; // Emulator diff --git a/Lib/Target/Encode.cpp b/Lib/Target/Encode.cpp index f5426a9..18dee09 100644 --- a/Lib/Target/Encode.cpp +++ b/Lib/Target/Encode.cpp @@ -1,5 +1,6 @@ #include "Target/Encode.h" #include "Target/Satisfy.h" +#include "Target/Pretty.h" #include #include @@ -194,9 +195,9 @@ void encodeInstr(Instr instr, uint32_t* high, uint32_t* low) instr.LI.imm.tag = IMM_INT32; instr.LI.imm.intVal = 1; break; - case LD2: - case ST3: { - RegId src = instr.tag == LD2 ? SPECIAL_DMA_LD_WAIT : + case DMA_LOAD_WAIT: + case DMA_STORE_WAIT: { + RegId src = instr.tag == DMA_LOAD_WAIT ? SPECIAL_DMA_LD_WAIT : SPECIAL_DMA_ST_WAIT; instr.tag = ALU; instr.ALU.setFlags = false; @@ -210,34 +211,6 @@ void encodeInstr(Instr instr, uint32_t* high, uint32_t* low) instr.ALU.srcB.reg = instr.ALU.srcA.reg; break; } - case LD4: { - Reg dest = instr.LD4.dest; - instr.tag = ALU; - instr.ALU.setFlags = false; - instr.ALU.cond.tag = ALWAYS; - instr.ALU.op = A_BOR; - instr.ALU.dest = dest; - instr.ALU.srcA.tag = REG; - instr.ALU.srcA.reg.tag = SPECIAL; - instr.ALU.srcA.reg.regId = SPECIAL_VPM_READ; - instr.ALU.srcB.tag = REG; - instr.ALU.srcB.reg = instr.ALU.srcA.reg; - break; - } - case ST1: { - Reg src = instr.ST1.data; - instr.tag = ALU; - instr.ALU.setFlags = false; - instr.ALU.cond.tag = ALWAYS; - instr.ALU.op = A_BOR; - instr.ALU.dest.tag = SPECIAL; - instr.ALU.dest.regId = SPECIAL_VPM_WRITE; - instr.ALU.srcA.tag = REG; - instr.ALU.srcA.reg = src; - instr.ALU.srcB.tag = REG; - instr.ALU.srcB.reg = src; - break; - } } // Encode core instrcution diff --git a/Lib/Target/Liveness.cpp b/Lib/Target/Liveness.cpp index b566d87..bbbf4c4 100644 --- a/Lib/Target/Liveness.cpp +++ b/Lib/Target/Liveness.cpp @@ -48,30 +48,6 @@ void useDefReg(Instr instr, UseDefReg* useDef) useDef->use.insert(instr.ALU.srcB.reg); return; - // LD1 instruction - case LD1: - // Add source reg to 'use' set - useDef->use.insert(instr.LD1.addr); - return; - - // LD4 instruction - case LD4: - // Add dest reg to 'def' set - useDef->def.insert(instr.LD4.dest); - return; - - // ST1 instruction - case ST1: - // Add source reg to 'use' set - useDef->use.insert(instr.ST1.data); - return; - - // ST2 instruction - case ST2: - // Add source reg to 'use' set - useDef->use.insert(instr.ST2.addr); - return; - // Print integer instruction case PRI: // Add source reg to 'use' set diff --git a/Lib/Target/LoadStore.cpp b/Lib/Target/LoadStore.cpp index 7041363..259d177 100644 --- a/Lib/Target/LoadStore.cpp +++ b/Lib/Target/LoadStore.cpp @@ -6,122 +6,295 @@ namespace QPULib { // ============================================================================= -// Stride setup +// VPM setup // ============================================================================= -// Generate instructions to set the read stride. +static int vpmSetupReadCode(int n, int hor, int stride) +{ + assert(n >= 1 && n <= 16); // A max of 16 vectors can be read + assert(stride >= 1 && stride <= 64); // Valid stride + assert(hor == 0 || hor == 1); // Horizontal or vertical + + // Max values encoded as 0 + if (n == 16) n = 0; + if (stride == 64) stride = 0; + + // Setup code + int code = n << 20; + code |= stride << 12; + code |= hor << 11; + code |= 2 << 8; -void genSetReadStride(Seq* instrs, int stride) + return code; +} + +static int vpmSetupWriteCode(int hor, int stride) { - int pitch = (stride+1)*4; - assert(pitch < 8192); - int setup = 0x90000000 | pitch; - Reg dst; dst.tag = REG_A; dst.regId = RSV_READ_STRIDE; - Instr instr = genLI(dst, setup); + assert(stride >= 1 && stride <= 64); // Valid stride + assert(hor == 0 || hor == 1); // Horizontal or vertical + + // Max values encoded as 0 + if (stride == 64) stride = 0; + + // Setup code + int code = stride << 12; + code |= hor << 11; + code |= 2 << 8; + + return code; +} + +// Generate instructions to setup VPM load. + +void genSetupVPMLoad(Seq* instrs, int n, int addr, int hor, int stride) +{ + assert(addr < 256); + + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_RD_SETUP; + + int setup = vpmSetupReadCode(n, hor, stride) | (addr & 0xff); + instrs->append(genLI(dst, setup)); + + Instr instr; + instr.tag = VPM_STALL; instrs->append(instr); } -void genSetReadStride(Seq* instrs, Reg stride) +void genSetupVPMLoad(Seq* instrs, int n, Reg addr, int hor, int stride) { - Reg pitch = freshReg(); + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_RD_SETUP; + Reg tmp = freshReg(); - instrs->append(genIncr(pitch, stride, 1)); - instrs->append(genLI(tmp, 0x90000000)); - instrs->append(genLShift(pitch, pitch, 2)); + int setup = vpmSetupReadCode(n, hor, stride); + instrs->append(genLI(tmp, setup)); + instrs->append(genOR(dst, addr, tmp)); - Reg dst; dst.tag = REG_A; dst.regId = RSV_READ_STRIDE; - instrs->append(genOR(dst, tmp, pitch)); + Instr instr; + instr.tag = VPM_STALL; + instrs->append(instr); } -// Generate instructions to set the write stride. +// Generate instructions to setup VPM store. -void genSetWriteStride(Seq* instrs, int stride) +void genSetupVPMStore(Seq* instrs, int addr, int hor, int stride) { - int strideBytes = stride*4; - assert(strideBytes < 8192); - int setup = 0xc0010000 | strideBytes; - Reg dst; dst.tag = REG_A; dst.regId = RSV_WRITE_STRIDE; - Instr instr = genLI(dst, setup); - instrs->append(instr); + assert(addr < 256); + + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_WR_SETUP; + + int setup = vpmSetupWriteCode(hor, stride) | (addr & 0xff); + instrs->append(genLI(dst, setup)); } -void genSetWriteStride(Seq* instrs, Reg stride) +void genSetupVPMStore(Seq* instrs, Reg addr, int hor, int stride) { - Reg tmp0 = freshReg(); - Reg tmp1 = freshReg(); - instrs->append(genLShift(tmp0, stride, 2)); - instrs->append(genLI(tmp1, 0xc0010000)); + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_WR_SETUP; - Reg dst; dst.tag = REG_A; dst.regId = RSV_WRITE_STRIDE; - instrs->append(genOR(dst, tmp0, tmp1)); + Reg tmp = freshReg(); + int setup = vpmSetupWriteCode(hor, stride); + instrs->append(genLI(tmp, setup)); + instrs->append(genOR(dst, addr, tmp)); } // ============================================================================= // DMA setup // ============================================================================= +// (rowLen in bytes) +static int dmaSetupStoreCode(int numRows, int rowLen, int hor) +{ + assert(numRows > 0 && numRows <= 128); + assert(rowLen > 0 && rowLen <= 128); + if (numRows == 128) numRows = 0; + if (rowLen == 128) rowLen = 0; + + int setup = 0x80000000; + setup |= numRows << 23; + setup |= rowLen << 16; + setup |= hor << 14; + return setup; +} + +// (rowLen in 32-bit words) +static int dmaSetupLoadCode(int numRows, int rowLen, int hor, int vpitch) +{ + assert(numRows > 0 && numRows <= 16); + assert(rowLen > 0 && rowLen <= 16); + assert(vpitch > 0 && vpitch <= 16); + if (numRows == 16) numRows = 0; + if (rowLen == 16) rowLen = 0; + if (vpitch == 16) vpitch = 0; + + int setup = 0x80000000; + setup |= rowLen << 20; + setup |= numRows << 16; + setup |= vpitch << 12; + setup |= (hor == 0 ? 1 : 0) << 11; + return setup; +} + // Generate instructions to setup DMA load. -void assignDMALoadSetup(Seq* instrs, Reg dst, BufferAorB b, Reg qpuId) +void genSetupDMALoad( + Seq* instrs, int numRows, int rowLen, + int hor, int vpitch, int vpmAddr) { - int setup = 0x80101800; - int buffIdx = (16 * (b == A ? 0 : 1)) << 4; - setup |= buffIdx; + assert(vpmAddr < 2048); + int setup = dmaSetupLoadCode(numRows, rowLen, hor, vpitch); + setup |= vpmAddr; + + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_RD_SETUP; + instrs->append(genLI(dst, setup)); +} + +void genSetupDMALoad( + Seq* instrs, int numRows, int rowLen, + int hor, int vpitch, Reg vpmAddr) +{ + int setup = dmaSetupLoadCode(numRows, rowLen, hor, vpitch); Reg tmp = freshReg(); instrs->append(genLI(tmp, setup)); - instrs->append(genOR(dst, qpuId, tmp)); + + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_RD_SETUP; + instrs->append(genOR(dst, vpmAddr, tmp)); } -// Generate instructions to setup DMA store. +void genStartDMALoad(Seq* instrs, Reg memAddr) +{ + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_DMA_LD_ADDR; + instrs->append(genOR(dst, memAddr, memAddr)); +} + +void genWaitDMALoad(Seq* instrs) +{ + Instr instr; + instr.tag = ALU; + instr.ALU.setFlags = false; + instr.ALU.cond.tag = NEVER; + instr.ALU.op = A_BOR; + instr.ALU.dest.tag = NONE; + instr.ALU.srcA.tag = REG; + instr.ALU.srcA.reg.tag = SPECIAL; + instr.ALU.srcA.reg.regId = SPECIAL_DMA_LD_WAIT; + instr.ALU.srcB.tag = REG; + instr.ALU.srcB.reg = instr.ALU.srcA.reg; + instrs->append(instr); +} + +// Generate instructions to do DMA store. + +void genSetupDMAStore( + Seq* instrs, int numRows, int rowLen, + int hor, int vpmAddr) +{ + assert(vpmAddr < 2048); + int setup = dmaSetupStoreCode(numRows, rowLen, hor); + setup |= vpmAddr << 3; + + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_WR_SETUP; + instrs->append(genLI(dst, setup)); +} -void assignDMAStoreSetup(Seq* instrs, Reg dst, BufferAorB b, Reg qpuId) +void genSetupDMAStore( + Seq* instrs, int numRows, int rowLen, + int hor, Reg vpmAddr) { - int setup = 0x88014000; - int buffIdx = (16 * (b == A ? 2 : 3)) << 7; - setup |= buffIdx; + int setup = dmaSetupStoreCode(numRows, rowLen, hor); Reg tmp0 = freshReg(); instrs->append(genLI(tmp0, setup)); Reg tmp1 = freshReg(); - instrs->append(genLShift(tmp1, qpuId, 3)); + instrs->append(genLShift(tmp1, vpmAddr, 3)); + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_WR_SETUP; instrs->append(genOR(dst, tmp0, tmp1)); } +void genStartDMAStore(Seq* instrs, Reg memAddr) +{ + Reg dst; + dst.tag = SPECIAL; + dst.regId = SPECIAL_DMA_ST_ADDR; + instrs->append(genOR(dst, memAddr, memAddr)); +} + +void genWaitDMAStore(Seq* instrs) +{ + Instr instr; + instr.tag = ALU; + instr.ALU.setFlags = false; + instr.ALU.cond.tag = NEVER; + instr.ALU.op = A_BOR; + instr.ALU.dest.tag = NONE; + instr.ALU.srcA.tag = REG; + instr.ALU.srcA.reg.tag = SPECIAL; + instr.ALU.srcA.reg.regId = SPECIAL_DMA_ST_WAIT; + instr.ALU.srcB.tag = REG; + instr.ALU.srcB.reg = instr.ALU.srcA.reg; + instrs->append(instr); +} + // ============================================================================= -// VPM setup +// DMA stride setup // ============================================================================= -// Generate instructions to setup VPM load. +// Generate instructions to set the DMA read pitch. -void assignVPMLoadSetup(Seq* instrs, Reg dst, BufferAorB b, Reg qpuId) +void genSetReadPitch(Seq* instrs, int pitch) { - int setup = 0x00100200; - int buffIdx = (b == A ? 0 : 1) << 4; - setup |= buffIdx; + assert(pitch < 8192); + int setup = 0x90000000 | pitch; + Reg dst; dst.tag = SPECIAL; dst.regId = SPECIAL_RD_SETUP; + instrs->append(genLI(dst, setup)); +} +void genSetReadPitch(Seq* instrs, Reg pitch) +{ Reg tmp = freshReg(); - instrs->append(genLI(tmp, setup)); - instrs->append(genOR(dst, qpuId, tmp)); + instrs->append(genLI(tmp, 0x90000000)); + + Reg dst; dst.tag = SPECIAL; dst.regId = SPECIAL_RD_SETUP; + instrs->append(genOR(dst, tmp, pitch)); } -// Generate instructions to setup VPM store. +// Generate instructions to set the DMA write stride. -void genSetupVPMStore(Seq* instrs, BufferAorB b, Reg qpuId) +void genSetWriteStride(Seq* instrs, int stride) { - int setup = 0x00100200; - int buffIdx = (b == A ? 2 : 3) << 4; - setup |= buffIdx; + assert(stride < 8192); + int setup = 0xc0000000 | stride; + Reg dst; dst.tag = SPECIAL; dst.regId = SPECIAL_WR_SETUP; + Instr instr = genLI(dst, setup); + instrs->append(instr); +} +void genSetWriteStride(Seq* instrs, Reg stride) +{ Reg tmp = freshReg(); - instrs->append(genLI(tmp, setup)); + instrs->append(genLI(tmp, 0xc0000000)); - Reg dst; - dst.tag = SPECIAL; - dst.regId = SPECIAL_WR_SETUP; - instrs->append(genOR(dst, qpuId, tmp)); + Reg dst; dst.tag = SPECIAL; dst.regId = SPECIAL_WR_SETUP; + instrs->append(genOR(dst, tmp, stride)); } // ============================================================================ @@ -133,53 +306,13 @@ void loadStorePass(Seq* instrs) Seq newInstrs(instrs->numElems*2); // Put QPU number in a register - Reg qpuId = freshReg(); - Reg qpuNum; qpuNum.tag = SPECIAL; qpuNum.regId = SPECIAL_QPU_NUM; - newInstrs.append(genMove(qpuId, qpuNum)); - - // Initialise strides - genSetReadStride(&newInstrs, 0); - genSetWriteStride(&newInstrs, 0); + //Reg qpuId = freshReg(); + //Reg qpuNum; qpuNum.tag = SPECIAL; qpuNum.regId = SPECIAL_QPU_NUM; + //newInstrs.append(genMove(qpuId, qpuNum)); - // Initialise load/store setup registers - Reg vpmLoadSetup = freshReg(); - Reg dmaLoadSetup = freshReg(); - Reg dmaStoreSetup = freshReg(); - - assignDMALoadSetup(&newInstrs, dmaLoadSetup, A, qpuId); - assignDMAStoreSetup(&newInstrs, dmaStoreSetup, A, qpuId); - assignVPMLoadSetup(&newInstrs, vpmLoadSetup, A, qpuId); - - genSetupVPMStore(&newInstrs, A, qpuId); - - // Elaborate LD1, LD3 and ST2 intermediate instructions - Reg sp; sp.tag = SPECIAL; - Reg src; src.tag = REG_A; for (int i = 0; i < instrs->numElems; i++) { Instr instr = instrs->elems[i]; switch (instr.tag) { - case LD1: - sp.regId = SPECIAL_RD_SETUP; - src.regId = RSV_READ_STRIDE; - newInstrs.append(genMove(sp, src)); - newInstrs.append(genMove(sp, dmaLoadSetup)); - sp.regId = SPECIAL_DMA_LD_ADDR; - newInstrs.append(genMove(sp, instr.LD1.addr)); - break; - case LD3: - sp.regId = SPECIAL_RD_SETUP; - newInstrs.append(genMove(sp, vpmLoadSetup)); - for (int j = 0; j < 3; j++) - newInstrs.append(nop()); - break; - case ST2: - sp.regId = SPECIAL_WR_SETUP; - src.regId = RSV_WRITE_STRIDE; - newInstrs.append(genMove(sp, src)); - newInstrs.append(genMove(sp, dmaStoreSetup)); - sp.regId = SPECIAL_DMA_ST_ADDR; - newInstrs.append(genMove(sp, instr.ST2.addr)); - break; case RECV: { instr.tag = TMU0_TO_ACC4; newInstrs.append(instr); diff --git a/Lib/Target/LoadStore.h b/Lib/Target/LoadStore.h index f23308e..a0fb9e0 100644 --- a/Lib/Target/LoadStore.h +++ b/Lib/Target/LoadStore.h @@ -3,13 +3,40 @@ #include "Common/Seq.h" #include "Target/Syntax.h" +#include "Source/Syntax.h" namespace QPULib { -void genSetReadStride(Seq* instrs, int stride); -void genSetReadStride(Seq* instrs, Reg stride); +void genSetupVPMLoad(Seq* instrs, int n, + int addr, int hor, int stride); +void genSetupVPMLoad(Seq* instrs, int n, + Reg addr, int hor, int stride); + +void genSetupVPMStore(Seq* instrs, int addr, int hor, int stride); +void genSetupVPMStore(Seq* instrs, Reg addr, int hor, int stride); + +void genSetupDMALoad( + Seq* instrs, int numRows, int rowLen, + int hor, int vpitch, int vpmAddr); +void genSetupDMALoad( + Seq* instrs, int numRows, int rowLen, + int hor, int vpitch, Reg vpmAddr); +void genStartDMALoad(Seq* instrs, Reg memAddr); +void genWaitDMALoad(Seq* instrs); + +void genSetupDMAStore( + Seq* instrs, int numRows, int rowLen, int hor, int vpmAddr); +void genSetupDMAStore( + Seq* instrs, int numRows, int rowLen, int hor, Reg vpmAddr); +void genStartDMAStore(Seq* instrs, Reg memAddr); +void genWaitDMAStore(Seq* instrs); + +void genSetReadPitch(Seq* instrs, int pitch); +void genSetReadPitch(Seq* instrs, Reg pitch); + void genSetWriteStride(Seq* instrs, int stride); void genSetWriteStride(Seq* instrs, Reg stride); + void loadStorePass(Seq* instrs); } // namespace QPULib diff --git a/Lib/Target/Pretty.cpp b/Lib/Target/Pretty.cpp index d35bb99..af2255c 100644 --- a/Lib/Target/Pretty.cpp +++ b/Lib/Target/Pretty.cpp @@ -3,8 +3,7 @@ namespace QPULib { -#ifdef NOT_USED -void pretty(SubWord sw) +const char* pretty(SubWord sw) { switch (sw) { case A8: return "[7:0]"; @@ -16,8 +15,6 @@ void pretty(SubWord sw) default: assert(false); return ""; } } -#endif // NOT_USED - const char* specialStr(RegId rid) { @@ -30,6 +27,8 @@ const char* specialStr(RegId rid) case SPECIAL_WR_SETUP: return "WR_SETUP"; case SPECIAL_DMA_ST_ADDR: return "DMA_ST_ADDR"; case SPECIAL_DMA_LD_ADDR: return "DMA_LD_ADDR"; + case SPECIAL_DMA_ST_WAIT: return "DMA_ST_WAIT"; + case SPECIAL_DMA_LD_WAIT: return "DMA_LD_WAIT"; case SPECIAL_VPM_READ: return "VPM_READ"; case SPECIAL_VPM_WRITE: return "VPM_WRITE"; case SPECIAL_HOST_INT: return "HOST_INT"; @@ -176,12 +175,6 @@ void pretty(FILE *f, BranchTarget target) fprintf(f, "%i", target.immOffset); } -void pretty(FILE *f, BufferAorB buffer) -{ - if (buffer == A) fprintf(f, "A"); - if (buffer == B) fprintf(f, "B"); -} - void pretty(FILE *f, Instr instr) { assert(f != nullptr); @@ -234,41 +227,6 @@ void pretty(FILE *f, Instr instr) case NO_OP: fprintf(f, "NOP\n"); return; - case LD1: - pretty(f, instr.LD1.buffer); - fprintf(f, " <- LD1("); - pretty(f, instr.LD1.addr); - fprintf(f, ")\n"); - return; - case LD2: - fprintf(f, "LD2\n"); - return; - case LD3: - fprintf(f, "LD3("); - pretty(f, instr.LD3.buffer); - fprintf(f, ")\n"); - return; - case LD4: - pretty(f, instr.LD4.dest); - fprintf(f, " <- LD4\n"); - return; - case ST1: - fprintf(f, "ST1("); - pretty(f, instr.ST1.buffer); - fprintf(f, ") <- "); - pretty(f, instr.ST1.data); - fprintf(f, "\n"); - return; - case ST2: - fprintf(f, "ST2("); - pretty(f, instr.ST2.buffer); - fprintf(f, ", "); - pretty(f, instr.ST2.addr); - fprintf(f, ")\n"); - return; - case ST3: - fprintf(f, "ST3\n"); - return; case PRS: fprintf(f, "PRS(\"%s\")", instr.PRS); return; @@ -299,6 +257,12 @@ void pretty(FILE *f, Instr instr) case IRQ: fprintf(f, "IRQ\n"); return; + case VPM_STALL: + fprintf(f, "VPM_STALL\n"); + return; + default: + fprintf(f, "<>\n", instr.tag); + return; } } diff --git a/Lib/Target/Pretty.h b/Lib/Target/Pretty.h index f6a7111..a159741 100644 --- a/Lib/Target/Pretty.h +++ b/Lib/Target/Pretty.h @@ -1,6 +1,7 @@ #ifndef _QPULIB_TARGET_PRETTY_H_ #define _QPULIB_TARGET_PRETTY_H_ +#include #include "Target/Syntax.h" namespace QPULib { diff --git a/Lib/Target/ReachingDefs.cpp b/Lib/Target/ReachingDefs.cpp index 49ff34d..dfea6c8 100644 --- a/Lib/Target/ReachingDefs.cpp +++ b/Lib/Target/ReachingDefs.cpp @@ -103,14 +103,6 @@ void computeGenKill(InstrId id, Instr instr, DefsOf* defsOf, GenKill* genKill) } break; - // LD4 instruction - case LD4: - // Add dest reg to 'def' set - if (instr.LD4.dest.tag == REG_A) { - isDef = true; - defReg = instr.LD4.dest.regId; - } - break; } if (isDef) { diff --git a/Lib/Target/Satisfy.cpp b/Lib/Target/Satisfy.cpp index c35c94e..3ffd463 100644 --- a/Lib/Target/Satisfy.cpp +++ b/Lib/Target/Satisfy.cpp @@ -77,8 +77,12 @@ RegTag regFileOf(Reg r) if (r.tag == SPECIAL) { if (r.regId == SPECIAL_ELEM_NUM) return REG_A; if (r.regId == SPECIAL_QPU_NUM) return REG_B; + if (r.regId == SPECIAL_RD_SETUP) return REG_A; + if (r.regId == SPECIAL_WR_SETUP) return REG_B; if (r.regId == SPECIAL_DMA_LD_WAIT) return REG_A; if (r.regId == SPECIAL_DMA_ST_WAIT) return REG_B; + if (r.regId == SPECIAL_DMA_LD_ADDR) return REG_A; + if (r.regId == SPECIAL_DMA_ST_ADDR) return REG_B; } return NONE; } @@ -94,8 +98,8 @@ bool resolveRegFileConflict(Instr* instr, Instr* newInstr) int rfa = regFileOf(instr->ALU.srcA.reg); int rfb = regFileOf(instr->ALU.srcB.reg); if (rfa != NONE && rfb != NONE) { - bool conflict = rfa == rfb && instr->ALU.srcA.reg.regId != - instr->ALU.srcB.reg.regId; + bool conflict = rfa == rfb && + !(instr->ALU.srcA.reg == instr->ALU.srcB.reg); if (conflict) { *newInstr = remapAToAccum(instr, 0); return true; @@ -205,17 +209,58 @@ static void insertNops(Seq* instrs, Seq* newInstrs) } +// Return true for any instruction that doesn't read from the VPM +bool notVPMGet(Instr instr) +{ + // Use/def sets + UseDefReg useDef; + + useDefReg(instr, &useDef); + for (int i = 0; i < useDef.use.numElems; i++) { + Reg useReg = useDef.use.elems[i]; + if (useReg.tag == SPECIAL && useReg.regId == SPECIAL_VPM_READ) + return false; + } + return true; +} + +// Insert NOPs between VPM setup and VPM read, if needed +static void removeVPMStall(Seq* instrs, Seq* newInstrs) +{ + // Use/def sets + UseDefReg useDef; + + for (int i = 0; i < instrs->numElems; i++) { + Instr instr = instrs->elems[i]; + if (instr.tag != VPM_STALL) + newInstrs->append(instr); + else { + int numNops = 3; // Number of nops to insert + for (int j = 1; j <= 3; j++) { + if ((i+j) >= instrs->numElems) break; + Instr next = instrs->elems[i+j]; + if (next.tag == LAB) break; + if (notVPMGet(next)) numNops--; else break; + } + for (int j = 0; j < numNops; j++) + newInstrs->append(nop()); + } + } +} + // Combine passes void satisfy(Seq* instrs) { // New instruction sequence - Seq newInstrs(instrs->numElems * 2); + Seq newInstrs0(instrs->numElems * 2); + Seq newInstrs1(instrs->numElems * 2); // Apply passes - insertMoves(instrs, &newInstrs); + insertMoves(instrs, &newInstrs0); + insertNops(&newInstrs0, &newInstrs1); instrs->clear(); - insertNops(&newInstrs, instrs); + removeVPMStall(&newInstrs1, instrs); } } // namespace QPULib diff --git a/Lib/Target/Subst.cpp b/Lib/Target/Subst.cpp index 813143f..c4f6727 100644 --- a/Lib/Target/Subst.cpp +++ b/Lib/Target/Subst.cpp @@ -23,14 +23,6 @@ void renameDest(Instr* instr, RegTag vt, RegId v, } return; - // LD4 instruction - case LD4: - if (instr->LD4.dest.tag == vt && instr->LD4.dest.regId == v) { - instr->LD4.dest.tag = wt; - instr->LD4.dest.regId = w; - } - return; - // RECV instruction case RECV: if (instr->RECV.dest.tag == vt && instr->RECV.dest.regId == v) { @@ -61,30 +53,6 @@ void renameUses(Instr* instr, RegTag vt, RegId v, } return; - // LD1 instruction - case LD1: - if (instr->LD1.addr.tag == vt && instr->LD1.addr.regId == v) { - instr->LD1.addr.tag = wt; - instr->LD1.addr.regId = w; - } - return; - - // ST1 instruction - case ST1: - if (instr->ST1.data.tag == vt && instr->ST1.data.regId == v) { - instr->ST1.data.tag = wt; - instr->ST1.data.regId = w; - } - return; - - // ST2 instruction - case ST2: - if (instr->ST2.addr.tag == vt && instr->ST2.addr.regId == v) { - instr->ST2.addr.tag = wt; - instr->ST2.addr.regId = w; - } - return; - // Print integer instruction case PRI: if (instr->PRI.tag == vt && instr->PRI.regId == v) { @@ -123,30 +91,6 @@ void substRegTag(Instr* instr, RegTag vt, RegTag wt) instr->ALU.srcB.reg.tag = wt; return; - // LD1 instruction - case LD1: - if (instr->LD1.addr.tag == vt) - instr->LD1.addr.tag = wt; - return; - - // LD4 instruction - case LD4: - if (instr->LD4.dest.tag == vt) - instr->LD4.dest.tag = wt; - return; - - // ST1 instruction - case ST1: - if (instr->ST1.data.tag == vt) - instr->ST1.data.tag = wt; - return; - - // ST2 instruction - case ST2: - if (instr->ST2.addr.tag == vt) - instr->ST2.addr.tag = wt; - return; - // Print integer instruction case PRI: if (instr->PRI.tag == vt) diff --git a/Lib/Target/Syntax.cpp b/Lib/Target/Syntax.cpp index 3f3f1a3..6531f51 100644 --- a/Lib/Target/Syntax.cpp +++ b/Lib/Target/Syntax.cpp @@ -85,6 +85,29 @@ Instr genOR(Reg dst, Reg srcA, Reg srcB) return instr; } +// Generate addition instruction. + +Instr genADD(Reg dst, Reg srcA, Reg srcB) +{ + AssignCond always; + always.tag = ALWAYS; + + Instr instr; + instr.tag = ALU; + instr.ALU.setFlags = false; + instr.ALU.cond = always; + instr.ALU.dest = dst; + instr.ALU.srcA.tag = REG; + instr.ALU.srcA.reg = srcA; + instr.ALU.op = A_ADD; + instr.ALU.srcB.tag = REG; + instr.ALU.srcB.reg = srcB; + + return instr; +} + + + // Generate left-shift instruction. Instr genLShift(Reg dst, Reg srcA, int n) diff --git a/Lib/Target/Syntax.h b/Lib/Target/Syntax.h index 7268d6b..81e89d0 100644 --- a/Lib/Target/Syntax.h +++ b/Lib/Target/Syntax.h @@ -69,14 +69,14 @@ enum Special { , SPECIAL_ELEM_NUM , SPECIAL_QPU_NUM , SPECIAL_VPM_READ + , SPECIAL_DMA_ST_WAIT + , SPECIAL_DMA_LD_WAIT // Write-only , SPECIAL_RD_SETUP , SPECIAL_WR_SETUP , SPECIAL_DMA_ST_ADDR - , SPECIAL_DMA_ST_WAIT , SPECIAL_DMA_LD_ADDR - , SPECIAL_DMA_LD_WAIT , SPECIAL_VPM_WRITE , SPECIAL_HOST_INT , SPECIAL_TMU0_S @@ -88,10 +88,11 @@ struct Reg { // Register identifier RegId regId; -}; -inline bool operator==(Reg ra, Reg rb) - { return ra.tag == rb.tag && ra.regId == rb.regId; } + bool operator==(const Reg &r) { + return tag == r.tag && regId == r.regId; + } +}; // ============================================================================ // Conditions @@ -261,16 +262,6 @@ struct BranchTarget { typedef int Label; -// ============================================================================ -// Loads/store buffering -// ============================================================================ - -// We reserve two load buffers and two store buffers for each QPU in the VPM -// (shared local) memory. The reason for two of each is to allow double -// buffering. We refer to a double buffer as A and B buffers. - -enum BufferAorB { A, B }; - // ============================================================================ // Instructions // ============================================================================ @@ -290,32 +281,11 @@ enum InstrTag { , LAB // Label , NO_OP // No-op - // Load instructions - // ----------------- - // - // Four instructions are used to implement a memory load. - - , LD1 // First, DMA vector in DRAM into VPM (local) memory - , LD2 // Second, wait for DMA completion - , LD3 // Third, setup a read from VPM memory - , LD4 // Fourth, transfer from VPM into given register - - // Rules for loads: - // * An LD1 must be followed (eventually) by a corresponding LD2 - // * Ditto for LD3 and LD4 - // * There must be at least 3 instructions between an LD3 and an LD4 - // * An LD1/LD2 need not be followed by a corresponding LD3/LD4, - // thus can be issued speculatively - // * A new LD1 can be issued after an LD2, allowing double buffering - - // Store instructions - // ------------------ - // - // Three instructions are required to perform a memory store. + // DMA + // --- - , ST1 // First, write the vector to VPM (local) memory. - , ST2 // Second, DMA from the VPM out to DRAM. - , ST3 // Third, wait for DMA to complete. + , DMA_LOAD_WAIT // Wait for DMA load to complete + , DMA_STORE_WAIT // Wait for DMA store to complete // Semaphores // ---------- @@ -340,6 +310,11 @@ enum InstrTag { , PRS // Print string , PRI // Print integer , PRF // Print float + + // VPM stall + // --------- + + , VPM_STALL // Marker for VPM read setup }; // QPU instructions @@ -368,35 +343,6 @@ struct Instr { // Labels, denoting branch targets Label label; - // Load instructions - // ----------------- - - // DMA vector at address specifed by register from DRAM into VPM - // (local) memory. To allow double buffering, i.e. the VPM to be - // filled by DMA while also being read by a QPU, a flag is used to - // indicate which one of two buffers in the VPM to use for the load - struct { Reg addr; BufferAorB buffer; } LD1; - - // LD2 (wait for DMA read completion) has no parameters - - // Setup a read from VPM memory. A flag indicates which one of - // two buffers in the VPM is being used for the load - struct { BufferAorB buffer; } LD3; - - // Transfer from VPM into given register - struct { Reg dest; } LD4; - - // Store instructions - // ------------------ - - // Write the vector to VPM (local) memory using specified buffer - struct { Reg data; BufferAorB buffer; } ST1; - - // DMA from the VPM out to DRAM at the address in given register. - struct { Reg addr; BufferAorB buffer; } ST2; - - // ST3 (wait for DMA write completion) has no parameters - // Semaphores // ---------- @@ -442,6 +388,7 @@ inline Instr nop() Instr genLI(Reg dst, int i); Instr genMove(Reg dst, Reg src); Instr genOR(Reg dst, Reg srcA, Reg srcB); +Instr genADD(Reg dst, Reg srcA, Reg srcB); Instr genLShift(Reg dst, Reg srcA, int n); Instr genIncr(Reg dst, Reg srcA, int n); diff --git a/Makefile b/Makefile index ee5b320..b841713 100644 --- a/Makefile +++ b/Makefile @@ -30,16 +30,6 @@ endif # QPU or emulation mode ifeq ($(QPU), 1) - -# Check platform before building. Can't be indented, otherwise make complains. -RET := $(shell Tools/detectPlatform.sh 1>/dev/null && echo "yes" || echo "no") -#$(info info: '$(RET)') -ifneq ($(RET), yes) -$(error QPU-mode specified on a non-Pi platform; aborting) -else -$(info Building on a Pi platform) -endif - CXX_FLAGS += -DQPU_MODE -I /opt/vc/include OBJ_DIR := $(OBJ_DIR)-qpu LIBS := -L /opt/vc/lib -l bcm_host @@ -72,7 +62,6 @@ OBJ = \ Target/LoadStore.o \ Target/Emulator.o \ Target/Encode.o \ - VideoCore/RegisterMap.o \ VideoCore/Mailbox.o \ VideoCore/Invoke.o \ VideoCore/VideoCore.o @@ -95,7 +84,8 @@ EXAMPLES = \ Rot3D \ Rot3DLib \ ID \ - HeatMap + HeatMap \ + DMA EXAMPLE_TARGETS = $(patsubst %,$(OBJ_DIR)/bin/%,$(EXAMPLES))