From 9a0b60a66f792ef3bd9a8692840214ee1d09880c Mon Sep 17 00:00:00 2001 From: andyfox-rushc Date: Thu, 4 Apr 2024 00:15:51 -0700 Subject: [PATCH] parallel reduction in booth.cc --- passes/techmap/booth.cc | 561 ++++++++++++++++++++++++++++------------ 1 file changed, 401 insertions(+), 160 deletions(-) diff --git a/passes/techmap/booth.cc b/passes/techmap/booth.cc index 09c20b50712..be5cbf51160 100644 --- a/passes/techmap/booth.cc +++ b/passes/techmap/booth.cc @@ -52,8 +52,8 @@ or in generic synthesis call with -booth argument: synth -top my_design -booth */ -//FIXME: These debug prints are broken now, should be fixed or removed. //#define DEBUG_CPA +//#define DEBUG_CSA 1 #include "kernel/sigtools.h" #include "kernel/yosys.h" @@ -104,8 +104,7 @@ struct BoothPassWorker { } // Booth unsigned radix 4 encoder - void BuildBur4e(std::string name, SigBit y0_i, SigBit y1_i, SigBit y2_i, - SigBit &one_o, SigBit &two_o, SigBit &s_o, SigBit &sb_o) + void BuildBur4e(std::string name, SigBit y0_i, SigBit y1_i, SigBit y2_i, SigBit &one_o, SigBit &two_o, SigBit &s_o, SigBit &sb_o) { one_o = module->XorGate(NEW_ID_SUFFIX(name), y0_i, y1_i); s_o = y2_i; @@ -116,8 +115,7 @@ struct BoothPassWorker { void BuildBr4e(std::string name, SigBit y2_m1_i, SigBit y2_i, // y2i - SigBit y2_p1_i, - SigBit &negi_o, SigBit &twoi_n_o, SigBit &onei_n_o, SigBit &cori_o) + SigBit y2_p1_i, SigBit &negi_o, SigBit &twoi_n_o, SigBit &onei_n_o, SigBit &cori_o) { auto y2_p1_n = module->NotGate(NEW_ID_SUFFIX(name), y2_p1_i); auto y2_n = module->NotGate(NEW_ID_SUFFIX(name), y2_i); @@ -130,9 +128,8 @@ struct BoothPassWorker { // (y2_p1 & y2_n & y2_m1_n) // ) twoi_n_o = module->NorGate(NEW_ID_SUFFIX(name), - module->AndGate(NEW_ID_SUFFIX(name), y2_p1_n, module->AndGate(NEW_ID_SUFFIX(name), y2_i, y2_m1_i)), - module->AndGate(NEW_ID_SUFFIX(name), y2_p1_i, module->AndGate(NEW_ID_SUFFIX(name), y2_n, y2_m1_n)) - ); + module->AndGate(NEW_ID_SUFFIX(name), y2_p1_n, module->AndGate(NEW_ID_SUFFIX(name), y2_i, y2_m1_i)), + module->AndGate(NEW_ID_SUFFIX(name), y2_p1_i, module->AndGate(NEW_ID_SUFFIX(name), y2_n, y2_m1_n))); // onei_n = ~(y2_m1_i ^ y2_i); onei_n_o = module->XnorGate(NEW_ID_SUFFIX(name), y2_m1_i, y2_i); @@ -143,17 +140,14 @@ struct BoothPassWorker { // // signed booth radix 4 decoder // - void BuildBr4d(std::string name, SigBit nxj_m1_i, SigBit twoi_n_i, SigBit xj_i, SigBit negi_i, SigBit onei_n_i, - SigBit &ppij_o, SigBit &nxj_o) + void BuildBr4d(std::string name, SigBit nxj_m1_i, SigBit twoi_n_i, SigBit xj_i, SigBit negi_i, SigBit onei_n_i, SigBit &ppij_o, SigBit &nxj_o) { // nxj_in = xnor(xj,negi) // nxj_o = xnj_in, // ppij = ~( (nxj_m1_i | twoi_n_i) & (nxj_int | onei_n_i)); nxj_o = module->XnorGate(NEW_ID_SUFFIX(name), xj_i, negi_i); - ppij_o = module->NandGate(NEW_ID_SUFFIX(name), - module->OrGate(NEW_ID_SUFFIX(name), nxj_m1_i, twoi_n_i), - module->OrGate(NEW_ID_SUFFIX(name), nxj_o, onei_n_i) - ); + ppij_o = module->NandGate(NEW_ID_SUFFIX(name), module->OrGate(NEW_ID_SUFFIX(name), nxj_m1_i, twoi_n_i), + module->OrGate(NEW_ID_SUFFIX(name), nxj_o, onei_n_i)); } /* @@ -161,9 +155,8 @@ struct BoothPassWorker { using non-booth encoded logic. We can save a booth encoder for the first couple of bits. */ - void BuildBoothQ1(std::string name, SigBit negi_i, SigBit cori_i, SigBit x0_i, SigBit x1_i, SigBit y0_i, - SigBit y1_i, - SigBit &nxj_o, SigBit &cor_o, SigBit &pp0_o, SigBit &pp1_o) + void BuildBoothQ1(std::string name, SigBit negi_i, SigBit cori_i, SigBit x0_i, SigBit x1_i, SigBit y0_i, SigBit y1_i, SigBit &nxj_o, + SigBit &cor_o, SigBit &pp0_o, SigBit &pp1_o) { /* assign NXJO = ~(X1 ^ NEGI); @@ -186,9 +179,8 @@ struct BoothPassWorker { cor_o = module->AndGate(NEW_ID_SUFFIX(name), pp1_nor_pp0, cori_i); } - void BuildBitwiseFa(Module *mod, std::string name, const SigSpec &sig_a, const SigSpec &sig_b, - const SigSpec &sig_c, const SigSpec &sig_x, const SigSpec &sig_y, - const std::string &src = "") + void BuildBitwiseFa(Module *mod, std::string name, const SigSpec &sig_a, const SigSpec &sig_b, const SigSpec &sig_c, const SigSpec &sig_x, + const SigSpec &sig_y, const std::string &src = "") { // We can't emit a single wide full-adder cell here since // there would typically be feedback loops involving the cells' @@ -200,8 +192,7 @@ struct BoothPassWorker { log_assert(sig_a.size() == sig_y.size()); for (int i = 0; i < sig_a.size(); i++) - mod->addFa(stringf("%s[%d]", name.c_str(), i), sig_a[i], sig_b[i], - sig_c[i], sig_x[i], sig_y[i], src); + mod->addFa(stringf("%s[%d]", name.c_str(), i), sig_a[i], sig_b[i], sig_c[i], sig_x[i], sig_y[i], src); } void run() @@ -216,8 +207,7 @@ struct BoothPassWorker { int x_sz = GetSize(A), y_sz = GetSize(B), z_sz = GetSize(Y); if (x_sz < 4 || y_sz < 4 || z_sz < 8) { - log_debug("Not mapping cell %s sized at %dx%x, %x: size below threshold\n", - log_id(cell), x_sz, y_sz, z_sz); + log_debug("Not mapping cell %s sized at %dx%x, %x: size below threshold\n", log_id(cell), x_sz, y_sz, z_sz); continue; } @@ -240,7 +230,7 @@ struct BoothPassWorker { y_sz_revised = y_sz + 1; } else { x_sz_revised = y_sz; - } + } } else { if (x_sz % 2 != 0) { y_sz_revised = x_sz + 1; @@ -258,7 +248,6 @@ struct BoothPassWorker { log_assert((x_sz_revised == y_sz_revised) && (x_sz_revised % 2 == 0) && (y_sz_revised % 2 == 0)); - A.extend_u0(x_sz_revised, is_signed); B.extend_u0(y_sz_revised, is_signed); @@ -280,18 +269,16 @@ struct BoothPassWorker { if (!lowpower) CreateBoothMult(module, - A, // multiplicand - B, // multiplier(scanned) - Y, // result - is_signed - ); + A, // multiplicand + B, // multiplier(scanned) + Y, // result + is_signed); else CreateBoothLowpowerMult(module, - A, // multiplicand - B, // multiplier(scanned) - Y, // result - is_signed - ); + A, // multiplicand + B, // multiplier(scanned) + Y, // result + is_signed); module->remove(cell); booth_counter++; @@ -306,11 +293,10 @@ struct BoothPassWorker { while (summands.size() > 2) { std::vector new_summands; int i; - for (i = 0; i < (int) summands.size() - 2; i += 3) { + for (i = 0; i < (int)summands.size() - 2; i += 3) { SigSpec x = module->addWire(NEW_ID, width); SigSpec y = module->addWire(NEW_ID, width); - BuildBitwiseFa(module, NEW_ID.str(), summands[i], summands[i + 1], - summands[i + 2], x, y); + BuildBitwiseFa(module, NEW_ID.str(), summands[i], summands[i + 1], summands[i + 2], x, y); new_summands.push_back(y); new_summands.push_back({x.extract(0, width - 1), State::S0}); } @@ -335,10 +321,9 @@ struct BoothPassWorker { */ void CreateBoothMult(RTLIL::Module *module, - SigSpec X, // multiplicand - SigSpec Y, // multiplier - SigSpec Z, - bool is_signed) + SigSpec X, // multiplicand + SigSpec Y, // multiplier + SigSpec Z, bool is_signed) { // result int z_sz = Z.size(); @@ -369,15 +354,12 @@ struct BoothPassWorker { SigSpec ppij_row_n; BuildBoothMultDecoderRowN(module, - X, // multiplicand - one_int[i], two_int[i], s_int[i], sb_int[i], ppij_row_n, i, - is_signed - ); + X, // multiplicand + one_int[i], two_int[i], s_int[i], sb_int[i], ppij_row_n, i, is_signed); // data, shift, sign ppij_int.push_back(std::make_tuple(ppij_row_n, i * 2, s_int[i])); } - // Debug dump out partial products // DebugDumpPP(ppij_int); @@ -394,18 +376,51 @@ struct BoothPassWorker { // Debug: dump out aligned partial products. // Later on yosys will clean up unused constants - // DebugDumpAlignPP(aligned_pp); - SigSig wtree_sum = WallaceSum(z_sz, aligned_pp); +#ifdef DEBUG_CSA + DebugDumpAlignPP(aligned_pp); +#endif + // SigSig wtree_sum = WallaceSum(z_sz, aligned_pp); + + SigSpec s_vec; + SigSpec c_vec; + std::vector> debug_csa_trees; + BuildCSATree(module, aligned_pp, s_vec, c_vec, debug_csa_trees); +#ifdef DEBUG_CSA + printf("Size of Sum Vec %d Size of Carry Vec %d\n", s_vec.size(), c_vec.size()); + printf("Sum Vec %s \n", s_vec.as_string().c_str()); + printf("Carry Vec %s \n", c_vec.as_string().c_str()); + printf("Size of Sum %d Size of Result %d\n", s_vec.size(), Z.size()); +#endif // Debug code: Dump out the csa trees // DumpCSATrees(debug_csa_trees); + // BuildCPA(module, s_vec, c_vec, Z); // Build the CPA to do the final accumulation. - log_assert(wtree_sum.second[0] == State::S0); - if (mapped_cpa) - BuildCPA(module, wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z); - else - module->addAdd(NEW_ID, wtree_sum.first, {wtree_sum.second.extract_end(1), State::S0}, Z); + // log_assert(wtree_sum.second[0] == State::S0); + // wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z); + + // if (mapped_cpa) + // BuildCPA(module, wtree_sum.first, {State::S0, wtree_sum.second.extract_end(1)}, Z); + // else + // printf("Svec size %d c_vec size %d\n", s_vec.size(), c_vec.size()); + // module->addAdd(NEW_ID, s_vec, + // {c_vec.extract(0,c_vec.size()-1),State::S0}, Z); + + // Brent Kung adder + SigSpec g = module->addWire(NEW_ID, s_vec.size()); + SigSpec p = module->addWire(NEW_ID, s_vec.size()); + SigSpec co = module->addWire(NEW_ID, s_vec.size()); + module->addAnd(NEW_ID, s_vec, {c_vec.extract(0, c_vec.size() - 1), State::S0}, g); + module->addXor(NEW_ID, s_vec, {c_vec.extract(0, c_vec.size() - 1), State::S0}, p); + auto lcu = module->addCell(NEW_ID, ID($lcu)); + auto lcu_int = module->addWire(NEW_ID, s_vec.size()); + lcu->setParam(ID::WIDTH, s_vec.size()); + lcu->setPort(ID::G, g); + lcu->setPort(ID::P, p); + lcu->setPort(ID::CI, State::S0); + lcu->setPort(ID::CO, lcu_int); + module->addXor(NEW_ID, p, {lcu_int, State::S0}, Z); } /* @@ -413,9 +428,8 @@ struct BoothPassWorker { */ void BuildBoothMultDecoderRow0(RTLIL::Module *module, - SigSpec X, // multiplicand - SigSpec s_int, SigSpec sb_int, SigSpec one_int, - SigSpec two_int, SigSpec &ppij_vec, bool is_signed) + SigSpec X, // multiplicand + SigSpec s_int, SigSpec sb_int, SigSpec one_int, SigSpec two_int, SigSpec &ppij_vec, bool is_signed) { (void)sb_int; (void)module; @@ -427,21 +441,19 @@ struct BoothPassWorker { // 1..xsize -1 for (int i = 1; i < x_sz; i++) - ppij_vec.append(Bur4d_n(stringf("row0_dec_%d", i), X[i], X[i - 1], - one_int[0], two_int[0], s_int[0])); - + ppij_vec.append(Bur4d_n(stringf("row0_dec_%d", i), X[i], X[i - 1], one_int[0], two_int[0], s_int[0])); // The redundant bit. Duplicate decoding of last bit. if (!is_signed) { ppij_vec.append(Bur4d_msb("row0_dec_msb", X.msb(), two_int[0], s_int[0])); } else { - ppij_vec.append(Bur4d_n("row0_dec_msb", X.msb(), X.msb(), - one_int[0], two_int[0], s_int[0])); + ppij_vec.append(Bur4d_n("row0_dec_msb", X.msb(), X.msb(), one_int[0], two_int[0], s_int[0])); } // append the sign bits if (is_signed) { - SigBit e = module->XorGate(NEW_ID, s_int[0], module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int[0], one_int[0]))); + SigBit e = + module->XorGate(NEW_ID, s_int[0], module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int[0], one_int[0]))); ppij_vec.append({module->NotGate(NEW_ID, e), e, e}); } else { // append the sign bits @@ -452,10 +464,8 @@ struct BoothPassWorker { // Build a generic row of decoders. void BuildBoothMultDecoderRowN(RTLIL::Module *module, - SigSpec X, // multiplicand - SigSpec one_int, SigSpec two_int, SigSpec s_int, SigSpec sb_int, - SigSpec &ppij_vec, int row_ix, - bool is_signed) + SigSpec X, // multiplicand + SigSpec one_int, SigSpec two_int, SigSpec s_int, SigSpec sb_int, SigSpec &ppij_vec, int row_ix, bool is_signed) { (void)module; int x_sz = GetSize(X); @@ -465,28 +475,124 @@ struct BoothPassWorker { // core bits for (int i = 1; i < x_sz; i++) - ppij_vec.append(Bur4d_n(stringf("row_%d_dec_%d", row_ix, i), X[i], X[i - 1], - one_int, two_int, s_int)); + ppij_vec.append(Bur4d_n(stringf("row_%d_dec_%d", row_ix, i), X[i], X[i - 1], one_int, two_int, s_int)); - if (!is_signed) { // redundant bit + if (!is_signed) { // redundant bit ppij_vec.append(Bur4d_msb("row_dec_red", X[x_sz - 1], two_int, s_int)); } else { - ppij_vec.append(Bur4d_n(stringf("row_%d_dec_msb", row_ix), X[x_sz - 1], X[x_sz - 1], - one_int, two_int, s_int)); + ppij_vec.append(Bur4d_n(stringf("row_%d_dec_msb", row_ix), X[x_sz - 1], X[x_sz - 1], one_int, two_int, s_int)); } - ppij_vec.append(!is_signed ? sb_int[0] : module->XorGate(NEW_ID, sb_int, module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int, one_int)))); + ppij_vec.append(!is_signed + ? sb_int[0] + : module->XorGate(NEW_ID, sb_int, module->AndGate(NEW_ID, X.msb(), module->OrGate(NEW_ID, two_int, one_int)))); ppij_vec.append(State::S1); } - void DebugDumpAlignPP(std::vector> &aligned_pp) + void DumpModule() + { + printf("Cells\n"); + for (auto cell : module->cells_) { + printf("+Cell %s type %s\n", cell.first.c_str(), cell.second->type.c_str()); + printf("Connections on cell\n"); + for (auto c : cell.second->connections()) { + printf("Connection %s\n", c.first.c_str()); + DumpSigSpec(1, c.second); + } + printf("-\n"); + } + printf("Wires\n"); + for (auto w : module->wires_) { + printf("Wire %s (%p) width %d port_id %d dir %s\n", w.first.c_str(), w.second, w.second->width, w.second->port_id, + w.second->port_input ? "input" : "output"); + } + printf("Connections\n"); + for (auto con : module->connections_) { + SigSpec from = con.first; + SigSpec to = con.second; + printf("\t From connection %s width %d bit lengtht %d", from.is_wire() ? "wire" : "?", from.size(), from.bits().size()); + printf("\tTo connection %s ", to.is_wire() ? "wire" : to.is_bit() ? "bit" : to.is_chunk() ? "chunk" : "?"); + if (to.is_bit()) { + RTLIL::SigBit sb = to.as_bit(); + if (to.as_bit().wire) { + printf("Sig bit wire offset %d dir %s\n", to.as_bit().offset, to.as_bit().wire->port_input ? "ip" : "op"); + } else + printf("Sig bit value is state\n"); + } + printf("\n"); + } + } + + std::string Indent(int indent) + { + std::string ret = ""; + for (int i = 0; i < indent; i++) + ret = ret + '\t'; + return ret; + } + + void DumpSigSpec(int ident, RTLIL::SigSpec sp, std::string hdr = "") + { + printf("%s Sig spec %s\n", Indent(ident).c_str(), hdr.c_str()); + printf("%s Width %d\n", Indent(ident + 1).c_str(), sp.size()); + printf("%s Type: %s\n", Indent(ident + 1).c_str(), + sp.is_wire() + ? "wire" + : sp.is_chunk() ? "chunk" + : sp.is_bit() ? "bit" + : sp.is_fully_const() + ? "fully const" + : sp.is_fully_def() + ? "fully def" + : sp.is_fully_undef() ? "fully undef" : sp.has_marked_bits() ? "marked bits" : "unk"); + + printf("%s Number of bits %d Number of chunks %d\n", Indent(ident + 1).c_str(), sp.bits().size(), sp.chunks().size()); + printf("%s Bits:\n", Indent(ident + 1).c_str()); + int count = 0; + // b is SigBit + for (auto b : sp.bits()) { + if (b.wire) { + printf("%s [%d] Bit wire %s (%p)\n", Indent(ident + 1).c_str(), count, b.wire->name.c_str(), b.wire); + } else { + if (b.data == State::S0) + printf("%s Bit constant 0\n", Indent(ident + 1).c_str()); + else if (b.data == State::S1) + printf("%s Bit constant 1\n", Indent(ident + 1).c_str()); + else + printf("%s Unknown constant\n", Indent(ident + 1).c_str()); + } + count++; + } + + /* + count=0; + printf("%s Chunks:\n",Indent(ident).c_str()); + if (sp.chunks().size() >0){ + for (auto sc: sp.chunks()){ + printf("%s [%d] Chunk wire %s (%p)width %d offset %d\n", + Indent(ident).c_str(), + count, + sc.wire ? sc.wire -> name.c_str(): "non-wire chunk: empty", + sc.wire, + sc.width, + sc.offset); + count++; + } + } + */ + } + + void DebugDumpAlignPP(std::vector &aligned_pp) { printf("Aligned & Padded Partial products\n"); int pp_ix = 0; for (auto pp_row : aligned_pp) { printf("PP_%d \t", pp_ix); - for (unsigned i = 0; i < pp_row.size(); i++) - printf("[%d] %s ", i, pp_row[i] == nullptr ? " 0 " : pp_row[i]->name.c_str()); + for (int i = 0; i < pp_row.size(); i++) { + RTLIL::SigSpec sb_el = pp_row.extract(i); + std::string col_ix = "PP_" + std::to_string(pp_ix) + " col" + std::to_string(i); + DumpSigSpec(1, sb_el, col_ix); + } printf("\n"); pp_ix++; } @@ -558,8 +664,145 @@ struct BoothPassWorker { } } - void BuildCSATree(RTLIL::Module *module, std::vector &bits_to_reduce, SigSpec &s_vec, - SigSpec &c_vec, std::vector> &debug_csa_trees) + /* + Decompose the bits for pp[x] 2**n to reduce into groups of 3. Each group + will ultimately feed a csa. Where we have odd bits + we put them in A, B. + */ + void makeCSAGroups(SigSpec bits_to_reduce, std::vector> &groups, SigBit &A, SigBit &B) + { + int bit_ix = 0; + A = State::S0; + B = State::S0; + while (bit_ix < bits_to_reduce.size()) { + if (bit_ix == (bits_to_reduce.size() - 1)) { + A = bits_to_reduce.extract(bit_ix); + bit_ix++; + } else if (bit_ix == bits_to_reduce.size() - 2) { + A = bits_to_reduce.extract(bit_ix); + B = bits_to_reduce.extract(bit_ix + 1); + bit_ix = bit_ix + 2; + } else { + groups.push_back(std::make_tuple(bits_to_reduce.extract(bit_ix), bits_to_reduce.extract(bit_ix + 1), + bits_to_reduce.extract(bit_ix + 2))); + bit_ix = bit_ix + 3; + } + } + } + + void makeCSARow(int row_ix, std::vector> &debug_csa_trees, RTLIL::Module *module, + std::vector> &groups, SigBit &A, SigBit &B, SigSpec &carry_bits_to_sum, + SigSpec &next_bits_to_reduce) + { + int col_ix = 0; + std::vector csa_row; + + for (auto g : groups) { + std::string csa_name = "csa_" + std::to_string(row_ix) + "_" + std::to_string(col_ix); + auto csa = module->addCell(new_id(csa_name, __LINE__, ""), ID($fa)); + csa_row.push_back(csa); + csa->setParam(ID::WIDTH, 1); + col_ix++; + csa->setPort(ID::A, get<0>(g)); + csa->setPort(ID::B, get<1>(g)); + csa->setPort(ID::C, get<2>(g)); + std::string s_name = "s_" + std::to_string(row_ix) + "_" + std::to_string(col_ix); + std::string c_name = "c_" + std::to_string(row_ix) + "_" + std::to_string(col_ix); + SigBit sum = module->addWire(new_id(s_name, __LINE__, ""), 1); + SigBit carry = module->addWire(new_id(c_name, __LINE__, ""), 1); + csa->setPort(ID::Y, sum); + csa->setPort(ID::X, carry); + carry_bits_to_sum.append(carry); + next_bits_to_reduce.append(sum); + +#ifdef DEBUG_CSA + printf("CSA Appending carry bit %s\n", carry.is_wire() ? carry.wire->name.c_str() : "unk"); + printf("New number of carry bits %d \n", carry_bits_to_sum.size()); + DumpSigSpec(1, carry_bits_to_sum); +#endif + } + + if (A != State::S0 && B != State::S0) { + SigBit sum; + SigBit carry; + std::string ha_name = "ha_" + std::to_string(row_ix) + "_" + std::to_string(col_ix); + BuildHa(ha_name, A, B, sum, carry); + carry_bits_to_sum.append(carry); + next_bits_to_reduce.append(sum); +#ifdef DEBUG_CSA + printf("HA Appending carry bit %s\n", carry.is_wire() ? carry.wire->name.c_str() : "unk"); + printf("New number of carry bits %d \n", carry_bits_to_sum.size()); + DumpSigSpec(1, carry_bits_to_sum); +#endif + + } else if (A != State::S0) { + next_bits_to_reduce.append(A); + col_ix++; + } + debug_csa_trees.push_back(csa_row); + } + + void ReduceBitsParallel(RTLIL::Module *module, int column_ix, SigSpec column_bits, SigBit &s_result, SigBit &c_result, + SigSpec &carry_bits_to_sum, std::vector> &debug_csa_trees) + { + (void)column_ix; + +#ifdef DEBUG_CSA + if (column_bits.size() > 0) + printf("Parallel Column %d reduce bits parallel given %d bits (%s) to reduce\n", column_ix, column_bits.size(), + column_bits.as_string().c_str()); + else + printf("Column %d given %d bits\n", column_ix, 0); + +#endif + SigSpec next_bits_to_reduce = column_bits; + std::vector> groups; + + SigBit A; + SigBit B; + int row_ix = 0; + + while (next_bits_to_reduce.size() > 1) { + +#ifdef DEBUG_CSA + printf("Row %d Column %d Reducing %d bits\n", row_ix, column_ix, next_bits_to_reduce.size()); +#endif + groups.clear(); + makeCSAGroups(next_bits_to_reduce, groups, A, B); +#ifdef DEBUG_CSA + printf("Row %d Got %ld groups to reduce + A %d + B %d\n", row_ix, groups.size(), A == State::S0 ? 0 : 1, + B == State::S0 ? 0 : 1); +#endif + + SigSpec csa_row_bits; + int csa_count_before = debug_csa_trees.size(); + makeCSARow(row_ix, debug_csa_trees, module, groups, A, B, carry_bits_to_sum, csa_row_bits); + int csa_count_after = debug_csa_trees.size(); + +#ifdef DEBUG_CSA + printf("Reduced %d bits to %d bits using %d csa components\n", next_bits_to_reduce.size(), csa_row_bits.size(), + csa_count_after - csa_count_before); +#endif + + next_bits_to_reduce = csa_row_bits; + row_ix++; + } + + s_result = State::S0; + c_result = State::S0; + if (next_bits_to_reduce.size() == 1) + s_result = next_bits_to_reduce.extract(0); + if (carry_bits_to_sum.size() > 0) { + c_result = carry_bits_to_sum.extract(carry_bits_to_sum.size() - 1); + carry_bits_to_sum.remove(carry_bits_to_sum.size() - 1); + } + } + + /* + Build a parallel CSA tree + */ + void BuildCSATree(RTLIL::Module *module, std::vector &bits_to_reduce, SigSpec &s_vec, SigSpec &c_vec, + std::vector> &debug_csa_trees) { if (!(bits_to_reduce.size() > 0)) @@ -569,6 +812,10 @@ struct BoothPassWorker { int row_size = bits_to_reduce.size(); SigSpec carry_bits_to_add_to_next_column; +#ifdef DEBUG_CSA + printf("Reducing %d partial products. Each with %d columns\n", row_size, column_size); +#endif + for (int column_ix = 0; column_ix < column_size; column_ix++) { // get the bits in this column. @@ -578,8 +825,9 @@ struct BoothPassWorker { column_bits.append(bits_to_reduce[row_ix][column_ix]); } for (auto c : carry_bits_to_add_to_next_column) { + #ifdef DEBUG_CSA - printf("\t Propagating column bit %s to column %d from column %d\n", c->name.c_str(), column_ix, column_ix - 1); + printf("\t Propagating column bit %s to column %d from column %d\n", c.wire->name.c_str(), column_ix, column_ix - 1); #endif column_bits.append(c); } @@ -589,25 +837,26 @@ struct BoothPassWorker { #ifdef DEBUG_CSA printf("Column %d Reducing %d bits\n", column_ix, column_bits.size()); for (auto b : column_bits) { - printf("\t %s\n", b->name.c_str()); + printf("\t %s\n", b.wire ? b.wire->name.c_str() + : b.data == State::S0 ? "Const 0" : (b.data == State::S1 ? "Const 1" : " Unk constant")); } printf("\n"); #endif - SigBit s, c; -#ifdef DEBUG_CSA - int csa_count_before = debug_csa_trees[column_ix].size(); -#endif - ReduceBits(module, column_ix, column_bits, s, c, carry_bits_to_add_to_next_column, debug_csa_trees); + ReduceBitsParallel(module, column_ix, column_bits, s, c, carry_bits_to_add_to_next_column, debug_csa_trees); s_vec.append(s); c_vec.append(c); #ifdef DEBUG_CSA - int csa_count_after = debug_csa_trees[column_ix].size(); - - printf("Column %d Created %d csa tree elements\n", column_ix, csa_count_after - csa_count_before); + printf("#Carry bits to next column: %d (%s)\n", carry_bits_to_add_to_next_column.size(), + carry_bits_to_add_to_next_column.as_string().c_str()); + for (auto b : carry_bits_to_add_to_next_column) { + printf("\t %s\n", b.wire ? b.wire->name.c_str() + : b.data == State::S0 ? "Const 0" : (b.data == State::S1 ? "Const 1" : " Unk constant")); + } + printf("\n"); #endif } } @@ -632,14 +881,12 @@ struct BoothPassWorker { Pad out rows with zeros and left the opt pass clean them up. */ - void AlignPP(int z_sz, std::vector> &ppij_int, - std::vector &aligned_pp) + void AlignPP(int z_sz, std::vector> &ppij_int, std::vector &aligned_pp) { unsigned aligned_pp_ix = aligned_pp.size() - 1; // default is zero for everything (so don't have to think to hard // about padding). - for (unsigned i = 0; i < aligned_pp.size(); i++) { for (int j = 0; j < z_sz; j++) { aligned_pp[i][j] = State::S0; @@ -730,12 +977,11 @@ struct BoothPassWorker { // Make the carry results.. Two extra bits after fa. SigBit carry_out = module->addWire(NEW_ID, 1); module->addFa(NEW_ID_SUFFIX(stringf("cpa_%d_fa_%d", cpa_id, n)), - /* A */ s_vec[n], - /* B */ c_vec[n - 1], - /* C */ carry, - /* X */ carry_out, - /* Y */ result[n] - ); + /* A */ s_vec[n], + /* B */ c_vec[n - 1], + /* C */ carry, + /* X */ carry_out, + /* Y */ result[n]); carry = carry_out; #ifdef DEBUG_CPA @@ -758,12 +1004,11 @@ struct BoothPassWorker { else { SigBit carry_out = module->addWire(NEW_ID_SUFFIX(stringf("cpa_%d_carry_%d", cpa_id, n)), 1); module->addFa(NEW_ID_SUFFIX(stringf("cpa_%d_fa_%d", cpa_id, n)), - /* A */ s_vec[n], - /* B */ c_vec[n - 1], - /* C */ carry, - /* X */ carry_out, - /* Y */ result[n] - ); + /* A */ s_vec[n], + /* B */ c_vec[n - 1], + /* C */ carry, + /* X */ carry_out, + /* Y */ result[n]); carry = carry_out; #ifdef DEBUG_CPA printf("CPA bit [%d] Cell %s IPs [%s] [%s] [%s]\n", n, fa_cell->name.c_str(), s_vec[n]->name.c_str(), @@ -777,10 +1022,18 @@ struct BoothPassWorker { // Pass the carry bits from each csa to the next // column for summation. - void ReduceBits(RTLIL::Module *module, int column_ix, SigSpec column_bits, SigBit &s_result, SigBit &c_result, - SigSpec &carry_bits_to_sum, std::vector> &debug_csa_trees) + void ReduceBits(RTLIL::Module *module, int column_ix, SigSpec column_bits, SigBit &s_result, SigBit &c_result, SigSpec &carry_bits_to_sum, + std::vector> &debug_csa_trees) { +#ifdef DEBUG_CSA + if (column_bits.size() > 0) + printf("Serial reduce Column %d reduce bits serial given %d bits (%s) to reduce\n", column_ix, column_bits.size(), + column_bits.as_string().c_str()); + else + printf("Serial reduce given 0 bits to reduce\n"); +#endif + int csa_ix = 0; int column_size = column_bits.size(); @@ -799,18 +1052,17 @@ struct BoothPassWorker { auto c_wire = module->addWire(NEW_ID_SUFFIX(stringf("csa_%d_%d_c", column_ix, csa_ix + 1)), 1); auto csa = module->addFa(NEW_ID_SUFFIX(stringf("csa_%d_%d", column_ix, csa_ix)), - /* A */ first_csa_ips[0], - /* B */ first_csa_ips.size() > 1 ? first_csa_ips[1] : State::S0, - /* C */ first_csa_ips.size() > 2 ? first_csa_ips[2] : State::S0, - /* X */ c_wire, - /* Y */ s_wire - ); + /* A */ first_csa_ips[0], + /* B */ first_csa_ips.size() > 1 ? first_csa_ips[1] : State::S0, + /* C */ first_csa_ips.size() > 2 ? first_csa_ips[2] : State::S0, + /* X */ c_wire, + /* Y */ s_wire); s_result = s_wire; c_result = c_wire; - debug_csa_trees[column_ix].push_back(csa); - csa_ix++; + // debug_csa_trees[column_ix].push_back(csa); + csa_ix++; if (var_ix <= column_bits.size() - 1) carry_bits_to_sum.append(c_wire); @@ -831,14 +1083,13 @@ struct BoothPassWorker { auto s_wire = module->addWire(NEW_ID_SUFFIX(stringf("csa_%d_%d_s", column_ix, csa_ix + 1)), 1); auto csa = module->addFa(NEW_ID_SUFFIX(stringf("csa_%d_%d", column_ix, csa_ix)), - /* A */ s_result, - /* B */ csa_ips[0], - /* C */ csa_ips.size() > 1 ? csa_ips[1] : State::S0, - /* X */ c_wire, - /* Y */ s_wire - ); - - debug_csa_trees[column_ix].push_back(csa); + /* A */ s_result, + /* B */ csa_ips[0], + /* C */ csa_ips.size() > 1 ? csa_ips[1] : State::S0, + /* X */ c_wire, + /* Y */ s_wire); + + // debug_csa_trees[column_ix].push_back(csa); csa_ix++; if (var_ix <= column_bits.size() - 1) @@ -852,8 +1103,8 @@ struct BoothPassWorker { } } - void BuildBoothMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int, - SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, int &encoder_ix, bool is_signed) + void BuildBoothMultEncoders(SigSpec Y, SigSpec &one_int, SigSpec &two_int, SigSpec &s_int, SigSpec &sb_int, RTLIL::Module *module, + int &encoder_ix, bool is_signed) { int y_sz = GetSize(Y); @@ -866,8 +1117,7 @@ struct BoothPassWorker { sb_int.append(module->addWire(NEW_ID_SUFFIX(stringf("sb_int_%d", encoder_ix)), 1)); if (y_ix == 0) { - BuildBur4e(enc_name, State::S0, Y[y_ix], - Y[y_ix + 1], one_int[encoder_ix], two_int[encoder_ix], s_int[encoder_ix], + BuildBur4e(enc_name, State::S0, Y[y_ix], Y[y_ix + 1], one_int[encoder_ix], two_int[encoder_ix], s_int[encoder_ix], sb_int[encoder_ix]); y_ix = y_ix + 1; @@ -926,8 +1176,7 @@ struct BoothPassWorker { sb_int.append(module->addWire(NEW_ID_SUFFIX(stringf("sb_int_%d", encoder_ix)), 1)); SigBit one_o_int, two_o_int, s_o_int, sb_o_int; - BuildBur4e(enc_name, Y[y_ix], State::S0, - State::S0, one_o_int, two_o_int, s_o_int, sb_o_int); + BuildBur4e(enc_name, Y[y_ix], State::S0, State::S0, one_o_int, two_o_int, s_o_int, sb_o_int); module->connect(one_int[encoder_ix], one_o_int); module->connect(two_int[encoder_ix], two_o_int); @@ -973,9 +1222,8 @@ struct BoothPassWorker { cori_n_int[encoder_ix - 1] = module->addWire(NEW_ID_SUFFIX(stringf("cori_n_int_%d", encoder_ix)), 1); if (encoder_ix == 1) { - BuildBr4e(enc_name, State::S0, Y[0], Y[1], - negi_n_int[encoder_ix - 1], twoi_n_int[encoder_ix - 1], onei_n_int[encoder_ix - 1], - cori_n_int[encoder_ix - 1]); + BuildBr4e(enc_name, State::S0, Y[0], Y[1], negi_n_int[encoder_ix - 1], twoi_n_int[encoder_ix - 1], + onei_n_int[encoder_ix - 1], cori_n_int[encoder_ix - 1]); } else { SigBit y1, y2, y3; @@ -992,8 +1240,7 @@ struct BoothPassWorker { else y3 = Y[(encoder_ix - 1) * 2 + 1]; //+1 - BuildBr4e(enc_name, y1, y2, y3, - negi_n_int[encoder_ix - 1], twoi_n_int[encoder_ix - 1], onei_n_int[encoder_ix - 1], + BuildBr4e(enc_name, y1, y2, y3, negi_n_int[encoder_ix - 1], twoi_n_int[encoder_ix - 1], onei_n_int[encoder_ix - 1], cori_n_int[encoder_ix - 1]); } } @@ -1005,11 +1252,10 @@ struct BoothPassWorker { for (int encoder_ix = 1; encoder_ix <= (int)enc_count; encoder_ix++) { for (int decoder_ix = 1; decoder_ix <= dec_count; decoder_ix++) { PPij[((encoder_ix - 1) * dec_count) + decoder_ix - 1] = - module->addWire(NEW_ID_SUFFIX(stringf("ppij_%d_%d", encoder_ix, decoder_ix)), 1); + module->addWire(NEW_ID_SUFFIX(stringf("ppij_%d_%d", encoder_ix, decoder_ix)), 1); - nxj[((encoder_ix - 1) * dec_count) + decoder_ix - 1] = - module->addWire(NEW_ID_SUFFIX(stringf("nxj_%s%d_%d", decoder_ix == 1 ? "pre_dec_" : "", - encoder_ix, decoder_ix)), 1); + nxj[((encoder_ix - 1) * dec_count) + decoder_ix - 1] = module->addWire( + NEW_ID_SUFFIX(stringf("nxj_%s%d_%d", decoder_ix == 1 ? "pre_dec_" : "", encoder_ix, decoder_ix)), 1); } } @@ -1023,10 +1269,8 @@ struct BoothPassWorker { if (encoder_ix == 1) { // quadrant 1 optimization } else { - module->addNotGate(NEW_ID_SUFFIX(stringf("pre_dec_%d", encoder_ix)), - negi_n_int[encoder_ix - 1], - nxj[(encoder_ix - 1) * dec_count] - ); + module->addNotGate(NEW_ID_SUFFIX(stringf("pre_dec_%d", encoder_ix)), negi_n_int[encoder_ix - 1], + nxj[(encoder_ix - 1) * dec_count]); } for (int decoder_ix = 1; decoder_ix < dec_count; decoder_ix++) { @@ -1046,9 +1290,9 @@ struct BoothPassWorker { // applies to 9th decoder (xsz+1 decoder). std::string dec_name = stringf("dec_%d_%d", encoder_ix, x_sz + 1); SigBit unused_op; - BuildBr4d(dec_name, nxj[((encoder_ix - 1) * dec_count) + dec_count - 1], twoi_n_int[encoder_ix - 1], - X[dec_count - 2], negi_n_int[encoder_ix - 1], onei_n_int[encoder_ix - 1], - PPij[((encoder_ix - 1) * dec_count) + dec_count - 1], unused_op); + BuildBr4d(dec_name, nxj[((encoder_ix - 1) * dec_count) + dec_count - 1], twoi_n_int[encoder_ix - 1], X[dec_count - 2], + negi_n_int[encoder_ix - 1], onei_n_int[encoder_ix - 1], PPij[((encoder_ix - 1) * dec_count) + dec_count - 1], + unused_op); } // @@ -1060,8 +1304,7 @@ struct BoothPassWorker { BuildBoothQ1("icb_booth_q1_", negi_n_int[0], // negi cori_n_int[0], // cori - X[0], X[1], Y[0], Y[1], - nxj_o_int, q1_carry_out, pp0_o_int, pp1_o_int); + X[0], X[1], Y[0], Y[1], nxj_o_int, q1_carry_out, pp0_o_int, pp1_o_int); module->connect(Z[0], pp0_o_int); module->connect(Z[1], pp1_o_int); @@ -1085,29 +1328,27 @@ struct BoothPassWorker { SigBit d08_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv1"), PPij[(0 * dec_count) + dec_count - 1]); SigBit d18_inv = module->NotGate(NEW_ID_SUFFIX("bfa_0_exc_inv2"), PPij[(1 * dec_count) + dec_count - 1]); BuildBitwiseFa(module, NEW_ID_SUFFIX("fa_row_0").str(), - /* A */ {State::S0, d08_inv, PPij[(0 * dec_count) + x_sz], PPij.extract((0 * dec_count) + 2, x_sz - 1)}, - /* B */ {State::S1, d18_inv, PPij.extract((1 * dec_count), x_sz)}, - /* C */ fa_carry[0].extract(1, x_sz + 2), - /* X */ fa_carry[0].extract(2, x_sz + 2), - /* Y */ fa_sum[0].extract(2, x_sz + 2) - ); + /* A */ {State::S0, d08_inv, PPij[(0 * dec_count) + x_sz], PPij.extract((0 * dec_count) + 2, x_sz - 1)}, + /* B */ {State::S1, d18_inv, PPij.extract((1 * dec_count), x_sz)}, + /* C */ fa_carry[0].extract(1, x_sz + 2), + /* X */ fa_carry[0].extract(2, x_sz + 2), + /* Y */ fa_sum[0].extract(2, x_sz + 2)); module->connect(fa_carry[0][1], q1_carry_out); // step case: 2nd and rest of rows. (fa_row_ix == 1...n) // special because these are driven by a decoder and prior fa. for (fa_row_ix = 1; fa_row_ix < fa_row_count; fa_row_ix++) { // end two bits: sign extension - SigBit d_inv = module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_L", fa_row_ix)), - PPij[((fa_row_ix + 1) * dec_count) + dec_count - 1]); + SigBit d_inv = + module->NotGate(NEW_ID_SUFFIX(stringf("bfa_se_inv_%d_L", fa_row_ix)), PPij[((fa_row_ix + 1) * dec_count) + dec_count - 1]); BuildBitwiseFa(module, NEW_ID_SUFFIX(stringf("fa_row_%d", fa_row_ix)).str(), - /* A */ {State::S0, fa_carry[fa_row_ix - 1][fa_count - 1], fa_sum[fa_row_ix - 1].extract(2, x_sz + 2)}, - /* B */ {State::S1, d_inv, PPij.extract((fa_row_ix + 1) * dec_count, x_sz), State::S0, State::S0}, + /* A */ {State::S0, fa_carry[fa_row_ix - 1][fa_count - 1], fa_sum[fa_row_ix - 1].extract(2, x_sz + 2)}, + /* B */ {State::S1, d_inv, PPij.extract((fa_row_ix + 1) * dec_count, x_sz), State::S0, State::S0}, - /* C */ {fa_carry[fa_row_ix].extract(0, x_sz + 3), cori_n_int[fa_row_ix]}, - /* X */ fa_carry[fa_row_ix], - /* Y */ fa_sum[fa_row_ix] - ); + /* C */ {fa_carry[fa_row_ix].extract(0, x_sz + 3), cori_n_int[fa_row_ix]}, + /* X */ fa_carry[fa_row_ix], + /* Y */ fa_sum[fa_row_ix]); } // instantiate the cpa