From 8c0a848c7b6cb23428d4090ce2751130c8a49b04 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Fri, 16 Aug 2024 19:05:33 +0800 Subject: [PATCH] [rocket&t1] connect vxsat. --- rocketv/src/csr/V.scala | 2 +- t1/src/Bundles.scala | 3 +++ t1/src/Lane.scala | 16 ++++++++++++++++ t1/src/OtherUnit.scala | 2 +- t1/src/T1.scala | 11 ++++++++++- t1/src/laneStage/Distributor.scala | 6 ++++++ t1/src/laneStage/LaneExecutionBridge.scala | 7 +++++++ 7 files changed, 44 insertions(+), 3 deletions(-) diff --git a/rocketv/src/csr/V.scala b/rocketv/src/csr/V.scala index 944448f13..a654ba4a8 100644 --- a/rocketv/src/csr/V.scala +++ b/rocketv/src/csr/V.scala @@ -50,7 +50,7 @@ class V(vlen: Int, hypervisor: Boolean) { case "vlenb" => UInt(vlenbWidth.W) case "vstart" => UInt(vlWidth.W) case "vxrm" => UInt(2.W) - case "vxsat" => UInt(2.W) + case "vxsat" => Bool() } // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#311-state-of-vector-extension-at-reset def reset(content: String): Option[UInt] = content match { diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index bb8a36f41..e873f946b 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -104,6 +104,8 @@ class InstructionControl(instIndexWidth: Int, laneSize: Int) extends Bundle { * TODO: move to `state`. */ val endTag: Vec[Bool] = Vec(laneSize + 1, Bool()) + + val vxsat: Bool = Bool() } class ExtendInstructionType extends Bundle { @@ -628,6 +630,7 @@ class ExecutionUnitRecord(parameter: LaneParameter)(isLastSlot: Boolean) extends val laneIndex: UInt = UInt(parameter.laneNumberBits.W) // pipe state val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) } class SlotRequestToVFU(parameter: LaneParameter) extends Bundle { diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index 59a7eb8a6..6b7dec0c2 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -267,6 +267,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ /** for each instruction in the slot, response to top when instruction is finished in this lane. */ @public val instructionFinished: UInt = IO(Output(UInt(parameter.chainingSize.W))) + @public + val vxsatReport: UInt = IO(Output(UInt(parameter.chainingSize.W))) /** V0 update in the lane should also update [[T1.v0]] */ @public @@ -496,6 +498,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ val instructionValid: UInt = Wire(UInt(parameter.chainingSize.W)) val instructionValidNext: UInt = RegNext(instructionValid, 0.U) + val vxsatResult: UInt = RegInit(0.U(parameter.chainingSize.W)) + vxsatReport := vxsatResult + + // Overflow occurs + val vxsatEnq: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W))) + // vxsatEnq and instructionFinished cannot happen at the same time + vxsatResult := (vxsatEnq.reduce(_ | _) | vxsatResult) & (~instructionFinished).asUInt + /** assert when a instruction will not use mask unit */ val instructionUnrelatedMaskUnitVec: Vec[UInt] = Wire(Vec(parameter.chainingSize, UInt(parameter.chainingSize.W))) @@ -744,6 +754,12 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ executionUnit.vfuRequest.ready := executeEnqueueFire(index) executionUnit.dataResponse := responseVec(index) + vxsatEnq(index) := Mux( + executionUnit.dataResponse.valid && + (executionUnit.dataResponse.bits.clipFail ## executionUnit.dataResponse.bits.vxsat).orR, + UIntToOH(executionUnit.responseIndex(parameter.instructionIndexBits - 2, 0)), + 0.U(parameter.chainingSize.W) + ) when(executionUnit.dequeue.valid)(assert(stage2.dequeue.valid)) stage3.enqueue.valid := executionUnit.dequeue.valid executionUnit.dequeue.ready := stage3.enqueue.ready diff --git a/t1/src/OtherUnit.scala b/t1/src/OtherUnit.scala index 0fa8712bd..66c83c1eb 100644 --- a/t1/src/OtherUnit.scala +++ b/t1/src/OtherUnit.scala @@ -130,5 +130,5 @@ class OtherUnit(val parameter: OtherUnitParam) extends VFUModule(parameter) with ) response.data := result response.ffoSuccess := ffo.resp.valid && isffo - response.clipFail := DontCare + response.clipFail := roundResultOverlap || differentSign } diff --git a/t1/src/T1.scala b/t1/src/T1.scala index 5a36bb040..afc7ebb68 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -496,6 +496,9 @@ class T1(val parameter: T1Parameter) */ val instructionFinished: Vec[Vec[Bool]] = Wire(Vec(parameter.laneNumber, Vec(parameter.chainingSize, Bool()))) + val vxsatReportVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) + val vxsatReport = vxsatReportVec.reduce(_ | _) + // todo: 把lsu也放decode里去 val maskUnitType: Bool = decodeResult(Decoder.maskUnit) && requestRegDequeue.bits.instruction(6) val maskDestination = decodeResult(Decoder.maskDestination) @@ -627,6 +630,7 @@ class T1(val parameter: T1Parameter) * this signal is used to update the `control.endTag`. */ val lsuFinished: Bool = ohCheck(lsu.lastReport, control.record.instructionIndex, parameter.chainingSize) + val vxsatUpdate = ohCheck(vxsatReport, control.record.instructionIndex, parameter.chainingSize) val dataInWritePipeCheck = ohCheck(dataInWritePipe, control.record.instructionIndex, parameter.chainingSize) // instruction is allocated to this slot. @@ -641,6 +645,7 @@ class T1(val parameter: T1Parameter) control.state.wLast := false.B control.state.sCommit := false.B control.state.wVRFWrite := !requestReg.bits.decodeResult(Decoder.maskUnit) + control.vxsat := false.B // two different initial states for endTag: // for load/store instruction, use the last bit to indicate whether it is the last instruction // for other instructions, use MSB to indicate whether it is the last instruction @@ -668,6 +673,9 @@ class T1(val parameter: T1Parameter) control.endTag.zip(instructionFinished.map(_(index)) :+ lsuFinished).foreach { case (d, c) => d := d || c } + when(vxsatUpdate) { + control.vxsat := true.B + } } // logic like mask&reduce will be put to last slot // TODO: review later @@ -1546,6 +1554,7 @@ class T1(val parameter: T1Parameter) instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => d := (UIntToOH(f(parameter.instructionIndexBits - 2, 0)) & lane.instructionFinished).orR } + vxsatReportVec(index) := lane.vxsatReport val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) val v0SelectBySew = Mux1H(UIntToOH(lane.maskSelectSew)(2, 0), v0ForThisLane) lane.maskInput := cutUInt(v0SelectBySew, parameter.datapathWidth)(lane.maskSelect) @@ -1685,7 +1694,7 @@ class T1(val parameter: T1Parameter) retire := slotCommit.asUInt.orR io.retire.rd.bits.rdData := Mux(ffoType, ffoIndexReg.bits, dataResult.bits) // TODO: csr retire. - io.retire.csr.bits.vxsat := DontCare + io.retire.csr.bits.vxsat := (slotCommit.asUInt & VecInit(slots.map(_.vxsat)).asUInt).orR io.retire.csr.bits.fflag := DontCare io.retire.csr.valid := false.B io.retire.mem.valid := (slotCommit.asUInt & VecInit(slots.map(_.record.isLoadStore)).asUInt).orR diff --git a/t1/src/laneStage/Distributor.scala b/t1/src/laneStage/Distributor.scala index befadb7ff..31e905a8d 100644 --- a/t1/src/laneStage/Distributor.scala +++ b/t1/src/laneStage/Distributor.scala @@ -28,6 +28,7 @@ class Distributor[T <: SlotRequestToVFU, B <: VFUResponseToSlot](enqueue: T, deq val requestReg: ValidIO[SlotRequestToVFU] = RegInit(0.U.asTypeOf(Valid(enqueue))) val sendRequestValid: Bool = RegInit(false.B) val ffoSuccess: Bool = RegInit(false.B) + val vxsatResult = RegInit(false.B) val responseData: UInt = RegInit(0.U(enqueue.src.head.getWidth.W)) val executeIndex = RegInit(0.U(2.W)) @@ -163,6 +164,10 @@ class Distributor[T <: SlotRequestToVFU, B <: VFUResponseToSlot](enqueue: T, deq when(responseFromVfu.fire || requestFromSlot.fire) { ffoSuccess := updateFFO && !requestFromSlot.fire } + val updateVxsat = (responseFromVfu.bits.vxsat ## responseFromVfu.bits.clipFail).orR || vxsatResult + when(responseFromVfu.fire || requestFromSlot.fire) { + vxsatResult := updateVxsat && !requestFromSlot.fire + } requestFromSlot.ready := !requestReg.valid || isLastResponse @@ -170,6 +175,7 @@ class Distributor[T <: SlotRequestToVFU, B <: VFUResponseToSlot](enqueue: T, deq responseWire.bits.data := resultUpdate responseWire.bits.ffoSuccess := updateFFO responseWire.bits.tag := requestReg.bits.tag + responseWire.bits.vxsat := updateVxsat val pipeResponse: ValidIO[VFUResponseToSlot] = RegNext(responseWire, 0.U.asTypeOf(responseToSlot)) responseToSlot <> pipeResponse diff --git a/t1/src/laneStage/LaneExecutionBridge.scala b/t1/src/laneStage/LaneExecutionBridge.scala index 3a58046f3..cf3cf1c9d 100644 --- a/t1/src/laneStage/LaneExecutionBridge.scala +++ b/t1/src/laneStage/LaneExecutionBridge.scala @@ -25,6 +25,7 @@ class LaneExecuteRequest(parameter: LaneParameter, isLastSlot: Boolean) extends val maskType: Bool = Bool() // Newly added in LaneExecutionBridge val laneIndex: UInt = UInt(parameter.laneNumberBits.W) + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) } class LaneExecuteResponse(parameter: LaneParameter, isLastSlot: Boolean) extends Bundle { @@ -45,6 +46,7 @@ class ExecutionBridgeRecordQueue(parameter: LaneParameter, isLastSlot: Boolean) // pipe state val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) val vSew1H: UInt = UInt(3.W) + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) } @instantiable @@ -71,6 +73,8 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd val executeDecode: DecodeBundle = IO(Output(Decoder.bundle(parameter.decoderParam))) @public val responseDecode: DecodeBundle = IO(Output(Decoder.bundle(parameter.decoderParam))) + @public + val responseIndex: UInt = IO(Output(UInt(parameter.instructionIndexBits.W))) val executionRecord: ExecutionUnitRecord = RegInit(0.U.asTypeOf(new ExecutionUnitRecord(parameter)(isLastSlot))) val executionRecordValid = RegInit(false.B) @@ -152,6 +156,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd executionRecord.csr := enqueue.bits.csr executionRecord.maskType := enqueue.bits.maskType executionRecord.laneIndex := enqueue.bits.laneIndex + executionRecord.instructionIndex := enqueue.bits.instructionIndex } /** collapse the dual SEW size operand for cross read. @@ -327,6 +332,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd } recordQueue.io.enq.bits.decodeResult := executionRecord.decodeResult recordQueue.io.enq.bits.vSew1H := executionRecord.vSew1H + recordQueue.io.enq.bits.instructionIndex := executionRecord.instructionIndex //--- vfu <-> write queue start --- /** same as [[doubleExecutionInRecord]] @@ -540,6 +546,7 @@ class LaneExecutionBridge(parameter: LaneParameter, isLastSlot: Boolean, slotInd queue.io.enq.bits.fpReduceValid.foreach(_ := !waitFirstValidFire.get) recordQueue.io.deq.ready := dataResponse.valid || (recordNotExecute && queue.io.enq.ready) responseDecode := recordQueue.io.deq.bits.decodeResult + responseIndex := recordQueue.io.deq.bits.instructionIndex queue.io.enq.valid := (recordQueue.io.deq.valid && ((dataResponse.valid && reduceReady &&