src/soc/experiment/compalu_multi.py

   1 from nmigen.compat.sim import run_simulation
   2 from nmigen.cli import verilog, rtlil
   3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
   4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
   5
   6 from nmutil.latch import SRLatch, latchregister
   7 from nmutil.iocontrol import RecordObject
   8
   9 from soc.decoder.power_decoder2 import Data
  10 from soc.decoder.power_enums import InternalOp
  11 from soc.fu.regspec import RegSpec, RegSpecALUAPI
  12
  13
  14 """ Computation Unit (aka "ALU Manager").
  15
  16     This module runs a "revolving door" set of three latches, based on
  17     * Issue
  18     * Go_Read
  19     * Go_Write
  20     where one of them cannot be set on any given cycle.
  21
  22     * When issue is first raised, a busy signal is sent out.
  23       The src1 and src2 registers and the operand can be latched in
  24       at this point
  25
  26     * Read request is set, which is acknowledged through the Scoreboard
  27       to the priority picker, which generates (one and only one) Go_Read
  28       at a time.  One of those will (eventually) be this Computation Unit.
  29
  30     * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
  31       src1/src2/operand in place), and the ALU is told to proceed.
  32
  33     * when the ALU pipeline is ready, this activates "write request release",
  34       and the ALU's output is captured into a temporary register.
  35
  36     * Write request release is *HELD UP* (prevented from proceeding) if shadowN
  37       is asserted LOW.  This is how all speculation, precise exceptions,
  38       predication - everything - is achieved.
  39
  40     * Write request release will go through a similar process as Read request,
  41       resulting (eventually) in Go_Write being asserted.
  42
  43     * When Go_Write is asserted, two things happen: (1) the data in the temp
  44       register is placed combinatorially onto the output, and (2) the
  45       req_l latch is cleared, busy is dropped, and the Comp Unit is back
  46       through its revolving door to do another task.
  47
  48     Note that the read and write latches are held synchronously for one cycle,
  49     i.e. that when Go_Read comes in, one cycle is given in which the incoming
  50     register (broadcast over a Regfile Read Port) may have time to be latched.
  51
  52     It is REQUIRED that Go_Read be held valid only for one cycle, and it is
  53     REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
  54     Go_Read is asserted HI.
  55
  56     Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
  57     likewise be dropped exactly one cycle after assertion of Go_Write.
  58
  59     When Go_Die is asserted then strictly speaking the entire FSM should be
  60     fully reset and that includes sending a cancellation request to the ALU.
  61     (XXX TODO: alu "go die" is not presently wired up)
  62 """
  63
  64 def go_record(n, name):
  65     r = Record([('go', n, DIR_FANIN),
  66                 ('rel', n, DIR_FANOUT)], name=name)
  67     r.go.reset_less = True
  68     r.rel.reset_less = True
  69     return r
  70
  71 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
  72
  73 class CompUnitRecord(RegSpec, RecordObject):
  74     """CompUnitRecord
  75
  76     base class for Computation Units, to provide a uniform API
  77     and allow "record.connect" etc. to be used, particularly when
  78     it comes to connecting multiple Computation Units up as a block
  79     (very laborious)
  80
  81     LDSTCompUnitRecord should derive from this class and add the
  82     additional signals it requires
  83
  84     :subkls:      the class (not an instance) needed to construct the opcode
  85     :rwid:        either an integer (specifies width of all regs) or a "regspec"
  86
  87     see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
  88     """
  89     def __init__(self, subkls, rwid, n_src=None, n_dst=None, name=None):
  90         RegSpec.__init__(self, rwid, n_src, n_dst)
  91         RecordObject.__init__(self, name)
  92         self._subkls = subkls
  93
  94         src = []
  95         for i in range(n_src):
  96             j = i + 1 # name numbering to match src1/src2
  97             name = "src%d_i" % j
  98             rw = self._get_srcwid(i)
  99             sreg = Signal(rw, name=name, reset_less=True)
 100             setattr(self, name, sreg)
 101             src.append(sreg)
 102         self._src_i = src
 103
 104         dst = []
 105         for i in range(n_dst):
 106             j = i + 1 # name numbering to match dest1/2...
 107             name = "dest%d_i" % j
 108             rw = self._get_dstwid(i)
 109             dreg = Signal(rw, name=name, reset_less=True)
 110             setattr(self, name, dreg)
 111             dst.append(dreg)
 112         self._dest = dst
 113
 114         self.rd = go_record(n_src, name="rd") # read in, req out
 115         self.wr = go_record(n_dst, name="wr") # write in, req out
 116         self.issue_i = Signal(reset_less=True) # fn issue in
 117         self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
 118         self.go_die_i = Signal() # go die (reset)
 119
 120         # operation / data input
 121         self.oper_i = subkls() # operand
 122
 123         # output (busy/done)
 124         self.busy_o = Signal(reset_less=True) # fn busy out
 125         self.done_o = Signal(reset_less=True)
 126
 127
 128 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
 129     def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
 130         """MultiCompUnit
 131
 132         * :rwid:        width of register latches (TODO: allocate per regspec)
 133         * :alu:         the ALU (pipeline, FSM) - must conform to nmutil Pipe API
 134         * :opsubsetkls: the subset of Decode2ExecuteType
 135         * :n_src:       number of src operands
 136         * :n_dst:       number of destination operands
 137         """
 138         RegSpecALUAPI.__init__(self, rwid, alu)
 139         self.n_src, self.n_dst = n_src, n_dst
 140         self.opsubsetkls = opsubsetkls
 141         self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
 142
 143         for i in range(n_src):
 144             j = i + 1 # name numbering to match src1/src2
 145             name = "src%d_i" % j
 146             setattr(self, name, getattr(cu, name))
 147
 148         for i in range(n_dst):
 149             j = i + 1 # name numbering to match dest1/2...
 150             name = "dest%d_i" % j
 151             setattr(self, name, getattr(cu, name))
 152
 153         # convenience names
 154         self.rd = cu.rd
 155         self.wr = cu.wr
 156         self.go_rd_i = self.rd.go # temporary naming
 157         self.go_wr_i = self.wr.go # temporary naming
 158         self.rd_rel_o = self.rd.rel # temporary naming
 159         self.req_rel_o = self.wr.rel # temporary naming
 160         self.issue_i = cu.issue_i
 161         self.shadown_i = cu.shadown_i
 162         self.go_die_i = cu.go_die_i
 163
 164         # operation / data input
 165         self.oper_i = cu.oper_i
 166         self.src_i = cu._src_i
 167
 168         self.busy_o = cu.busy_o
 169         self.dest = cu._dest
 170         self.data_o = self.dest[0] # Dest out
 171         self.done_o = cu.done_o
 172
 173     def elaborate(self, platform):
 174         m = Module()
 175         m.submodules.alu = self.alu
 176         m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
 177         m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
 178         m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
 179         m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
 180         m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
 181
 182         # ALU only proceeds when all src are ready.  rd_rel_o is delayed
 183         # so combine it with go_rd_i.  if all bits are set we're good
 184         all_rd = Signal(reset_less=True)
 185         m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
 186                     (((~self.rd.rel) | self.rd.go).all()))
 187
 188         # write_requests all done
 189         # req_done works because any one of the last of the writes
 190         # is enough, when combined with when read-phase is done (rst_l.q)
 191         wr_any = Signal(reset_less=True)
 192         req_done = Signal(reset_less=True)
 193         m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
 194         m.d.comb += wr_any.eq(self.wr.go.bool())
 195         m.d.comb += req_done.eq(rst_l.q & wr_any)
 196
 197         # shadow/go_die
 198         reset = Signal(reset_less=True)
 199         rst_r = Signal(reset_less=True) # reset latch off
 200         reset_w = Signal(self.n_dst, reset_less=True)
 201         reset_r = Signal(self.n_src, reset_less=True)
 202         m.d.comb += reset.eq(req_done | self.go_die_i)
 203         m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
 204         m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
 205         m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
 206
 207         # read-done,wr-proceed latch
 208         m.d.comb += rok_l.s.eq(self.issue_i)  # set up when issue starts
 209         m.d.comb += rok_l.r.eq(self.alu.p.ready_o) # off when ALU acknowledges
 210
 211         # wr-done, back-to-start latch
 212         m.d.comb += rst_l.s.eq(all_rd)     # set when read-phase is fully done
 213         m.d.comb += rst_l.r.eq(rst_r)        # *off* on issue
 214
 215         # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
 216         m.d.sync += opc_l.s.eq(self.issue_i)       # set on issue
 217         m.d.sync += opc_l.r.eq(self.alu.n.valid_o & req_done) # reset on ALU
 218
 219         # src operand latch (not using go_wr_i)
 220         m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
 221         m.d.sync += src_l.r.eq(reset_r)
 222
 223         # dest operand latch (not using issue_i)
 224         m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
 225         m.d.sync += req_l.r.eq(reset_w)
 226
 227         # create a latch/register for the operand
 228         oper_r = self.opsubsetkls()
 229         latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
 230
 231         # and for each output from the ALU
 232         drl = []
 233         for i in range(self.n_dst):
 234             name = "data_r%d" % i
 235             data_r = Signal(self.cu._get_srcwid(i), name=name, reset_less=True)
 236             latchregister(m, self.get_out(i), data_r, req_l.q[i], name)
 237             drl.append(data_r)
 238
 239         # pass the operation to the ALU
 240         m.d.comb += self.get_op().eq(oper_r)
 241
 242         # create list of src/alu-src/src-latch.  override 1st and 2nd one below.
 243         # in the case, for ALU and Logical pipelines, we assume RB is the 2nd operand
 244         # in the input "regspec".  see for example soc.fu.alu.pipe_data.ALUInputData
 245         # TODO: assume RA is the 1st operand, zero_a detection is needed.
 246         sl = []
 247         for i in range(self.n_src):
 248             sl.append([self.src_i[i], self.get_in(i), src_l.q[i]])
 249
 250         # if the operand subset has "zero_a" we implicitly assume that means
 251         # src_i[0] is an INT register type where zero can be multiplexed in, instead.
 252         # see https://bugs.libre-soc.org/show_bug.cgi?id=336
 253         #if hasattr(oper_r, "zero_a"):
 254             # select zero immediate if opcode says so.  however also change the latch
 255             # to trigger *from* the opcode latch instead.
 256             # ...
 257             # ...
 258
 259         # if the operand subset has "imm_data" we implicitly assume that means
 260         # "this is an INT ALU/Logical FU jobbie, RB is multiplexed with the immediate"
 261         if hasattr(oper_r, "imm_data"):
 262             # select immediate if opcode says so.  however also change the latch
 263             # to trigger *from* the opcode latch instead.
 264             op_is_imm = oper_r.imm_data.imm_ok
 265             src2_or_imm = Signal(self.cu._get_srcwid(1), reset_less=True)
 266             src_sel = Signal(reset_less=True)
 267             m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
 268             m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
 269                                                       self.src2_i))
 270             # overwrite 2nd src-latch with immediate-muxed stuff
 271             sl[1][0] = src2_or_imm
 272             sl[1][2] = src_sel
 273
 274         # create a latch/register for src1/src2 (even if it is a copy of an immediate)
 275         for i in range(self.n_src):
 276             src, alusrc, latch = sl[i]
 277             latchregister(m, src, alusrc, latch, name="src_r%d" % i)
 278
 279         # -----
 280         # outputs
 281         # -----
 282
 283         # all request signals gated by busy_o.  prevents picker problems
 284         m.d.comb += self.busy_o.eq(opc_l.q) # busy out
 285         bro = Repl(self.busy_o, self.n_src)
 286         m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
 287
 288         # on a go_read, tell the ALU we're accepting data.
 289         # NOTE: this spells TROUBLE if the ALU isn't ready!
 290         # go_read is only valid for one clock!
 291         with m.If(all_rd):                           # src operands ready, GO!
 292             with m.If(~self.alu.p.ready_o):          # no ACK yet
 293                 m.d.comb += self.alu.p.valid_i.eq(1) # so indicate valid
 294
 295         brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
 296         # only proceed if ALU says its output is valid
 297         with m.If(self.alu.n.valid_o):
 298             # when ALU ready, write req release out. waits for shadow
 299             m.d.comb += self.wr.rel.eq(req_l.q & brd)
 300             # when output latch is ready, and ALU says ready, accept ALU output
 301             with m.If(reset):
 302                 m.d.comb += self.alu.n.ready_i.eq(1) # tells ALU "thanks got it"
 303
 304         # output the data from the latch on go_write
 305         for i in range(self.n_dst):
 306             with m.If(self.wr.go[i]):
 307                 m.d.comb += self.dest[i].eq(drl[i])
 308
 309         return m
 310
 311     def __iter__(self):
 312         yield self.rd.go
 313         yield self.wr.go
 314         yield self.issue_i
 315         yield self.shadown_i
 316         yield self.go_die_i
 317         yield from self.oper_i.ports()
 318         yield self.src1_i
 319         yield self.src2_i
 320         yield self.busy_o
 321         yield self.rd.rel
 322         yield self.wr.rel
 323         yield self.data_o
 324
 325     def ports(self):
 326         return list(self)
 327
 328
 329 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
 330     yield dut.issue_i.eq(0)
 331     yield
 332     yield dut.src_i[0].eq(a)
 333     yield dut.src_i[1].eq(b)
 334     yield dut.oper_i.insn_type.eq(op)
 335     yield dut.oper_i.invert_a.eq(inv_a)
 336     yield dut.oper_i.imm_data.imm.eq(imm)
 337     yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
 338     yield dut.issue_i.eq(1)
 339     yield
 340     yield dut.issue_i.eq(0)
 341     yield
 342     yield dut.rd.go.eq(0b11)
 343     while True:
 344         yield
 345         rd_rel_o = yield dut.rd.rel
 346         print ("rd_rel", rd_rel_o)
 347         if rd_rel_o:
 348             break
 349     yield
 350     yield dut.rd.go.eq(0)
 351     req_rel_o = yield dut.wr.rel
 352     result = yield dut.data_o
 353     print ("req_rel", req_rel_o, result)
 354     while True:
 355         req_rel_o = yield dut.wr.rel
 356         result = yield dut.data_o
 357         print ("req_rel", req_rel_o, result)
 358         if req_rel_o:
 359             break
 360         yield
 361     yield dut.wr.go[0].eq(1)
 362     yield
 363     result = yield dut.data_o
 364     print ("result", result)
 365     yield dut.wr.go[0].eq(0)
 366     yield
 367     return result
 368
 369
 370 def scoreboard_sim(dut):
 371     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
 372                                     imm=8, imm_ok=1)
 373     assert result == 13
 374
 375     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
 376     assert result == 7
 377
 378     result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
 379     assert result == 65532
 380
 381
 382 def test_compunit():
 383     from alu_hier import ALU
 384     from soc.fu.alu.alu_input_record import CompALUOpSubset
 385
 386     m = Module()
 387     alu = ALU(16)
 388     dut = MultiCompUnit(16, alu, CompALUOpSubset)
 389     m.submodules.cu = dut
 390
 391     vl = rtlil.convert(dut, ports=dut.ports())
 392     with open("test_compunit1.il", "w") as f:
 393         f.write(vl)
 394
 395     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 396
 397
 398 def test_compunit_regspec1():
 399     from alu_hier import ALU
 400     from soc.fu.alu.alu_input_record import CompALUOpSubset
 401
 402     inspec = [('INT', 'a', '0:15'),
 403               ('INT', 'b', '0:15')]
 404     outspec = [('INT', 'o', '0:15'),
 405               ]
 406
 407     regspec = (inspec, outspec)
 408
 409     m = Module()
 410     alu = ALU(16)
 411     dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
 412     m.submodules.cu = dut
 413
 414     vl = rtlil.convert(dut, ports=dut.ports())
 415     with open("test_compunit_regspec1.il", "w") as f:
 416         f.write(vl)
 417
 418     run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
 419
 420
 421 if __name__ == '__main__':
 422     test_compunit()
 423     test_compunit_regspec1()