src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmigen.back.pysim import Simulator, Settle
  18
  19 from soc.decoder.power_enums import MicrOp, Function, CryIn
  20
  21 from soc.fu.alu.alu_input_record import CompALUOpSubset
  22 from soc.fu.cr.cr_input_record import CompCROpSubset
  23
  24 import operator
  25
  26
  27
  28
  29 class Adder(Elaboratable):
  30     def __init__(self, width):
  31         self.invert_a = Signal()
  32         self.a   = Signal(width)
  33         self.b   = Signal(width)
  34         self.o   = Signal(width, name="add_o")
  35
  36     def elaborate(self, platform):
  37         m = Module()
  38         with m.If(self.invert_a):
  39             m.d.comb += self.o.eq((~self.a) + self.b)
  40         with m.Else():
  41             m.d.comb += self.o.eq(self.a + self.b)
  42         return m
  43
  44
  45 class Subtractor(Elaboratable):
  46     def __init__(self, width):
  47         self.a   = Signal(width)
  48         self.b   = Signal(width)
  49         self.o   = Signal(width, name="sub_o")
  50
  51     def elaborate(self, platform):
  52         m = Module()
  53         m.d.comb += self.o.eq(self.a - self.b)
  54         return m
  55
  56
  57 class Multiplier(Elaboratable):
  58     def __init__(self, width):
  59         self.a   = Signal(width)
  60         self.b   = Signal(width)
  61         self.o   = Signal(width, name="mul_o")
  62
  63     def elaborate(self, platform):
  64         m = Module()
  65         m.d.comb += self.o.eq(self.a * self.b)
  66         return m
  67
  68
  69 class Shifter(Elaboratable):
  70     def __init__(self, width):
  71         self.width = width
  72         self.a   = Signal(width)
  73         self.b   = Signal(width)
  74         self.o   = Signal(width, name="shf_o")
  75
  76     def elaborate(self, platform):
  77         m = Module()
  78         btrunc = Signal(self.width)
  79         m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
  80         m.d.comb += self.o.eq(self.a >> btrunc)
  81         return m
  82
  83 class Dummy:
  84     pass
  85
  86
  87 class DummyALU(Elaboratable):
  88     def __init__(self, width):
  89         self.p = Dummy() # make look like nmutil pipeline API
  90         self.p.data_i = Dummy()
  91         self.p.data_i.ctx = Dummy()
  92         self.n = Dummy() # make look like nmutil pipeline API
  93         self.n.data_o = Dummy()
  94         self.p.valid_i = Signal()
  95         self.p.ready_o = Signal()
  96         self.n.ready_i = Signal()
  97         self.n.valid_o = Signal()
  98         self.counter   = Signal(4)
  99         self.op  = CompCROpSubset()
 100         i = []
 101         i.append(Signal(width, name="i1"))
 102         i.append(Signal(width, name="i2"))
 103         i.append(Signal(width, name="i3"))
 104         self.i = Array(i)
 105         self.a, self.b, self.c = i[0], i[1], i[2]
 106         self.out = Array([Signal(width, name="alu_o")])
 107         self.o = self.out[0]
 108         self.width = width
 109         # more "look like nmutil pipeline API"
 110         self.p.data_i.ctx.op = self.op
 111         self.p.data_i.a = self.a
 112         self.p.data_i.b = self.b
 113         self.p.data_i.c = self.c
 114         self.n.data_o.o = self.o
 115
 116     def elaborate(self, platform):
 117         m = Module()
 118
 119         go_now = Signal(reset_less=True) # testing no-delay ALU
 120
 121         with m.If(self.p.valid_i):
 122             # input is valid. next check, if we already said "ready" or not
 123             with m.If(~self.p.ready_o):
 124                 # we didn't say "ready" yet, so say so and initialise
 125                 m.d.sync += self.p.ready_o.eq(1)
 126
 127                 m.d.sync += self.o.eq(self.a)
 128                 m.d.comb += go_now.eq(1)
 129                 m.d.sync += self.counter.eq(1)
 130
 131         with m.Else():
 132             # input says no longer valid, so drop ready as well.
 133             # a "proper" ALU would have had to sync in the opcode and a/b ops
 134             m.d.sync += self.p.ready_o.eq(0)
 135
 136         # ok so the counter's running: when it gets to 1, fire the output
 137         with m.If((self.counter == 1) | go_now):
 138             # set the output as valid if the recipient is ready for it
 139             m.d.sync += self.n.valid_o.eq(1)
 140         with m.If(self.n.ready_i & self.n.valid_o):
 141             m.d.sync += self.n.valid_o.eq(0)
 142             # recipient said it was ready: reset back to known-good.
 143             m.d.sync += self.counter.eq(0) # reset the counter
 144             m.d.sync += self.o.eq(0) # clear the output for tidiness sake
 145
 146         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 147         with m.If(self.counter > 1):
 148             m.d.sync += self.counter.eq(self.counter - 1)
 149
 150         return m
 151
 152     def __iter__(self):
 153         yield from self.op.ports()
 154         yield self.a
 155         yield self.b
 156         yield self.c
 157         yield self.o
 158
 159     def ports(self):
 160         return list(self)
 161
 162
 163 class ALU(Elaboratable):
 164     def __init__(self, width):
 165         self.p = Dummy() # make look like nmutil pipeline API
 166         self.p.data_i = Dummy()
 167         self.p.data_i.ctx = Dummy()
 168         self.n = Dummy() # make look like nmutil pipeline API
 169         self.n.data_o = Dummy()
 170         self.p.valid_i = Signal()
 171         self.p.ready_o = Signal()
 172         self.n.ready_i = Signal()
 173         self.n.valid_o = Signal()
 174         self.counter   = Signal(4)
 175         self.op = CompALUOpSubset(name="op")
 176         i = []
 177         i.append(Signal(width, name="i1"))
 178         i.append(Signal(width, name="i2"))
 179         self.i = Array(i)
 180         self.a, self.b = i[0], i[1]
 181         self.out = Array([Signal(width, name="alu_o")])
 182         self.o = self.out[0]
 183         self.width = width
 184         # more "look like nmutil pipeline API"
 185         self.p.data_i.ctx.op = self.op
 186         self.p.data_i.a = self.a
 187         self.p.data_i.b = self.b
 188         self.n.data_o.o = self.o
 189
 190     def elaborate(self, platform):
 191         m = Module()
 192         add = Adder(self.width)
 193         mul = Multiplier(self.width)
 194         shf = Shifter(self.width)
 195         sub = Subtractor(self.width)
 196
 197         m.submodules.add = add
 198         m.submodules.mul = mul
 199         m.submodules.shf = shf
 200         m.submodules.sub = sub
 201
 202         # really should not activate absolutely all ALU inputs like this
 203         for mod in [add, mul, shf, sub]:
 204             m.d.comb += [
 205                 mod.a.eq(self.a),
 206                 mod.b.eq(self.b),
 207             ]
 208
 209         # pass invert (and carry later)
 210         m.d.comb += add.invert_a.eq(self.op.invert_a)
 211
 212         go_now = Signal(reset_less=True) # testing no-delay ALU
 213
 214         # ALU sequencer is idle when the count is zero
 215         alu_idle = Signal(reset_less=True)
 216         m.d.comb += alu_idle.eq(self.counter == 0)
 217
 218         # ALU sequencer is done when the count is one
 219         alu_done = Signal(reset_less=True)
 220         m.d.comb += alu_done.eq(self.counter == 1)
 221
 222         # select handshake handling according to ALU type
 223         with m.If(go_now):
 224             # with a combinatorial, no-delay ALU, just pass through
 225             # the handshake signals to the other side
 226             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 227             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 228         with m.Else():
 229             # sequential ALU handshake:
 230             # ready_o responds to valid_i, but only if the ALU is idle
 231             m.d.comb += self.p.ready_o.eq(alu_idle)
 232             # select the internally generated valid_o, above
 233             m.d.comb += self.n.valid_o.eq(alu_done)
 234
 235         # hold the ALU result until ready_o is asserted
 236         alu_r = Signal(self.width)
 237
 238         with m.If(alu_idle):
 239             with m.If(self.p.valid_i):
 240
 241                 # as this is a "fake" pipeline, just grab the output right now
 242                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 243                     m.d.sync += alu_r.eq(add.o)
 244                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 245                     m.d.sync += alu_r.eq(mul.o)
 246                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 247                     m.d.sync += alu_r.eq(shf.o)
 248                 # SUB is zero-delay, no need to register
 249
 250                 # NOTE: all of these are fake, just something to test
 251
 252                 # MUL, to take 5 instructions
 253                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 254                     m.d.sync += self.counter.eq(5)
 255                 # SHIFT to take 1, straight away
 256                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 257                     m.d.sync += self.counter.eq(1)
 258                 # ADD/SUB to take 3
 259                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 260                     m.d.sync += self.counter.eq(3)
 261                 # others to take no delay
 262                 with m.Else():
 263                     m.d.comb += go_now.eq(1)
 264
 265         with m.Elif(~alu_done | self.n.ready_i):
 266             # decrement the counter while the ALU is neither idle nor finished
 267             m.d.sync += self.counter.eq(self.counter - 1)
 268
 269         # choose between zero-delay output, or registered
 270         with m.If(go_now):
 271             m.d.comb += self.o.eq(sub.o)
 272         # only present the result at the last computation cycle
 273         with m.Elif(alu_done):
 274             m.d.comb += self.o.eq(alu_r)
 275
 276         return m
 277
 278     def __iter__(self):
 279         yield from self.op.ports()
 280         yield self.a
 281         yield self.b
 282         yield self.o
 283         yield self.p.valid_i
 284         yield self.p.ready_o
 285         yield self.n.valid_o
 286         yield self.n.ready_i
 287
 288     def ports(self):
 289         return list(self)
 290
 291
 292 class BranchOp(Elaboratable):
 293     def __init__(self, width, op):
 294         self.a   = Signal(width)
 295         self.b   = Signal(width)
 296         self.o   = Signal(width)
 297         self.op = op
 298
 299     def elaborate(self, platform):
 300         m = Module()
 301         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 302         return m
 303
 304
 305 class BranchALU(Elaboratable):
 306     def __init__(self, width):
 307         self.p = Dummy() # make look like nmutil pipeline API
 308         self.p.data_i = Dummy()
 309         self.p.data_i.ctx = Dummy()
 310         self.n = Dummy() # make look like nmutil pipeline API
 311         self.n.data_o = Dummy()
 312         self.p.valid_i = Signal()
 313         self.p.ready_o = Signal()
 314         self.n.ready_i = Signal()
 315         self.n.valid_o = Signal()
 316         self.counter   = Signal(4)
 317         self.op  = Signal(2)
 318         i = []
 319         i.append(Signal(width, name="i1"))
 320         i.append(Signal(width, name="i2"))
 321         self.i = Array(i)
 322         self.a, self.b = i[0], i[1]
 323         self.out = Array([Signal(width)])
 324         self.o = self.out[0]
 325         self.width = width
 326
 327     def elaborate(self, platform):
 328         m = Module()
 329         bgt = BranchOp(self.width, operator.gt)
 330         blt = BranchOp(self.width, operator.lt)
 331         beq = BranchOp(self.width, operator.eq)
 332         bne = BranchOp(self.width, operator.ne)
 333
 334         m.submodules.bgt = bgt
 335         m.submodules.blt = blt
 336         m.submodules.beq = beq
 337         m.submodules.bne = bne
 338         for mod in [bgt, blt, beq, bne]:
 339             m.d.comb += [
 340                 mod.a.eq(self.a),
 341                 mod.b.eq(self.b),
 342             ]
 343
 344         go_now = Signal(reset_less=True) # testing no-delay ALU
 345         with m.If(self.p.valid_i):
 346             # input is valid. next check, if we already said "ready" or not
 347             with m.If(~self.p.ready_o):
 348                 # we didn't say "ready" yet, so say so and initialise
 349                 m.d.sync += self.p.ready_o.eq(1)
 350
 351                 # as this is a "fake" pipeline, just grab the output right now
 352                 with m.Switch(self.op):
 353                     for i, mod in enumerate([bgt, blt, beq, bne]):
 354                         with m.Case(i):
 355                             m.d.sync += self.o.eq(mod.o)
 356                 m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
 357                 #m.d.comb += go_now.eq(1)
 358         with m.Else():
 359             # input says no longer valid, so drop ready as well.
 360             # a "proper" ALU would have had to sync in the opcode and a/b ops
 361             m.d.sync += self.p.ready_o.eq(0)
 362
 363         # ok so the counter's running: when it gets to 1, fire the output
 364         with m.If((self.counter == 1) | go_now):
 365             # set the output as valid if the recipient is ready for it
 366             m.d.sync += self.n.valid_o.eq(1)
 367         with m.If(self.n.ready_i & self.n.valid_o):
 368             m.d.sync += self.n.valid_o.eq(0)
 369             # recipient said it was ready: reset back to known-good.
 370             m.d.sync += self.counter.eq(0) # reset the counter
 371             m.d.sync += self.o.eq(0) # clear the output for tidiness sake
 372
 373         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 374         with m.If(self.counter > 1):
 375             m.d.sync += self.counter.eq(self.counter - 1)
 376
 377         return m
 378
 379     def __iter__(self):
 380         yield self.op
 381         yield self.a
 382         yield self.b
 383         yield self.o
 384
 385     def ports(self):
 386         return list(self)
 387
 388 def run_op(dut, a, b, op, inv_a=0):
 389     yield dut.a.eq(a)
 390     yield dut.b.eq(b)
 391     yield dut.op.insn_type.eq(op)
 392     yield dut.op.invert_a.eq(inv_a)
 393     yield dut.n.ready_i.eq(0)
 394     yield dut.p.valid_i.eq(1)
 395     yield dut.n.ready_i.eq(1)
 396     yield
 397
 398     # wait for the ALU to accept our input data
 399     while not (yield dut.p.ready_o):
 400         yield
 401
 402     yield dut.p.valid_i.eq(0)
 403     yield dut.a.eq(0)
 404     yield dut.b.eq(0)
 405     yield dut.op.insn_type.eq(0)
 406     yield dut.op.invert_a.eq(0)
 407
 408     # wait for the ALU to present the output data
 409     while not (yield dut.n.valid_o):
 410         yield
 411
 412     # latch the result and lower read_i
 413     result = yield dut.o
 414     yield dut.n.ready_i.eq(0)
 415
 416     return result
 417
 418
 419 def alu_sim(dut):
 420     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 421     print ("alu_sim add", result)
 422     assert (result == 8)
 423
 424     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 425     print ("alu_sim mul", result)
 426     assert (result == 6)
 427
 428     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 429     print ("alu_sim add-inv", result)
 430     assert (result == 65533)
 431
 432     # test zero-delay ALU
 433     # don't have OP_SUB, so use any other
 434     result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
 435     print ("alu_sim sub", result)
 436     assert (result == 2)
 437
 438     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 439     print ("alu_sim shr", result)
 440     assert (result == 3)
 441
 442
 443 def test_alu():
 444     alu = ALU(width=16)
 445     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 446
 447     vl = rtlil.convert(alu, ports=alu.ports())
 448     with open("test_alu.il", "w") as f:
 449         f.write(vl)
 450
 451
 452 def test_alu_parallel():
 453     # Compare with the sequential test implementation, above.
 454     m = Module()
 455     m.submodules.alu = dut = ALU(width=16)
 456     sim = Simulator(m)
 457     sim.add_clock(1e-6)
 458
 459     def send(a, b, op, inv_a=0):
 460         # present input data and assert valid_i
 461         yield dut.a.eq(a)
 462         yield dut.b.eq(b)
 463         yield dut.op.insn_type.eq(op)
 464         yield dut.op.invert_a.eq(inv_a)
 465         yield dut.p.valid_i.eq(1)
 466         yield
 467         # wait for ready_o to be asserted
 468         while not (yield dut.p.ready_o):
 469             yield
 470         # clear input data and negate valid_i
 471         # if send is called again immediately afterwards, there will be no
 472         # visible transition (they will not be negated, after all)
 473         yield dut.p.valid_i.eq(0)
 474         yield dut.a.eq(0)
 475         yield dut.b.eq(0)
 476         yield dut.op.insn_type.eq(0)
 477         yield dut.op.invert_a.eq(0)
 478
 479     def receive():
 480         # signal readiness to receive data
 481         yield dut.n.ready_i.eq(1)
 482         yield
 483         # wait for valid_o to be asserted
 484         while not (yield dut.n.valid_o):
 485             yield
 486         # read result
 487         result = yield dut.o
 488         # negate ready_i
 489         # if receive is called again immediately afterwards, there will be no
 490         # visible transition (it will not be negated, after all)
 491         yield dut.n.ready_i.eq(0)
 492         return result
 493
 494     def producer():
 495         # send a few test cases, interspersed with wait states
 496         # note that, for this test, we do not wait for the result to be ready,
 497         # before presenting the next input
 498         # 5 + 3
 499         yield from send(5, 3, MicrOp.OP_ADD)
 500         yield
 501         yield
 502         # 2 * 3
 503         yield from send(2, 3, MicrOp.OP_MUL_L64)
 504         # (-5) + 3
 505         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 506         yield
 507         # 5 - 3
 508         # note that this is a zero-delay operation
 509         yield from send(5, 3, MicrOp.OP_NOP)
 510         yield
 511         yield
 512         # 13 >> 2
 513         yield from send(13, 2, MicrOp.OP_SHR)
 514
 515     def consumer():
 516         # receive and check results, interspersed with wait states
 517         # the consumer is not in step with the producer, but the
 518         # order of the results are preserved
 519         yield
 520         # 5 + 3 = 8
 521         result = yield from receive()
 522         assert (result == 8)
 523         # 2 * 3 = 6
 524         result = yield from receive()
 525         assert (result == 6)
 526         yield
 527         yield
 528         # (-5) + 3 = -2
 529         result = yield from receive()
 530         assert (result == 65533)  # unsigned equivalent to -2
 531         # 5 - 3 = 2
 532         # note that this is a zero-delay operation
 533         # this, and the previous result, will be received back-to-back
 534         # (check the output waveform to see this)
 535         result = yield from receive()
 536         assert (result == 2)
 537         yield
 538         yield
 539         # 13 >> 2 = 3
 540         result = yield from receive()
 541         assert (result == 3)
 542
 543     sim.add_sync_process(producer)
 544     sim.add_sync_process(consumer)
 545     sim_writer = sim.write_vcd(
 546         "test_alu_parallel.vcd",
 547         "test_alu_parallel.gtkw",
 548         traces=dut.ports()
 549     )
 550     with sim_writer:
 551         sim.run()
 552
 553
 554 if __name__ == "__main__":
 555     test_alu()
 556     test_alu_parallel()
 557
 558     # alu = BranchALU(width=16)
 559     # vl = rtlil.convert(alu, ports=alu.ports())
 560     # with open("test_branch_alu.il", "w") as f:
 561     #     f.write(vl)
 562