src/soc/experiment/alu_hier.py

   1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
   2
   3 This ALU is *deliberately* designed to add in (unnecessary) delays into
   4 different operations so as to be able to test the 6600-style matrices
   5 and the CompUnits.  Countdown timers wait for (defined) periods before
   6 indicating that the output is valid
   7
   8 A "real" integer ALU would place the answers onto the output bus after
   9 only one cycle (sync)
  10 """
  11
  12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
  13 from nmigen.hdl.rec import Record, Layout
  14 from nmigen.cli import main
  15 from nmigen.cli import verilog, rtlil
  16 from nmigen.compat.sim import run_simulation
  17 from nmigen.back.pysim import Simulator, Settle
  18
  19 from soc.decoder.power_enums import MicrOp, Function, CryIn
  20
  21 from soc.fu.alu.alu_input_record import CompALUOpSubset
  22 from soc.fu.cr.cr_input_record import CompCROpSubset
  23
  24 import operator
  25
  26
  27 class Adder(Elaboratable):
  28     def __init__(self, width):
  29         self.invert_a = Signal()
  30         self.a = Signal(width)
  31         self.b = Signal(width)
  32         self.o = Signal(width, name="add_o")
  33
  34     def elaborate(self, platform):
  35         m = Module()
  36         with m.If(self.invert_a):
  37             m.d.comb += self.o.eq((~self.a) + self.b)
  38         with m.Else():
  39             m.d.comb += self.o.eq(self.a + self.b)
  40         return m
  41
  42
  43 class Subtractor(Elaboratable):
  44     def __init__(self, width):
  45         self.a = Signal(width)
  46         self.b = Signal(width)
  47         self.o = Signal(width, name="sub_o")
  48
  49     def elaborate(self, platform):
  50         m = Module()
  51         m.d.comb += self.o.eq(self.a - self.b)
  52         return m
  53
  54
  55 class Multiplier(Elaboratable):
  56     def __init__(self, width):
  57         self.a = Signal(width)
  58         self.b = Signal(width)
  59         self.o = Signal(width, name="mul_o")
  60
  61     def elaborate(self, platform):
  62         m = Module()
  63         m.d.comb += self.o.eq(self.a * self.b)
  64         return m
  65
  66
  67 class Shifter(Elaboratable):
  68     def __init__(self, width):
  69         self.width = width
  70         self.a = Signal(width)
  71         self.b = Signal(width)
  72         self.o = Signal(width, name="shf_o")
  73
  74     def elaborate(self, platform):
  75         m = Module()
  76         btrunc = Signal(self.width)
  77         m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
  78         m.d.comb += self.o.eq(self.a >> btrunc)
  79         return m
  80
  81
  82 class Dummy:
  83     pass
  84
  85
  86 class DummyALU(Elaboratable):
  87     def __init__(self, width):
  88         self.p = Dummy()  # make look like nmutil pipeline API
  89         self.p.data_i = Dummy()
  90         self.p.data_i.ctx = Dummy()
  91         self.n = Dummy()  # make look like nmutil pipeline API
  92         self.n.data_o = Dummy()
  93         self.p.valid_i = Signal()
  94         self.p.ready_o = Signal()
  95         self.n.ready_i = Signal()
  96         self.n.valid_o = Signal()
  97         self.counter = Signal(4)
  98         self.op = CompCROpSubset()
  99         i = []
 100         i.append(Signal(width, name="i1"))
 101         i.append(Signal(width, name="i2"))
 102         i.append(Signal(width, name="i3"))
 103         self.i = Array(i)
 104         self.a, self.b, self.c = i[0], i[1], i[2]
 105         self.out = Array([Signal(width, name="alu_o")])
 106         self.o = self.out[0]
 107         self.width = width
 108         # more "look like nmutil pipeline API"
 109         self.p.data_i.ctx.op = self.op
 110         self.p.data_i.a = self.a
 111         self.p.data_i.b = self.b
 112         self.p.data_i.c = self.c
 113         self.n.data_o.o = self.o
 114
 115     def elaborate(self, platform):
 116         m = Module()
 117
 118         go_now = Signal(reset_less=True)  # testing no-delay ALU
 119
 120         with m.If(self.p.valid_i):
 121             # input is valid. next check, if we already said "ready" or not
 122             with m.If(~self.p.ready_o):
 123                 # we didn't say "ready" yet, so say so and initialise
 124                 m.d.sync += self.p.ready_o.eq(1)
 125
 126                 m.d.sync += self.o.eq(self.a)
 127                 m.d.comb += go_now.eq(1)
 128                 m.d.sync += self.counter.eq(1)
 129
 130         with m.Else():
 131             # input says no longer valid, so drop ready as well.
 132             # a "proper" ALU would have had to sync in the opcode and a/b ops
 133             m.d.sync += self.p.ready_o.eq(0)
 134
 135         # ok so the counter's running: when it gets to 1, fire the output
 136         with m.If((self.counter == 1) | go_now):
 137             # set the output as valid if the recipient is ready for it
 138             m.d.sync += self.n.valid_o.eq(1)
 139         with m.If(self.n.ready_i & self.n.valid_o):
 140             m.d.sync += self.n.valid_o.eq(0)
 141             # recipient said it was ready: reset back to known-good.
 142             m.d.sync += self.counter.eq(0)  # reset the counter
 143             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 144
 145         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 146         with m.If(self.counter > 1):
 147             m.d.sync += self.counter.eq(self.counter - 1)
 148
 149         return m
 150
 151     def __iter__(self):
 152         yield from self.op.ports()
 153         yield self.a
 154         yield self.b
 155         yield self.c
 156         yield self.o
 157
 158     def ports(self):
 159         return list(self)
 160
 161
 162 class ALU(Elaboratable):
 163     def __init__(self, width):
 164         self.p = Dummy()  # make look like nmutil pipeline API
 165         self.p.data_i = Dummy()
 166         self.p.data_i.ctx = Dummy()
 167         self.n = Dummy()  # make look like nmutil pipeline API
 168         self.n.data_o = Dummy()
 169         self.p.valid_i = Signal()
 170         self.p.ready_o = Signal()
 171         self.n.ready_i = Signal()
 172         self.n.valid_o = Signal()
 173         self.counter = Signal(4)
 174         self.op = CompALUOpSubset(name="op")
 175         i = []
 176         i.append(Signal(width, name="i1"))
 177         i.append(Signal(width, name="i2"))
 178         self.i = Array(i)
 179         self.a, self.b = i[0], i[1]
 180         self.out = Array([Signal(width, name="alu_o")])
 181         self.o = self.out[0]
 182         self.width = width
 183         # more "look like nmutil pipeline API"
 184         self.p.data_i.ctx.op = self.op
 185         self.p.data_i.a = self.a
 186         self.p.data_i.b = self.b
 187         self.n.data_o.o = self.o
 188
 189     def elaborate(self, platform):
 190         m = Module()
 191         add = Adder(self.width)
 192         mul = Multiplier(self.width)
 193         shf = Shifter(self.width)
 194         sub = Subtractor(self.width)
 195
 196         m.submodules.add = add
 197         m.submodules.mul = mul
 198         m.submodules.shf = shf
 199         m.submodules.sub = sub
 200
 201         # really should not activate absolutely all ALU inputs like this
 202         for mod in [add, mul, shf, sub]:
 203             m.d.comb += [
 204                 mod.a.eq(self.a),
 205                 mod.b.eq(self.b),
 206             ]
 207
 208         # pass invert (and carry later)
 209         m.d.comb += add.invert_a.eq(self.op.invert_a)
 210
 211         go_now = Signal(reset_less=True)  # testing no-delay ALU
 212
 213         # ALU sequencer is idle when the count is zero
 214         alu_idle = Signal(reset_less=True)
 215         m.d.comb += alu_idle.eq(self.counter == 0)
 216
 217         # ALU sequencer is done when the count is one
 218         alu_done = Signal(reset_less=True)
 219         m.d.comb += alu_done.eq(self.counter == 1)
 220
 221         # select handshake handling according to ALU type
 222         with m.If(go_now):
 223             # with a combinatorial, no-delay ALU, just pass through
 224             # the handshake signals to the other side
 225             m.d.comb += self.p.ready_o.eq(self.n.ready_i)
 226             m.d.comb += self.n.valid_o.eq(self.p.valid_i)
 227         with m.Else():
 228             # sequential ALU handshake:
 229             # ready_o responds to valid_i, but only if the ALU is idle
 230             m.d.comb += self.p.ready_o.eq(alu_idle)
 231             # select the internally generated valid_o, above
 232             m.d.comb += self.n.valid_o.eq(alu_done)
 233
 234         # hold the ALU result until ready_o is asserted
 235         alu_r = Signal(self.width)
 236
 237         with m.If(alu_idle):
 238             with m.If(self.p.valid_i):
 239
 240                 # as this is a "fake" pipeline, just grab the output right now
 241                 with m.If(self.op.insn_type == MicrOp.OP_ADD):
 242                     m.d.sync += alu_r.eq(add.o)
 243                 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
 244                     m.d.sync += alu_r.eq(mul.o)
 245                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 246                     m.d.sync += alu_r.eq(shf.o)
 247                 # SUB is zero-delay, no need to register
 248
 249                 # NOTE: all of these are fake, just something to test
 250
 251                 # MUL, to take 5 instructions
 252                 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
 253                     m.d.sync += self.counter.eq(5)
 254                 # SHIFT to take 1, straight away
 255                 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
 256                     m.d.sync += self.counter.eq(1)
 257                 # ADD/SUB to take 3
 258                 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
 259                     m.d.sync += self.counter.eq(3)
 260                 # others to take no delay
 261                 with m.Else():
 262                     m.d.comb += go_now.eq(1)
 263
 264         with m.Elif(~alu_done | self.n.ready_i):
 265             # decrement the counter while the ALU is neither idle nor finished
 266             m.d.sync += self.counter.eq(self.counter - 1)
 267
 268         # choose between zero-delay output, or registered
 269         with m.If(go_now):
 270             m.d.comb += self.o.eq(sub.o)
 271         # only present the result at the last computation cycle
 272         with m.Elif(alu_done):
 273             m.d.comb += self.o.eq(alu_r)
 274
 275         return m
 276
 277     def __iter__(self):
 278         yield from self.op.ports()
 279         yield self.a
 280         yield self.b
 281         yield self.o
 282         yield self.p.valid_i
 283         yield self.p.ready_o
 284         yield self.n.valid_o
 285         yield self.n.ready_i
 286
 287     def ports(self):
 288         return list(self)
 289
 290
 291 class BranchOp(Elaboratable):
 292     def __init__(self, width, op):
 293         self.a = Signal(width)
 294         self.b = Signal(width)
 295         self.o = Signal(width)
 296         self.op = op
 297
 298     def elaborate(self, platform):
 299         m = Module()
 300         m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
 301         return m
 302
 303
 304 class BranchALU(Elaboratable):
 305     def __init__(self, width):
 306         self.p = Dummy()  # make look like nmutil pipeline API
 307         self.p.data_i = Dummy()
 308         self.p.data_i.ctx = Dummy()
 309         self.n = Dummy()  # make look like nmutil pipeline API
 310         self.n.data_o = Dummy()
 311         self.p.valid_i = Signal()
 312         self.p.ready_o = Signal()
 313         self.n.ready_i = Signal()
 314         self.n.valid_o = Signal()
 315         self.counter = Signal(4)
 316         self.op = Signal(2)
 317         i = []
 318         i.append(Signal(width, name="i1"))
 319         i.append(Signal(width, name="i2"))
 320         self.i = Array(i)
 321         self.a, self.b = i[0], i[1]
 322         self.out = Array([Signal(width)])
 323         self.o = self.out[0]
 324         self.width = width
 325
 326     def elaborate(self, platform):
 327         m = Module()
 328         bgt = BranchOp(self.width, operator.gt)
 329         blt = BranchOp(self.width, operator.lt)
 330         beq = BranchOp(self.width, operator.eq)
 331         bne = BranchOp(self.width, operator.ne)
 332
 333         m.submodules.bgt = bgt
 334         m.submodules.blt = blt
 335         m.submodules.beq = beq
 336         m.submodules.bne = bne
 337         for mod in [bgt, blt, beq, bne]:
 338             m.d.comb += [
 339                 mod.a.eq(self.a),
 340                 mod.b.eq(self.b),
 341             ]
 342
 343         go_now = Signal(reset_less=True)  # testing no-delay ALU
 344         with m.If(self.p.valid_i):
 345             # input is valid. next check, if we already said "ready" or not
 346             with m.If(~self.p.ready_o):
 347                 # we didn't say "ready" yet, so say so and initialise
 348                 m.d.sync += self.p.ready_o.eq(1)
 349
 350                 # as this is a "fake" pipeline, just grab the output right now
 351                 with m.Switch(self.op):
 352                     for i, mod in enumerate([bgt, blt, beq, bne]):
 353                         with m.Case(i):
 354                             m.d.sync += self.o.eq(mod.o)
 355                 # branch to take 5 cycles (fake)
 356                 m.d.sync += self.counter.eq(5)
 357                 #m.d.comb += go_now.eq(1)
 358         with m.Else():
 359             # input says no longer valid, so drop ready as well.
 360             # a "proper" ALU would have had to sync in the opcode and a/b ops
 361             m.d.sync += self.p.ready_o.eq(0)
 362
 363         # ok so the counter's running: when it gets to 1, fire the output
 364         with m.If((self.counter == 1) | go_now):
 365             # set the output as valid if the recipient is ready for it
 366             m.d.sync += self.n.valid_o.eq(1)
 367         with m.If(self.n.ready_i & self.n.valid_o):
 368             m.d.sync += self.n.valid_o.eq(0)
 369             # recipient said it was ready: reset back to known-good.
 370             m.d.sync += self.counter.eq(0)  # reset the counter
 371             m.d.sync += self.o.eq(0)  # clear the output for tidiness sake
 372
 373         # countdown to 1 (transition from 1 to 0 only on acknowledgement)
 374         with m.If(self.counter > 1):
 375             m.d.sync += self.counter.eq(self.counter - 1)
 376
 377         return m
 378
 379     def __iter__(self):
 380         yield self.op
 381         yield self.a
 382         yield self.b
 383         yield self.o
 384
 385     def ports(self):
 386         return list(self)
 387
 388
 389 def run_op(dut, a, b, op, inv_a=0):
 390     yield dut.a.eq(a)
 391     yield dut.b.eq(b)
 392     yield dut.op.insn_type.eq(op)
 393     yield dut.op.invert_a.eq(inv_a)
 394     yield dut.n.ready_i.eq(0)
 395     yield dut.p.valid_i.eq(1)
 396     yield dut.n.ready_i.eq(1)
 397     yield
 398
 399     # wait for the ALU to accept our input data
 400     while not (yield dut.p.ready_o):
 401         yield
 402
 403     yield dut.p.valid_i.eq(0)
 404     yield dut.a.eq(0)
 405     yield dut.b.eq(0)
 406     yield dut.op.insn_type.eq(0)
 407     yield dut.op.invert_a.eq(0)
 408
 409     # wait for the ALU to present the output data
 410     while not (yield dut.n.valid_o):
 411         yield
 412
 413     # latch the result and lower read_i
 414     result = yield dut.o
 415     yield dut.n.ready_i.eq(0)
 416
 417     return result
 418
 419
 420 def alu_sim(dut):
 421     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
 422     print("alu_sim add", result)
 423     assert (result == 8)
 424
 425     result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
 426     print("alu_sim mul", result)
 427     assert (result == 6)
 428
 429     result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
 430     print("alu_sim add-inv", result)
 431     assert (result == 65533)
 432
 433     # test zero-delay ALU
 434     # don't have OP_SUB, so use any other
 435     result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
 436     print("alu_sim sub", result)
 437     assert (result == 2)
 438
 439     result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
 440     print("alu_sim shr", result)
 441     assert (result == 3)
 442
 443
 444 def test_alu():
 445     alu = ALU(width=16)
 446     run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
 447
 448     vl = rtlil.convert(alu, ports=alu.ports())
 449     with open("test_alu.il", "w") as f:
 450         f.write(vl)
 451
 452
 453 def test_alu_parallel():
 454     # Compare with the sequential test implementation, above.
 455     m = Module()
 456     m.submodules.alu = dut = ALU(width=16)
 457     sim = Simulator(m)
 458     sim.add_clock(1e-6)
 459
 460     def send(a, b, op, inv_a=0):
 461         # present input data and assert valid_i
 462         yield dut.a.eq(a)
 463         yield dut.b.eq(b)
 464         yield dut.op.insn_type.eq(op)
 465         yield dut.op.invert_a.eq(inv_a)
 466         yield dut.p.valid_i.eq(1)
 467         yield
 468         # wait for ready_o to be asserted
 469         while not (yield dut.p.ready_o):
 470             yield
 471         # clear input data and negate valid_i
 472         # if send is called again immediately afterwards, there will be no
 473         # visible transition (they will not be negated, after all)
 474         yield dut.p.valid_i.eq(0)
 475         yield dut.a.eq(0)
 476         yield dut.b.eq(0)
 477         yield dut.op.insn_type.eq(0)
 478         yield dut.op.invert_a.eq(0)
 479
 480     def receive():
 481         # signal readiness to receive data
 482         yield dut.n.ready_i.eq(1)
 483         yield
 484         # wait for valid_o to be asserted
 485         while not (yield dut.n.valid_o):
 486             yield
 487         # read result
 488         result = yield dut.o
 489         # negate ready_i
 490         # if receive is called again immediately afterwards, there will be no
 491         # visible transition (it will not be negated, after all)
 492         yield dut.n.ready_i.eq(0)
 493         return result
 494
 495     def producer():
 496         # send a few test cases, interspersed with wait states
 497         # note that, for this test, we do not wait for the result to be ready,
 498         # before presenting the next input
 499         # 5 + 3
 500         yield from send(5, 3, MicrOp.OP_ADD)
 501         yield
 502         yield
 503         # 2 * 3
 504         yield from send(2, 3, MicrOp.OP_MUL_L64)
 505         # (-5) + 3
 506         yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
 507         yield
 508         # 5 - 3
 509         # note that this is a zero-delay operation
 510         yield from send(5, 3, MicrOp.OP_NOP)
 511         yield
 512         yield
 513         # 13 >> 2
 514         yield from send(13, 2, MicrOp.OP_SHR)
 515
 516     def consumer():
 517         # receive and check results, interspersed with wait states
 518         # the consumer is not in step with the producer, but the
 519         # order of the results are preserved
 520         yield
 521         # 5 + 3 = 8
 522         result = yield from receive()
 523         assert (result == 8)
 524         # 2 * 3 = 6
 525         result = yield from receive()
 526         assert (result == 6)
 527         yield
 528         yield
 529         # (-5) + 3 = -2
 530         result = yield from receive()
 531         assert (result == 65533)  # unsigned equivalent to -2
 532         # 5 - 3 = 2
 533         # note that this is a zero-delay operation
 534         # this, and the previous result, will be received back-to-back
 535         # (check the output waveform to see this)
 536         result = yield from receive()
 537         assert (result == 2)
 538         yield
 539         yield
 540         # 13 >> 2 = 3
 541         result = yield from receive()
 542         assert (result == 3)
 543
 544     sim.add_sync_process(producer)
 545     sim.add_sync_process(consumer)
 546     sim_writer = sim.write_vcd(
 547         "test_alu_parallel.vcd",
 548         "test_alu_parallel.gtkw",
 549         traces=dut.ports()
 550     )
 551     with sim_writer:
 552         sim.run()
 553
 554
 555 if __name__ == "__main__":
 556     test_alu()
 557     test_alu_parallel()
 558
 559     # alu = BranchALU(width=16)
 560     # vl = rtlil.convert(alu, ports=alu.ports())
 561     # with open("test_branch_alu.il", "w") as f:
 562     #     f.write(vl)