Simplify waiting loops
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmigen.back.pysim import Simulator, Settle
18
19 from soc.decoder.power_enums import InternalOp, Function, CryIn
20
21 from soc.fu.alu.alu_input_record import CompALUOpSubset
22 from soc.fu.cr.cr_input_record import CompCROpSubset
23
24 import operator
25
26
27
28
29 class Adder(Elaboratable):
30 def __init__(self, width):
31 self.invert_a = Signal()
32 self.a = Signal(width)
33 self.b = Signal(width)
34 self.o = Signal(width, name="add_o")
35
36 def elaborate(self, platform):
37 m = Module()
38 with m.If(self.invert_a):
39 m.d.comb += self.o.eq((~self.a) + self.b)
40 with m.Else():
41 m.d.comb += self.o.eq(self.a + self.b)
42 return m
43
44
45 class Subtractor(Elaboratable):
46 def __init__(self, width):
47 self.a = Signal(width)
48 self.b = Signal(width)
49 self.o = Signal(width, name="sub_o")
50
51 def elaborate(self, platform):
52 m = Module()
53 m.d.comb += self.o.eq(self.a - self.b)
54 return m
55
56
57 class Multiplier(Elaboratable):
58 def __init__(self, width):
59 self.a = Signal(width)
60 self.b = Signal(width)
61 self.o = Signal(width, name="mul_o")
62
63 def elaborate(self, platform):
64 m = Module()
65 m.d.comb += self.o.eq(self.a * self.b)
66 return m
67
68
69 class Shifter(Elaboratable):
70 def __init__(self, width):
71 self.width = width
72 self.a = Signal(width)
73 self.b = Signal(width)
74 self.o = Signal(width, name="shf_o")
75
76 def elaborate(self, platform):
77 m = Module()
78 btrunc = Signal(self.width)
79 m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
80 m.d.comb += self.o.eq(self.a >> btrunc)
81 return m
82
83 class Dummy:
84 pass
85
86
87 class DummyALU(Elaboratable):
88 def __init__(self, width):
89 self.p = Dummy() # make look like nmutil pipeline API
90 self.p.data_i = Dummy()
91 self.p.data_i.ctx = Dummy()
92 self.n = Dummy() # make look like nmutil pipeline API
93 self.n.data_o = Dummy()
94 self.p.valid_i = Signal()
95 self.p.ready_o = Signal()
96 self.n.ready_i = Signal()
97 self.n.valid_o = Signal()
98 self.counter = Signal(4)
99 self.op = CompCROpSubset()
100 i = []
101 i.append(Signal(width, name="i1"))
102 i.append(Signal(width, name="i2"))
103 i.append(Signal(width, name="i3"))
104 self.i = Array(i)
105 self.a, self.b, self.c = i[0], i[1], i[2]
106 self.out = Array([Signal(width, name="alu_o")])
107 self.o = self.out[0]
108 self.width = width
109 # more "look like nmutil pipeline API"
110 self.p.data_i.ctx.op = self.op
111 self.p.data_i.a = self.a
112 self.p.data_i.b = self.b
113 self.p.data_i.c = self.c
114 self.n.data_o.o = self.o
115
116 def elaborate(self, platform):
117 m = Module()
118
119 go_now = Signal(reset_less=True) # testing no-delay ALU
120
121 with m.If(self.p.valid_i):
122 # input is valid. next check, if we already said "ready" or not
123 with m.If(~self.p.ready_o):
124 # we didn't say "ready" yet, so say so and initialise
125 m.d.sync += self.p.ready_o.eq(1)
126
127 m.d.sync += self.o.eq(self.a)
128 m.d.comb += go_now.eq(1)
129 m.d.sync += self.counter.eq(1)
130
131 with m.Else():
132 # input says no longer valid, so drop ready as well.
133 # a "proper" ALU would have had to sync in the opcode and a/b ops
134 m.d.sync += self.p.ready_o.eq(0)
135
136 # ok so the counter's running: when it gets to 1, fire the output
137 with m.If((self.counter == 1) | go_now):
138 # set the output as valid if the recipient is ready for it
139 m.d.sync += self.n.valid_o.eq(1)
140 with m.If(self.n.ready_i & self.n.valid_o):
141 m.d.sync += self.n.valid_o.eq(0)
142 # recipient said it was ready: reset back to known-good.
143 m.d.sync += self.counter.eq(0) # reset the counter
144 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
145
146 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
147 with m.If(self.counter > 1):
148 m.d.sync += self.counter.eq(self.counter - 1)
149
150 return m
151
152 def __iter__(self):
153 yield from self.op.ports()
154 yield self.a
155 yield self.b
156 yield self.c
157 yield self.o
158
159 def ports(self):
160 return list(self)
161
162
163 class ALU(Elaboratable):
164 def __init__(self, width):
165 self.p = Dummy() # make look like nmutil pipeline API
166 self.p.data_i = Dummy()
167 self.p.data_i.ctx = Dummy()
168 self.n = Dummy() # make look like nmutil pipeline API
169 self.n.data_o = Dummy()
170 self.p.valid_i = Signal()
171 self.p.ready_o = Signal()
172 self.n.ready_i = Signal()
173 self.n.valid_o = Signal()
174 self.counter = Signal(4)
175 self.op = CompALUOpSubset(name="op")
176 i = []
177 i.append(Signal(width, name="i1"))
178 i.append(Signal(width, name="i2"))
179 self.i = Array(i)
180 self.a, self.b = i[0], i[1]
181 self.out = Array([Signal(width, name="alu_o")])
182 self.o = self.out[0]
183 self.width = width
184 # more "look like nmutil pipeline API"
185 self.p.data_i.ctx.op = self.op
186 self.p.data_i.a = self.a
187 self.p.data_i.b = self.b
188 self.n.data_o.o = self.o
189
190 def elaborate(self, platform):
191 m = Module()
192 add = Adder(self.width)
193 mul = Multiplier(self.width)
194 shf = Shifter(self.width)
195 sub = Subtractor(self.width)
196
197 m.submodules.add = add
198 m.submodules.mul = mul
199 m.submodules.shf = shf
200 m.submodules.sub = sub
201
202 # really should not activate absolutely all ALU inputs like this
203 for mod in [add, mul, shf, sub]:
204 m.d.comb += [
205 mod.a.eq(self.a),
206 mod.b.eq(self.b),
207 ]
208
209 # pass invert (and carry later)
210 m.d.comb += add.invert_a.eq(self.op.invert_a)
211
212 go_now = Signal(reset_less=True) # testing no-delay ALU
213
214 # ALU sequencer is idle when the count is zero
215 alu_idle = Signal(reset_less=True)
216 m.d.comb += alu_idle.eq(self.counter == 0)
217
218 # ALU sequencer is done when the count is one
219 alu_done = Signal(reset_less=True)
220 m.d.comb += alu_done.eq(self.counter == 1)
221
222 # select handshake handling according to ALU type
223 with m.If(go_now):
224 # with a combinatorial, no-delay ALU, just pass through
225 # the handshake signals to the other side
226 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
227 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
228 with m.Else():
229 # sequential ALU handshake:
230 # ready_o responds to valid_i, but only if the ALU is idle
231 m.d.comb += self.p.ready_o.eq(alu_idle)
232 # select the internally generated valid_o, above
233 m.d.comb += self.n.valid_o.eq(alu_done)
234
235 # hold the ALU result until ready_o is asserted
236 alu_r = Signal(self.width)
237
238 with m.If(alu_idle):
239 with m.If(self.p.valid_i):
240
241 # as this is a "fake" pipeline, just grab the output right now
242 with m.If(self.op.insn_type == InternalOp.OP_ADD):
243 m.d.sync += alu_r.eq(add.o)
244 with m.Elif(self.op.insn_type == InternalOp.OP_MUL_L64):
245 m.d.sync += alu_r.eq(mul.o)
246 with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
247 m.d.sync += alu_r.eq(shf.o)
248 # SUB is zero-delay, no need to register
249
250 # NOTE: all of these are fake, just something to test
251
252 # MUL, to take 5 instructions
253 with m.If(self.op.insn_type == InternalOp.OP_MUL_L64):
254 m.d.sync += self.counter.eq(5)
255 # SHIFT to take 1, straight away
256 with m.Elif(self.op.insn_type == InternalOp.OP_SHR):
257 m.d.sync += self.counter.eq(1)
258 # ADD/SUB to take 3
259 with m.Elif(self.op.insn_type == InternalOp.OP_ADD):
260 m.d.sync += self.counter.eq(3)
261 # others to take no delay
262 with m.Else():
263 m.d.comb += go_now.eq(1)
264
265 with m.Elif(~alu_done | self.n.ready_i):
266 # decrement the counter while the ALU is neither idle nor finished
267 m.d.sync += self.counter.eq(self.counter - 1)
268
269 # choose between zero-delay output, or registered
270 with m.If(go_now):
271 m.d.comb += self.o.eq(sub.o)
272 # only present the result at the last computation cycle
273 with m.Elif(alu_done):
274 m.d.comb += self.o.eq(alu_r)
275
276 return m
277
278 def __iter__(self):
279 yield from self.op.ports()
280 yield self.a
281 yield self.b
282 yield self.o
283 yield self.p.valid_i
284 yield self.p.ready_o
285 yield self.n.valid_o
286 yield self.n.ready_i
287
288 def ports(self):
289 return list(self)
290
291
292 class BranchOp(Elaboratable):
293 def __init__(self, width, op):
294 self.a = Signal(width)
295 self.b = Signal(width)
296 self.o = Signal(width)
297 self.op = op
298
299 def elaborate(self, platform):
300 m = Module()
301 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
302 return m
303
304
305 class BranchALU(Elaboratable):
306 def __init__(self, width):
307 self.p = Dummy() # make look like nmutil pipeline API
308 self.p.data_i = Dummy()
309 self.p.data_i.ctx = Dummy()
310 self.n = Dummy() # make look like nmutil pipeline API
311 self.n.data_o = Dummy()
312 self.p.valid_i = Signal()
313 self.p.ready_o = Signal()
314 self.n.ready_i = Signal()
315 self.n.valid_o = Signal()
316 self.counter = Signal(4)
317 self.op = Signal(2)
318 i = []
319 i.append(Signal(width, name="i1"))
320 i.append(Signal(width, name="i2"))
321 self.i = Array(i)
322 self.a, self.b = i[0], i[1]
323 self.out = Array([Signal(width)])
324 self.o = self.out[0]
325 self.width = width
326
327 def elaborate(self, platform):
328 m = Module()
329 bgt = BranchOp(self.width, operator.gt)
330 blt = BranchOp(self.width, operator.lt)
331 beq = BranchOp(self.width, operator.eq)
332 bne = BranchOp(self.width, operator.ne)
333
334 m.submodules.bgt = bgt
335 m.submodules.blt = blt
336 m.submodules.beq = beq
337 m.submodules.bne = bne
338 for mod in [bgt, blt, beq, bne]:
339 m.d.comb += [
340 mod.a.eq(self.a),
341 mod.b.eq(self.b),
342 ]
343
344 go_now = Signal(reset_less=True) # testing no-delay ALU
345 with m.If(self.p.valid_i):
346 # input is valid. next check, if we already said "ready" or not
347 with m.If(~self.p.ready_o):
348 # we didn't say "ready" yet, so say so and initialise
349 m.d.sync += self.p.ready_o.eq(1)
350
351 # as this is a "fake" pipeline, just grab the output right now
352 with m.Switch(self.op):
353 for i, mod in enumerate([bgt, blt, beq, bne]):
354 with m.Case(i):
355 m.d.sync += self.o.eq(mod.o)
356 m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
357 #m.d.comb += go_now.eq(1)
358 with m.Else():
359 # input says no longer valid, so drop ready as well.
360 # a "proper" ALU would have had to sync in the opcode and a/b ops
361 m.d.sync += self.p.ready_o.eq(0)
362
363 # ok so the counter's running: when it gets to 1, fire the output
364 with m.If((self.counter == 1) | go_now):
365 # set the output as valid if the recipient is ready for it
366 m.d.sync += self.n.valid_o.eq(1)
367 with m.If(self.n.ready_i & self.n.valid_o):
368 m.d.sync += self.n.valid_o.eq(0)
369 # recipient said it was ready: reset back to known-good.
370 m.d.sync += self.counter.eq(0) # reset the counter
371 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
372
373 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
374 with m.If(self.counter > 1):
375 m.d.sync += self.counter.eq(self.counter - 1)
376
377 return m
378
379 def __iter__(self):
380 yield self.op
381 yield self.a
382 yield self.b
383 yield self.o
384
385 def ports(self):
386 return list(self)
387
388 def run_op(dut, a, b, op, inv_a=0):
389 yield dut.a.eq(a)
390 yield dut.b.eq(b)
391 yield dut.op.insn_type.eq(op)
392 yield dut.op.invert_a.eq(inv_a)
393 yield dut.n.ready_i.eq(0)
394 yield dut.p.valid_i.eq(1)
395
396 # if valid_o rose on the very first cycle, it is a
397 # zero-delay ALU
398 yield Settle()
399 vld = yield dut.n.valid_o
400 if vld:
401 # special case for zero-delay ALU
402 # we must raise ready_i first, since the combinatorial ALU doesn't
403 # have any storage, and doesn't dare to assert ready_o back to us
404 # until we accepted the output data
405 yield dut.n.ready_i.eq(1)
406 result = yield dut.o
407 yield
408 yield dut.p.valid_i.eq(0)
409 yield dut.n.ready_i.eq(0)
410 yield
411 return result
412
413 yield
414
415 # wait for the ALU to accept our input data
416 while not (yield dut.p.ready_o):
417 yield
418
419 yield dut.p.valid_i.eq(0)
420
421 # wait for the ALU to present the output data
422 while not (yield dut.n.valid_o):
423 yield
424
425 # latch the result and lower read_i
426 yield dut.n.ready_i.eq(1)
427 result = yield dut.o
428 yield
429 yield dut.n.ready_i.eq(0)
430 yield
431
432 return result
433
434
435 def alu_sim(dut):
436 result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD)
437 print ("alu_sim add", result)
438 assert (result == 8)
439
440 result = yield from run_op(dut, 2, 3, InternalOp.OP_MUL_L64)
441 print ("alu_sim mul", result)
442 assert (result == 6)
443
444 result = yield from run_op(dut, 5, 3, InternalOp.OP_ADD, inv_a=1)
445 print ("alu_sim add-inv", result)
446 assert (result == 65533)
447
448 # test zero-delay ALU
449 # don't have OP_SUB, so use any other
450 result = yield from run_op(dut, 5, 3, InternalOp.OP_NOP)
451 print ("alu_sim sub", result)
452 assert (result == 2)
453
454 result = yield from run_op(dut, 13, 2, InternalOp.OP_SHR)
455 print ("alu_sim shr", result)
456 assert (result == 3)
457
458
459 def test_alu():
460 alu = ALU(width=16)
461 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
462
463 vl = rtlil.convert(alu, ports=alu.ports())
464 with open("test_alu.il", "w") as f:
465 f.write(vl)
466
467
468 def test_alu_parallel():
469 # Compare with the sequential test implementation, above.
470 m = Module()
471 m.submodules.alu = dut = ALU(width=16)
472 sim = Simulator(m)
473 sim.add_clock(1e-6)
474
475 def send(a, b, op, inv_a=0):
476 # present input data and assert valid_i
477 yield dut.a.eq(a)
478 yield dut.b.eq(b)
479 yield dut.op.insn_type.eq(op)
480 yield dut.op.invert_a.eq(inv_a)
481 yield dut.p.valid_i.eq(1)
482 yield
483 # wait for ready_o to be asserted
484 while not (yield dut.p.ready_o):
485 yield
486 # clear input data and negate valid_i
487 # if send is called again immediately afterwards, there will be no
488 # visible transition (they will not be negated, after all)
489 yield dut.p.valid_i.eq(0)
490 yield dut.a.eq(0)
491 yield dut.b.eq(0)
492 yield dut.op.insn_type.eq(0)
493 yield dut.op.invert_a.eq(0)
494
495 def receive():
496 # signal readiness to receive data
497 yield dut.n.ready_i.eq(1)
498 yield
499 # wait for valid_o to be asserted
500 while not (yield dut.n.valid_o):
501 yield
502 # read result
503 result = yield dut.o
504 # negate ready_i
505 # if receive is called again immediately afterwards, there will be no
506 # visible transition (it will not be negated, after all)
507 yield dut.n.ready_i.eq(0)
508 return result
509
510 def producer():
511 # send a few test cases, interspersed with wait states
512 # note that, for this test, we do not wait for the result to be ready,
513 # before presenting the next input
514 # 5 + 3
515 yield from send(5, 3, InternalOp.OP_ADD)
516 yield
517 yield
518 # 2 * 3
519 yield from send(2, 3, InternalOp.OP_MUL_L64)
520 # (-5) + 3
521 yield from send(5, 3, InternalOp.OP_ADD, inv_a=1)
522 yield
523 # 5 - 3
524 # note that this is a zero-delay operation
525 yield from send(5, 3, InternalOp.OP_NOP)
526 yield
527 yield
528 # 13 >> 2
529 yield from send(13, 2, InternalOp.OP_SHR)
530
531 def consumer():
532 # receive and check results, interspersed with wait states
533 # the consumer is not in step with the producer, but the
534 # order of the results are preserved
535 yield
536 # 5 + 3 = 8
537 result = yield from receive()
538 assert (result == 8)
539 # 2 * 3 = 6
540 result = yield from receive()
541 assert (result == 6)
542 yield
543 yield
544 # (-5) + 3 = -2
545 result = yield from receive()
546 assert (result == 65533) # unsigned equivalent to -2
547 yield
548 # 5 - 3 = 2
549 # note that this is a zero-delay operation
550 result = yield from receive()
551 assert (result == 2)
552 yield
553 yield
554 # 13 >> 2 = 3
555 result = yield from receive()
556 assert (result == 3)
557
558 sim.add_sync_process(producer)
559 sim.add_sync_process(consumer)
560 sim_writer = sim.write_vcd(
561 "test_alu_parallel.vcd",
562 "test_alu_parallel.gtkw",
563 traces=dut.ports()
564 )
565 with sim_writer:
566 sim.run()
567
568
569 if __name__ == "__main__":
570 test_alu()
571 test_alu_parallel()
572
573 # alu = BranchALU(width=16)
574 # vl = rtlil.convert(alu, ports=alu.ports())
575 # with open("test_branch_alu.il", "w") as f:
576 # f.write(vl)
577