39a4be95c5d75e688045f268d64f91abc032eb1b
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmutil.extend import exts
18 from nmutil.gtkw import write_gtkw
19
20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
23 is_engine_pysim)
24
25 from soc.decoder.power_enums import MicrOp, Function, CryIn
26
27 from soc.fu.alu.alu_input_record import CompALUOpSubset
28 from soc.fu.cr.cr_input_record import CompCROpSubset
29
30 import operator
31
32
33 class Adder(Elaboratable):
34 def __init__(self, width):
35 self.invert_in = Signal()
36 self.a = Signal(width)
37 self.b = Signal(width)
38 self.o = Signal(width, name="add_o")
39
40 def elaborate(self, platform):
41 m = Module()
42 with m.If(self.invert_in):
43 m.d.comb += self.o.eq((~self.a) + self.b)
44 with m.Else():
45 m.d.comb += self.o.eq(self.a + self.b)
46 return m
47
48
49 class Subtractor(Elaboratable):
50 def __init__(self, width):
51 self.a = Signal(width)
52 self.b = Signal(width)
53 self.o = Signal(width, name="sub_o")
54
55 def elaborate(self, platform):
56 m = Module()
57 m.d.comb += self.o.eq(self.a - self.b)
58 return m
59
60
61 class Multiplier(Elaboratable):
62 def __init__(self, width):
63 self.a = Signal(width)
64 self.b = Signal(width)
65 self.o = Signal(width, name="mul_o")
66
67 def elaborate(self, platform):
68 m = Module()
69 m.d.comb += self.o.eq(self.a * self.b)
70 return m
71
72
73 class Shifter(Elaboratable):
74 def __init__(self, width):
75 self.width = width
76 self.a = Signal(width)
77 self.b = Signal(width)
78 self.o = Signal(width, name="shf_o")
79
80 def elaborate(self, platform):
81 m = Module()
82 btrunc = Signal(self.width)
83 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
84 m.d.comb += self.o.eq(self.a >> btrunc)
85 return m
86
87
88 class SignExtend(Elaboratable):
89 def __init__(self, width):
90 self.width = width
91 self.a = Signal(width)
92 self.o = Signal(width, name="exts_o")
93
94 def elaborate(self, platform):
95 m = Module()
96 m.d.comb += self.o.eq(exts(self.a, 8, self.width))
97 return m
98
99
100 class Dummy:
101 pass
102
103
104 class DummyALU(Elaboratable):
105 def __init__(self, width):
106 self.p = Dummy() # make look like nmutil pipeline API
107 self.p.data_i = Dummy()
108 self.p.data_i.ctx = Dummy()
109 self.n = Dummy() # make look like nmutil pipeline API
110 self.n.data_o = Dummy()
111 self.p.valid_i = Signal()
112 self.p.ready_o = Signal()
113 self.n.ready_i = Signal()
114 self.n.valid_o = Signal()
115 self.counter = Signal(4)
116 self.op = CompCROpSubset()
117 i = []
118 i.append(Signal(width, name="i1"))
119 i.append(Signal(width, name="i2"))
120 i.append(Signal(width, name="i3"))
121 self.i = Array(i)
122 self.a, self.b, self.c = i[0], i[1], i[2]
123 self.out = Array([Signal(width, name="alu_o")])
124 self.o = self.out[0]
125 self.width = width
126 # more "look like nmutil pipeline API"
127 self.p.data_i.ctx.op = self.op
128 self.p.data_i.a = self.a
129 self.p.data_i.b = self.b
130 self.p.data_i.c = self.c
131 self.n.data_o.o = self.o
132
133 def elaborate(self, platform):
134 m = Module()
135
136 go_now = Signal(reset_less=True) # testing no-delay ALU
137
138 with m.If(self.p.valid_i):
139 # input is valid. next check, if we already said "ready" or not
140 with m.If(~self.p.ready_o):
141 # we didn't say "ready" yet, so say so and initialise
142 m.d.sync += self.p.ready_o.eq(1)
143
144 m.d.sync += self.o.eq(self.a)
145 m.d.comb += go_now.eq(1)
146 m.d.sync += self.counter.eq(1)
147
148 with m.Else():
149 # input says no longer valid, so drop ready as well.
150 # a "proper" ALU would have had to sync in the opcode and a/b ops
151 m.d.sync += self.p.ready_o.eq(0)
152
153 # ok so the counter's running: when it gets to 1, fire the output
154 with m.If((self.counter == 1) | go_now):
155 # set the output as valid if the recipient is ready for it
156 m.d.sync += self.n.valid_o.eq(1)
157 with m.If(self.n.ready_i & self.n.valid_o):
158 m.d.sync += self.n.valid_o.eq(0)
159 # recipient said it was ready: reset back to known-good.
160 m.d.sync += self.counter.eq(0) # reset the counter
161 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
162
163 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
164 with m.If(self.counter > 1):
165 m.d.sync += self.counter.eq(self.counter - 1)
166
167 return m
168
169 def __iter__(self):
170 yield from self.op.ports()
171 yield self.a
172 yield self.b
173 yield self.c
174 yield self.o
175
176 def ports(self):
177 return list(self)
178
179
180 class ALU(Elaboratable):
181 def __init__(self, width):
182 self.p = Dummy() # make look like nmutil pipeline API
183 self.p.data_i = Dummy()
184 self.p.data_i.ctx = Dummy()
185 self.n = Dummy() # make look like nmutil pipeline API
186 self.n.data_o = Dummy()
187 self.p.valid_i = Signal()
188 self.p.ready_o = Signal()
189 self.n.ready_i = Signal()
190 self.n.valid_o = Signal()
191 self.counter = Signal(4)
192 self.op = CompALUOpSubset(name="op")
193 i = []
194 i.append(Signal(width, name="i1"))
195 i.append(Signal(width, name="i2"))
196 self.i = Array(i)
197 self.a, self.b = i[0], i[1]
198 self.out = Array([Signal(width, name="alu_o")])
199 self.o = self.out[0]
200 self.width = width
201 # more "look like nmutil pipeline API"
202 self.p.data_i.ctx.op = self.op
203 self.p.data_i.a = self.a
204 self.p.data_i.b = self.b
205 self.n.data_o.o = self.o
206
207 def elaborate(self, platform):
208 m = Module()
209 add = Adder(self.width)
210 mul = Multiplier(self.width)
211 shf = Shifter(self.width)
212 sub = Subtractor(self.width)
213 ext_sign = SignExtend(self.width)
214
215 m.submodules.add = add
216 m.submodules.mul = mul
217 m.submodules.shf = shf
218 m.submodules.sub = sub
219 m.submodules.ext_sign = ext_sign
220
221 # really should not activate absolutely all ALU inputs like this
222 for mod in [add, mul, shf, sub]:
223 m.d.comb += [
224 mod.a.eq(self.a),
225 mod.b.eq(self.b),
226 ]
227 # EXTS sign extends the first input
228 with m.If(self.op.insn_type == MicrOp.OP_EXTS):
229 m.d.comb += ext_sign.a.eq(self.a)
230 # EXTSWSLI sign extends the second input
231 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
232 m.d.comb += ext_sign.a.eq(self.b)
233
234 # pass invert (and carry later)
235 m.d.comb += add.invert_in.eq(self.op.invert_in)
236
237 go_now = Signal(reset_less=True) # testing no-delay ALU
238
239 # ALU sequencer is idle when the count is zero
240 alu_idle = Signal(reset_less=True)
241 m.d.comb += alu_idle.eq(self.counter == 0)
242
243 # ALU sequencer is done when the count is one
244 alu_done = Signal(reset_less=True)
245 m.d.comb += alu_done.eq(self.counter == 1)
246
247 # select handshake handling according to ALU type
248 with m.If(go_now):
249 # with a combinatorial, no-delay ALU, just pass through
250 # the handshake signals to the other side
251 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
252 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
253 with m.Else():
254 # sequential ALU handshake:
255 # ready_o responds to valid_i, but only if the ALU is idle
256 m.d.comb += self.p.ready_o.eq(alu_idle)
257 # select the internally generated valid_o, above
258 m.d.comb += self.n.valid_o.eq(alu_done)
259
260 # hold the ALU result until ready_o is asserted
261 alu_r = Signal(self.width)
262
263 with m.If(alu_idle):
264 with m.If(self.p.valid_i):
265
266 # as this is a "fake" pipeline, just grab the output right now
267 with m.If(self.op.insn_type == MicrOp.OP_ADD):
268 m.d.sync += alu_r.eq(add.o)
269 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
270 m.d.sync += alu_r.eq(mul.o)
271 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
272 m.d.sync += alu_r.eq(shf.o)
273 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
274 m.d.sync += alu_r.eq(ext_sign.o)
275 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
276 m.d.sync += alu_r.eq(ext_sign.o)
277 # SUB is zero-delay, no need to register
278
279 # NOTE: all of these are fake, just something to test
280
281 # MUL, to take 5 instructions
282 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
283 m.d.sync += self.counter.eq(5)
284 # SHIFT to take 1, straight away
285 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
286 m.d.sync += self.counter.eq(1)
287 # ADD/SUB to take 3
288 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
289 m.d.sync += self.counter.eq(3)
290 # EXTS to take 1
291 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
292 m.d.sync += self.counter.eq(1)
293 # EXTSWSLI to take 1
294 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
295 m.d.sync += self.counter.eq(1)
296 # others to take no delay
297 with m.Else():
298 m.d.comb += go_now.eq(1)
299
300 with m.Elif(~alu_done | self.n.ready_i):
301 # decrement the counter while the ALU is neither idle nor finished
302 m.d.sync += self.counter.eq(self.counter - 1)
303
304 # choose between zero-delay output, or registered
305 with m.If(go_now):
306 m.d.comb += self.o.eq(sub.o)
307 # only present the result at the last computation cycle
308 with m.Elif(alu_done):
309 m.d.comb += self.o.eq(alu_r)
310
311 return m
312
313 def __iter__(self):
314 yield from self.op.ports()
315 yield self.a
316 yield self.b
317 yield self.o
318 yield self.p.valid_i
319 yield self.p.ready_o
320 yield self.n.valid_o
321 yield self.n.ready_i
322
323 def ports(self):
324 return list(self)
325
326
327 class BranchOp(Elaboratable):
328 def __init__(self, width, op):
329 self.a = Signal(width)
330 self.b = Signal(width)
331 self.o = Signal(width)
332 self.op = op
333
334 def elaborate(self, platform):
335 m = Module()
336 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
337 return m
338
339
340 class BranchALU(Elaboratable):
341 def __init__(self, width):
342 self.p = Dummy() # make look like nmutil pipeline API
343 self.p.data_i = Dummy()
344 self.p.data_i.ctx = Dummy()
345 self.n = Dummy() # make look like nmutil pipeline API
346 self.n.data_o = Dummy()
347 self.p.valid_i = Signal()
348 self.p.ready_o = Signal()
349 self.n.ready_i = Signal()
350 self.n.valid_o = Signal()
351 self.counter = Signal(4)
352 self.op = Signal(2)
353 i = []
354 i.append(Signal(width, name="i1"))
355 i.append(Signal(width, name="i2"))
356 self.i = Array(i)
357 self.a, self.b = i[0], i[1]
358 self.out = Array([Signal(width)])
359 self.o = self.out[0]
360 self.width = width
361
362 def elaborate(self, platform):
363 m = Module()
364 bgt = BranchOp(self.width, operator.gt)
365 blt = BranchOp(self.width, operator.lt)
366 beq = BranchOp(self.width, operator.eq)
367 bne = BranchOp(self.width, operator.ne)
368
369 m.submodules.bgt = bgt
370 m.submodules.blt = blt
371 m.submodules.beq = beq
372 m.submodules.bne = bne
373 for mod in [bgt, blt, beq, bne]:
374 m.d.comb += [
375 mod.a.eq(self.a),
376 mod.b.eq(self.b),
377 ]
378
379 go_now = Signal(reset_less=True) # testing no-delay ALU
380 with m.If(self.p.valid_i):
381 # input is valid. next check, if we already said "ready" or not
382 with m.If(~self.p.ready_o):
383 # we didn't say "ready" yet, so say so and initialise
384 m.d.sync += self.p.ready_o.eq(1)
385
386 # as this is a "fake" pipeline, just grab the output right now
387 with m.Switch(self.op):
388 for i, mod in enumerate([bgt, blt, beq, bne]):
389 with m.Case(i):
390 m.d.sync += self.o.eq(mod.o)
391 # branch to take 5 cycles (fake)
392 m.d.sync += self.counter.eq(5)
393 #m.d.comb += go_now.eq(1)
394 with m.Else():
395 # input says no longer valid, so drop ready as well.
396 # a "proper" ALU would have had to sync in the opcode and a/b ops
397 m.d.sync += self.p.ready_o.eq(0)
398
399 # ok so the counter's running: when it gets to 1, fire the output
400 with m.If((self.counter == 1) | go_now):
401 # set the output as valid if the recipient is ready for it
402 m.d.sync += self.n.valid_o.eq(1)
403 with m.If(self.n.ready_i & self.n.valid_o):
404 m.d.sync += self.n.valid_o.eq(0)
405 # recipient said it was ready: reset back to known-good.
406 m.d.sync += self.counter.eq(0) # reset the counter
407 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
408
409 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
410 with m.If(self.counter > 1):
411 m.d.sync += self.counter.eq(self.counter - 1)
412
413 return m
414
415 def __iter__(self):
416 yield self.op
417 yield self.a
418 yield self.b
419 yield self.o
420
421 def ports(self):
422 return list(self)
423
424
425 def run_op(dut, a, b, op, inv_a=0):
426 yield dut.a.eq(a)
427 yield dut.b.eq(b)
428 yield dut.op.insn_type.eq(op)
429 yield dut.op.invert_in.eq(inv_a)
430 yield dut.n.ready_i.eq(0)
431 yield dut.p.valid_i.eq(1)
432 yield dut.n.ready_i.eq(1)
433 yield
434
435 # wait for the ALU to accept our input data
436 while not (yield dut.p.ready_o):
437 yield
438
439 yield dut.p.valid_i.eq(0)
440 yield dut.a.eq(0)
441 yield dut.b.eq(0)
442 yield dut.op.insn_type.eq(0)
443 yield dut.op.invert_in.eq(0)
444
445 # wait for the ALU to present the output data
446 while not (yield dut.n.valid_o):
447 yield
448
449 # latch the result and lower read_i
450 result = yield dut.o
451 yield dut.n.ready_i.eq(0)
452
453 return result
454
455
456 def alu_sim(dut):
457 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
458 print("alu_sim add", result)
459 assert (result == 8)
460
461 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
462 print("alu_sim mul", result)
463 assert (result == 6)
464
465 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
466 print("alu_sim add-inv", result)
467 assert (result == 65533)
468
469 # test zero-delay ALU
470 # don't have OP_SUB, so use any other
471 result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
472 print("alu_sim sub", result)
473 assert (result == 2)
474
475 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
476 print("alu_sim shr", result)
477 assert (result == 3)
478
479
480 def test_alu():
481 alu = ALU(width=16)
482 write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
483 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
484
485 vl = rtlil.convert(alu, ports=alu.ports())
486 with open("test_alu.il", "w") as f:
487 f.write(vl)
488
489
490 def test_alu_parallel():
491 # Compare with the sequential test implementation, above.
492 m = Module()
493 m.submodules.alu = dut = ALU(width=16)
494 write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
495 pysim=is_engine_pysim())
496
497 sim = Simulator(m)
498 sim.add_clock(1e-6)
499
500 def send(a, b, op, inv_a=0):
501 # present input data and assert valid_i
502 yield dut.a.eq(a)
503 yield dut.b.eq(b)
504 yield dut.op.insn_type.eq(op)
505 yield dut.op.invert_in.eq(inv_a)
506 yield dut.p.valid_i.eq(1)
507 yield
508 # wait for ready_o to be asserted
509 while not (yield dut.p.ready_o):
510 yield
511 # clear input data and negate valid_i
512 # if send is called again immediately afterwards, there will be no
513 # visible transition (they will not be negated, after all)
514 yield dut.p.valid_i.eq(0)
515 yield dut.a.eq(0)
516 yield dut.b.eq(0)
517 yield dut.op.insn_type.eq(0)
518 yield dut.op.invert_in.eq(0)
519
520 def receive():
521 # signal readiness to receive data
522 yield dut.n.ready_i.eq(1)
523 yield
524 # wait for valid_o to be asserted
525 while not (yield dut.n.valid_o):
526 yield
527 # read result
528 result = yield dut.o
529 # negate ready_i
530 # if receive is called again immediately afterwards, there will be no
531 # visible transition (it will not be negated, after all)
532 yield dut.n.ready_i.eq(0)
533 return result
534
535 def producer():
536 # send a few test cases, interspersed with wait states
537 # note that, for this test, we do not wait for the result to be ready,
538 # before presenting the next input
539 # 5 + 3
540 yield from send(5, 3, MicrOp.OP_ADD)
541 yield
542 yield
543 # 2 * 3
544 yield from send(2, 3, MicrOp.OP_MUL_L64)
545 # (-5) + 3
546 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
547 yield
548 # 5 - 3
549 # note that this is a zero-delay operation
550 yield from send(5, 3, MicrOp.OP_NOP)
551 yield
552 yield
553 # 13 >> 2
554 yield from send(13, 2, MicrOp.OP_SHR)
555 # sign extent 13
556 yield from send(13, 2, MicrOp.OP_EXTS)
557 # sign extend -128 (8 bits)
558 yield from send(0x80, 2, MicrOp.OP_EXTS)
559 # sign extend -128 (8 bits)
560 yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
561
562 def consumer():
563 # receive and check results, interspersed with wait states
564 # the consumer is not in step with the producer, but the
565 # order of the results are preserved
566 yield
567 # 5 + 3 = 8
568 result = yield from receive()
569 assert (result == 8)
570 # 2 * 3 = 6
571 result = yield from receive()
572 assert (result == 6)
573 yield
574 yield
575 # (-5) + 3 = -2
576 result = yield from receive()
577 assert (result == 65533) # unsigned equivalent to -2
578 # 5 - 3 = 2
579 # note that this is a zero-delay operation
580 # this, and the previous result, will be received back-to-back
581 # (check the output waveform to see this)
582 result = yield from receive()
583 assert (result == 2)
584 yield
585 yield
586 # 13 >> 2 = 3
587 result = yield from receive()
588 assert (result == 3)
589 # sign extent 13 = 13
590 result = yield from receive()
591 assert (result == 13)
592 # sign extend -128 (8 bits) = -128 (16 bits)
593 result = yield from receive()
594 assert (result == 0xFF80)
595 # sign extend -128 (8 bits) = -128 (16 bits)
596 result = yield from receive()
597 assert (result == 0xFF80)
598
599 sim.add_sync_process(producer)
600 sim.add_sync_process(consumer)
601 sim_writer = sim.write_vcd("test_alu_parallel.vcd")
602 with sim_writer:
603 sim.run()
604
605
606 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
607 pysim=True):
608 """Common function to write the GTKWave documents for this module"""
609 gtkwave_desc = [
610 'clk',
611 'i1[15:0]',
612 'i2[15:0]',
613 'op__insn_type' if pysim else 'op__insn_type[6:0]',
614 'op__invert_in',
615 'valid_i',
616 'ready_o',
617 'valid_o',
618 'ready_i',
619 'alu_o[15:0]',
620 ]
621 # determine the module name of the DUT
622 module = 'top'
623 if sub_module is not None:
624 module = nmigen_sim_top_module + sub_module
625 vcd_name = gtkw_name.replace('.gtkw', '.vcd')
626 write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
627 loc=__file__, clk_period=clk_period, base='signed')
628
629
630 if __name__ == "__main__":
631 test_alu()
632 test_alu_parallel()
633
634 # alu = BranchALU(width=16)
635 # vl = rtlil.convert(alu, ports=alu.ports())
636 # with open("test_branch_alu.il", "w") as f:
637 # f.write(vl)