Implement and test NOP in the test ALU
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmutil.extend import exts
18 from nmutil.gtkw import write_gtkw
19
20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
23 is_engine_pysim)
24
25 from soc.decoder.decode2execute1 import Data
26 from soc.decoder.power_enums import MicrOp, Function, CryIn
27
28 from soc.fu.alu.alu_input_record import CompALUOpSubset
29 from soc.fu.cr.cr_input_record import CompCROpSubset
30
31 import operator
32
33
34 class Adder(Elaboratable):
35 def __init__(self, width):
36 self.invert_in = Signal()
37 self.a = Signal(width)
38 self.b = Signal(width)
39 self.o = Signal(width, name="add_o")
40
41 def elaborate(self, platform):
42 m = Module()
43 with m.If(self.invert_in):
44 m.d.comb += self.o.eq((~self.a) + self.b)
45 with m.Else():
46 m.d.comb += self.o.eq(self.a + self.b)
47 return m
48
49
50 class Subtractor(Elaboratable):
51 def __init__(self, width):
52 self.a = Signal(width)
53 self.b = Signal(width)
54 self.o = Signal(width, name="sub_o")
55
56 def elaborate(self, platform):
57 m = Module()
58 m.d.comb += self.o.eq(self.a - self.b)
59 return m
60
61
62 class Multiplier(Elaboratable):
63 def __init__(self, width):
64 self.a = Signal(width)
65 self.b = Signal(width)
66 self.o = Signal(width, name="mul_o")
67
68 def elaborate(self, platform):
69 m = Module()
70 m.d.comb += self.o.eq(self.a * self.b)
71 return m
72
73
74 class Shifter(Elaboratable):
75 def __init__(self, width):
76 self.width = width
77 self.a = Signal(width)
78 self.b = Signal(width)
79 self.o = Signal(width, name="shf_o")
80
81 def elaborate(self, platform):
82 m = Module()
83 btrunc = Signal(self.width)
84 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
85 m.d.comb += self.o.eq(self.a >> btrunc)
86 return m
87
88
89 class SignExtend(Elaboratable):
90 def __init__(self, width):
91 self.width = width
92 self.a = Signal(width)
93 self.o = Signal(width, name="exts_o")
94
95 def elaborate(self, platform):
96 m = Module()
97 m.d.comb += self.o.eq(exts(self.a, 8, self.width))
98 return m
99
100
101 class Dummy:
102 pass
103
104
105 class DummyALU(Elaboratable):
106 def __init__(self, width):
107 self.p = Dummy() # make look like nmutil pipeline API
108 self.p.data_i = Dummy()
109 self.p.data_i.ctx = Dummy()
110 self.n = Dummy() # make look like nmutil pipeline API
111 self.n.data_o = Dummy()
112 self.p.valid_i = Signal()
113 self.p.ready_o = Signal()
114 self.n.ready_i = Signal()
115 self.n.valid_o = Signal()
116 self.counter = Signal(4)
117 self.op = CompCROpSubset()
118 i = []
119 i.append(Signal(width, name="i1"))
120 i.append(Signal(width, name="i2"))
121 i.append(Signal(width, name="i3"))
122 self.i = Array(i)
123 self.a, self.b, self.c = i[0], i[1], i[2]
124 self.out = Array([Signal(width, name="alu_o")])
125 self.o = self.out[0]
126 self.width = width
127 # more "look like nmutil pipeline API"
128 self.p.data_i.ctx.op = self.op
129 self.p.data_i.a = self.a
130 self.p.data_i.b = self.b
131 self.p.data_i.c = self.c
132 self.n.data_o.o = self.o
133
134 def elaborate(self, platform):
135 m = Module()
136
137 go_now = Signal(reset_less=True) # testing no-delay ALU
138
139 with m.If(self.p.valid_i):
140 # input is valid. next check, if we already said "ready" or not
141 with m.If(~self.p.ready_o):
142 # we didn't say "ready" yet, so say so and initialise
143 m.d.sync += self.p.ready_o.eq(1)
144
145 m.d.sync += self.o.eq(self.a)
146 m.d.comb += go_now.eq(1)
147 m.d.sync += self.counter.eq(1)
148
149 with m.Else():
150 # input says no longer valid, so drop ready as well.
151 # a "proper" ALU would have had to sync in the opcode and a/b ops
152 m.d.sync += self.p.ready_o.eq(0)
153
154 # ok so the counter's running: when it gets to 1, fire the output
155 with m.If((self.counter == 1) | go_now):
156 # set the output as valid if the recipient is ready for it
157 m.d.sync += self.n.valid_o.eq(1)
158 with m.If(self.n.ready_i & self.n.valid_o):
159 m.d.sync += self.n.valid_o.eq(0)
160 # recipient said it was ready: reset back to known-good.
161 m.d.sync += self.counter.eq(0) # reset the counter
162 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
163
164 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
165 with m.If(self.counter > 1):
166 m.d.sync += self.counter.eq(self.counter - 1)
167
168 return m
169
170 def __iter__(self):
171 yield from self.op.ports()
172 yield self.a
173 yield self.b
174 yield self.c
175 yield self.o
176
177 def ports(self):
178 return list(self)
179
180
181 class ALU(Elaboratable):
182 def __init__(self, width):
183 self.p = Dummy() # make look like nmutil pipeline API
184 self.p.data_i = Dummy()
185 self.p.data_i.ctx = Dummy()
186 self.n = Dummy() # make look like nmutil pipeline API
187 self.n.data_o = Dummy()
188 self.p.valid_i = Signal()
189 self.p.ready_o = Signal()
190 self.n.ready_i = Signal()
191 self.n.valid_o = Signal()
192 self.counter = Signal(4)
193 self.op = CompALUOpSubset(name="op")
194 i = []
195 i.append(Signal(width, name="i1"))
196 i.append(Signal(width, name="i2"))
197 self.i = Array(i)
198 self.a, self.b = i[0], i[1]
199 self.out = Array([Data(width, name="alu_o")])
200 self.o = self.out[0]
201 self.width = width
202 # more "look like nmutil pipeline API"
203 self.p.data_i.ctx.op = self.op
204 self.p.data_i.a = self.a
205 self.p.data_i.b = self.b
206 self.n.data_o.o = self.o
207
208 def elaborate(self, platform):
209 m = Module()
210 add = Adder(self.width)
211 mul = Multiplier(self.width)
212 shf = Shifter(self.width)
213 sub = Subtractor(self.width)
214 ext_sign = SignExtend(self.width)
215
216 m.submodules.add = add
217 m.submodules.mul = mul
218 m.submodules.shf = shf
219 m.submodules.sub = sub
220 m.submodules.ext_sign = ext_sign
221
222 # really should not activate absolutely all ALU inputs like this
223 for mod in [add, mul, shf, sub]:
224 m.d.comb += [
225 mod.a.eq(self.a),
226 mod.b.eq(self.b),
227 ]
228 # EXTS sign extends the first input
229 with m.If(self.op.insn_type == MicrOp.OP_EXTS):
230 m.d.comb += ext_sign.a.eq(self.a)
231 # EXTSWSLI sign extends the second input
232 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
233 m.d.comb += ext_sign.a.eq(self.b)
234
235 # pass invert (and carry later)
236 m.d.comb += add.invert_in.eq(self.op.invert_in)
237
238 go_now = Signal(reset_less=True) # testing no-delay ALU
239
240 # ALU sequencer is idle when the count is zero
241 alu_idle = Signal(reset_less=True)
242 m.d.comb += alu_idle.eq(self.counter == 0)
243
244 # ALU sequencer is done when the count is one
245 alu_done = Signal(reset_less=True)
246 m.d.comb += alu_done.eq(self.counter == 1)
247
248 # select handshake handling according to ALU type
249 with m.If(go_now):
250 # with a combinatorial, no-delay ALU, just pass through
251 # the handshake signals to the other side
252 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
253 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
254 with m.Else():
255 # sequential ALU handshake:
256 # ready_o responds to valid_i, but only if the ALU is idle
257 m.d.comb += self.p.ready_o.eq(alu_idle)
258 # select the internally generated valid_o, above
259 m.d.comb += self.n.valid_o.eq(alu_done)
260
261 # hold the ALU result until ready_o is asserted
262 alu_r = Signal(self.width)
263
264 # NOP doesn't output anything
265 with m.If(self.op.insn_type != MicrOp.OP_NOP):
266 m.d.comb += self.o.ok.eq(1)
267 with m.If(alu_idle):
268 with m.If(self.p.valid_i):
269
270 # as this is a "fake" pipeline, just grab the output right now
271 with m.If(self.op.insn_type == MicrOp.OP_ADD):
272 m.d.sync += alu_r.eq(add.o)
273 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
274 m.d.sync += alu_r.eq(mul.o)
275 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
276 m.d.sync += alu_r.eq(shf.o)
277 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
278 m.d.sync += alu_r.eq(ext_sign.o)
279 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
280 m.d.sync += alu_r.eq(ext_sign.o)
281 # SUB is zero-delay, no need to register
282
283 # NOTE: all of these are fake, just something to test
284
285 # MUL, to take 5 instructions
286 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
287 m.d.sync += self.counter.eq(5)
288 # SHIFT to take 1, straight away
289 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
290 m.d.sync += self.counter.eq(1)
291 # ADD/SUB to take 3
292 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
293 m.d.sync += self.counter.eq(3)
294 # EXTS to take 1
295 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
296 m.d.sync += self.counter.eq(1)
297 # EXTSWSLI to take 1
298 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
299 m.d.sync += self.counter.eq(1)
300 # others to take no delay
301 with m.Else():
302 m.d.comb += go_now.eq(1)
303
304 with m.Elif(~alu_done | self.n.ready_i):
305 # decrement the counter while the ALU is neither idle nor finished
306 m.d.sync += self.counter.eq(self.counter - 1)
307
308 # choose between zero-delay output, or registered
309 with m.If(go_now):
310 m.d.comb += self.o.data.eq(sub.o)
311 # only present the result at the last computation cycle
312 with m.Elif(alu_done):
313 m.d.comb += self.o.data.eq(alu_r)
314
315 return m
316
317 def __iter__(self):
318 yield from self.op.ports()
319 yield self.a
320 yield self.b
321 yield from self.o.ports()
322 yield self.p.valid_i
323 yield self.p.ready_o
324 yield self.n.valid_o
325 yield self.n.ready_i
326
327 def ports(self):
328 return list(self)
329
330
331 class BranchOp(Elaboratable):
332 def __init__(self, width, op):
333 self.a = Signal(width)
334 self.b = Signal(width)
335 self.o = Signal(width)
336 self.op = op
337
338 def elaborate(self, platform):
339 m = Module()
340 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
341 return m
342
343
344 class BranchALU(Elaboratable):
345 def __init__(self, width):
346 self.p = Dummy() # make look like nmutil pipeline API
347 self.p.data_i = Dummy()
348 self.p.data_i.ctx = Dummy()
349 self.n = Dummy() # make look like nmutil pipeline API
350 self.n.data_o = Dummy()
351 self.p.valid_i = Signal()
352 self.p.ready_o = Signal()
353 self.n.ready_i = Signal()
354 self.n.valid_o = Signal()
355 self.counter = Signal(4)
356 self.op = Signal(2)
357 i = []
358 i.append(Signal(width, name="i1"))
359 i.append(Signal(width, name="i2"))
360 self.i = Array(i)
361 self.a, self.b = i[0], i[1]
362 self.out = Array([Signal(width)])
363 self.o = self.out[0]
364 self.width = width
365
366 def elaborate(self, platform):
367 m = Module()
368 bgt = BranchOp(self.width, operator.gt)
369 blt = BranchOp(self.width, operator.lt)
370 beq = BranchOp(self.width, operator.eq)
371 bne = BranchOp(self.width, operator.ne)
372
373 m.submodules.bgt = bgt
374 m.submodules.blt = blt
375 m.submodules.beq = beq
376 m.submodules.bne = bne
377 for mod in [bgt, blt, beq, bne]:
378 m.d.comb += [
379 mod.a.eq(self.a),
380 mod.b.eq(self.b),
381 ]
382
383 go_now = Signal(reset_less=True) # testing no-delay ALU
384 with m.If(self.p.valid_i):
385 # input is valid. next check, if we already said "ready" or not
386 with m.If(~self.p.ready_o):
387 # we didn't say "ready" yet, so say so and initialise
388 m.d.sync += self.p.ready_o.eq(1)
389
390 # as this is a "fake" pipeline, just grab the output right now
391 with m.Switch(self.op):
392 for i, mod in enumerate([bgt, blt, beq, bne]):
393 with m.Case(i):
394 m.d.sync += self.o.eq(mod.o)
395 # branch to take 5 cycles (fake)
396 m.d.sync += self.counter.eq(5)
397 #m.d.comb += go_now.eq(1)
398 with m.Else():
399 # input says no longer valid, so drop ready as well.
400 # a "proper" ALU would have had to sync in the opcode and a/b ops
401 m.d.sync += self.p.ready_o.eq(0)
402
403 # ok so the counter's running: when it gets to 1, fire the output
404 with m.If((self.counter == 1) | go_now):
405 # set the output as valid if the recipient is ready for it
406 m.d.sync += self.n.valid_o.eq(1)
407 with m.If(self.n.ready_i & self.n.valid_o):
408 m.d.sync += self.n.valid_o.eq(0)
409 # recipient said it was ready: reset back to known-good.
410 m.d.sync += self.counter.eq(0) # reset the counter
411 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
412
413 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
414 with m.If(self.counter > 1):
415 m.d.sync += self.counter.eq(self.counter - 1)
416
417 return m
418
419 def __iter__(self):
420 yield self.op
421 yield self.a
422 yield self.b
423 yield self.o
424
425 def ports(self):
426 return list(self)
427
428
429 def run_op(dut, a, b, op, inv_a=0):
430 yield dut.a.eq(a)
431 yield dut.b.eq(b)
432 yield dut.op.insn_type.eq(op)
433 yield dut.op.invert_in.eq(inv_a)
434 yield dut.n.ready_i.eq(0)
435 yield dut.p.valid_i.eq(1)
436 yield dut.n.ready_i.eq(1)
437 yield
438
439 # wait for the ALU to accept our input data
440 while not (yield dut.p.ready_o):
441 yield
442
443 yield dut.p.valid_i.eq(0)
444 yield dut.a.eq(0)
445 yield dut.b.eq(0)
446 yield dut.op.insn_type.eq(0)
447 yield dut.op.invert_in.eq(0)
448
449 # wait for the ALU to present the output data
450 while not (yield dut.n.valid_o):
451 yield
452
453 # latch the result and lower read_i
454 result = yield dut.o.data
455 yield dut.n.ready_i.eq(0)
456
457 return result
458
459
460 def alu_sim(dut):
461 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
462 print("alu_sim add", result)
463 assert (result == 8)
464
465 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
466 print("alu_sim mul", result)
467 assert (result == 6)
468
469 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
470 print("alu_sim add-inv", result)
471 assert (result == 65533)
472
473 # test zero-delay ALU
474 # don't have OP_SUB, so use any other
475 result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
476 print("alu_sim sub", result)
477 assert (result == 2)
478
479 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
480 print("alu_sim shr", result)
481 assert (result == 3)
482
483
484 def test_alu():
485 alu = ALU(width=16)
486 write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
487 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
488
489 vl = rtlil.convert(alu, ports=alu.ports())
490 with open("test_alu.il", "w") as f:
491 f.write(vl)
492
493
494 def test_alu_parallel():
495 # Compare with the sequential test implementation, above.
496 m = Module()
497 m.submodules.alu = dut = ALU(width=16)
498 write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
499 pysim=is_engine_pysim())
500
501 sim = Simulator(m)
502 sim.add_clock(1e-6)
503
504 def send(a, b, op, inv_a=0):
505 # present input data and assert valid_i
506 yield dut.a.eq(a)
507 yield dut.b.eq(b)
508 yield dut.op.insn_type.eq(op)
509 yield dut.op.invert_in.eq(inv_a)
510 yield dut.p.valid_i.eq(1)
511 yield
512 # wait for ready_o to be asserted
513 while not (yield dut.p.ready_o):
514 yield
515 # clear input data and negate valid_i
516 # if send is called again immediately afterwards, there will be no
517 # visible transition (they will not be negated, after all)
518 yield dut.p.valid_i.eq(0)
519 yield dut.a.eq(0)
520 yield dut.b.eq(0)
521 yield dut.op.insn_type.eq(0)
522 yield dut.op.invert_in.eq(0)
523
524 def receive():
525 # signal readiness to receive data
526 yield dut.n.ready_i.eq(1)
527 yield
528 # wait for valid_o to be asserted
529 while not (yield dut.n.valid_o):
530 yield
531 # read result
532 result = yield dut.o.data
533 # negate ready_i
534 # if receive is called again immediately afterwards, there will be no
535 # visible transition (it will not be negated, after all)
536 yield dut.n.ready_i.eq(0)
537 return result
538
539 def producer():
540 # send a few test cases, interspersed with wait states
541 # note that, for this test, we do not wait for the result to be ready,
542 # before presenting the next input
543 # 5 + 3
544 yield from send(5, 3, MicrOp.OP_ADD)
545 yield
546 yield
547 # 2 * 3
548 yield from send(2, 3, MicrOp.OP_MUL_L64)
549 # (-5) + 3
550 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
551 yield
552 # 5 - 3
553 # note that this is a zero-delay operation
554 yield from send(5, 3, MicrOp.OP_NOP)
555 yield
556 yield
557 # 13 >> 2
558 yield from send(13, 2, MicrOp.OP_SHR)
559 # sign extent 13
560 yield from send(13, 2, MicrOp.OP_EXTS)
561 # sign extend -128 (8 bits)
562 yield from send(0x80, 2, MicrOp.OP_EXTS)
563 # sign extend -128 (8 bits)
564 yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
565
566 def consumer():
567 # receive and check results, interspersed with wait states
568 # the consumer is not in step with the producer, but the
569 # order of the results are preserved
570 yield
571 # 5 + 3 = 8
572 result = yield from receive()
573 assert (result == 8)
574 # 2 * 3 = 6
575 result = yield from receive()
576 assert (result == 6)
577 yield
578 yield
579 # (-5) + 3 = -2
580 result = yield from receive()
581 assert (result == 65533) # unsigned equivalent to -2
582 # 5 - 3 = 2
583 # note that this is a zero-delay operation
584 # this, and the previous result, will be received back-to-back
585 # (check the output waveform to see this)
586 result = yield from receive()
587 assert (result == 2)
588 yield
589 yield
590 # 13 >> 2 = 3
591 result = yield from receive()
592 assert (result == 3)
593 # sign extent 13 = 13
594 result = yield from receive()
595 assert (result == 13)
596 # sign extend -128 (8 bits) = -128 (16 bits)
597 result = yield from receive()
598 assert (result == 0xFF80)
599 # sign extend -128 (8 bits) = -128 (16 bits)
600 result = yield from receive()
601 assert (result == 0xFF80)
602
603 sim.add_sync_process(producer)
604 sim.add_sync_process(consumer)
605 sim_writer = sim.write_vcd("test_alu_parallel.vcd")
606 with sim_writer:
607 sim.run()
608
609
610 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
611 pysim=True):
612 """Common function to write the GTKWave documents for this module"""
613 gtkwave_desc = [
614 'clk',
615 'i1[15:0]',
616 'i2[15:0]',
617 'op__insn_type' if pysim else 'op__insn_type[6:0]',
618 'op__invert_in',
619 'valid_i',
620 'ready_o',
621 'valid_o',
622 'ready_i',
623 'alu_o[15:0]',
624 ]
625 # determine the module name of the DUT
626 module = 'top'
627 if sub_module is not None:
628 module = nmigen_sim_top_module + sub_module
629 vcd_name = gtkw_name.replace('.gtkw', '.vcd')
630 write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
631 loc=__file__, clk_period=clk_period, base='signed')
632
633
634 if __name__ == "__main__":
635 test_alu()
636 test_alu_parallel()
637
638 # alu = BranchALU(width=16)
639 # vl = rtlil.convert(alu, ports=alu.ports())
640 # with open("test_branch_alu.il", "w") as f:
641 # f.write(vl)