Disable data value output on NOP
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmutil.extend import exts
18 from nmutil.gtkw import write_gtkw
19
20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
23 is_engine_pysim)
24
25 from soc.decoder.decode2execute1 import Data
26 from soc.decoder.power_enums import MicrOp, Function, CryIn
27
28 from soc.fu.alu.alu_input_record import CompALUOpSubset
29 from soc.fu.cr.cr_input_record import CompCROpSubset
30
31 import operator
32
33
34 class Adder(Elaboratable):
35 def __init__(self, width):
36 self.invert_in = Signal()
37 self.a = Signal(width)
38 self.b = Signal(width)
39 self.o = Signal(width, name="add_o")
40
41 def elaborate(self, platform):
42 m = Module()
43 with m.If(self.invert_in):
44 m.d.comb += self.o.eq((~self.a) + self.b)
45 with m.Else():
46 m.d.comb += self.o.eq(self.a + self.b)
47 return m
48
49
50 class Subtractor(Elaboratable):
51 def __init__(self, width):
52 self.a = Signal(width)
53 self.b = Signal(width)
54 self.o = Signal(width, name="sub_o")
55
56 def elaborate(self, platform):
57 m = Module()
58 m.d.comb += self.o.eq(self.a - self.b)
59 return m
60
61
62 class Multiplier(Elaboratable):
63 def __init__(self, width):
64 self.a = Signal(width)
65 self.b = Signal(width)
66 self.o = Signal(width, name="mul_o")
67
68 def elaborate(self, platform):
69 m = Module()
70 m.d.comb += self.o.eq(self.a * self.b)
71 return m
72
73
74 class Shifter(Elaboratable):
75 def __init__(self, width):
76 self.width = width
77 self.a = Signal(width)
78 self.b = Signal(width)
79 self.o = Signal(width, name="shf_o")
80
81 def elaborate(self, platform):
82 m = Module()
83 btrunc = Signal(self.width)
84 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
85 m.d.comb += self.o.eq(self.a >> btrunc)
86 return m
87
88
89 class SignExtend(Elaboratable):
90 def __init__(self, width):
91 self.width = width
92 self.a = Signal(width)
93 self.o = Signal(width, name="exts_o")
94
95 def elaborate(self, platform):
96 m = Module()
97 m.d.comb += self.o.eq(exts(self.a, 8, self.width))
98 return m
99
100
101 class Dummy:
102 pass
103
104
105 class DummyALU(Elaboratable):
106 def __init__(self, width):
107 self.p = Dummy() # make look like nmutil pipeline API
108 self.p.data_i = Dummy()
109 self.p.data_i.ctx = Dummy()
110 self.n = Dummy() # make look like nmutil pipeline API
111 self.n.data_o = Dummy()
112 self.p.valid_i = Signal()
113 self.p.ready_o = Signal()
114 self.n.ready_i = Signal()
115 self.n.valid_o = Signal()
116 self.counter = Signal(4)
117 self.op = CompCROpSubset()
118 i = []
119 i.append(Signal(width, name="i1"))
120 i.append(Signal(width, name="i2"))
121 i.append(Signal(width, name="i3"))
122 self.i = Array(i)
123 self.a, self.b, self.c = i[0], i[1], i[2]
124 self.out = Array([Signal(width, name="alu_o")])
125 self.o = self.out[0]
126 self.width = width
127 # more "look like nmutil pipeline API"
128 self.p.data_i.ctx.op = self.op
129 self.p.data_i.a = self.a
130 self.p.data_i.b = self.b
131 self.p.data_i.c = self.c
132 self.n.data_o.o = self.o
133
134 def elaborate(self, platform):
135 m = Module()
136
137 go_now = Signal(reset_less=True) # testing no-delay ALU
138
139 with m.If(self.p.valid_i):
140 # input is valid. next check, if we already said "ready" or not
141 with m.If(~self.p.ready_o):
142 # we didn't say "ready" yet, so say so and initialise
143 m.d.sync += self.p.ready_o.eq(1)
144
145 m.d.sync += self.o.eq(self.a)
146 m.d.comb += go_now.eq(1)
147 m.d.sync += self.counter.eq(1)
148
149 with m.Else():
150 # input says no longer valid, so drop ready as well.
151 # a "proper" ALU would have had to sync in the opcode and a/b ops
152 m.d.sync += self.p.ready_o.eq(0)
153
154 # ok so the counter's running: when it gets to 1, fire the output
155 with m.If((self.counter == 1) | go_now):
156 # set the output as valid if the recipient is ready for it
157 m.d.sync += self.n.valid_o.eq(1)
158 with m.If(self.n.ready_i & self.n.valid_o):
159 m.d.sync += self.n.valid_o.eq(0)
160 # recipient said it was ready: reset back to known-good.
161 m.d.sync += self.counter.eq(0) # reset the counter
162 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
163
164 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
165 with m.If(self.counter > 1):
166 m.d.sync += self.counter.eq(self.counter - 1)
167
168 return m
169
170 def __iter__(self):
171 yield from self.op.ports()
172 yield self.a
173 yield self.b
174 yield self.c
175 yield self.o
176
177 def ports(self):
178 return list(self)
179
180
181 class ALU(Elaboratable):
182 def __init__(self, width):
183 self.p = Dummy() # make look like nmutil pipeline API
184 self.p.data_i = Dummy()
185 self.p.data_i.ctx = Dummy()
186 self.n = Dummy() # make look like nmutil pipeline API
187 self.n.data_o = Dummy()
188 self.p.valid_i = Signal()
189 self.p.ready_o = Signal()
190 self.n.ready_i = Signal()
191 self.n.valid_o = Signal()
192 self.counter = Signal(4)
193 self.op = CompALUOpSubset(name="op")
194 i = []
195 i.append(Signal(width, name="i1"))
196 i.append(Signal(width, name="i2"))
197 self.i = Array(i)
198 self.a, self.b = i[0], i[1]
199 out = []
200 out.append(Data(width, name="alu_o"))
201 out.append(Data(3, name="alu_cr"))
202 self.out = Array(out)
203 self.o = self.out[0]
204 self.cr = self.out[1]
205 self.width = width
206 # more "look like nmutil pipeline API"
207 self.p.data_i.ctx.op = self.op
208 self.p.data_i.a = self.a
209 self.p.data_i.b = self.b
210 self.n.data_o.o = self.o
211
212 def elaborate(self, platform):
213 m = Module()
214 add = Adder(self.width)
215 mul = Multiplier(self.width)
216 shf = Shifter(self.width)
217 sub = Subtractor(self.width)
218 ext_sign = SignExtend(self.width)
219
220 m.submodules.add = add
221 m.submodules.mul = mul
222 m.submodules.shf = shf
223 m.submodules.sub = sub
224 m.submodules.ext_sign = ext_sign
225
226 # really should not activate absolutely all ALU inputs like this
227 for mod in [add, mul, shf, sub]:
228 m.d.comb += [
229 mod.a.eq(self.a),
230 mod.b.eq(self.b),
231 ]
232 # EXTS sign extends the first input
233 with m.If(self.op.insn_type == MicrOp.OP_EXTS):
234 m.d.comb += ext_sign.a.eq(self.a)
235 # EXTSWSLI sign extends the second input
236 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
237 m.d.comb += ext_sign.a.eq(self.b)
238
239 # pass invert (and carry later)
240 m.d.comb += add.invert_in.eq(self.op.invert_in)
241
242 go_now = Signal(reset_less=True) # testing no-delay ALU
243
244 # ALU sequencer is idle when the count is zero
245 alu_idle = Signal(reset_less=True)
246 m.d.comb += alu_idle.eq(self.counter == 0)
247
248 # ALU sequencer is done when the count is one
249 alu_done = Signal(reset_less=True)
250 m.d.comb += alu_done.eq(self.counter == 1)
251
252 # select handshake handling according to ALU type
253 with m.If(go_now):
254 # with a combinatorial, no-delay ALU, just pass through
255 # the handshake signals to the other side
256 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
257 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
258 with m.Else():
259 # sequential ALU handshake:
260 # ready_o responds to valid_i, but only if the ALU is idle
261 m.d.comb += self.p.ready_o.eq(alu_idle)
262 # select the internally generated valid_o, above
263 m.d.comb += self.n.valid_o.eq(alu_done)
264
265 # hold the ALU result until ready_o is asserted
266 alu_r = Signal(self.width)
267
268 # condition register output enable
269 cr_ok_r = Signal()
270
271 # NOP doesn't output anything
272 with m.If(self.op.insn_type != MicrOp.OP_NOP):
273 m.d.comb += self.o.ok.eq(1)
274 with m.If(alu_idle):
275 with m.If(self.p.valid_i):
276
277 # as this is a "fake" pipeline, just grab the output right now
278 with m.If(self.op.insn_type == MicrOp.OP_ADD):
279 m.d.sync += alu_r.eq(add.o)
280 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
281 m.d.sync += alu_r.eq(mul.o)
282 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
283 m.d.sync += alu_r.eq(shf.o)
284 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
285 m.d.sync += alu_r.eq(ext_sign.o)
286 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
287 m.d.sync += alu_r.eq(ext_sign.o)
288 # SUB is zero-delay, no need to register
289
290 # NOTE: all of these are fake, just something to test
291
292 # MUL, to take 5 instructions
293 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
294 m.d.sync += self.counter.eq(5)
295 # SHIFT to take 1, straight away
296 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
297 m.d.sync += self.counter.eq(1)
298 # ADD/SUB to take 3
299 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
300 m.d.sync += self.counter.eq(3)
301 # EXTS to take 1
302 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
303 m.d.sync += self.counter.eq(1)
304 # EXTSWSLI to take 1
305 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
306 m.d.sync += self.counter.eq(1)
307 # others to take no delay
308 with m.Else():
309 m.d.comb += go_now.eq(1)
310
311 # store rc bit, to enable cr output later
312 m.d.sync += cr_ok_r.eq(self.op.rc.rc)
313
314 with m.Elif(~alu_done | self.n.ready_i):
315 # decrement the counter while the ALU is neither idle nor finished
316 m.d.sync += self.counter.eq(self.counter - 1)
317
318 # choose between zero-delay output, or registered
319 with m.If(go_now):
320 with m.If(self.o.ok):
321 m.d.comb += self.o.data.eq(sub.o)
322 m.d.comb += self.cr.ok.eq(self.op.rc.rc)
323 # only present the result at the last computation cycle
324 with m.Elif(alu_done):
325 with m.If(self.o.ok):
326 m.d.comb += self.o.data.eq(alu_r)
327 m.d.comb += self.cr.ok.eq(cr_ok_r)
328
329 # determine condition register bits based on the data output value
330 with m.If(self.cr.ok):
331 with m.If(~self.o.data.any()):
332 m.d.comb += self.cr.data.eq(0b001)
333 with m.Elif(self.o.data[-1]):
334 m.d.comb += self.cr.data.eq(0b010)
335 with m.Else():
336 m.d.comb += self.cr.data.eq(0b100)
337
338 return m
339
340 def __iter__(self):
341 yield from self.op.ports()
342 yield self.a
343 yield self.b
344 yield from self.o.ports()
345 yield self.p.valid_i
346 yield self.p.ready_o
347 yield self.n.valid_o
348 yield self.n.ready_i
349
350 def ports(self):
351 return list(self)
352
353
354 class BranchOp(Elaboratable):
355 def __init__(self, width, op):
356 self.a = Signal(width)
357 self.b = Signal(width)
358 self.o = Signal(width)
359 self.op = op
360
361 def elaborate(self, platform):
362 m = Module()
363 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
364 return m
365
366
367 class BranchALU(Elaboratable):
368 def __init__(self, width):
369 self.p = Dummy() # make look like nmutil pipeline API
370 self.p.data_i = Dummy()
371 self.p.data_i.ctx = Dummy()
372 self.n = Dummy() # make look like nmutil pipeline API
373 self.n.data_o = Dummy()
374 self.p.valid_i = Signal()
375 self.p.ready_o = Signal()
376 self.n.ready_i = Signal()
377 self.n.valid_o = Signal()
378 self.counter = Signal(4)
379 self.op = Signal(2)
380 i = []
381 i.append(Signal(width, name="i1"))
382 i.append(Signal(width, name="i2"))
383 self.i = Array(i)
384 self.a, self.b = i[0], i[1]
385 self.out = Array([Signal(width)])
386 self.o = self.out[0]
387 self.width = width
388
389 def elaborate(self, platform):
390 m = Module()
391 bgt = BranchOp(self.width, operator.gt)
392 blt = BranchOp(self.width, operator.lt)
393 beq = BranchOp(self.width, operator.eq)
394 bne = BranchOp(self.width, operator.ne)
395
396 m.submodules.bgt = bgt
397 m.submodules.blt = blt
398 m.submodules.beq = beq
399 m.submodules.bne = bne
400 for mod in [bgt, blt, beq, bne]:
401 m.d.comb += [
402 mod.a.eq(self.a),
403 mod.b.eq(self.b),
404 ]
405
406 go_now = Signal(reset_less=True) # testing no-delay ALU
407 with m.If(self.p.valid_i):
408 # input is valid. next check, if we already said "ready" or not
409 with m.If(~self.p.ready_o):
410 # we didn't say "ready" yet, so say so and initialise
411 m.d.sync += self.p.ready_o.eq(1)
412
413 # as this is a "fake" pipeline, just grab the output right now
414 with m.Switch(self.op):
415 for i, mod in enumerate([bgt, blt, beq, bne]):
416 with m.Case(i):
417 m.d.sync += self.o.eq(mod.o)
418 # branch to take 5 cycles (fake)
419 m.d.sync += self.counter.eq(5)
420 #m.d.comb += go_now.eq(1)
421 with m.Else():
422 # input says no longer valid, so drop ready as well.
423 # a "proper" ALU would have had to sync in the opcode and a/b ops
424 m.d.sync += self.p.ready_o.eq(0)
425
426 # ok so the counter's running: when it gets to 1, fire the output
427 with m.If((self.counter == 1) | go_now):
428 # set the output as valid if the recipient is ready for it
429 m.d.sync += self.n.valid_o.eq(1)
430 with m.If(self.n.ready_i & self.n.valid_o):
431 m.d.sync += self.n.valid_o.eq(0)
432 # recipient said it was ready: reset back to known-good.
433 m.d.sync += self.counter.eq(0) # reset the counter
434 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
435
436 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
437 with m.If(self.counter > 1):
438 m.d.sync += self.counter.eq(self.counter - 1)
439
440 return m
441
442 def __iter__(self):
443 yield self.op
444 yield self.a
445 yield self.b
446 yield self.o
447
448 def ports(self):
449 return list(self)
450
451
452 def run_op(dut, a, b, op, inv_a=0):
453 yield dut.a.eq(a)
454 yield dut.b.eq(b)
455 yield dut.op.insn_type.eq(op)
456 yield dut.op.invert_in.eq(inv_a)
457 yield dut.n.ready_i.eq(0)
458 yield dut.p.valid_i.eq(1)
459 yield dut.n.ready_i.eq(1)
460 yield
461
462 # wait for the ALU to accept our input data
463 while not (yield dut.p.ready_o):
464 yield
465
466 yield dut.p.valid_i.eq(0)
467 yield dut.a.eq(0)
468 yield dut.b.eq(0)
469 yield dut.op.insn_type.eq(0)
470 yield dut.op.invert_in.eq(0)
471
472 # wait for the ALU to present the output data
473 while not (yield dut.n.valid_o):
474 yield
475
476 # latch the result and lower read_i
477 result = yield dut.o.data
478 yield dut.n.ready_i.eq(0)
479
480 return result
481
482
483 def alu_sim(dut):
484 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
485 print("alu_sim add", result)
486 assert (result == 8)
487
488 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
489 print("alu_sim mul", result)
490 assert (result == 6)
491
492 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
493 print("alu_sim add-inv", result)
494 assert (result == 65533)
495
496 # test zero-delay ALU
497 # don't have OP_SUB, so use any other
498 result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
499 print("alu_sim sub", result)
500 assert (result == 2)
501
502 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
503 print("alu_sim shr", result)
504 assert (result == 3)
505
506
507 def test_alu():
508 alu = ALU(width=16)
509 write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
510 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
511
512 vl = rtlil.convert(alu, ports=alu.ports())
513 with open("test_alu.il", "w") as f:
514 f.write(vl)
515
516
517 def test_alu_parallel():
518 # Compare with the sequential test implementation, above.
519 m = Module()
520 m.submodules.alu = dut = ALU(width=16)
521 write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
522 pysim=is_engine_pysim())
523
524 sim = Simulator(m)
525 sim.add_clock(1e-6)
526
527 def send(a, b, op, inv_a=0, rc=0):
528 # present input data and assert valid_i
529 yield dut.a.eq(a)
530 yield dut.b.eq(b)
531 yield dut.op.insn_type.eq(op)
532 yield dut.op.invert_in.eq(inv_a)
533 yield dut.op.rc.rc.eq(rc)
534 yield dut.p.valid_i.eq(1)
535 yield
536 # wait for ready_o to be asserted
537 while not (yield dut.p.ready_o):
538 yield
539 # clear input data and negate valid_i
540 # if send is called again immediately afterwards, there will be no
541 # visible transition (they will not be negated, after all)
542 yield dut.p.valid_i.eq(0)
543 yield dut.a.eq(0)
544 yield dut.b.eq(0)
545 yield dut.op.insn_type.eq(0)
546 yield dut.op.invert_in.eq(0)
547 yield dut.op.rc.rc.eq(0)
548
549 def receive():
550 # signal readiness to receive data
551 yield dut.n.ready_i.eq(1)
552 yield
553 # wait for valid_o to be asserted
554 while not (yield dut.n.valid_o):
555 yield
556 # read results
557 result = yield dut.o.data
558 cr = yield dut.cr.data
559 # negate ready_i
560 # if receive is called again immediately afterwards, there will be no
561 # visible transition (it will not be negated, after all)
562 yield dut.n.ready_i.eq(0)
563 return result, cr
564
565 def producer():
566 # send a few test cases, interspersed with wait states
567 # note that, for this test, we do not wait for the result to be ready,
568 # before presenting the next input
569 # 5 + 3
570 yield from send(5, 3, MicrOp.OP_ADD)
571 yield
572 yield
573 # 2 * 3
574 yield from send(2, 3, MicrOp.OP_MUL_L64, rc=1)
575 # (-6) + 3
576 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1, rc=1)
577 yield
578 # 5 - 3
579 # note that this is a zero-delay operation
580 yield from send(5, 3, MicrOp.OP_CMP)
581 yield
582 yield
583 # NOP
584 yield from send(5, 3, MicrOp.OP_NOP)
585 # 13 >> 2
586 yield from send(13, 2, MicrOp.OP_SHR)
587 # sign extent 13
588 yield from send(13, 2, MicrOp.OP_EXTS)
589 # sign extend -128 (8 bits)
590 yield from send(0x80, 2, MicrOp.OP_EXTS, rc=1)
591 # sign extend -128 (8 bits)
592 yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
593
594 def consumer():
595 # receive and check results, interspersed with wait states
596 # the consumer is not in step with the producer, but the
597 # order of the results are preserved
598 yield
599 # 5 + 3 = 8
600 result = yield from receive()
601 assert result[0] == 8
602 # 2 * 3 = 6
603 result = yield from receive()
604 assert result == (6, 0b100)
605 yield
606 yield
607 # (-6) + 3 = -3
608 result = yield from receive()
609 assert result == (65533, 0b010) # unsigned equivalent to -2
610 # 5 - 3 = 2
611 # note that this is a zero-delay operation
612 # this, and the previous result, will be received back-to-back
613 # (check the output waveform to see this)
614 result = yield from receive()
615 assert result[0] == 2
616 yield
617 yield
618 # NOP
619 yield from receive()
620 # 13 >> 2 = 3
621 result = yield from receive()
622 assert result[0] == 3
623 # sign extent 13 = 13
624 result = yield from receive()
625 assert result[0] == 13
626 # sign extend -128 (8 bits) = -128 (16 bits)
627 result = yield from receive()
628 assert result == (0xFF80, 0b010)
629 # sign extend -128 (8 bits) = -128 (16 bits)
630 result = yield from receive()
631 assert result[0] == 0xFF80
632
633 sim.add_sync_process(producer)
634 sim.add_sync_process(consumer)
635 sim_writer = sim.write_vcd("test_alu_parallel.vcd")
636 with sim_writer:
637 sim.run()
638
639
640 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
641 pysim=True):
642 """Common function to write the GTKWave documents for this module"""
643 gtkwave_desc = [
644 'clk',
645 'i1[15:0]',
646 'i2[15:0]',
647 'op__insn_type' if pysim else 'op__insn_type[6:0]',
648 'op__invert_in',
649 'valid_i',
650 'ready_o',
651 'valid_o',
652 'ready_i',
653 'alu_o[15:0]',
654 'alu_o_ok',
655 'alu_cr[2:0]',
656 'alu_cr_ok'
657 ]
658 # determine the module name of the DUT
659 module = 'top'
660 if sub_module is not None:
661 module = nmigen_sim_top_module + sub_module
662 vcd_name = gtkw_name.replace('.gtkw', '.vcd')
663 write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
664 loc=__file__, clk_period=clk_period, base='signed')
665
666
667 if __name__ == "__main__":
668 test_alu()
669 test_alu_parallel()
670
671 # alu = BranchALU(width=16)
672 # vl = rtlil.convert(alu, ports=alu.ports())
673 # with open("test_branch_alu.il", "w") as f:
674 # f.write(vl)