Add condition register (CR) output
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmutil.extend import exts
18 from nmutil.gtkw import write_gtkw
19
20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
23 is_engine_pysim)
24
25 from soc.decoder.decode2execute1 import Data
26 from soc.decoder.power_enums import MicrOp, Function, CryIn
27
28 from soc.fu.alu.alu_input_record import CompALUOpSubset
29 from soc.fu.cr.cr_input_record import CompCROpSubset
30
31 import operator
32
33
34 class Adder(Elaboratable):
35 def __init__(self, width):
36 self.invert_in = Signal()
37 self.a = Signal(width)
38 self.b = Signal(width)
39 self.o = Signal(width, name="add_o")
40
41 def elaborate(self, platform):
42 m = Module()
43 with m.If(self.invert_in):
44 m.d.comb += self.o.eq((~self.a) + self.b)
45 with m.Else():
46 m.d.comb += self.o.eq(self.a + self.b)
47 return m
48
49
50 class Subtractor(Elaboratable):
51 def __init__(self, width):
52 self.a = Signal(width)
53 self.b = Signal(width)
54 self.o = Signal(width, name="sub_o")
55
56 def elaborate(self, platform):
57 m = Module()
58 m.d.comb += self.o.eq(self.a - self.b)
59 return m
60
61
62 class Multiplier(Elaboratable):
63 def __init__(self, width):
64 self.a = Signal(width)
65 self.b = Signal(width)
66 self.o = Signal(width, name="mul_o")
67
68 def elaborate(self, platform):
69 m = Module()
70 m.d.comb += self.o.eq(self.a * self.b)
71 return m
72
73
74 class Shifter(Elaboratable):
75 def __init__(self, width):
76 self.width = width
77 self.a = Signal(width)
78 self.b = Signal(width)
79 self.o = Signal(width, name="shf_o")
80
81 def elaborate(self, platform):
82 m = Module()
83 btrunc = Signal(self.width)
84 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
85 m.d.comb += self.o.eq(self.a >> btrunc)
86 return m
87
88
89 class SignExtend(Elaboratable):
90 def __init__(self, width):
91 self.width = width
92 self.a = Signal(width)
93 self.o = Signal(width, name="exts_o")
94
95 def elaborate(self, platform):
96 m = Module()
97 m.d.comb += self.o.eq(exts(self.a, 8, self.width))
98 return m
99
100
101 class Dummy:
102 pass
103
104
105 class DummyALU(Elaboratable):
106 def __init__(self, width):
107 self.p = Dummy() # make look like nmutil pipeline API
108 self.p.data_i = Dummy()
109 self.p.data_i.ctx = Dummy()
110 self.n = Dummy() # make look like nmutil pipeline API
111 self.n.data_o = Dummy()
112 self.p.valid_i = Signal()
113 self.p.ready_o = Signal()
114 self.n.ready_i = Signal()
115 self.n.valid_o = Signal()
116 self.counter = Signal(4)
117 self.op = CompCROpSubset()
118 i = []
119 i.append(Signal(width, name="i1"))
120 i.append(Signal(width, name="i2"))
121 i.append(Signal(width, name="i3"))
122 self.i = Array(i)
123 self.a, self.b, self.c = i[0], i[1], i[2]
124 self.out = Array([Signal(width, name="alu_o")])
125 self.o = self.out[0]
126 self.width = width
127 # more "look like nmutil pipeline API"
128 self.p.data_i.ctx.op = self.op
129 self.p.data_i.a = self.a
130 self.p.data_i.b = self.b
131 self.p.data_i.c = self.c
132 self.n.data_o.o = self.o
133
134 def elaborate(self, platform):
135 m = Module()
136
137 go_now = Signal(reset_less=True) # testing no-delay ALU
138
139 with m.If(self.p.valid_i):
140 # input is valid. next check, if we already said "ready" or not
141 with m.If(~self.p.ready_o):
142 # we didn't say "ready" yet, so say so and initialise
143 m.d.sync += self.p.ready_o.eq(1)
144
145 m.d.sync += self.o.eq(self.a)
146 m.d.comb += go_now.eq(1)
147 m.d.sync += self.counter.eq(1)
148
149 with m.Else():
150 # input says no longer valid, so drop ready as well.
151 # a "proper" ALU would have had to sync in the opcode and a/b ops
152 m.d.sync += self.p.ready_o.eq(0)
153
154 # ok so the counter's running: when it gets to 1, fire the output
155 with m.If((self.counter == 1) | go_now):
156 # set the output as valid if the recipient is ready for it
157 m.d.sync += self.n.valid_o.eq(1)
158 with m.If(self.n.ready_i & self.n.valid_o):
159 m.d.sync += self.n.valid_o.eq(0)
160 # recipient said it was ready: reset back to known-good.
161 m.d.sync += self.counter.eq(0) # reset the counter
162 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
163
164 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
165 with m.If(self.counter > 1):
166 m.d.sync += self.counter.eq(self.counter - 1)
167
168 return m
169
170 def __iter__(self):
171 yield from self.op.ports()
172 yield self.a
173 yield self.b
174 yield self.c
175 yield self.o
176
177 def ports(self):
178 return list(self)
179
180
181 class ALU(Elaboratable):
182 def __init__(self, width):
183 self.p = Dummy() # make look like nmutil pipeline API
184 self.p.data_i = Dummy()
185 self.p.data_i.ctx = Dummy()
186 self.n = Dummy() # make look like nmutil pipeline API
187 self.n.data_o = Dummy()
188 self.p.valid_i = Signal()
189 self.p.ready_o = Signal()
190 self.n.ready_i = Signal()
191 self.n.valid_o = Signal()
192 self.counter = Signal(4)
193 self.op = CompALUOpSubset(name="op")
194 i = []
195 i.append(Signal(width, name="i1"))
196 i.append(Signal(width, name="i2"))
197 self.i = Array(i)
198 self.a, self.b = i[0], i[1]
199 out = []
200 out.append(Data(width, name="alu_o"))
201 out.append(Data(3, name="alu_cr"))
202 self.out = Array(out)
203 self.o = self.out[0]
204 self.cr = self.out[1]
205 self.width = width
206 # more "look like nmutil pipeline API"
207 self.p.data_i.ctx.op = self.op
208 self.p.data_i.a = self.a
209 self.p.data_i.b = self.b
210 self.n.data_o.o = self.o
211
212 def elaborate(self, platform):
213 m = Module()
214 add = Adder(self.width)
215 mul = Multiplier(self.width)
216 shf = Shifter(self.width)
217 sub = Subtractor(self.width)
218 ext_sign = SignExtend(self.width)
219
220 m.submodules.add = add
221 m.submodules.mul = mul
222 m.submodules.shf = shf
223 m.submodules.sub = sub
224 m.submodules.ext_sign = ext_sign
225
226 # really should not activate absolutely all ALU inputs like this
227 for mod in [add, mul, shf, sub]:
228 m.d.comb += [
229 mod.a.eq(self.a),
230 mod.b.eq(self.b),
231 ]
232 # EXTS sign extends the first input
233 with m.If(self.op.insn_type == MicrOp.OP_EXTS):
234 m.d.comb += ext_sign.a.eq(self.a)
235 # EXTSWSLI sign extends the second input
236 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
237 m.d.comb += ext_sign.a.eq(self.b)
238
239 # pass invert (and carry later)
240 m.d.comb += add.invert_in.eq(self.op.invert_in)
241
242 go_now = Signal(reset_less=True) # testing no-delay ALU
243
244 # ALU sequencer is idle when the count is zero
245 alu_idle = Signal(reset_less=True)
246 m.d.comb += alu_idle.eq(self.counter == 0)
247
248 # ALU sequencer is done when the count is one
249 alu_done = Signal(reset_less=True)
250 m.d.comb += alu_done.eq(self.counter == 1)
251
252 # select handshake handling according to ALU type
253 with m.If(go_now):
254 # with a combinatorial, no-delay ALU, just pass through
255 # the handshake signals to the other side
256 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
257 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
258 with m.Else():
259 # sequential ALU handshake:
260 # ready_o responds to valid_i, but only if the ALU is idle
261 m.d.comb += self.p.ready_o.eq(alu_idle)
262 # select the internally generated valid_o, above
263 m.d.comb += self.n.valid_o.eq(alu_done)
264
265 # hold the ALU result until ready_o is asserted
266 alu_r = Signal(self.width)
267
268 # condition register output enable
269 cr_ok_r = Signal()
270
271 # NOP doesn't output anything
272 with m.If(self.op.insn_type != MicrOp.OP_NOP):
273 m.d.comb += self.o.ok.eq(1)
274 with m.If(alu_idle):
275 with m.If(self.p.valid_i):
276
277 # as this is a "fake" pipeline, just grab the output right now
278 with m.If(self.op.insn_type == MicrOp.OP_ADD):
279 m.d.sync += alu_r.eq(add.o)
280 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
281 m.d.sync += alu_r.eq(mul.o)
282 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
283 m.d.sync += alu_r.eq(shf.o)
284 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
285 m.d.sync += alu_r.eq(ext_sign.o)
286 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
287 m.d.sync += alu_r.eq(ext_sign.o)
288 # SUB is zero-delay, no need to register
289
290 # NOTE: all of these are fake, just something to test
291
292 # MUL, to take 5 instructions
293 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
294 m.d.sync += self.counter.eq(5)
295 # SHIFT to take 1, straight away
296 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
297 m.d.sync += self.counter.eq(1)
298 # ADD/SUB to take 3
299 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
300 m.d.sync += self.counter.eq(3)
301 # EXTS to take 1
302 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
303 m.d.sync += self.counter.eq(1)
304 # EXTSWSLI to take 1
305 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
306 m.d.sync += self.counter.eq(1)
307 # others to take no delay
308 with m.Else():
309 m.d.comb += go_now.eq(1)
310
311 # store rc bit, to enable cr output later
312 m.d.sync += cr_ok_r.eq(self.op.rc.rc)
313
314 with m.Elif(~alu_done | self.n.ready_i):
315 # decrement the counter while the ALU is neither idle nor finished
316 m.d.sync += self.counter.eq(self.counter - 1)
317
318 # choose between zero-delay output, or registered
319 with m.If(go_now):
320 m.d.comb += self.o.data.eq(sub.o)
321 m.d.comb += self.cr.ok.eq(self.op.rc.rc)
322 # only present the result at the last computation cycle
323 with m.Elif(alu_done):
324 m.d.comb += self.o.data.eq(alu_r)
325 m.d.comb += self.cr.ok.eq(cr_ok_r)
326
327 # determine condition register bits based on the data output value
328 with m.If(self.cr.ok):
329 with m.If(~self.o.data.any()):
330 m.d.comb += self.cr.data.eq(0b001)
331 with m.Elif(self.o.data[-1]):
332 m.d.comb += self.cr.data.eq(0b010)
333 with m.Else():
334 m.d.comb += self.cr.data.eq(0b100)
335
336 return m
337
338 def __iter__(self):
339 yield from self.op.ports()
340 yield self.a
341 yield self.b
342 yield from self.o.ports()
343 yield self.p.valid_i
344 yield self.p.ready_o
345 yield self.n.valid_o
346 yield self.n.ready_i
347
348 def ports(self):
349 return list(self)
350
351
352 class BranchOp(Elaboratable):
353 def __init__(self, width, op):
354 self.a = Signal(width)
355 self.b = Signal(width)
356 self.o = Signal(width)
357 self.op = op
358
359 def elaborate(self, platform):
360 m = Module()
361 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
362 return m
363
364
365 class BranchALU(Elaboratable):
366 def __init__(self, width):
367 self.p = Dummy() # make look like nmutil pipeline API
368 self.p.data_i = Dummy()
369 self.p.data_i.ctx = Dummy()
370 self.n = Dummy() # make look like nmutil pipeline API
371 self.n.data_o = Dummy()
372 self.p.valid_i = Signal()
373 self.p.ready_o = Signal()
374 self.n.ready_i = Signal()
375 self.n.valid_o = Signal()
376 self.counter = Signal(4)
377 self.op = Signal(2)
378 i = []
379 i.append(Signal(width, name="i1"))
380 i.append(Signal(width, name="i2"))
381 self.i = Array(i)
382 self.a, self.b = i[0], i[1]
383 self.out = Array([Signal(width)])
384 self.o = self.out[0]
385 self.width = width
386
387 def elaborate(self, platform):
388 m = Module()
389 bgt = BranchOp(self.width, operator.gt)
390 blt = BranchOp(self.width, operator.lt)
391 beq = BranchOp(self.width, operator.eq)
392 bne = BranchOp(self.width, operator.ne)
393
394 m.submodules.bgt = bgt
395 m.submodules.blt = blt
396 m.submodules.beq = beq
397 m.submodules.bne = bne
398 for mod in [bgt, blt, beq, bne]:
399 m.d.comb += [
400 mod.a.eq(self.a),
401 mod.b.eq(self.b),
402 ]
403
404 go_now = Signal(reset_less=True) # testing no-delay ALU
405 with m.If(self.p.valid_i):
406 # input is valid. next check, if we already said "ready" or not
407 with m.If(~self.p.ready_o):
408 # we didn't say "ready" yet, so say so and initialise
409 m.d.sync += self.p.ready_o.eq(1)
410
411 # as this is a "fake" pipeline, just grab the output right now
412 with m.Switch(self.op):
413 for i, mod in enumerate([bgt, blt, beq, bne]):
414 with m.Case(i):
415 m.d.sync += self.o.eq(mod.o)
416 # branch to take 5 cycles (fake)
417 m.d.sync += self.counter.eq(5)
418 #m.d.comb += go_now.eq(1)
419 with m.Else():
420 # input says no longer valid, so drop ready as well.
421 # a "proper" ALU would have had to sync in the opcode and a/b ops
422 m.d.sync += self.p.ready_o.eq(0)
423
424 # ok so the counter's running: when it gets to 1, fire the output
425 with m.If((self.counter == 1) | go_now):
426 # set the output as valid if the recipient is ready for it
427 m.d.sync += self.n.valid_o.eq(1)
428 with m.If(self.n.ready_i & self.n.valid_o):
429 m.d.sync += self.n.valid_o.eq(0)
430 # recipient said it was ready: reset back to known-good.
431 m.d.sync += self.counter.eq(0) # reset the counter
432 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
433
434 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
435 with m.If(self.counter > 1):
436 m.d.sync += self.counter.eq(self.counter - 1)
437
438 return m
439
440 def __iter__(self):
441 yield self.op
442 yield self.a
443 yield self.b
444 yield self.o
445
446 def ports(self):
447 return list(self)
448
449
450 def run_op(dut, a, b, op, inv_a=0):
451 yield dut.a.eq(a)
452 yield dut.b.eq(b)
453 yield dut.op.insn_type.eq(op)
454 yield dut.op.invert_in.eq(inv_a)
455 yield dut.n.ready_i.eq(0)
456 yield dut.p.valid_i.eq(1)
457 yield dut.n.ready_i.eq(1)
458 yield
459
460 # wait for the ALU to accept our input data
461 while not (yield dut.p.ready_o):
462 yield
463
464 yield dut.p.valid_i.eq(0)
465 yield dut.a.eq(0)
466 yield dut.b.eq(0)
467 yield dut.op.insn_type.eq(0)
468 yield dut.op.invert_in.eq(0)
469
470 # wait for the ALU to present the output data
471 while not (yield dut.n.valid_o):
472 yield
473
474 # latch the result and lower read_i
475 result = yield dut.o.data
476 yield dut.n.ready_i.eq(0)
477
478 return result
479
480
481 def alu_sim(dut):
482 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
483 print("alu_sim add", result)
484 assert (result == 8)
485
486 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
487 print("alu_sim mul", result)
488 assert (result == 6)
489
490 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
491 print("alu_sim add-inv", result)
492 assert (result == 65533)
493
494 # test zero-delay ALU
495 # don't have OP_SUB, so use any other
496 result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
497 print("alu_sim sub", result)
498 assert (result == 2)
499
500 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
501 print("alu_sim shr", result)
502 assert (result == 3)
503
504
505 def test_alu():
506 alu = ALU(width=16)
507 write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
508 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
509
510 vl = rtlil.convert(alu, ports=alu.ports())
511 with open("test_alu.il", "w") as f:
512 f.write(vl)
513
514
515 def test_alu_parallel():
516 # Compare with the sequential test implementation, above.
517 m = Module()
518 m.submodules.alu = dut = ALU(width=16)
519 write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
520 pysim=is_engine_pysim())
521
522 sim = Simulator(m)
523 sim.add_clock(1e-6)
524
525 def send(a, b, op, inv_a=0, rc=0):
526 # present input data and assert valid_i
527 yield dut.a.eq(a)
528 yield dut.b.eq(b)
529 yield dut.op.insn_type.eq(op)
530 yield dut.op.invert_in.eq(inv_a)
531 yield dut.op.rc.rc.eq(rc)
532 yield dut.p.valid_i.eq(1)
533 yield
534 # wait for ready_o to be asserted
535 while not (yield dut.p.ready_o):
536 yield
537 # clear input data and negate valid_i
538 # if send is called again immediately afterwards, there will be no
539 # visible transition (they will not be negated, after all)
540 yield dut.p.valid_i.eq(0)
541 yield dut.a.eq(0)
542 yield dut.b.eq(0)
543 yield dut.op.insn_type.eq(0)
544 yield dut.op.invert_in.eq(0)
545 yield dut.op.rc.rc.eq(0)
546
547 def receive():
548 # signal readiness to receive data
549 yield dut.n.ready_i.eq(1)
550 yield
551 # wait for valid_o to be asserted
552 while not (yield dut.n.valid_o):
553 yield
554 # read results
555 result = yield dut.o.data
556 cr = yield dut.cr.data
557 # negate ready_i
558 # if receive is called again immediately afterwards, there will be no
559 # visible transition (it will not be negated, after all)
560 yield dut.n.ready_i.eq(0)
561 return result, cr
562
563 def producer():
564 # send a few test cases, interspersed with wait states
565 # note that, for this test, we do not wait for the result to be ready,
566 # before presenting the next input
567 # 5 + 3
568 yield from send(5, 3, MicrOp.OP_ADD)
569 yield
570 yield
571 # 2 * 3
572 yield from send(2, 3, MicrOp.OP_MUL_L64, rc=1)
573 # (-6) + 3
574 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1, rc=1)
575 yield
576 # 5 - 3
577 # note that this is a zero-delay operation
578 yield from send(5, 3, MicrOp.OP_CMP)
579 yield
580 yield
581 # NOP
582 yield from send(5, 3, MicrOp.OP_NOP)
583 # 13 >> 2
584 yield from send(13, 2, MicrOp.OP_SHR)
585 # sign extent 13
586 yield from send(13, 2, MicrOp.OP_EXTS)
587 # sign extend -128 (8 bits)
588 yield from send(0x80, 2, MicrOp.OP_EXTS, rc=1)
589 # sign extend -128 (8 bits)
590 yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
591
592 def consumer():
593 # receive and check results, interspersed with wait states
594 # the consumer is not in step with the producer, but the
595 # order of the results are preserved
596 yield
597 # 5 + 3 = 8
598 result = yield from receive()
599 assert result[0] == 8
600 # 2 * 3 = 6
601 result = yield from receive()
602 assert result == (6, 0b100)
603 yield
604 yield
605 # (-6) + 3 = -3
606 result = yield from receive()
607 assert result == (65533, 0b010) # unsigned equivalent to -2
608 # 5 - 3 = 2
609 # note that this is a zero-delay operation
610 # this, and the previous result, will be received back-to-back
611 # (check the output waveform to see this)
612 result = yield from receive()
613 assert result[0] == 2
614 yield
615 yield
616 # NOP
617 yield from receive()
618 # 13 >> 2 = 3
619 result = yield from receive()
620 assert result[0] == 3
621 # sign extent 13 = 13
622 result = yield from receive()
623 assert result[0] == 13
624 # sign extend -128 (8 bits) = -128 (16 bits)
625 result = yield from receive()
626 assert result == (0xFF80, 0b010)
627 # sign extend -128 (8 bits) = -128 (16 bits)
628 result = yield from receive()
629 assert result[0] == 0xFF80
630
631 sim.add_sync_process(producer)
632 sim.add_sync_process(consumer)
633 sim_writer = sim.write_vcd("test_alu_parallel.vcd")
634 with sim_writer:
635 sim.run()
636
637
638 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
639 pysim=True):
640 """Common function to write the GTKWave documents for this module"""
641 gtkwave_desc = [
642 'clk',
643 'i1[15:0]',
644 'i2[15:0]',
645 'op__insn_type' if pysim else 'op__insn_type[6:0]',
646 'op__invert_in',
647 'valid_i',
648 'ready_o',
649 'valid_o',
650 'ready_i',
651 'alu_o[15:0]',
652 'alu_o_ok',
653 'alu_cr[2:0]',
654 'alu_cr_ok'
655 ]
656 # determine the module name of the DUT
657 module = 'top'
658 if sub_module is not None:
659 module = nmigen_sim_top_module + sub_module
660 vcd_name = gtkw_name.replace('.gtkw', '.vcd')
661 write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
662 loc=__file__, clk_period=clk_period, base='signed')
663
664
665 if __name__ == "__main__":
666 test_alu()
667 test_alu_parallel()
668
669 # alu = BranchALU(width=16)
670 # vl = rtlil.convert(alu, ports=alu.ports())
671 # with open("test_branch_alu.il", "w") as f:
672 # f.write(vl)