83354d44f16cb8fc105f98e6e20c57d8c302986d
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmutil.extend import exts
18 from nmutil.gtkw import write_gtkw
19
20 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
21 # Also, check out the cxxsim nmigen branch, and latest yosys from git
22 from nmutil.sim_tmp_alternative import (Simulator, nmigen_sim_top_module,
23 is_engine_pysim)
24
25 from soc.decoder.decode2execute1 import Data
26 from soc.decoder.power_enums import MicrOp, Function, CryIn
27
28 from soc.fu.alu.alu_input_record import CompALUOpSubset
29 from soc.fu.cr.cr_input_record import CompCROpSubset
30
31 import operator
32
33
34 class Adder(Elaboratable):
35 def __init__(self, width):
36 self.invert_in = Signal()
37 self.a = Signal(width)
38 self.b = Signal(width)
39 self.o = Signal(width, name="add_o")
40
41 def elaborate(self, platform):
42 m = Module()
43 with m.If(self.invert_in):
44 m.d.comb += self.o.eq((~self.a) + self.b)
45 with m.Else():
46 m.d.comb += self.o.eq(self.a + self.b)
47 return m
48
49
50 class Subtractor(Elaboratable):
51 def __init__(self, width):
52 self.a = Signal(width)
53 self.b = Signal(width)
54 self.o = Signal(width, name="sub_o")
55
56 def elaborate(self, platform):
57 m = Module()
58 m.d.comb += self.o.eq(self.a - self.b)
59 return m
60
61
62 class Multiplier(Elaboratable):
63 def __init__(self, width):
64 self.a = Signal(width)
65 self.b = Signal(width)
66 self.o = Signal(width, name="mul_o")
67
68 def elaborate(self, platform):
69 m = Module()
70 m.d.comb += self.o.eq(self.a * self.b)
71 return m
72
73
74 class Shifter(Elaboratable):
75 def __init__(self, width):
76 self.width = width
77 self.a = Signal(width)
78 self.b = Signal(width)
79 self.o = Signal(width, name="shf_o")
80
81 def elaborate(self, platform):
82 m = Module()
83 btrunc = Signal(self.width)
84 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
85 m.d.comb += self.o.eq(self.a >> btrunc)
86 return m
87
88
89 class SignExtend(Elaboratable):
90 def __init__(self, width):
91 self.width = width
92 self.a = Signal(width)
93 self.o = Signal(width, name="exts_o")
94
95 def elaborate(self, platform):
96 m = Module()
97 m.d.comb += self.o.eq(exts(self.a, 8, self.width))
98 return m
99
100
101 class Dummy:
102 pass
103
104
105 class DummyALU(Elaboratable):
106 def __init__(self, width):
107 self.p = Dummy() # make look like nmutil pipeline API
108 self.p.data_i = Dummy()
109 self.p.data_i.ctx = Dummy()
110 self.n = Dummy() # make look like nmutil pipeline API
111 self.n.data_o = Dummy()
112 self.p.valid_i = Signal()
113 self.p.ready_o = Signal()
114 self.n.ready_i = Signal()
115 self.n.valid_o = Signal()
116 self.counter = Signal(4)
117 self.op = CompCROpSubset()
118 i = []
119 i.append(Signal(width, name="i1"))
120 i.append(Signal(width, name="i2"))
121 i.append(Signal(width, name="i3"))
122 self.i = Array(i)
123 self.a, self.b, self.c = i[0], i[1], i[2]
124 self.out = Array([Signal(width, name="alu_o")])
125 self.o = self.out[0]
126 self.width = width
127 # more "look like nmutil pipeline API"
128 self.p.data_i.ctx.op = self.op
129 self.p.data_i.a = self.a
130 self.p.data_i.b = self.b
131 self.p.data_i.c = self.c
132 self.n.data_o.o = self.o
133
134 def elaborate(self, platform):
135 m = Module()
136
137 go_now = Signal(reset_less=True) # testing no-delay ALU
138
139 with m.If(self.p.valid_i):
140 # input is valid. next check, if we already said "ready" or not
141 with m.If(~self.p.ready_o):
142 # we didn't say "ready" yet, so say so and initialise
143 m.d.sync += self.p.ready_o.eq(1)
144
145 m.d.sync += self.o.eq(self.a)
146 m.d.comb += go_now.eq(1)
147 m.d.sync += self.counter.eq(1)
148
149 with m.Else():
150 # input says no longer valid, so drop ready as well.
151 # a "proper" ALU would have had to sync in the opcode and a/b ops
152 m.d.sync += self.p.ready_o.eq(0)
153
154 # ok so the counter's running: when it gets to 1, fire the output
155 with m.If((self.counter == 1) | go_now):
156 # set the output as valid if the recipient is ready for it
157 m.d.sync += self.n.valid_o.eq(1)
158 with m.If(self.n.ready_i & self.n.valid_o):
159 m.d.sync += self.n.valid_o.eq(0)
160 # recipient said it was ready: reset back to known-good.
161 m.d.sync += self.counter.eq(0) # reset the counter
162 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
163
164 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
165 with m.If(self.counter > 1):
166 m.d.sync += self.counter.eq(self.counter - 1)
167
168 return m
169
170 def __iter__(self):
171 yield from self.op.ports()
172 yield self.a
173 yield self.b
174 yield self.c
175 yield self.o
176
177 def ports(self):
178 return list(self)
179
180
181 class ALU(Elaboratable):
182 def __init__(self, width):
183 self.p = Dummy() # make look like nmutil pipeline API
184 self.p.data_i = Dummy()
185 self.p.data_i.ctx = Dummy()
186 self.n = Dummy() # make look like nmutil pipeline API
187 self.n.data_o = Dummy()
188 self.p.valid_i = Signal()
189 self.p.ready_o = Signal()
190 self.n.ready_i = Signal()
191 self.n.valid_o = Signal()
192 self.counter = Signal(4)
193 self.op = CompALUOpSubset(name="op")
194 i = []
195 i.append(Signal(width, name="i1"))
196 i.append(Signal(width, name="i2"))
197 self.i = Array(i)
198 self.a, self.b = i[0], i[1]
199 out = []
200 out.append(Data(width, name="alu_o"))
201 out.append(Data(3, name="alu_cr"))
202 self.out = Array(out)
203 self.o = self.out[0]
204 self.cr = self.out[1]
205 self.width = width
206 # more "look like nmutil pipeline API"
207 self.p.data_i.ctx.op = self.op
208 self.p.data_i.a = self.a
209 self.p.data_i.b = self.b
210 self.n.data_o.o = self.o
211
212 def elaborate(self, platform):
213 m = Module()
214 add = Adder(self.width)
215 mul = Multiplier(self.width)
216 shf = Shifter(self.width)
217 sub = Subtractor(self.width)
218 ext_sign = SignExtend(self.width)
219
220 m.submodules.add = add
221 m.submodules.mul = mul
222 m.submodules.shf = shf
223 m.submodules.sub = sub
224 m.submodules.ext_sign = ext_sign
225
226 # really should not activate absolutely all ALU inputs like this
227 for mod in [add, mul, shf, sub]:
228 m.d.comb += [
229 mod.a.eq(self.a),
230 mod.b.eq(self.b),
231 ]
232 # EXTS sign extends the first input
233 with m.If(self.op.insn_type == MicrOp.OP_EXTS):
234 m.d.comb += ext_sign.a.eq(self.a)
235 # EXTSWSLI sign extends the second input
236 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
237 m.d.comb += ext_sign.a.eq(self.b)
238
239 # pass invert (and carry later)
240 m.d.comb += add.invert_in.eq(self.op.invert_in)
241
242 go_now = Signal(reset_less=True) # testing no-delay ALU
243
244 # ALU sequencer is idle when the count is zero
245 alu_idle = Signal(reset_less=True)
246 m.d.comb += alu_idle.eq(self.counter == 0)
247
248 # ALU sequencer is done when the count is one
249 alu_done = Signal(reset_less=True)
250 m.d.comb += alu_done.eq(self.counter == 1)
251
252 # select handshake handling according to ALU type
253 with m.If(go_now):
254 # with a combinatorial, no-delay ALU, just pass through
255 # the handshake signals to the other side
256 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
257 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
258 with m.Else():
259 # sequential ALU handshake:
260 # ready_o responds to valid_i, but only if the ALU is idle
261 m.d.comb += self.p.ready_o.eq(alu_idle)
262 # select the internally generated valid_o, above
263 m.d.comb += self.n.valid_o.eq(alu_done)
264
265 # hold the ALU result until ready_o is asserted
266 alu_r = Signal(self.width)
267
268 # output masks
269 # NOP and ILLEGAL don't output anything
270 with m.If((self.op.insn_type != MicrOp.OP_NOP) &
271 (self.op.insn_type != MicrOp.OP_ILLEGAL)):
272 m.d.comb += self.o.ok.eq(1)
273 # CR is output when rc bit is active
274 m.d.comb += self.cr.ok.eq(self.op.rc.rc)
275
276 with m.If(alu_idle):
277 with m.If(self.p.valid_i):
278
279 # as this is a "fake" pipeline, just grab the output right now
280 with m.If(self.op.insn_type == MicrOp.OP_ADD):
281 m.d.sync += alu_r.eq(add.o)
282 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
283 m.d.sync += alu_r.eq(mul.o)
284 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
285 m.d.sync += alu_r.eq(shf.o)
286 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
287 m.d.sync += alu_r.eq(ext_sign.o)
288 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
289 m.d.sync += alu_r.eq(ext_sign.o)
290 # SUB is zero-delay, no need to register
291
292 # NOTE: all of these are fake, just something to test
293
294 # MUL, to take 5 instructions
295 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
296 m.d.sync += self.counter.eq(5)
297 # SHIFT to take 1, straight away
298 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
299 m.d.sync += self.counter.eq(1)
300 # ADD/SUB to take 3
301 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
302 m.d.sync += self.counter.eq(3)
303 # EXTS to take 1
304 with m.Elif(self.op.insn_type == MicrOp.OP_EXTS):
305 m.d.sync += self.counter.eq(1)
306 # EXTSWSLI to take 1
307 with m.Elif(self.op.insn_type == MicrOp.OP_EXTSWSLI):
308 m.d.sync += self.counter.eq(1)
309 # others to take no delay
310 with m.Else():
311 m.d.comb += go_now.eq(1)
312
313 with m.Elif(~alu_done | self.n.ready_i):
314 # decrement the counter while the ALU is neither idle nor finished
315 m.d.sync += self.counter.eq(self.counter - 1)
316
317 # choose between zero-delay output, or registered
318 with m.If(go_now):
319 m.d.comb += self.o.data.eq(sub.o)
320 # only present the result at the last computation cycle
321 with m.Elif(alu_done):
322 m.d.comb += self.o.data.eq(alu_r)
323
324 # determine condition register bits based on the data output value
325 with m.If(~self.o.data.any()):
326 m.d.comb += self.cr.data.eq(0b001)
327 with m.Elif(self.o.data[-1]):
328 m.d.comb += self.cr.data.eq(0b010)
329 with m.Else():
330 m.d.comb += self.cr.data.eq(0b100)
331
332 return m
333
334 def __iter__(self):
335 yield from self.op.ports()
336 yield self.a
337 yield self.b
338 yield from self.o.ports()
339 yield self.p.valid_i
340 yield self.p.ready_o
341 yield self.n.valid_o
342 yield self.n.ready_i
343
344 def ports(self):
345 return list(self)
346
347
348 class BranchOp(Elaboratable):
349 def __init__(self, width, op):
350 self.a = Signal(width)
351 self.b = Signal(width)
352 self.o = Signal(width)
353 self.op = op
354
355 def elaborate(self, platform):
356 m = Module()
357 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
358 return m
359
360
361 class BranchALU(Elaboratable):
362 def __init__(self, width):
363 self.p = Dummy() # make look like nmutil pipeline API
364 self.p.data_i = Dummy()
365 self.p.data_i.ctx = Dummy()
366 self.n = Dummy() # make look like nmutil pipeline API
367 self.n.data_o = Dummy()
368 self.p.valid_i = Signal()
369 self.p.ready_o = Signal()
370 self.n.ready_i = Signal()
371 self.n.valid_o = Signal()
372 self.counter = Signal(4)
373 self.op = Signal(2)
374 i = []
375 i.append(Signal(width, name="i1"))
376 i.append(Signal(width, name="i2"))
377 self.i = Array(i)
378 self.a, self.b = i[0], i[1]
379 self.out = Array([Signal(width)])
380 self.o = self.out[0]
381 self.width = width
382
383 def elaborate(self, platform):
384 m = Module()
385 bgt = BranchOp(self.width, operator.gt)
386 blt = BranchOp(self.width, operator.lt)
387 beq = BranchOp(self.width, operator.eq)
388 bne = BranchOp(self.width, operator.ne)
389
390 m.submodules.bgt = bgt
391 m.submodules.blt = blt
392 m.submodules.beq = beq
393 m.submodules.bne = bne
394 for mod in [bgt, blt, beq, bne]:
395 m.d.comb += [
396 mod.a.eq(self.a),
397 mod.b.eq(self.b),
398 ]
399
400 go_now = Signal(reset_less=True) # testing no-delay ALU
401 with m.If(self.p.valid_i):
402 # input is valid. next check, if we already said "ready" or not
403 with m.If(~self.p.ready_o):
404 # we didn't say "ready" yet, so say so and initialise
405 m.d.sync += self.p.ready_o.eq(1)
406
407 # as this is a "fake" pipeline, just grab the output right now
408 with m.Switch(self.op):
409 for i, mod in enumerate([bgt, blt, beq, bne]):
410 with m.Case(i):
411 m.d.sync += self.o.eq(mod.o)
412 # branch to take 5 cycles (fake)
413 m.d.sync += self.counter.eq(5)
414 #m.d.comb += go_now.eq(1)
415 with m.Else():
416 # input says no longer valid, so drop ready as well.
417 # a "proper" ALU would have had to sync in the opcode and a/b ops
418 m.d.sync += self.p.ready_o.eq(0)
419
420 # ok so the counter's running: when it gets to 1, fire the output
421 with m.If((self.counter == 1) | go_now):
422 # set the output as valid if the recipient is ready for it
423 m.d.sync += self.n.valid_o.eq(1)
424 with m.If(self.n.ready_i & self.n.valid_o):
425 m.d.sync += self.n.valid_o.eq(0)
426 # recipient said it was ready: reset back to known-good.
427 m.d.sync += self.counter.eq(0) # reset the counter
428 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
429
430 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
431 with m.If(self.counter > 1):
432 m.d.sync += self.counter.eq(self.counter - 1)
433
434 return m
435
436 def __iter__(self):
437 yield self.op
438 yield self.a
439 yield self.b
440 yield self.o
441
442 def ports(self):
443 return list(self)
444
445
446 def run_op(dut, a, b, op, inv_a=0):
447 yield dut.a.eq(a)
448 yield dut.b.eq(b)
449 yield dut.op.insn_type.eq(op)
450 yield dut.op.invert_in.eq(inv_a)
451 yield dut.n.ready_i.eq(0)
452 yield dut.p.valid_i.eq(1)
453 yield dut.n.ready_i.eq(1)
454 yield
455
456 # wait for the ALU to accept our input data
457 while not (yield dut.p.ready_o):
458 yield
459
460 yield dut.p.valid_i.eq(0)
461 yield dut.a.eq(0)
462 yield dut.b.eq(0)
463 yield dut.op.insn_type.eq(0)
464 yield dut.op.invert_in.eq(0)
465
466 # wait for the ALU to present the output data
467 while not (yield dut.n.valid_o):
468 yield
469
470 # latch the result and lower read_i
471 result = yield dut.o.data
472 yield dut.n.ready_i.eq(0)
473
474 return result
475
476
477 def alu_sim(dut):
478 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
479 print("alu_sim add", result)
480 assert (result == 8)
481
482 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
483 print("alu_sim mul", result)
484 assert (result == 6)
485
486 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
487 print("alu_sim add-inv", result)
488 assert (result == 65533)
489
490 # test zero-delay ALU
491 # don't have OP_SUB, so use any other
492 result = yield from run_op(dut, 5, 3, MicrOp.OP_CMP)
493 print("alu_sim sub", result)
494 assert (result == 2)
495
496 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
497 print("alu_sim shr", result)
498 assert (result == 3)
499
500
501 def test_alu():
502 alu = ALU(width=16)
503 write_alu_gtkw("test_alusim.gtkw", clk_period=10e-9)
504 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
505
506 vl = rtlil.convert(alu, ports=alu.ports())
507 with open("test_alu.il", "w") as f:
508 f.write(vl)
509
510
511 def test_alu_parallel():
512 # Compare with the sequential test implementation, above.
513 m = Module()
514 m.submodules.alu = dut = ALU(width=16)
515 write_alu_gtkw("test_alu_parallel.gtkw", sub_module='alu',
516 pysim=is_engine_pysim())
517
518 sim = Simulator(m)
519 sim.add_clock(1e-6)
520
521 def send(a, b, op, inv_a=0, rc=0):
522 # present input data and assert valid_i
523 yield dut.a.eq(a)
524 yield dut.b.eq(b)
525 yield dut.op.insn_type.eq(op)
526 yield dut.op.invert_in.eq(inv_a)
527 yield dut.op.rc.rc.eq(rc)
528 yield dut.p.valid_i.eq(1)
529 yield
530 # wait for ready_o to be asserted
531 while not (yield dut.p.ready_o):
532 yield
533 # clear input data and negate valid_i
534 # if send is called again immediately afterwards, there will be no
535 # visible transition (they will not be negated, after all)
536 yield dut.p.valid_i.eq(0)
537 yield dut.a.eq(0)
538 yield dut.b.eq(0)
539 yield dut.op.insn_type.eq(0)
540 yield dut.op.invert_in.eq(0)
541 yield dut.op.rc.rc.eq(0)
542
543 def receive():
544 # signal readiness to receive data
545 yield dut.n.ready_i.eq(1)
546 yield
547 # wait for valid_o to be asserted
548 while not (yield dut.n.valid_o):
549 yield
550 # read results
551 result = yield dut.o.data
552 cr = yield dut.cr.data
553 # negate ready_i
554 # if receive is called again immediately afterwards, there will be no
555 # visible transition (it will not be negated, after all)
556 yield dut.n.ready_i.eq(0)
557 return result, cr
558
559 def producer():
560 # send a few test cases, interspersed with wait states
561 # note that, for this test, we do not wait for the result to be ready,
562 # before presenting the next input
563 # 5 + 3
564 yield from send(5, 3, MicrOp.OP_ADD)
565 yield
566 yield
567 # 2 * 3
568 yield from send(2, 3, MicrOp.OP_MUL_L64, rc=1)
569 # (-6) + 3
570 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1, rc=1)
571 yield
572 # 5 - 3
573 # note that this is a zero-delay operation
574 yield from send(5, 3, MicrOp.OP_CMP)
575 yield
576 yield
577 # NOP
578 yield from send(5, 3, MicrOp.OP_NOP)
579 # 13 >> 2
580 yield from send(13, 2, MicrOp.OP_SHR)
581 # sign extent 13
582 yield from send(13, 2, MicrOp.OP_EXTS)
583 # sign extend -128 (8 bits)
584 yield from send(0x80, 2, MicrOp.OP_EXTS, rc=1)
585 # sign extend -128 (8 bits)
586 yield from send(2, 0x80, MicrOp.OP_EXTSWSLI)
587
588 def consumer():
589 # receive and check results, interspersed with wait states
590 # the consumer is not in step with the producer, but the
591 # order of the results are preserved
592 yield
593 # 5 + 3 = 8
594 result = yield from receive()
595 assert result[0] == 8
596 # 2 * 3 = 6
597 result = yield from receive()
598 assert result == (6, 0b100)
599 yield
600 yield
601 # (-6) + 3 = -3
602 result = yield from receive()
603 assert result == (65533, 0b010) # unsigned equivalent to -2
604 # 5 - 3 = 2
605 # note that this is a zero-delay operation
606 # this, and the previous result, will be received back-to-back
607 # (check the output waveform to see this)
608 result = yield from receive()
609 assert result[0] == 2
610 yield
611 yield
612 # NOP
613 yield from receive()
614 # 13 >> 2 = 3
615 result = yield from receive()
616 assert result[0] == 3
617 # sign extent 13 = 13
618 result = yield from receive()
619 assert result[0] == 13
620 # sign extend -128 (8 bits) = -128 (16 bits)
621 result = yield from receive()
622 assert result == (0xFF80, 0b010)
623 # sign extend -128 (8 bits) = -128 (16 bits)
624 result = yield from receive()
625 assert result[0] == 0xFF80
626
627 sim.add_sync_process(producer)
628 sim.add_sync_process(consumer)
629 sim_writer = sim.write_vcd("test_alu_parallel.vcd")
630 with sim_writer:
631 sim.run()
632
633
634 def write_alu_gtkw(gtkw_name, clk_period=1e-6, sub_module=None,
635 pysim=True):
636 """Common function to write the GTKWave documents for this module"""
637 gtkwave_desc = [
638 'clk',
639 'i1[15:0]',
640 'i2[15:0]',
641 'op__insn_type' if pysim else 'op__insn_type[6:0]',
642 'op__invert_in',
643 'valid_i',
644 'ready_o',
645 'valid_o',
646 'ready_i',
647 'alu_o[15:0]',
648 'alu_o_ok',
649 'alu_cr[2:0]',
650 'alu_cr_ok'
651 ]
652 # determine the module name of the DUT
653 module = 'top'
654 if sub_module is not None:
655 module = nmigen_sim_top_module + sub_module
656 vcd_name = gtkw_name.replace('.gtkw', '.vcd')
657 write_gtkw(gtkw_name, vcd_name, gtkwave_desc, module=module,
658 loc=__file__, clk_period=clk_period, base='signed')
659
660
661 if __name__ == "__main__":
662 test_alu()
663 test_alu_parallel()
664
665 # alu = BranchALU(width=16)
666 # vl = rtlil.convert(alu, ports=alu.ports())
667 # with open("test_branch_alu.il", "w") as f:
668 # f.write(vl)