first attempt at 3) of
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17
18 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
19 # Also, check out the cxxsim nmigen branch, and latest yosys from git
20 from nmutil.sim_tmp_alternative import Simulator
21
22 from soc.decoder.power_enums import MicrOp, Function, CryIn
23
24 from soc.fu.alu.alu_input_record import CompALUOpSubset
25 from soc.fu.cr.cr_input_record import CompCROpSubset
26
27 import operator
28
29
30 class Adder(Elaboratable):
31 def __init__(self, width):
32 self.invert_in = Signal()
33 self.a = Signal(width)
34 self.b = Signal(width)
35 self.o = Signal(width, name="add_o")
36
37 def elaborate(self, platform):
38 m = Module()
39 with m.If(self.invert_in):
40 m.d.comb += self.o.eq((~self.a) + self.b)
41 with m.Else():
42 m.d.comb += self.o.eq(self.a + self.b)
43 return m
44
45
46 class Subtractor(Elaboratable):
47 def __init__(self, width):
48 self.a = Signal(width)
49 self.b = Signal(width)
50 self.o = Signal(width, name="sub_o")
51
52 def elaborate(self, platform):
53 m = Module()
54 m.d.comb += self.o.eq(self.a - self.b)
55 return m
56
57
58 class Multiplier(Elaboratable):
59 def __init__(self, width):
60 self.a = Signal(width)
61 self.b = Signal(width)
62 self.o = Signal(width, name="mul_o")
63
64 def elaborate(self, platform):
65 m = Module()
66 m.d.comb += self.o.eq(self.a * self.b)
67 return m
68
69
70 class Shifter(Elaboratable):
71 def __init__(self, width):
72 self.width = width
73 self.a = Signal(width)
74 self.b = Signal(width)
75 self.o = Signal(width, name="shf_o")
76
77 def elaborate(self, platform):
78 m = Module()
79 btrunc = Signal(self.width)
80 m.d.comb += btrunc.eq(self.b & Const((1 << self.width)-1))
81 m.d.comb += self.o.eq(self.a >> btrunc)
82 return m
83
84
85 class Dummy:
86 pass
87
88
89 class DummyALU(Elaboratable):
90 def __init__(self, width):
91 self.p = Dummy() # make look like nmutil pipeline API
92 self.p.data_i = Dummy()
93 self.p.data_i.ctx = Dummy()
94 self.n = Dummy() # make look like nmutil pipeline API
95 self.n.data_o = Dummy()
96 self.p.valid_i = Signal()
97 self.p.ready_o = Signal()
98 self.n.ready_i = Signal()
99 self.n.valid_o = Signal()
100 self.counter = Signal(4)
101 self.op = CompCROpSubset()
102 i = []
103 i.append(Signal(width, name="i1"))
104 i.append(Signal(width, name="i2"))
105 i.append(Signal(width, name="i3"))
106 self.i = Array(i)
107 self.a, self.b, self.c = i[0], i[1], i[2]
108 self.out = Array([Signal(width, name="alu_o")])
109 self.o = self.out[0]
110 self.width = width
111 # more "look like nmutil pipeline API"
112 self.p.data_i.ctx.op = self.op
113 self.p.data_i.a = self.a
114 self.p.data_i.b = self.b
115 self.p.data_i.c = self.c
116 self.n.data_o.o = self.o
117
118 def elaborate(self, platform):
119 m = Module()
120
121 go_now = Signal(reset_less=True) # testing no-delay ALU
122
123 with m.If(self.p.valid_i):
124 # input is valid. next check, if we already said "ready" or not
125 with m.If(~self.p.ready_o):
126 # we didn't say "ready" yet, so say so and initialise
127 m.d.sync += self.p.ready_o.eq(1)
128
129 m.d.sync += self.o.eq(self.a)
130 m.d.comb += go_now.eq(1)
131 m.d.sync += self.counter.eq(1)
132
133 with m.Else():
134 # input says no longer valid, so drop ready as well.
135 # a "proper" ALU would have had to sync in the opcode and a/b ops
136 m.d.sync += self.p.ready_o.eq(0)
137
138 # ok so the counter's running: when it gets to 1, fire the output
139 with m.If((self.counter == 1) | go_now):
140 # set the output as valid if the recipient is ready for it
141 m.d.sync += self.n.valid_o.eq(1)
142 with m.If(self.n.ready_i & self.n.valid_o):
143 m.d.sync += self.n.valid_o.eq(0)
144 # recipient said it was ready: reset back to known-good.
145 m.d.sync += self.counter.eq(0) # reset the counter
146 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
147
148 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
149 with m.If(self.counter > 1):
150 m.d.sync += self.counter.eq(self.counter - 1)
151
152 return m
153
154 def __iter__(self):
155 yield from self.op.ports()
156 yield self.a
157 yield self.b
158 yield self.c
159 yield self.o
160
161 def ports(self):
162 return list(self)
163
164
165 class ALU(Elaboratable):
166 def __init__(self, width):
167 self.p = Dummy() # make look like nmutil pipeline API
168 self.p.data_i = Dummy()
169 self.p.data_i.ctx = Dummy()
170 self.n = Dummy() # make look like nmutil pipeline API
171 self.n.data_o = Dummy()
172 self.p.valid_i = Signal()
173 self.p.ready_o = Signal()
174 self.n.ready_i = Signal()
175 self.n.valid_o = Signal()
176 self.counter = Signal(4)
177 self.op = CompALUOpSubset(name="op")
178 i = []
179 i.append(Signal(width, name="i1"))
180 i.append(Signal(width, name="i2"))
181 self.i = Array(i)
182 self.a, self.b = i[0], i[1]
183 self.out = Array([Signal(width, name="alu_o")])
184 self.o = self.out[0]
185 self.width = width
186 # more "look like nmutil pipeline API"
187 self.p.data_i.ctx.op = self.op
188 self.p.data_i.a = self.a
189 self.p.data_i.b = self.b
190 self.n.data_o.o = self.o
191
192 def elaborate(self, platform):
193 m = Module()
194 add = Adder(self.width)
195 mul = Multiplier(self.width)
196 shf = Shifter(self.width)
197 sub = Subtractor(self.width)
198
199 m.submodules.add = add
200 m.submodules.mul = mul
201 m.submodules.shf = shf
202 m.submodules.sub = sub
203
204 # really should not activate absolutely all ALU inputs like this
205 for mod in [add, mul, shf, sub]:
206 m.d.comb += [
207 mod.a.eq(self.a),
208 mod.b.eq(self.b),
209 ]
210
211 # pass invert (and carry later)
212 m.d.comb += add.invert_in.eq(self.op.invert_in)
213
214 go_now = Signal(reset_less=True) # testing no-delay ALU
215
216 # ALU sequencer is idle when the count is zero
217 alu_idle = Signal(reset_less=True)
218 m.d.comb += alu_idle.eq(self.counter == 0)
219
220 # ALU sequencer is done when the count is one
221 alu_done = Signal(reset_less=True)
222 m.d.comb += alu_done.eq(self.counter == 1)
223
224 # select handshake handling according to ALU type
225 with m.If(go_now):
226 # with a combinatorial, no-delay ALU, just pass through
227 # the handshake signals to the other side
228 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
229 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
230 with m.Else():
231 # sequential ALU handshake:
232 # ready_o responds to valid_i, but only if the ALU is idle
233 m.d.comb += self.p.ready_o.eq(alu_idle)
234 # select the internally generated valid_o, above
235 m.d.comb += self.n.valid_o.eq(alu_done)
236
237 # hold the ALU result until ready_o is asserted
238 alu_r = Signal(self.width)
239
240 with m.If(alu_idle):
241 with m.If(self.p.valid_i):
242
243 # as this is a "fake" pipeline, just grab the output right now
244 with m.If(self.op.insn_type == MicrOp.OP_ADD):
245 m.d.sync += alu_r.eq(add.o)
246 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
247 m.d.sync += alu_r.eq(mul.o)
248 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
249 m.d.sync += alu_r.eq(shf.o)
250 # SUB is zero-delay, no need to register
251
252 # NOTE: all of these are fake, just something to test
253
254 # MUL, to take 5 instructions
255 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
256 m.d.sync += self.counter.eq(5)
257 # SHIFT to take 1, straight away
258 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
259 m.d.sync += self.counter.eq(1)
260 # ADD/SUB to take 3
261 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
262 m.d.sync += self.counter.eq(3)
263 # others to take no delay
264 with m.Else():
265 m.d.comb += go_now.eq(1)
266
267 with m.Elif(~alu_done | self.n.ready_i):
268 # decrement the counter while the ALU is neither idle nor finished
269 m.d.sync += self.counter.eq(self.counter - 1)
270
271 # choose between zero-delay output, or registered
272 with m.If(go_now):
273 m.d.comb += self.o.eq(sub.o)
274 # only present the result at the last computation cycle
275 with m.Elif(alu_done):
276 m.d.comb += self.o.eq(alu_r)
277
278 return m
279
280 def __iter__(self):
281 yield from self.op.ports()
282 yield self.a
283 yield self.b
284 yield self.o
285 yield self.p.valid_i
286 yield self.p.ready_o
287 yield self.n.valid_o
288 yield self.n.ready_i
289
290 def ports(self):
291 return list(self)
292
293
294 class BranchOp(Elaboratable):
295 def __init__(self, width, op):
296 self.a = Signal(width)
297 self.b = Signal(width)
298 self.o = Signal(width)
299 self.op = op
300
301 def elaborate(self, platform):
302 m = Module()
303 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
304 return m
305
306
307 class BranchALU(Elaboratable):
308 def __init__(self, width):
309 self.p = Dummy() # make look like nmutil pipeline API
310 self.p.data_i = Dummy()
311 self.p.data_i.ctx = Dummy()
312 self.n = Dummy() # make look like nmutil pipeline API
313 self.n.data_o = Dummy()
314 self.p.valid_i = Signal()
315 self.p.ready_o = Signal()
316 self.n.ready_i = Signal()
317 self.n.valid_o = Signal()
318 self.counter = Signal(4)
319 self.op = Signal(2)
320 i = []
321 i.append(Signal(width, name="i1"))
322 i.append(Signal(width, name="i2"))
323 self.i = Array(i)
324 self.a, self.b = i[0], i[1]
325 self.out = Array([Signal(width)])
326 self.o = self.out[0]
327 self.width = width
328
329 def elaborate(self, platform):
330 m = Module()
331 bgt = BranchOp(self.width, operator.gt)
332 blt = BranchOp(self.width, operator.lt)
333 beq = BranchOp(self.width, operator.eq)
334 bne = BranchOp(self.width, operator.ne)
335
336 m.submodules.bgt = bgt
337 m.submodules.blt = blt
338 m.submodules.beq = beq
339 m.submodules.bne = bne
340 for mod in [bgt, blt, beq, bne]:
341 m.d.comb += [
342 mod.a.eq(self.a),
343 mod.b.eq(self.b),
344 ]
345
346 go_now = Signal(reset_less=True) # testing no-delay ALU
347 with m.If(self.p.valid_i):
348 # input is valid. next check, if we already said "ready" or not
349 with m.If(~self.p.ready_o):
350 # we didn't say "ready" yet, so say so and initialise
351 m.d.sync += self.p.ready_o.eq(1)
352
353 # as this is a "fake" pipeline, just grab the output right now
354 with m.Switch(self.op):
355 for i, mod in enumerate([bgt, blt, beq, bne]):
356 with m.Case(i):
357 m.d.sync += self.o.eq(mod.o)
358 # branch to take 5 cycles (fake)
359 m.d.sync += self.counter.eq(5)
360 #m.d.comb += go_now.eq(1)
361 with m.Else():
362 # input says no longer valid, so drop ready as well.
363 # a "proper" ALU would have had to sync in the opcode and a/b ops
364 m.d.sync += self.p.ready_o.eq(0)
365
366 # ok so the counter's running: when it gets to 1, fire the output
367 with m.If((self.counter == 1) | go_now):
368 # set the output as valid if the recipient is ready for it
369 m.d.sync += self.n.valid_o.eq(1)
370 with m.If(self.n.ready_i & self.n.valid_o):
371 m.d.sync += self.n.valid_o.eq(0)
372 # recipient said it was ready: reset back to known-good.
373 m.d.sync += self.counter.eq(0) # reset the counter
374 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
375
376 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
377 with m.If(self.counter > 1):
378 m.d.sync += self.counter.eq(self.counter - 1)
379
380 return m
381
382 def __iter__(self):
383 yield self.op
384 yield self.a
385 yield self.b
386 yield self.o
387
388 def ports(self):
389 return list(self)
390
391
392 def run_op(dut, a, b, op, inv_a=0):
393 yield dut.a.eq(a)
394 yield dut.b.eq(b)
395 yield dut.op.insn_type.eq(op)
396 yield dut.op.invert_in.eq(inv_a)
397 yield dut.n.ready_i.eq(0)
398 yield dut.p.valid_i.eq(1)
399 yield dut.n.ready_i.eq(1)
400 yield
401
402 # wait for the ALU to accept our input data
403 while not (yield dut.p.ready_o):
404 yield
405
406 yield dut.p.valid_i.eq(0)
407 yield dut.a.eq(0)
408 yield dut.b.eq(0)
409 yield dut.op.insn_type.eq(0)
410 yield dut.op.invert_in.eq(0)
411
412 # wait for the ALU to present the output data
413 while not (yield dut.n.valid_o):
414 yield
415
416 # latch the result and lower read_i
417 result = yield dut.o
418 yield dut.n.ready_i.eq(0)
419
420 return result
421
422
423 def alu_sim(dut):
424 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
425 print("alu_sim add", result)
426 assert (result == 8)
427
428 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
429 print("alu_sim mul", result)
430 assert (result == 6)
431
432 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
433 print("alu_sim add-inv", result)
434 assert (result == 65533)
435
436 # test zero-delay ALU
437 # don't have OP_SUB, so use any other
438 result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
439 print("alu_sim sub", result)
440 assert (result == 2)
441
442 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
443 print("alu_sim shr", result)
444 assert (result == 3)
445
446
447 def test_alu():
448 alu = ALU(width=16)
449 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
450
451 vl = rtlil.convert(alu, ports=alu.ports())
452 with open("test_alu.il", "w") as f:
453 f.write(vl)
454
455
456 def test_alu_parallel():
457 # Compare with the sequential test implementation, above.
458 m = Module()
459 m.submodules.alu = dut = ALU(width=16)
460 sim = Simulator(m)
461 sim.add_clock(1e-6)
462
463 def send(a, b, op, inv_a=0):
464 # present input data and assert valid_i
465 yield dut.a.eq(a)
466 yield dut.b.eq(b)
467 yield dut.op.insn_type.eq(op)
468 yield dut.op.invert_in.eq(inv_a)
469 yield dut.p.valid_i.eq(1)
470 yield
471 # wait for ready_o to be asserted
472 while not (yield dut.p.ready_o):
473 yield
474 # clear input data and negate valid_i
475 # if send is called again immediately afterwards, there will be no
476 # visible transition (they will not be negated, after all)
477 yield dut.p.valid_i.eq(0)
478 yield dut.a.eq(0)
479 yield dut.b.eq(0)
480 yield dut.op.insn_type.eq(0)
481 yield dut.op.invert_in.eq(0)
482
483 def receive():
484 # signal readiness to receive data
485 yield dut.n.ready_i.eq(1)
486 yield
487 # wait for valid_o to be asserted
488 while not (yield dut.n.valid_o):
489 yield
490 # read result
491 result = yield dut.o
492 # negate ready_i
493 # if receive is called again immediately afterwards, there will be no
494 # visible transition (it will not be negated, after all)
495 yield dut.n.ready_i.eq(0)
496 return result
497
498 def producer():
499 # send a few test cases, interspersed with wait states
500 # note that, for this test, we do not wait for the result to be ready,
501 # before presenting the next input
502 # 5 + 3
503 yield from send(5, 3, MicrOp.OP_ADD)
504 yield
505 yield
506 # 2 * 3
507 yield from send(2, 3, MicrOp.OP_MUL_L64)
508 # (-5) + 3
509 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
510 yield
511 # 5 - 3
512 # note that this is a zero-delay operation
513 yield from send(5, 3, MicrOp.OP_NOP)
514 yield
515 yield
516 # 13 >> 2
517 yield from send(13, 2, MicrOp.OP_SHR)
518
519 def consumer():
520 # receive and check results, interspersed with wait states
521 # the consumer is not in step with the producer, but the
522 # order of the results are preserved
523 yield
524 # 5 + 3 = 8
525 result = yield from receive()
526 assert (result == 8)
527 # 2 * 3 = 6
528 result = yield from receive()
529 assert (result == 6)
530 yield
531 yield
532 # (-5) + 3 = -2
533 result = yield from receive()
534 assert (result == 65533) # unsigned equivalent to -2
535 # 5 - 3 = 2
536 # note that this is a zero-delay operation
537 # this, and the previous result, will be received back-to-back
538 # (check the output waveform to see this)
539 result = yield from receive()
540 assert (result == 2)
541 yield
542 yield
543 # 13 >> 2 = 3
544 result = yield from receive()
545 assert (result == 3)
546
547 sim.add_sync_process(producer)
548 sim.add_sync_process(consumer)
549 sim_writer = sim.write_vcd(
550 "test_alu_parallel.vcd",
551 "test_alu_parallel.gtkw",
552 traces=dut.ports()
553 )
554 with sim_writer:
555 sim.run()
556
557
558 if __name__ == "__main__":
559 test_alu()
560 test_alu_parallel()
561
562 # alu = BranchALU(width=16)
563 # vl = rtlil.convert(alu, ports=alu.ports())
564 # with open("test_branch_alu.il", "w") as f:
565 # f.write(vl)