rename InternalOp to MicrOp
[soc.git] / src / soc / experiment / alu_hier.py
1 """*Experimental* ALU: based on nmigen alu_hier.py, includes branch-compare ALU
2
3 This ALU is *deliberately* designed to add in (unnecessary) delays into
4 different operations so as to be able to test the 6600-style matrices
5 and the CompUnits. Countdown timers wait for (defined) periods before
6 indicating that the output is valid
7
8 A "real" integer ALU would place the answers onto the output bus after
9 only one cycle (sync)
10 """
11
12 from nmigen import Elaboratable, Signal, Module, Const, Mux, Array
13 from nmigen.hdl.rec import Record, Layout
14 from nmigen.cli import main
15 from nmigen.cli import verilog, rtlil
16 from nmigen.compat.sim import run_simulation
17 from nmigen.back.pysim import Simulator, Settle
18
19 from soc.decoder.power_enums import MicrOp, Function, CryIn
20
21 from soc.fu.alu.alu_input_record import CompALUOpSubset
22 from soc.fu.cr.cr_input_record import CompCROpSubset
23
24 import operator
25
26
27
28
29 class Adder(Elaboratable):
30 def __init__(self, width):
31 self.invert_a = Signal()
32 self.a = Signal(width)
33 self.b = Signal(width)
34 self.o = Signal(width, name="add_o")
35
36 def elaborate(self, platform):
37 m = Module()
38 with m.If(self.invert_a):
39 m.d.comb += self.o.eq((~self.a) + self.b)
40 with m.Else():
41 m.d.comb += self.o.eq(self.a + self.b)
42 return m
43
44
45 class Subtractor(Elaboratable):
46 def __init__(self, width):
47 self.a = Signal(width)
48 self.b = Signal(width)
49 self.o = Signal(width, name="sub_o")
50
51 def elaborate(self, platform):
52 m = Module()
53 m.d.comb += self.o.eq(self.a - self.b)
54 return m
55
56
57 class Multiplier(Elaboratable):
58 def __init__(self, width):
59 self.a = Signal(width)
60 self.b = Signal(width)
61 self.o = Signal(width, name="mul_o")
62
63 def elaborate(self, platform):
64 m = Module()
65 m.d.comb += self.o.eq(self.a * self.b)
66 return m
67
68
69 class Shifter(Elaboratable):
70 def __init__(self, width):
71 self.width = width
72 self.a = Signal(width)
73 self.b = Signal(width)
74 self.o = Signal(width, name="shf_o")
75
76 def elaborate(self, platform):
77 m = Module()
78 btrunc = Signal(self.width)
79 m.d.comb += btrunc.eq(self.b & Const((1<<self.width)-1))
80 m.d.comb += self.o.eq(self.a >> btrunc)
81 return m
82
83 class Dummy:
84 pass
85
86
87 class DummyALU(Elaboratable):
88 def __init__(self, width):
89 self.p = Dummy() # make look like nmutil pipeline API
90 self.p.data_i = Dummy()
91 self.p.data_i.ctx = Dummy()
92 self.n = Dummy() # make look like nmutil pipeline API
93 self.n.data_o = Dummy()
94 self.p.valid_i = Signal()
95 self.p.ready_o = Signal()
96 self.n.ready_i = Signal()
97 self.n.valid_o = Signal()
98 self.counter = Signal(4)
99 self.op = CompCROpSubset()
100 i = []
101 i.append(Signal(width, name="i1"))
102 i.append(Signal(width, name="i2"))
103 i.append(Signal(width, name="i3"))
104 self.i = Array(i)
105 self.a, self.b, self.c = i[0], i[1], i[2]
106 self.out = Array([Signal(width, name="alu_o")])
107 self.o = self.out[0]
108 self.width = width
109 # more "look like nmutil pipeline API"
110 self.p.data_i.ctx.op = self.op
111 self.p.data_i.a = self.a
112 self.p.data_i.b = self.b
113 self.p.data_i.c = self.c
114 self.n.data_o.o = self.o
115
116 def elaborate(self, platform):
117 m = Module()
118
119 go_now = Signal(reset_less=True) # testing no-delay ALU
120
121 with m.If(self.p.valid_i):
122 # input is valid. next check, if we already said "ready" or not
123 with m.If(~self.p.ready_o):
124 # we didn't say "ready" yet, so say so and initialise
125 m.d.sync += self.p.ready_o.eq(1)
126
127 m.d.sync += self.o.eq(self.a)
128 m.d.comb += go_now.eq(1)
129 m.d.sync += self.counter.eq(1)
130
131 with m.Else():
132 # input says no longer valid, so drop ready as well.
133 # a "proper" ALU would have had to sync in the opcode and a/b ops
134 m.d.sync += self.p.ready_o.eq(0)
135
136 # ok so the counter's running: when it gets to 1, fire the output
137 with m.If((self.counter == 1) | go_now):
138 # set the output as valid if the recipient is ready for it
139 m.d.sync += self.n.valid_o.eq(1)
140 with m.If(self.n.ready_i & self.n.valid_o):
141 m.d.sync += self.n.valid_o.eq(0)
142 # recipient said it was ready: reset back to known-good.
143 m.d.sync += self.counter.eq(0) # reset the counter
144 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
145
146 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
147 with m.If(self.counter > 1):
148 m.d.sync += self.counter.eq(self.counter - 1)
149
150 return m
151
152 def __iter__(self):
153 yield from self.op.ports()
154 yield self.a
155 yield self.b
156 yield self.c
157 yield self.o
158
159 def ports(self):
160 return list(self)
161
162
163 class ALU(Elaboratable):
164 def __init__(self, width):
165 self.p = Dummy() # make look like nmutil pipeline API
166 self.p.data_i = Dummy()
167 self.p.data_i.ctx = Dummy()
168 self.n = Dummy() # make look like nmutil pipeline API
169 self.n.data_o = Dummy()
170 self.p.valid_i = Signal()
171 self.p.ready_o = Signal()
172 self.n.ready_i = Signal()
173 self.n.valid_o = Signal()
174 self.counter = Signal(4)
175 self.op = CompALUOpSubset(name="op")
176 i = []
177 i.append(Signal(width, name="i1"))
178 i.append(Signal(width, name="i2"))
179 self.i = Array(i)
180 self.a, self.b = i[0], i[1]
181 self.out = Array([Signal(width, name="alu_o")])
182 self.o = self.out[0]
183 self.width = width
184 # more "look like nmutil pipeline API"
185 self.p.data_i.ctx.op = self.op
186 self.p.data_i.a = self.a
187 self.p.data_i.b = self.b
188 self.n.data_o.o = self.o
189
190 def elaborate(self, platform):
191 m = Module()
192 add = Adder(self.width)
193 mul = Multiplier(self.width)
194 shf = Shifter(self.width)
195 sub = Subtractor(self.width)
196
197 m.submodules.add = add
198 m.submodules.mul = mul
199 m.submodules.shf = shf
200 m.submodules.sub = sub
201
202 # really should not activate absolutely all ALU inputs like this
203 for mod in [add, mul, shf, sub]:
204 m.d.comb += [
205 mod.a.eq(self.a),
206 mod.b.eq(self.b),
207 ]
208
209 # pass invert (and carry later)
210 m.d.comb += add.invert_a.eq(self.op.invert_a)
211
212 go_now = Signal(reset_less=True) # testing no-delay ALU
213
214 # ALU sequencer is idle when the count is zero
215 alu_idle = Signal(reset_less=True)
216 m.d.comb += alu_idle.eq(self.counter == 0)
217
218 # ALU sequencer is done when the count is one
219 alu_done = Signal(reset_less=True)
220 m.d.comb += alu_done.eq(self.counter == 1)
221
222 # select handshake handling according to ALU type
223 with m.If(go_now):
224 # with a combinatorial, no-delay ALU, just pass through
225 # the handshake signals to the other side
226 m.d.comb += self.p.ready_o.eq(self.n.ready_i)
227 m.d.comb += self.n.valid_o.eq(self.p.valid_i)
228 with m.Else():
229 # sequential ALU handshake:
230 # ready_o responds to valid_i, but only if the ALU is idle
231 m.d.comb += self.p.ready_o.eq(alu_idle)
232 # select the internally generated valid_o, above
233 m.d.comb += self.n.valid_o.eq(alu_done)
234
235 # hold the ALU result until ready_o is asserted
236 alu_r = Signal(self.width)
237
238 with m.If(alu_idle):
239 with m.If(self.p.valid_i):
240
241 # as this is a "fake" pipeline, just grab the output right now
242 with m.If(self.op.insn_type == MicrOp.OP_ADD):
243 m.d.sync += alu_r.eq(add.o)
244 with m.Elif(self.op.insn_type == MicrOp.OP_MUL_L64):
245 m.d.sync += alu_r.eq(mul.o)
246 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
247 m.d.sync += alu_r.eq(shf.o)
248 # SUB is zero-delay, no need to register
249
250 # NOTE: all of these are fake, just something to test
251
252 # MUL, to take 5 instructions
253 with m.If(self.op.insn_type == MicrOp.OP_MUL_L64):
254 m.d.sync += self.counter.eq(5)
255 # SHIFT to take 1, straight away
256 with m.Elif(self.op.insn_type == MicrOp.OP_SHR):
257 m.d.sync += self.counter.eq(1)
258 # ADD/SUB to take 3
259 with m.Elif(self.op.insn_type == MicrOp.OP_ADD):
260 m.d.sync += self.counter.eq(3)
261 # others to take no delay
262 with m.Else():
263 m.d.comb += go_now.eq(1)
264
265 with m.Elif(~alu_done | self.n.ready_i):
266 # decrement the counter while the ALU is neither idle nor finished
267 m.d.sync += self.counter.eq(self.counter - 1)
268
269 # choose between zero-delay output, or registered
270 with m.If(go_now):
271 m.d.comb += self.o.eq(sub.o)
272 # only present the result at the last computation cycle
273 with m.Elif(alu_done):
274 m.d.comb += self.o.eq(alu_r)
275
276 return m
277
278 def __iter__(self):
279 yield from self.op.ports()
280 yield self.a
281 yield self.b
282 yield self.o
283 yield self.p.valid_i
284 yield self.p.ready_o
285 yield self.n.valid_o
286 yield self.n.ready_i
287
288 def ports(self):
289 return list(self)
290
291
292 class BranchOp(Elaboratable):
293 def __init__(self, width, op):
294 self.a = Signal(width)
295 self.b = Signal(width)
296 self.o = Signal(width)
297 self.op = op
298
299 def elaborate(self, platform):
300 m = Module()
301 m.d.comb += self.o.eq(Mux(self.op(self.a, self.b), 1, 0))
302 return m
303
304
305 class BranchALU(Elaboratable):
306 def __init__(self, width):
307 self.p = Dummy() # make look like nmutil pipeline API
308 self.p.data_i = Dummy()
309 self.p.data_i.ctx = Dummy()
310 self.n = Dummy() # make look like nmutil pipeline API
311 self.n.data_o = Dummy()
312 self.p.valid_i = Signal()
313 self.p.ready_o = Signal()
314 self.n.ready_i = Signal()
315 self.n.valid_o = Signal()
316 self.counter = Signal(4)
317 self.op = Signal(2)
318 i = []
319 i.append(Signal(width, name="i1"))
320 i.append(Signal(width, name="i2"))
321 self.i = Array(i)
322 self.a, self.b = i[0], i[1]
323 self.out = Array([Signal(width)])
324 self.o = self.out[0]
325 self.width = width
326
327 def elaborate(self, platform):
328 m = Module()
329 bgt = BranchOp(self.width, operator.gt)
330 blt = BranchOp(self.width, operator.lt)
331 beq = BranchOp(self.width, operator.eq)
332 bne = BranchOp(self.width, operator.ne)
333
334 m.submodules.bgt = bgt
335 m.submodules.blt = blt
336 m.submodules.beq = beq
337 m.submodules.bne = bne
338 for mod in [bgt, blt, beq, bne]:
339 m.d.comb += [
340 mod.a.eq(self.a),
341 mod.b.eq(self.b),
342 ]
343
344 go_now = Signal(reset_less=True) # testing no-delay ALU
345 with m.If(self.p.valid_i):
346 # input is valid. next check, if we already said "ready" or not
347 with m.If(~self.p.ready_o):
348 # we didn't say "ready" yet, so say so and initialise
349 m.d.sync += self.p.ready_o.eq(1)
350
351 # as this is a "fake" pipeline, just grab the output right now
352 with m.Switch(self.op):
353 for i, mod in enumerate([bgt, blt, beq, bne]):
354 with m.Case(i):
355 m.d.sync += self.o.eq(mod.o)
356 m.d.sync += self.counter.eq(5) # branch to take 5 cycles (fake)
357 #m.d.comb += go_now.eq(1)
358 with m.Else():
359 # input says no longer valid, so drop ready as well.
360 # a "proper" ALU would have had to sync in the opcode and a/b ops
361 m.d.sync += self.p.ready_o.eq(0)
362
363 # ok so the counter's running: when it gets to 1, fire the output
364 with m.If((self.counter == 1) | go_now):
365 # set the output as valid if the recipient is ready for it
366 m.d.sync += self.n.valid_o.eq(1)
367 with m.If(self.n.ready_i & self.n.valid_o):
368 m.d.sync += self.n.valid_o.eq(0)
369 # recipient said it was ready: reset back to known-good.
370 m.d.sync += self.counter.eq(0) # reset the counter
371 m.d.sync += self.o.eq(0) # clear the output for tidiness sake
372
373 # countdown to 1 (transition from 1 to 0 only on acknowledgement)
374 with m.If(self.counter > 1):
375 m.d.sync += self.counter.eq(self.counter - 1)
376
377 return m
378
379 def __iter__(self):
380 yield self.op
381 yield self.a
382 yield self.b
383 yield self.o
384
385 def ports(self):
386 return list(self)
387
388 def run_op(dut, a, b, op, inv_a=0):
389 yield dut.a.eq(a)
390 yield dut.b.eq(b)
391 yield dut.op.insn_type.eq(op)
392 yield dut.op.invert_a.eq(inv_a)
393 yield dut.n.ready_i.eq(0)
394 yield dut.p.valid_i.eq(1)
395 yield dut.n.ready_i.eq(1)
396 yield
397
398 # wait for the ALU to accept our input data
399 while not (yield dut.p.ready_o):
400 yield
401
402 yield dut.p.valid_i.eq(0)
403 yield dut.a.eq(0)
404 yield dut.b.eq(0)
405 yield dut.op.insn_type.eq(0)
406 yield dut.op.invert_a.eq(0)
407
408 # wait for the ALU to present the output data
409 while not (yield dut.n.valid_o):
410 yield
411
412 # latch the result and lower read_i
413 result = yield dut.o
414 yield dut.n.ready_i.eq(0)
415
416 return result
417
418
419 def alu_sim(dut):
420 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD)
421 print ("alu_sim add", result)
422 assert (result == 8)
423
424 result = yield from run_op(dut, 2, 3, MicrOp.OP_MUL_L64)
425 print ("alu_sim mul", result)
426 assert (result == 6)
427
428 result = yield from run_op(dut, 5, 3, MicrOp.OP_ADD, inv_a=1)
429 print ("alu_sim add-inv", result)
430 assert (result == 65533)
431
432 # test zero-delay ALU
433 # don't have OP_SUB, so use any other
434 result = yield from run_op(dut, 5, 3, MicrOp.OP_NOP)
435 print ("alu_sim sub", result)
436 assert (result == 2)
437
438 result = yield from run_op(dut, 13, 2, MicrOp.OP_SHR)
439 print ("alu_sim shr", result)
440 assert (result == 3)
441
442
443 def test_alu():
444 alu = ALU(width=16)
445 run_simulation(alu, {"sync": alu_sim(alu)}, vcd_name='test_alusim.vcd')
446
447 vl = rtlil.convert(alu, ports=alu.ports())
448 with open("test_alu.il", "w") as f:
449 f.write(vl)
450
451
452 def test_alu_parallel():
453 # Compare with the sequential test implementation, above.
454 m = Module()
455 m.submodules.alu = dut = ALU(width=16)
456 sim = Simulator(m)
457 sim.add_clock(1e-6)
458
459 def send(a, b, op, inv_a=0):
460 # present input data and assert valid_i
461 yield dut.a.eq(a)
462 yield dut.b.eq(b)
463 yield dut.op.insn_type.eq(op)
464 yield dut.op.invert_a.eq(inv_a)
465 yield dut.p.valid_i.eq(1)
466 yield
467 # wait for ready_o to be asserted
468 while not (yield dut.p.ready_o):
469 yield
470 # clear input data and negate valid_i
471 # if send is called again immediately afterwards, there will be no
472 # visible transition (they will not be negated, after all)
473 yield dut.p.valid_i.eq(0)
474 yield dut.a.eq(0)
475 yield dut.b.eq(0)
476 yield dut.op.insn_type.eq(0)
477 yield dut.op.invert_a.eq(0)
478
479 def receive():
480 # signal readiness to receive data
481 yield dut.n.ready_i.eq(1)
482 yield
483 # wait for valid_o to be asserted
484 while not (yield dut.n.valid_o):
485 yield
486 # read result
487 result = yield dut.o
488 # negate ready_i
489 # if receive is called again immediately afterwards, there will be no
490 # visible transition (it will not be negated, after all)
491 yield dut.n.ready_i.eq(0)
492 return result
493
494 def producer():
495 # send a few test cases, interspersed with wait states
496 # note that, for this test, we do not wait for the result to be ready,
497 # before presenting the next input
498 # 5 + 3
499 yield from send(5, 3, MicrOp.OP_ADD)
500 yield
501 yield
502 # 2 * 3
503 yield from send(2, 3, MicrOp.OP_MUL_L64)
504 # (-5) + 3
505 yield from send(5, 3, MicrOp.OP_ADD, inv_a=1)
506 yield
507 # 5 - 3
508 # note that this is a zero-delay operation
509 yield from send(5, 3, MicrOp.OP_NOP)
510 yield
511 yield
512 # 13 >> 2
513 yield from send(13, 2, MicrOp.OP_SHR)
514
515 def consumer():
516 # receive and check results, interspersed with wait states
517 # the consumer is not in step with the producer, but the
518 # order of the results are preserved
519 yield
520 # 5 + 3 = 8
521 result = yield from receive()
522 assert (result == 8)
523 # 2 * 3 = 6
524 result = yield from receive()
525 assert (result == 6)
526 yield
527 yield
528 # (-5) + 3 = -2
529 result = yield from receive()
530 assert (result == 65533) # unsigned equivalent to -2
531 # 5 - 3 = 2
532 # note that this is a zero-delay operation
533 # this, and the previous result, will be received back-to-back
534 # (check the output waveform to see this)
535 result = yield from receive()
536 assert (result == 2)
537 yield
538 yield
539 # 13 >> 2 = 3
540 result = yield from receive()
541 assert (result == 3)
542
543 sim.add_sync_process(producer)
544 sim.add_sync_process(consumer)
545 sim_writer = sim.write_vcd(
546 "test_alu_parallel.vcd",
547 "test_alu_parallel.gtkw",
548 traces=dut.ports()
549 )
550 with sim_writer:
551 sim.run()
552
553
554 if __name__ == "__main__":
555 test_alu()
556 test_alu_parallel()
557
558 # alu = BranchALU(width=16)
559 # vl = rtlil.convert(alu, ports=alu.ports())
560 # with open("test_branch_alu.il", "w") as f:
561 # f.write(vl)
562