add regspec capability to MultiCompUnit
[soc.git] / src / soc / experiment / compalu_multi.py
1 from nmigen.compat.sim import run_simulation
2 from nmigen.cli import verilog, rtlil
3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
5
6 from nmutil.latch import SRLatch, latchregister
7 from nmutil.iocontrol import RecordObject
8
9 from soc.decoder.power_decoder2 import Data
10 from soc.decoder.power_enums import InternalOp
11
12
13 """ Computation Unit (aka "ALU Manager").
14
15 This module runs a "revolving door" set of three latches, based on
16 * Issue
17 * Go_Read
18 * Go_Write
19 where one of them cannot be set on any given cycle.
20
21 * When issue is first raised, a busy signal is sent out.
22 The src1 and src2 registers and the operand can be latched in
23 at this point
24
25 * Read request is set, which is acknowledged through the Scoreboard
26 to the priority picker, which generates (one and only one) Go_Read
27 at a time. One of those will (eventually) be this Computation Unit.
28
29 * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
30 src1/src2/operand in place), and the ALU is told to proceed.
31
32 * when the ALU pipeline is ready, this activates "write request release",
33 and the ALU's output is captured into a temporary register.
34
35 * Write request release is *HELD UP* (prevented from proceeding) if shadowN
36 is asserted LOW. This is how all speculation, precise exceptions,
37 predication - everything - is achieved.
38
39 * Write request release will go through a similar process as Read request,
40 resulting (eventually) in Go_Write being asserted.
41
42 * When Go_Write is asserted, two things happen: (1) the data in the temp
43 register is placed combinatorially onto the output, and (2) the
44 req_l latch is cleared, busy is dropped, and the Comp Unit is back
45 through its revolving door to do another task.
46
47 Note that the read and write latches are held synchronously for one cycle,
48 i.e. that when Go_Read comes in, one cycle is given in which the incoming
49 register (broadcast over a Regfile Read Port) may have time to be latched.
50
51 It is REQUIRED that Go_Read be held valid only for one cycle, and it is
52 REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
53 Go_Read is asserted HI.
54
55 Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
56 likewise be dropped exactly one cycle after assertion of Go_Write.
57
58 When Go_Die is asserted then strictly speaking the entire FSM should be
59 fully reset and that includes sending a cancellation request to the ALU.
60 (XXX TODO: alu "go die" is not presently wired up)
61 """
62
63 def go_record(n, name):
64 r = Record([('go', n, DIR_FANIN),
65 ('rel', n, DIR_FANOUT)], name=name)
66 r.go.reset_less = True
67 r.rel.reset_less = True
68 return r
69
70
71 def get_regspec_bitwidth(regspec, srcdest, idx):
72 bitspec = regspec[srcdest][idx]
73 wid = 0
74 print (bitspec)
75 for ranges in bitspec[2].split(","):
76 ranges = ranges.split(":")
77 print (ranges)
78 if len(ranges) == 1: # only one bit
79 wid += 1
80 else:
81 start, end = map(int, ranges)
82 wid += (end-start)+1
83 return wid
84
85
86 class CompUnitRecord(RecordObject):
87 """CompUnitRecord
88
89 base class for Computation Units, to provide a uniform API
90 and allow "record.connect" etc. to be used, particularly when
91 it comes to connecting multiple Computation Units up as a block
92 (very laborious)
93
94 LDSTCompUnitRecord should derive from this class and add the
95 additional signals it requires
96
97 :subkls: the class (not an instance) needed to construct the opcode
98 :rwid: either an integer (specifies width of all regs) or a "regspec"
99 """
100 def __init__(self, subkls, rwid, n_src=None, n_dst=None, name=None):
101 RecordObject.__init__(self, name)
102 self._rwid = rwid
103 if isinstance(rwid, int):
104 # rwid: integer (covers all registers)
105 self._n_src, self._n_dst = n_src, n_dst
106 else:
107 # rwid: a regspec.
108 self._n_src, self._n_dst = len(rwid[0]), len(rwid[1])
109 self._subkls = subkls
110
111 src = []
112 for i in range(n_src):
113 j = i + 1 # name numbering to match src1/src2
114 name = "src%d_i" % j
115 rw = self._get_srcwid(i)
116 sreg = Signal(rw, name=name, reset_less=True)
117 setattr(self, name, sreg)
118 src.append(sreg)
119 self._src_i = src
120
121 dst = []
122 for i in range(n_dst):
123 j = i + 1 # name numbering to match dest1/2...
124 name = "dest%d_i" % j
125 rw = self._get_dstwid(i)
126 dreg = Signal(rw, name=name, reset_less=True)
127 setattr(self, name, dreg)
128 dst.append(dreg)
129 self._dest = dst
130
131 self.rd = go_record(n_src, name="rd") # read in, req out
132 self.wr = go_record(n_dst, name="wr") # write in, req out
133 self.issue_i = Signal(reset_less=True) # fn issue in
134 self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
135 self.go_die_i = Signal() # go die (reset)
136
137 # operation / data input
138 self.oper_i = subkls() # operand
139
140 # output (busy/done)
141 self.busy_o = Signal(reset_less=True) # fn busy out
142 self.done_o = Signal(reset_less=True)
143
144 def _get_dstwid(self, i):
145 if isinstance(self._rwid, int):
146 return self._rwid
147 return get_regspec_bitwidth(self._rwid, 1, i)
148
149 def _get_srcwid(self, i):
150 if isinstance(self._rwid, int):
151 return self._rwid
152 return get_regspec_bitwidth(self._rwid, 0, i)
153
154 class MultiCompUnit(Elaboratable):
155 def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
156 """MultiCompUnit
157
158 * :rwid: width of register latches (TODO: allocate per regspec)
159 * :alu: the ALU (pipeline, FSM) - must conform to nmutil Pipe API
160 * :opsubsetkls: the subset of Decode2ExecuteType
161 * :n_src: number of src operands
162 * :n_dst: number of destination operands
163 """
164 self.n_src, self.n_dst = n_src, n_dst
165 self.rwid = rwid
166 self.opsubsetkls = opsubsetkls
167 self.alu = alu # actual ALU - set as a "submodule" of the CU
168 self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
169
170 for i in range(n_src):
171 j = i + 1 # name numbering to match src1/src2
172 name = "src%d_i" % j
173 setattr(self, name, getattr(cu, name))
174
175 for i in range(n_dst):
176 j = i + 1 # name numbering to match dest1/2...
177 name = "dest%d_i" % j
178 setattr(self, name, getattr(cu, name))
179
180 # convenience names
181 self.rd = cu.rd
182 self.wr = cu.wr
183 self.go_rd_i = self.rd.go # temporary naming
184 self.go_wr_i = self.wr.go # temporary naming
185 self.rd_rel_o = self.rd.rel # temporary naming
186 self.req_rel_o = self.wr.rel # temporary naming
187 self.issue_i = cu.issue_i
188 self.shadown_i = cu.shadown_i
189 self.go_die_i = cu.go_die_i
190
191 # operation / data input
192 self.oper_i = cu.oper_i
193 self.src_i = cu._src_i
194
195 self.busy_o = cu.busy_o
196 self.dest = cu._dest
197 self.data_o = self.dest[0] # Dest out
198 self.done_o = cu.done_o
199
200 def elaborate(self, platform):
201 m = Module()
202 m.submodules.alu = self.alu
203 m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
204 m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
205 m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
206 m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
207 m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
208
209 # ALU only proceeds when all src are ready. rd_rel_o is delayed
210 # so combine it with go_rd_i. if all bits are set we're good
211 all_rd = Signal(reset_less=True)
212 m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
213 (((~self.rd.rel) | self.rd.go).all()))
214
215 # write_requests all done
216 # req_done works because any one of the last of the writes
217 # is enough, when combined with when read-phase is done (rst_l.q)
218 wr_any = Signal(reset_less=True)
219 req_done = Signal(reset_less=True)
220 m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
221 m.d.comb += wr_any.eq(self.wr.go.bool())
222 m.d.comb += req_done.eq(rst_l.q & wr_any)
223
224 # shadow/go_die
225 reset = Signal(reset_less=True)
226 rst_r = Signal(reset_less=True) # reset latch off
227 reset_w = Signal(self.n_dst, reset_less=True)
228 reset_r = Signal(self.n_src, reset_less=True)
229 m.d.comb += reset.eq(req_done | self.go_die_i)
230 m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
231 m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
232 m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
233
234 # read-done,wr-proceed latch
235 m.d.comb += rok_l.s.eq(self.issue_i) # set up when issue starts
236 m.d.comb += rok_l.r.eq(self.alu.p_ready_o) # off when ALU acknowledges
237
238 # wr-done, back-to-start latch
239 m.d.comb += rst_l.s.eq(all_rd) # set when read-phase is fully done
240 m.d.comb += rst_l.r.eq(rst_r) # *off* on issue
241
242 # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
243 m.d.sync += opc_l.s.eq(self.issue_i) # set on issue
244 m.d.sync += opc_l.r.eq(self.alu.n_valid_o & req_done) # reset on ALU
245
246 # src operand latch (not using go_wr_i)
247 m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
248 m.d.sync += src_l.r.eq(reset_r)
249
250 # dest operand latch (not using issue_i)
251 m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
252 m.d.sync += req_l.r.eq(reset_w)
253
254 # create a latch/register for the operand
255 oper_r = self.opsubsetkls()
256 latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
257
258 # and for each output from the ALU
259 drl = []
260 for i in range(self.n_dst):
261 name = "data_r%d" % i
262 data_r = Signal(self.cu._get_srcwid(i), name=name, reset_less=True)
263 latchregister(m, self.alu.out[i], data_r, req_l.q[i], name)
264 drl.append(data_r)
265
266 # pass the operation to the ALU
267 m.d.comb += self.alu.op.eq(oper_r)
268
269 # create list of src/alu-src/src-latch. override 1st and 2nd one below.
270 # in the case, for ALU and Logical pipelines, we assume RB is the 2nd operand
271 # in the input "regspec". see for example soc.fu.alu.pipe_data.ALUInputData
272 # TODO: assume RA is the 1st operand, zero_a detection is needed.
273 sl = []
274 for i in range(self.n_src):
275 sl.append([self.src_i[i], self.alu.i[i], src_l.q[i]])
276
277 # if the operand subset has "zero_a" we implicitly assume that means
278 # src_i[0] is an INT register type where zero can be multiplexed in, instead.
279 # see https://bugs.libre-soc.org/show_bug.cgi?id=336
280 #if hasattr(oper_r, "zero_a"):
281 # select zero immediate if opcode says so. however also change the latch
282 # to trigger *from* the opcode latch instead.
283 # ...
284 # ...
285
286 # if the operand subset has "imm_data" we implicitly assume that means
287 # "this is an INT ALU/Logical FU jobbie, RB is multiplexed with the immediate"
288 if hasattr(oper_r, "imm_data"):
289 # select immediate if opcode says so. however also change the latch
290 # to trigger *from* the opcode latch instead.
291 op_is_imm = oper_r.imm_data.imm_ok
292 src2_or_imm = Signal(self.cu._get_srcwid(1), reset_less=True)
293 src_sel = Signal(reset_less=True)
294 m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
295 m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
296 self.src2_i))
297 # overwrite 2nd src-latch with immediate-muxed stuff
298 sl[1][0] = src2_or_imm
299 sl[1][2] = src_sel
300
301 # create a latch/register for src1/src2 (even if it is a copy of an immediate)
302 for i in range(self.n_src):
303 src, alusrc, latch = sl[i]
304 latchregister(m, src, alusrc, latch, name="src_r%d" % i)
305
306 # -----
307 # outputs
308 # -----
309
310 # all request signals gated by busy_o. prevents picker problems
311 m.d.comb += self.busy_o.eq(opc_l.q) # busy out
312 bro = Repl(self.busy_o, self.n_src)
313 m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
314
315 # on a go_read, tell the ALU we're accepting data.
316 # NOTE: this spells TROUBLE if the ALU isn't ready!
317 # go_read is only valid for one clock!
318 with m.If(all_rd): # src operands ready, GO!
319 with m.If(~self.alu.p_ready_o): # no ACK yet
320 m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
321
322 brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
323 # only proceed if ALU says its output is valid
324 with m.If(self.alu.n_valid_o):
325 # when ALU ready, write req release out. waits for shadow
326 m.d.comb += self.wr.rel.eq(req_l.q & brd)
327 # when output latch is ready, and ALU says ready, accept ALU output
328 with m.If(reset):
329 m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
330
331 # output the data from the latch on go_write
332 for i in range(self.n_dst):
333 with m.If(self.wr.go[i]):
334 m.d.comb += self.dest[i].eq(drl[i])
335
336 return m
337
338 def __iter__(self):
339 yield self.rd.go
340 yield self.wr.go
341 yield self.issue_i
342 yield self.shadown_i
343 yield self.go_die_i
344 yield from self.oper_i.ports()
345 yield self.src1_i
346 yield self.src2_i
347 yield self.busy_o
348 yield self.rd.rel
349 yield self.wr.rel
350 yield self.data_o
351
352 def ports(self):
353 return list(self)
354
355
356 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
357 yield dut.issue_i.eq(0)
358 yield
359 yield dut.src_i[0].eq(a)
360 yield dut.src_i[1].eq(b)
361 yield dut.oper_i.insn_type.eq(op)
362 yield dut.oper_i.invert_a.eq(inv_a)
363 yield dut.oper_i.imm_data.imm.eq(imm)
364 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
365 yield dut.issue_i.eq(1)
366 yield
367 yield dut.issue_i.eq(0)
368 yield
369 yield dut.rd.go.eq(0b11)
370 while True:
371 yield
372 rd_rel_o = yield dut.rd.rel
373 print ("rd_rel", rd_rel_o)
374 if rd_rel_o:
375 break
376 yield
377 yield dut.rd.go.eq(0)
378 req_rel_o = yield dut.wr.rel
379 result = yield dut.data_o
380 print ("req_rel", req_rel_o, result)
381 while True:
382 req_rel_o = yield dut.wr.rel
383 result = yield dut.data_o
384 print ("req_rel", req_rel_o, result)
385 if req_rel_o:
386 break
387 yield
388 yield dut.wr.go[0].eq(1)
389 yield
390 result = yield dut.data_o
391 print ("result", result)
392 yield dut.wr.go[0].eq(0)
393 yield
394 return result
395
396
397 def scoreboard_sim(dut):
398 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
399 imm=8, imm_ok=1)
400 assert result == 13
401
402 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
403 assert result == 7
404
405 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
406 assert result == 65532
407
408
409 def test_compunit():
410 from alu_hier import ALU
411 from soc.fu.alu.alu_input_record import CompALUOpSubset
412
413 m = Module()
414 alu = ALU(16)
415 dut = MultiCompUnit(16, alu, CompALUOpSubset)
416 m.submodules.cu = dut
417
418 vl = rtlil.convert(dut, ports=dut.ports())
419 with open("test_compunit1.il", "w") as f:
420 f.write(vl)
421
422 run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
423
424
425 def test_compunit_regspec1():
426 from alu_hier import ALU
427 from soc.fu.alu.alu_input_record import CompALUOpSubset
428
429 inspec = [('INT', 'a', '0:15'),
430 ('INT', 'b', '0:15')]
431 outspec = [('INT', 'o', '0:15'),
432 ]
433
434 regspec = (inspec, outspec)
435
436 m = Module()
437 alu = ALU(16)
438 dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
439 m.submodules.cu = dut
440
441 vl = rtlil.convert(dut, ports=dut.ports())
442 with open("test_compunit_regspec1.il", "w") as f:
443 f.write(vl)
444
445 run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
446
447
448 if __name__ == '__main__':
449 test_compunit()
450 test_compunit_regspec1()