split out RegSpecs into separate module
[soc.git] / src / soc / experiment / compalu_multi.py
1 from nmigen.compat.sim import run_simulation
2 from nmigen.cli import verilog, rtlil
3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
5
6 from nmutil.latch import SRLatch, latchregister
7 from nmutil.iocontrol import RecordObject
8
9 from soc.decoder.power_decoder2 import Data
10 from soc.decoder.power_enums import InternalOp
11 from soc.fu.regspec import RegSpec, RegSpecALUAPI
12
13
14 """ Computation Unit (aka "ALU Manager").
15
16 This module runs a "revolving door" set of three latches, based on
17 * Issue
18 * Go_Read
19 * Go_Write
20 where one of them cannot be set on any given cycle.
21
22 * When issue is first raised, a busy signal is sent out.
23 The src1 and src2 registers and the operand can be latched in
24 at this point
25
26 * Read request is set, which is acknowledged through the Scoreboard
27 to the priority picker, which generates (one and only one) Go_Read
28 at a time. One of those will (eventually) be this Computation Unit.
29
30 * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
31 src1/src2/operand in place), and the ALU is told to proceed.
32
33 * when the ALU pipeline is ready, this activates "write request release",
34 and the ALU's output is captured into a temporary register.
35
36 * Write request release is *HELD UP* (prevented from proceeding) if shadowN
37 is asserted LOW. This is how all speculation, precise exceptions,
38 predication - everything - is achieved.
39
40 * Write request release will go through a similar process as Read request,
41 resulting (eventually) in Go_Write being asserted.
42
43 * When Go_Write is asserted, two things happen: (1) the data in the temp
44 register is placed combinatorially onto the output, and (2) the
45 req_l latch is cleared, busy is dropped, and the Comp Unit is back
46 through its revolving door to do another task.
47
48 Note that the read and write latches are held synchronously for one cycle,
49 i.e. that when Go_Read comes in, one cycle is given in which the incoming
50 register (broadcast over a Regfile Read Port) may have time to be latched.
51
52 It is REQUIRED that Go_Read be held valid only for one cycle, and it is
53 REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
54 Go_Read is asserted HI.
55
56 Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
57 likewise be dropped exactly one cycle after assertion of Go_Write.
58
59 When Go_Die is asserted then strictly speaking the entire FSM should be
60 fully reset and that includes sending a cancellation request to the ALU.
61 (XXX TODO: alu "go die" is not presently wired up)
62 """
63
64 def go_record(n, name):
65 r = Record([('go', n, DIR_FANIN),
66 ('rel', n, DIR_FANOUT)], name=name)
67 r.go.reset_less = True
68 r.rel.reset_less = True
69 return r
70
71 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
72
73 class CompUnitRecord(RegSpec, RecordObject):
74 """CompUnitRecord
75
76 base class for Computation Units, to provide a uniform API
77 and allow "record.connect" etc. to be used, particularly when
78 it comes to connecting multiple Computation Units up as a block
79 (very laborious)
80
81 LDSTCompUnitRecord should derive from this class and add the
82 additional signals it requires
83
84 :subkls: the class (not an instance) needed to construct the opcode
85 :rwid: either an integer (specifies width of all regs) or a "regspec"
86
87 see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
88 """
89 def __init__(self, subkls, rwid, n_src=None, n_dst=None, name=None):
90 RegSpec.__init__(self, rwid, n_src, n_dst)
91 RecordObject.__init__(self, name)
92 self._subkls = subkls
93
94 src = []
95 for i in range(n_src):
96 j = i + 1 # name numbering to match src1/src2
97 name = "src%d_i" % j
98 rw = self._get_srcwid(i)
99 sreg = Signal(rw, name=name, reset_less=True)
100 setattr(self, name, sreg)
101 src.append(sreg)
102 self._src_i = src
103
104 dst = []
105 for i in range(n_dst):
106 j = i + 1 # name numbering to match dest1/2...
107 name = "dest%d_i" % j
108 rw = self._get_dstwid(i)
109 dreg = Signal(rw, name=name, reset_less=True)
110 setattr(self, name, dreg)
111 dst.append(dreg)
112 self._dest = dst
113
114 self.rd = go_record(n_src, name="rd") # read in, req out
115 self.wr = go_record(n_dst, name="wr") # write in, req out
116 self.issue_i = Signal(reset_less=True) # fn issue in
117 self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
118 self.go_die_i = Signal() # go die (reset)
119
120 # operation / data input
121 self.oper_i = subkls() # operand
122
123 # output (busy/done)
124 self.busy_o = Signal(reset_less=True) # fn busy out
125 self.done_o = Signal(reset_less=True)
126
127
128 class MultiCompUnit(RegSpecALUAPI, Elaboratable):
129 def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
130 """MultiCompUnit
131
132 * :rwid: width of register latches (TODO: allocate per regspec)
133 * :alu: the ALU (pipeline, FSM) - must conform to nmutil Pipe API
134 * :opsubsetkls: the subset of Decode2ExecuteType
135 * :n_src: number of src operands
136 * :n_dst: number of destination operands
137 """
138 RegSpecALUAPI.__init__(self, rwid, alu)
139 self.n_src, self.n_dst = n_src, n_dst
140 self.opsubsetkls = opsubsetkls
141 self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
142
143 for i in range(n_src):
144 j = i + 1 # name numbering to match src1/src2
145 name = "src%d_i" % j
146 setattr(self, name, getattr(cu, name))
147
148 for i in range(n_dst):
149 j = i + 1 # name numbering to match dest1/2...
150 name = "dest%d_i" % j
151 setattr(self, name, getattr(cu, name))
152
153 # convenience names
154 self.rd = cu.rd
155 self.wr = cu.wr
156 self.go_rd_i = self.rd.go # temporary naming
157 self.go_wr_i = self.wr.go # temporary naming
158 self.rd_rel_o = self.rd.rel # temporary naming
159 self.req_rel_o = self.wr.rel # temporary naming
160 self.issue_i = cu.issue_i
161 self.shadown_i = cu.shadown_i
162 self.go_die_i = cu.go_die_i
163
164 # operation / data input
165 self.oper_i = cu.oper_i
166 self.src_i = cu._src_i
167
168 self.busy_o = cu.busy_o
169 self.dest = cu._dest
170 self.data_o = self.dest[0] # Dest out
171 self.done_o = cu.done_o
172
173 def elaborate(self, platform):
174 m = Module()
175 m.submodules.alu = self.alu
176 m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
177 m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
178 m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
179 m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
180 m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
181
182 # ALU only proceeds when all src are ready. rd_rel_o is delayed
183 # so combine it with go_rd_i. if all bits are set we're good
184 all_rd = Signal(reset_less=True)
185 m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
186 (((~self.rd.rel) | self.rd.go).all()))
187
188 # write_requests all done
189 # req_done works because any one of the last of the writes
190 # is enough, when combined with when read-phase is done (rst_l.q)
191 wr_any = Signal(reset_less=True)
192 req_done = Signal(reset_less=True)
193 m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
194 m.d.comb += wr_any.eq(self.wr.go.bool())
195 m.d.comb += req_done.eq(rst_l.q & wr_any)
196
197 # shadow/go_die
198 reset = Signal(reset_less=True)
199 rst_r = Signal(reset_less=True) # reset latch off
200 reset_w = Signal(self.n_dst, reset_less=True)
201 reset_r = Signal(self.n_src, reset_less=True)
202 m.d.comb += reset.eq(req_done | self.go_die_i)
203 m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
204 m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
205 m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
206
207 # read-done,wr-proceed latch
208 m.d.comb += rok_l.s.eq(self.issue_i) # set up when issue starts
209 m.d.comb += rok_l.r.eq(self.alu.p.ready_o) # off when ALU acknowledges
210
211 # wr-done, back-to-start latch
212 m.d.comb += rst_l.s.eq(all_rd) # set when read-phase is fully done
213 m.d.comb += rst_l.r.eq(rst_r) # *off* on issue
214
215 # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
216 m.d.sync += opc_l.s.eq(self.issue_i) # set on issue
217 m.d.sync += opc_l.r.eq(self.alu.n.valid_o & req_done) # reset on ALU
218
219 # src operand latch (not using go_wr_i)
220 m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
221 m.d.sync += src_l.r.eq(reset_r)
222
223 # dest operand latch (not using issue_i)
224 m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
225 m.d.sync += req_l.r.eq(reset_w)
226
227 # create a latch/register for the operand
228 oper_r = self.opsubsetkls()
229 latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
230
231 # and for each output from the ALU
232 drl = []
233 for i in range(self.n_dst):
234 name = "data_r%d" % i
235 data_r = Signal(self.cu._get_srcwid(i), name=name, reset_less=True)
236 latchregister(m, self.get_out(i), data_r, req_l.q[i], name)
237 drl.append(data_r)
238
239 # pass the operation to the ALU
240 m.d.comb += self.get_op().eq(oper_r)
241
242 # create list of src/alu-src/src-latch. override 1st and 2nd one below.
243 # in the case, for ALU and Logical pipelines, we assume RB is the 2nd operand
244 # in the input "regspec". see for example soc.fu.alu.pipe_data.ALUInputData
245 # TODO: assume RA is the 1st operand, zero_a detection is needed.
246 sl = []
247 for i in range(self.n_src):
248 sl.append([self.src_i[i], self.get_in(i), src_l.q[i]])
249
250 # if the operand subset has "zero_a" we implicitly assume that means
251 # src_i[0] is an INT register type where zero can be multiplexed in, instead.
252 # see https://bugs.libre-soc.org/show_bug.cgi?id=336
253 #if hasattr(oper_r, "zero_a"):
254 # select zero immediate if opcode says so. however also change the latch
255 # to trigger *from* the opcode latch instead.
256 # ...
257 # ...
258
259 # if the operand subset has "imm_data" we implicitly assume that means
260 # "this is an INT ALU/Logical FU jobbie, RB is multiplexed with the immediate"
261 if hasattr(oper_r, "imm_data"):
262 # select immediate if opcode says so. however also change the latch
263 # to trigger *from* the opcode latch instead.
264 op_is_imm = oper_r.imm_data.imm_ok
265 src2_or_imm = Signal(self.cu._get_srcwid(1), reset_less=True)
266 src_sel = Signal(reset_less=True)
267 m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
268 m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
269 self.src2_i))
270 # overwrite 2nd src-latch with immediate-muxed stuff
271 sl[1][0] = src2_or_imm
272 sl[1][2] = src_sel
273
274 # create a latch/register for src1/src2 (even if it is a copy of an immediate)
275 for i in range(self.n_src):
276 src, alusrc, latch = sl[i]
277 latchregister(m, src, alusrc, latch, name="src_r%d" % i)
278
279 # -----
280 # outputs
281 # -----
282
283 # all request signals gated by busy_o. prevents picker problems
284 m.d.comb += self.busy_o.eq(opc_l.q) # busy out
285 bro = Repl(self.busy_o, self.n_src)
286 m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
287
288 # on a go_read, tell the ALU we're accepting data.
289 # NOTE: this spells TROUBLE if the ALU isn't ready!
290 # go_read is only valid for one clock!
291 with m.If(all_rd): # src operands ready, GO!
292 with m.If(~self.alu.p.ready_o): # no ACK yet
293 m.d.comb += self.alu.p.valid_i.eq(1) # so indicate valid
294
295 brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
296 # only proceed if ALU says its output is valid
297 with m.If(self.alu.n.valid_o):
298 # when ALU ready, write req release out. waits for shadow
299 m.d.comb += self.wr.rel.eq(req_l.q & brd)
300 # when output latch is ready, and ALU says ready, accept ALU output
301 with m.If(reset):
302 m.d.comb += self.alu.n.ready_i.eq(1) # tells ALU "thanks got it"
303
304 # output the data from the latch on go_write
305 for i in range(self.n_dst):
306 with m.If(self.wr.go[i]):
307 m.d.comb += self.dest[i].eq(drl[i])
308
309 return m
310
311 def __iter__(self):
312 yield self.rd.go
313 yield self.wr.go
314 yield self.issue_i
315 yield self.shadown_i
316 yield self.go_die_i
317 yield from self.oper_i.ports()
318 yield self.src1_i
319 yield self.src2_i
320 yield self.busy_o
321 yield self.rd.rel
322 yield self.wr.rel
323 yield self.data_o
324
325 def ports(self):
326 return list(self)
327
328
329 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
330 yield dut.issue_i.eq(0)
331 yield
332 yield dut.src_i[0].eq(a)
333 yield dut.src_i[1].eq(b)
334 yield dut.oper_i.insn_type.eq(op)
335 yield dut.oper_i.invert_a.eq(inv_a)
336 yield dut.oper_i.imm_data.imm.eq(imm)
337 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
338 yield dut.issue_i.eq(1)
339 yield
340 yield dut.issue_i.eq(0)
341 yield
342 yield dut.rd.go.eq(0b11)
343 while True:
344 yield
345 rd_rel_o = yield dut.rd.rel
346 print ("rd_rel", rd_rel_o)
347 if rd_rel_o:
348 break
349 yield
350 yield dut.rd.go.eq(0)
351 req_rel_o = yield dut.wr.rel
352 result = yield dut.data_o
353 print ("req_rel", req_rel_o, result)
354 while True:
355 req_rel_o = yield dut.wr.rel
356 result = yield dut.data_o
357 print ("req_rel", req_rel_o, result)
358 if req_rel_o:
359 break
360 yield
361 yield dut.wr.go[0].eq(1)
362 yield
363 result = yield dut.data_o
364 print ("result", result)
365 yield dut.wr.go[0].eq(0)
366 yield
367 return result
368
369
370 def scoreboard_sim(dut):
371 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
372 imm=8, imm_ok=1)
373 assert result == 13
374
375 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
376 assert result == 7
377
378 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
379 assert result == 65532
380
381
382 def test_compunit():
383 from alu_hier import ALU
384 from soc.fu.alu.alu_input_record import CompALUOpSubset
385
386 m = Module()
387 alu = ALU(16)
388 dut = MultiCompUnit(16, alu, CompALUOpSubset)
389 m.submodules.cu = dut
390
391 vl = rtlil.convert(dut, ports=dut.ports())
392 with open("test_compunit1.il", "w") as f:
393 f.write(vl)
394
395 run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
396
397
398 def test_compunit_regspec1():
399 from alu_hier import ALU
400 from soc.fu.alu.alu_input_record import CompALUOpSubset
401
402 inspec = [('INT', 'a', '0:15'),
403 ('INT', 'b', '0:15')]
404 outspec = [('INT', 'o', '0:15'),
405 ]
406
407 regspec = (inspec, outspec)
408
409 m = Module()
410 alu = ALU(16)
411 dut = MultiCompUnit(regspec, alu, CompALUOpSubset)
412 m.submodules.cu = dut
413
414 vl = rtlil.convert(dut, ports=dut.ports())
415 with open("test_compunit_regspec1.il", "w") as f:
416 f.write(vl)
417
418 run_simulation(m, scoreboard_sim(dut), vcd_name='test_compunit1.vcd')
419
420
421 if __name__ == '__main__':
422 test_compunit()
423 test_compunit_regspec1()