start to morph MultiCompUnit to take "regspec" as the way to decide the latch and
[soc.git] / src / soc / experiment / compalu_multi.py
1 from nmigen.compat.sim import run_simulation
2 from nmigen.cli import verilog, rtlil
3 from nmigen import Module, Signal, Mux, Elaboratable, Repl, Array, Record
4 from nmigen.hdl.rec import (DIR_FANIN, DIR_FANOUT)
5
6 from nmutil.latch import SRLatch, latchregister
7 from nmutil.iocontrol import RecordObject
8
9 from soc.decoder.power_decoder2 import Data
10 from soc.decoder.power_enums import InternalOp
11
12
13 """ Computation Unit (aka "ALU Manager").
14
15 This module runs a "revolving door" set of three latches, based on
16 * Issue
17 * Go_Read
18 * Go_Write
19 where one of them cannot be set on any given cycle.
20
21 * When issue is first raised, a busy signal is sent out.
22 The src1 and src2 registers and the operand can be latched in
23 at this point
24
25 * Read request is set, which is acknowledged through the Scoreboard
26 to the priority picker, which generates (one and only one) Go_Read
27 at a time. One of those will (eventually) be this Computation Unit.
28
29 * Once Go_Read is set, the src1/src2/operand latch door shuts (locking
30 src1/src2/operand in place), and the ALU is told to proceed.
31
32 * when the ALU pipeline is ready, this activates "write request release",
33 and the ALU's output is captured into a temporary register.
34
35 * Write request release is *HELD UP* (prevented from proceeding) if shadowN
36 is asserted LOW. This is how all speculation, precise exceptions,
37 predication - everything - is achieved.
38
39 * Write request release will go through a similar process as Read request,
40 resulting (eventually) in Go_Write being asserted.
41
42 * When Go_Write is asserted, two things happen: (1) the data in the temp
43 register is placed combinatorially onto the output, and (2) the
44 req_l latch is cleared, busy is dropped, and the Comp Unit is back
45 through its revolving door to do another task.
46
47 Note that the read and write latches are held synchronously for one cycle,
48 i.e. that when Go_Read comes in, one cycle is given in which the incoming
49 register (broadcast over a Regfile Read Port) may have time to be latched.
50
51 It is REQUIRED that Go_Read be held valid only for one cycle, and it is
52 REQUIRED that the corresponding Read_Req be dropped exactly one cycle after
53 Go_Read is asserted HI.
54
55 Likewise for Go_Write: this is asserted for one cycle, and Req_Writes must
56 likewise be dropped exactly one cycle after assertion of Go_Write.
57
58 When Go_Die is asserted then strictly speaking the entire FSM should be
59 fully reset and that includes sending a cancellation request to the ALU.
60 (XXX TODO: alu "go die" is not presently wired up)
61 """
62
63 def go_record(n, name):
64 r = Record([('go', n, DIR_FANIN),
65 ('rel', n, DIR_FANOUT)], name=name)
66 r.go.reset_less = True
67 r.rel.reset_less = True
68 return r
69
70
71 class CompUnitRecord(RecordObject):
72 """CompUnitRecord
73
74 base class for Computation Units, to provide a uniform API
75 and allow "record.connect" etc. to be used, particularly when
76 it comes to connecting multiple Computation Units up as a block
77 (very laborious)
78
79 LDSTCompUnitRecord should derive from this class and add the
80 additional signals it requires
81
82 :subkls: the class (not an instance) needed to construct the opcode
83 """
84 def __init__(self, subkls, rwid, n_src, n_dst, name=None):
85 RecordObject.__init__(self, name)
86 self._n_src, self._n_dst = n_src, n_dst
87 self._rwid = rwid
88 self._subkls = subkls
89
90 src = []
91 for i in range(n_src):
92 j = i + 1 # name numbering to match src1/src2
93 name = "src%d_i" % j
94 sreg = Signal(rwid, name=name, reset_less=True)
95 setattr(self, name, sreg)
96 src.append(sreg)
97 self._src_i = src
98
99 dst = []
100 for i in range(n_dst):
101 j = i + 1 # name numbering to match dest1/2...
102 name = "dest%d_i" % j
103 dreg = Signal(rwid, name=name, reset_less=True)
104 setattr(self, name, dreg)
105 dst.append(dreg)
106 self._dest = dst
107
108 self.rd = go_record(n_src, name="rd") # read in, req out
109 self.wr = go_record(n_dst, name="wr") # write in, req out
110 self.issue_i = Signal(reset_less=True) # fn issue in
111 self.shadown_i = Signal(reset=1) # shadow function, defaults to ON
112 self.go_die_i = Signal() # go die (reset)
113
114 # operation / data input
115 self.oper_i = subkls() # operand
116
117 # output (busy/done)
118 self.busy_o = Signal(reset_less=True) # fn busy out
119 self.done_o = Signal(reset_less=True)
120
121
122 class MultiCompUnit(Elaboratable):
123 def __init__(self, rwid, alu, opsubsetkls, n_src=2, n_dst=1):
124 """MultiCompUnit
125
126 * :rwid: width of register latches (TODO: allocate per regspec)
127 * :alu: the ALU (pipeline, FSM) - must conform to nmutil Pipe API
128 * :opsubsetkls: the subset of Decode2ExecuteType
129 * :n_src: number of src operands
130 * :n_dst: number of destination operands
131 """
132 self.n_src, self.n_dst = n_src, n_dst
133 self.rwid = rwid
134 self.opsubsetkls = opsubsetkls
135 self.alu = alu # actual ALU - set as a "submodule" of the CU
136 self.cu = cu = CompUnitRecord(opsubsetkls, rwid, n_src, n_dst)
137
138 for i in range(n_src):
139 j = i + 1 # name numbering to match src1/src2
140 name = "src%d_i" % j
141 setattr(self, name, getattr(cu, name))
142
143 for i in range(n_dst):
144 j = i + 1 # name numbering to match dest1/2...
145 name = "dest%d_i" % j
146 setattr(self, name, getattr(cu, name))
147
148 # convenience names
149 self.rd = cu.rd
150 self.wr = cu.wr
151 self.go_rd_i = self.rd.go # temporary naming
152 self.go_wr_i = self.wr.go # temporary naming
153 self.rd_rel_o = self.rd.rel # temporary naming
154 self.req_rel_o = self.wr.rel # temporary naming
155 self.issue_i = cu.issue_i
156 self.shadown_i = cu.shadown_i
157 self.go_die_i = cu.go_die_i
158
159 # operation / data input
160 self.oper_i = cu.oper_i
161 self.src_i = cu._src_i
162
163 self.busy_o = cu.busy_o
164 self.dest = cu._dest
165 self.data_o = self.dest[0] # Dest out
166 self.done_o = cu.done_o
167
168 def elaborate(self, platform):
169 m = Module()
170 m.submodules.alu = self.alu
171 m.submodules.src_l = src_l = SRLatch(False, self.n_src, name="src")
172 m.submodules.opc_l = opc_l = SRLatch(sync=False, name="opc")
173 m.submodules.req_l = req_l = SRLatch(False, self.n_dst, name="req")
174 m.submodules.rst_l = rst_l = SRLatch(sync=False, name="rst")
175 m.submodules.rok_l = rok_l = SRLatch(sync=False, name="rdok")
176
177 # ALU only proceeds when all src are ready. rd_rel_o is delayed
178 # so combine it with go_rd_i. if all bits are set we're good
179 all_rd = Signal(reset_less=True)
180 m.d.comb += all_rd.eq(self.busy_o & rok_l.q &
181 (((~self.rd.rel) | self.rd.go).all()))
182
183 # write_requests all done
184 # req_done works because any one of the last of the writes
185 # is enough, when combined with when read-phase is done (rst_l.q)
186 wr_any = Signal(reset_less=True)
187 req_done = Signal(reset_less=True)
188 m.d.comb += self.done_o.eq(self.busy_o & ~(self.wr.rel.bool()))
189 m.d.comb += wr_any.eq(self.wr.go.bool())
190 m.d.comb += req_done.eq(rst_l.q & wr_any)
191
192 # shadow/go_die
193 reset = Signal(reset_less=True)
194 rst_r = Signal(reset_less=True) # reset latch off
195 reset_w = Signal(self.n_dst, reset_less=True)
196 reset_r = Signal(self.n_src, reset_less=True)
197 m.d.comb += reset.eq(req_done | self.go_die_i)
198 m.d.comb += rst_r.eq(self.issue_i | self.go_die_i)
199 m.d.comb += reset_w.eq(self.wr.go | Repl(self.go_die_i, self.n_dst))
200 m.d.comb += reset_r.eq(self.rd.go | Repl(self.go_die_i, self.n_src))
201
202 # read-done,wr-proceed latch
203 m.d.comb += rok_l.s.eq(self.issue_i) # set up when issue starts
204 m.d.comb += rok_l.r.eq(self.alu.p_ready_o) # off when ALU acknowledges
205
206 # wr-done, back-to-start latch
207 m.d.comb += rst_l.s.eq(all_rd) # set when read-phase is fully done
208 m.d.comb += rst_l.r.eq(rst_r) # *off* on issue
209
210 # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
211 m.d.sync += opc_l.s.eq(self.issue_i) # set on issue
212 m.d.sync += opc_l.r.eq(self.alu.n_valid_o & req_done) # reset on ALU
213
214 # src operand latch (not using go_wr_i)
215 m.d.sync += src_l.s.eq(Repl(self.issue_i, self.n_src))
216 m.d.sync += src_l.r.eq(reset_r)
217
218 # dest operand latch (not using issue_i)
219 m.d.sync += req_l.s.eq(Repl(all_rd, self.n_dst))
220 m.d.sync += req_l.r.eq(reset_w)
221
222 # create a latch/register for the operand
223 oper_r = self.opsubsetkls()
224 latchregister(m, self.oper_i, oper_r, self.issue_i, "oper_r")
225
226 # and for each output from the ALU
227 drl = []
228 for i in range(self.n_dst):
229 name = "data_r%d" % i
230 data_r = Signal(self.rwid, name=name, reset_less=True)
231 latchregister(m, self.alu.out[i], data_r, req_l.q[i], name)
232 drl.append(data_r)
233
234 # pass the operation to the ALU
235 m.d.comb += self.alu.op.eq(oper_r)
236
237 # create list of src/alu-src/src-latch. override 2nd one below
238 sl = []
239 for i in range(self.n_src):
240 sl.append([self.src_i[i], self.alu.i[i], src_l.q[i]])
241
242 # select immediate if opcode says so. however also change the latch
243 # to trigger *from* the opcode latch instead.
244 op_is_imm = oper_r.imm_data.imm_ok
245 src2_or_imm = Signal(self.rwid, reset_less=True)
246 src_sel = Signal(reset_less=True)
247 m.d.comb += src_sel.eq(Mux(op_is_imm, opc_l.q, src_l.q[1]))
248 m.d.comb += src2_or_imm.eq(Mux(op_is_imm, oper_r.imm_data.imm,
249 self.src2_i))
250 # overwrite 2nd src-latch with immediate-muxed stuff
251 sl[1][0] = src2_or_imm
252 sl[1][2] = src_sel
253
254 # create a latch/register for src1/src2
255 for i in range(self.n_src):
256 src, alusrc, latch = sl[i]
257 latchregister(m, src, alusrc, latch, name="src_r%d" % i)
258
259 # -----
260 # outputs
261 # -----
262
263 # all request signals gated by busy_o. prevents picker problems
264 m.d.comb += self.busy_o.eq(opc_l.q) # busy out
265 bro = Repl(self.busy_o, self.n_src)
266 m.d.comb += self.rd.rel.eq(src_l.q & bro) # src1/src2 req rel
267
268 # on a go_read, tell the ALU we're accepting data.
269 # NOTE: this spells TROUBLE if the ALU isn't ready!
270 # go_read is only valid for one clock!
271 with m.If(all_rd): # src operands ready, GO!
272 with m.If(~self.alu.p_ready_o): # no ACK yet
273 m.d.comb += self.alu.p_valid_i.eq(1) # so indicate valid
274
275 brd = Repl(self.busy_o & self.shadown_i, self.n_dst)
276 # only proceed if ALU says its output is valid
277 with m.If(self.alu.n_valid_o):
278 # when ALU ready, write req release out. waits for shadow
279 m.d.comb += self.wr.rel.eq(req_l.q & brd)
280 # when output latch is ready, and ALU says ready, accept ALU output
281 with m.If(reset):
282 m.d.comb += self.alu.n_ready_i.eq(1) # tells ALU "thanks got it"
283
284 # output the data from the latch on go_write
285 for i in range(self.n_dst):
286 with m.If(self.wr.go[i]):
287 m.d.comb += self.dest[i].eq(drl[i])
288
289 return m
290
291 def __iter__(self):
292 yield self.rd.go
293 yield self.wr.go
294 yield self.issue_i
295 yield self.shadown_i
296 yield self.go_die_i
297 yield from self.oper_i.ports()
298 yield self.src1_i
299 yield self.src2_i
300 yield self.busy_o
301 yield self.rd.rel
302 yield self.wr.rel
303 yield self.data_o
304
305 def ports(self):
306 return list(self)
307
308
309 def op_sim(dut, a, b, op, inv_a=0, imm=0, imm_ok=0):
310 yield dut.issue_i.eq(0)
311 yield
312 yield dut.src_i[0].eq(a)
313 yield dut.src_i[1].eq(b)
314 yield dut.oper_i.insn_type.eq(op)
315 yield dut.oper_i.invert_a.eq(inv_a)
316 yield dut.oper_i.imm_data.imm.eq(imm)
317 yield dut.oper_i.imm_data.imm_ok.eq(imm_ok)
318 yield dut.issue_i.eq(1)
319 yield
320 yield dut.issue_i.eq(0)
321 yield
322 yield dut.rd.go.eq(0b11)
323 while True:
324 yield
325 rd_rel_o = yield dut.rd.rel
326 print ("rd_rel", rd_rel_o)
327 if rd_rel_o:
328 break
329 yield
330 yield dut.rd.go.eq(0)
331 req_rel_o = yield dut.wr.rel
332 result = yield dut.data_o
333 print ("req_rel", req_rel_o, result)
334 while True:
335 req_rel_o = yield dut.wr.rel
336 result = yield dut.data_o
337 print ("req_rel", req_rel_o, result)
338 if req_rel_o:
339 break
340 yield
341 yield dut.wr.go[0].eq(1)
342 yield
343 result = yield dut.data_o
344 print ("result", result)
345 yield dut.wr.go[0].eq(0)
346 yield
347 return result
348
349
350 def scoreboard_sim(dut):
351 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=0,
352 imm=8, imm_ok=1)
353 assert result == 13
354
355 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD)
356 assert result == 7
357
358 result = yield from op_sim(dut, 5, 2, InternalOp.OP_ADD, inv_a=1)
359 assert result == 65532
360
361
362 def test_scoreboard():
363 from alu_hier import ALU
364 from soc.fu.alu.alu_input_record import CompALUOpSubset
365
366 m = Module()
367 alu = ALU(16)
368 dut = MultiCompUnit(16, alu, CompALUOpSubset)
369 m.submodules.cu = dut
370
371 vl = rtlil.convert(dut, ports=dut.ports())
372 with open("test_compalu.il", "w") as f:
373 f.write(vl)
374
375 run_simulation(m, scoreboard_sim(dut), vcd_name='test_compalu.vcd')
376
377 if __name__ == '__main__':
378 test_scoreboard()