1 """Computation Unit (aka "ALU Manager").
3 Manages a Pipeline or FSM, ensuring that the start and end time are 100%
4 monitored. At no time may the ALU proceed without this module notifying
5 the Dependency Matrices. At no time is a result production "abandoned".
6 This module blocks (indicates busy) starting from when it first receives
7 an opcode until it receives notification that
8 its result(s) have been successfully stored in the regfile(s)
10 Documented at http://libre-soc.org/3d_gpu/architecture/compunit
13 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Repl
, Cat
, Const
14 from nmigen
.hdl
.rec
import (Record
, DIR_FANIN
, DIR_FANOUT
)
16 from nmutil
.latch
import SRLatch
, latchregister
17 from nmutil
.iocontrol
import RecordObject
18 from nmutil
.util
import rising_edge
20 from soc
.fu
.regspec
import RegSpec
, RegSpecALUAPI
24 """find_ok helper function - finds field ending in "_ok"
26 for field_name
in fields
:
27 if field_name
.endswith("_ok"):
32 def go_record(n
, name
):
33 r
= Record([('go_i', n
, DIR_FANIN
),
34 ('rel_o', n
, DIR_FANOUT
)], name
=name
)
35 r
.go_i
.reset_less
= True
36 r
.rel_o
.reset_less
= True
40 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
42 class CompUnitRecord(RegSpec
, RecordObject
):
45 base class for Computation Units, to provide a uniform API
46 and allow "record.connect" etc. to be used, particularly when
47 it comes to connecting multiple Computation Units up as a block
50 LDSTCompUnitRecord should derive from this class and add the
51 additional signals it requires
53 :subkls: the class (not an instance) needed to construct the opcode
54 :rwid: either an integer (specifies width of all regs) or a "regspec"
56 see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
59 def __init__(self
, subkls
, rwid
, n_src
=None, n_dst
=None, name
=None):
60 RegSpec
.__init
__(self
, rwid
, n_src
, n_dst
)
62 RecordObject
.__init
__(self
)
64 n_src
, n_dst
= self
._n
_src
, self
._n
_dst
66 # create source operands
68 for i
in range(n_src
):
69 j
= i
+ 1 # name numbering to match src1/src2
71 rw
= self
._get
_srcwid
(i
)
72 sreg
= Signal(rw
, name
=sname
, reset_less
=True)
73 setattr(self
, sname
, sreg
)
77 # create dest operands
79 for i
in range(n_dst
):
80 j
= i
+ 1 # name numbering to match dest1/2...
81 dname
= "dest%d_o" % j
82 rw
= self
._get
_dstwid
(i
)
83 # dreg = Data(rw, name=name) XXX ??? output needs to be a Data type?
84 dreg
= Signal(rw
, name
=dname
, reset_less
=True)
85 setattr(self
, dname
, dreg
)
89 # operation / data input
90 self
.oper_i
= subkls(name
="oper_i_%s" % name
) # operand
92 # create read/write and other scoreboard signalling
93 self
.rd
= go_record(n_src
, name
="cu_rd") # read in, req out
94 self
.wr
= go_record(n_dst
, name
="cu_wr") # write in, req out
96 self
.rdmaskn
= Signal(n_src
, name
="cu_rdmaskn_i", reset_less
=True)
97 self
.wrmask
= Signal(n_dst
, name
="cu_wrmask_o", reset_less
=True)
100 self
.issue_i
= Signal(name
="cu_issue_i", reset_less
=True)
101 # shadow function, defaults to ON
102 self
.shadown_i
= Signal(name
="cu_shadown_i", reset
=1)
104 self
.go_die_i
= Signal(name
="cu_go_die_i")
107 self
.busy_o
= Signal(name
="cu_busy_o", reset_less
=True) # fn busy out
108 self
.done_o
= Signal(name
="cu_done_o", reset_less
=True)
109 self
.alu_done_o
= Signal(name
="cu_alu_done_o", reset_less
=True)
112 class MultiCompUnit(RegSpecALUAPI
, Elaboratable
):
113 def __init__(self
, rwid
, alu
, opsubsetkls
, n_src
=2, n_dst
=1, name
=None,
117 * :rwid: width of register latches (TODO: allocate per regspec)
118 * :alu: ALU (pipeline, FSM) - must conform to nmutil Pipe API
119 * :opsubsetkls: subset of Decode2ExecuteType
120 * :n_src: number of src operands
121 * :n_dst: number of destination operands
123 RegSpecALUAPI
.__init
__(self
, rwid
, alu
)
124 self
.sync_rw
= sync_rw
125 self
.alu_name
= name
or "alu"
126 self
.opsubsetkls
= opsubsetkls
127 self
.cu
= cu
= CompUnitRecord(opsubsetkls
, rwid
, n_src
, n_dst
,
129 n_src
, n_dst
= self
.n_src
, self
.n_dst
= cu
._n
_src
, cu
._n
_dst
130 print("n_src %d n_dst %d" % (self
.n_src
, self
.n_dst
))
132 # convenience names for src operands
133 for i
in range(n_src
):
134 j
= i
+ 1 # name numbering to match src1/src2
136 setattr(self
, name
, getattr(cu
, name
))
138 # convenience names for dest operands
139 for i
in range(n_dst
):
140 j
= i
+ 1 # name numbering to match dest1/2...
141 name
= "dest%d_o" % j
142 setattr(self
, name
, getattr(cu
, name
))
144 # more convenience names
147 self
.rdmaskn
= cu
.rdmaskn
148 self
.wrmask
= cu
.wrmask
149 self
.alu_done_o
= cu
.alu_done_o
150 self
.go_rd_i
= self
.rd
.go_i
# temporary naming
151 self
.go_wr_i
= self
.wr
.go_i
# temporary naming
152 self
.rd_rel_o
= self
.rd
.rel_o
# temporary naming
153 self
.req_rel_o
= self
.wr
.rel_o
# temporary naming
154 self
.issue_i
= cu
.issue_i
155 self
.shadown_i
= cu
.shadown_i
156 self
.go_die_i
= cu
.go_die_i
158 # operation / data input
159 self
.oper_i
= cu
.oper_i
160 self
.src_i
= cu
._src
_i
162 self
.busy_o
= cu
.busy_o
164 self
.o_data
= self
.dest
[0] # Dest out
165 self
.done_o
= cu
.done_o
167 def _mux_op(self
, m
, sl
, op_is_imm
, imm
, i
):
168 # select imm if opcode says so. however also change the latch
169 # to trigger *from* the opcode latch instead.
170 src_or_imm
= Signal(self
.cu
._get
_srcwid
(i
), reset_less
=True)
171 src_sel
= Signal(reset_less
=True)
172 m
.d
.comb
+= src_sel
.eq(Mux(op_is_imm
, self
.opc_l
.q
, sl
[i
][2]))
173 m
.d
.comb
+= src_or_imm
.eq(Mux(op_is_imm
, imm
, self
.src_i
[i
]))
174 # overwrite 1st src-latch with immediate-muxed stuff
175 sl
[i
][0] = src_or_imm
177 sl
[i
][3] = ~op_is_imm
# change rd.rel[i] gate condition
179 def elaborate(self
, platform
):
185 # generate a pulse on system reset, to reset any latches, if needed
186 system_reset
= Signal(reset
=1)
187 m
.d
.sync
+= system_reset
.eq(0)
189 # add the ALU to the MultiCompUnit only if it is a "real" ALU
190 # see AllFunctionUnits as to why: a FunctionUnitBaseMulti
191 # only has one "real" ALU but multiple pseudo front-ends,
192 # aka "ReservationStations" (ALUProxy "fronts")
193 if isinstance(self
.alu
, Elaboratable
):
194 setattr(m
.submodules
, self
.alu_name
, self
.alu
)
195 m
.submodules
.src_l
= src_l
= SRLatch(False, self
.n_src
, name
="src")
196 m
.submodules
.opc_l
= opc_l
= SRLatch(sync
=False, name
="opc")
197 m
.submodules
.req_l
= req_l
= SRLatch(False, self
.n_dst
, name
="req")
198 m
.submodules
.rst_l
= rst_l
= SRLatch(sync
=False, name
="rst")
199 m
.submodules
.rok_l
= rok_l
= SRLatch(sync
=False, name
="rdok")
200 self
.opc_l
, self
.src_l
= opc_l
, src_l
202 # ALU only proceeds when all src are ready. rd_rel_o is delayed
203 # so combine it with go_rd_i. if all bits are set we're good
204 all_rd
= Signal(reset_less
=True)
205 m
.d
.comb
+= all_rd
.eq(self
.busy_o
& # rok_l.q & # XXX LOOP
206 (((~self
.rd
.rel_o
) | self
.rd
.go_i
).all()))
208 # generate read-done pulse
209 all_rd_pulse
= Signal(reset_less
=True)
210 m
.d
.comb
+= all_rd_pulse
.eq(rising_edge(m
, all_rd
)) # XXX LOOP
212 # create rising pulse from alu valid condition.
213 alu_done
= self
.cu
.alu_done_o
214 alu_pulse
= Signal(reset_less
=True)
215 alu_pulsem
= Signal(self
.n_dst
, reset_less
=True)
216 m
.d
.comb
+= alu_done
.eq(self
.alu
.n
.o_valid
)
217 m
.d
.comb
+= alu_pulse
.eq(rising_edge(m
, alu_done
))
218 m
.d
.comb
+= alu_pulsem
.eq(Repl(alu_pulse
, self
.n_dst
))
220 # sigh bug where req_l gets both set and reset raised at same time
221 prev_wr_go
= Signal(self
.n_dst
)
222 brd
= Repl(self
.busy_o
, self
.n_dst
)
223 m
.d
.sync
+= prev_wr_go
.eq(self
.wr
.go_i
& brd
)
225 # write_requests all done
226 # req_done works because any one of the last of the writes
227 # is enough, when combined with when read-phase is done (rst_l.q)
228 wr_any
= Signal(reset_less
=True)
229 req_done
= Signal(reset_less
=True)
230 m
.d
.comb
+= self
.done_o
.eq(self
.busy_o
& ~
(self
.wr
.rel_o
).bool())
231 m
.d
.comb
+= wr_any
.eq(self
.wr
.go_i
.bool() | prev_wr_go
.bool())
232 m
.d
.comb
+= req_done
.eq(wr_any
& ~self
.alu
.n
.i_ready
& (req_l
.q
== 0))
233 # argh, complicated hack: if there are no regs to write,
234 # instead of waiting for regs that are never going to happen,
235 # we indicate "done" when the ALU is "done"
236 with m
.If((self
.wrmask
== 0) &
237 self
.alu
.n
.i_ready
& self
.alu
.n
.o_valid
& self
.busy_o
):
238 m
.d
.comb
+= req_done
.eq(1)
241 reset
= Signal(reset_less
=True)
242 rst_r
= Signal(reset_less
=True) # reset latch off
243 reset_w
= Signal(self
.n_dst
, reset_less
=True)
244 reset_r
= Signal(self
.n_src
, reset_less
=True)
245 m
.d
.comb
+= reset
.eq(req_done | self
.go_die_i
)
246 m
.d
.comb
+= rst_r
.eq(self
.issue_i | self
.go_die_i
)
247 m
.d
.comb
+= reset_w
.eq(self
.wr
.go_i |
Repl(self
.go_die_i
, self
.n_dst
))
248 m
.d
.comb
+= reset_r
.eq(self
.rd
.go_i |
Repl(rst_r
, self
.n_src
))
250 # read-done,wr-proceed latch
251 rw_domain
+= rok_l
.s
.eq(self
.issue_i
) # set up when issue starts
252 rw_domain
+= rok_l
.r
.eq(self
.alu
.n
.o_valid
& self
.busy_o
) # ALUdone LOOP
254 # wr-done, back-to-start latch
255 rw_domain
+= rst_l
.s
.eq(all_rd
) # set when read-phase is fully done
256 rw_domain
+= rst_l
.r
.eq(rst_r
) # *off* on issue
258 # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
259 m
.d
.sync
+= opc_l
.s
.eq(self
.issue_i
) # set on issue
260 m
.d
.sync
+= opc_l
.r
.eq(req_done
) # reset on ALU
262 # src operand latch (not using go_wr_i) ANDed with rdmask
263 rdmaskn
= Signal(self
.n_src
)
264 latchregister(m
, self
.rdmaskn
, rdmaskn
, self
.issue_i
, name
="rdmask_l")
265 m
.d
.sync
+= src_l
.s
.eq(Repl(self
.issue_i
, self
.n_src
) & ~rdmaskn
)
266 m
.d
.sync
+= src_l
.r
.eq(reset_r
)
268 # dest operand latch (not using issue_i)
269 rw_domain
+= req_l
.s
.eq(alu_pulsem
& self
.wrmask
)
270 m
.d
.comb
+= req_l
.r
.eq(reset_w | prev_wr_go |
271 Repl(system_reset
, self
.n_dst
))
273 # pass operation to the ALU (sync: plenty time to wait for src reads)
275 with m
.If(self
.issue_i
):
276 m
.d
.sync
+= op
.eq(self
.oper_i
)
278 # and for each output from the ALU: capture when ALU output is valid
281 for i
in range(self
.n_dst
):
282 name
= "data_r%d" % i
283 lro
= self
.get_out(i
)
285 data_r_ok
= Const(1, 1)
286 if isinstance(lro
, Record
):
287 print("wr fields", i
, lro
, lro
.fields
)
288 data_r
= Record
.like(lro
, name
=name
)
289 # bye-bye abstract interface design..
290 fname
= find_ok(lro
.fields
)
292 ok
= getattr(lro
, fname
)
293 data_r_ok
= getattr(data_r
, fname
)
294 # write-ok based on incoming output *and* whether the latched
296 # XXX fails - wrok.append((ok|data_r_ok) & self.busy_o)
297 wrok
.append(ok
& self
.busy_o
)
299 data_r
= Signal
.like(lro
, name
=name
)
300 # really should retire this but it's part of unit tests
301 wrok
.append(ok
& self
.busy_o
)
302 #latchregister(m, lro, data_r, ok & self.busy_o, name=name)
303 latchregister(m
, lro
, data_r
, alu_pulse
, name
=name
)
304 with m
.If(self
.issue_i
):
305 m
.d
.comb
+= data_r
.eq(0)
308 # ok, above we collated anything with an "ok" on the output side
309 # now actually use those to create a write-mask. this basically
310 # is now the Function Unit API tells the Comp Unit "do not request
311 # a regfile port because this particular output is not valid"
312 m
.d
.comb
+= self
.wrmask
.eq(Cat(*wrok
))
314 # create list of src/alu-src/src-latch. override 1st and 2nd one below.
315 # in the case, for ALU and Logical pipelines, we assume RB is the
316 # 2nd operand in the input "regspec". see for example
317 # soc.fu.alu.pipe_data.ALUInputData
319 print("src_i", self
.src_i
)
320 for i
in range(self
.n_src
):
321 sl
.append([self
.src_i
[i
], self
.get_in(i
), src_l
.q
[i
], Const(1, 1)])
323 # if the operand subset has "zero_a" we implicitly assume that means
324 # src_i[0] is an INT reg type where zero can be multiplexed in, instead.
325 # see https://bugs.libre-soc.org/show_bug.cgi?id=336
326 if hasattr(op
, "zero_a"):
327 # select zero imm if opcode says so. however also change the latch
328 # to trigger *from* the opcode latch instead.
329 self
._mux
_op
(m
, sl
, op
.zero_a
, 0, 0)
331 # if the operand subset has "imm_data" we implicitly assume that means
332 # "this is an INT ALU/Logical FU jobbie, RB is muxed with the immediate"
333 if hasattr(op
, "imm_data"):
334 # select immediate if opcode says so. however also change the latch
335 # to trigger *from* the opcode latch instead.
336 op_is_imm
= op
.imm_data
.ok
337 imm
= op
.imm_data
.data
338 self
._mux
_op
(m
, sl
, op_is_imm
, imm
, 1)
340 # create a latch/register for src1/src2 (even if it is a copy of imm)
341 for i
in range(self
.n_src
):
342 src
, alusrc
, latch
, _
= sl
[i
]
343 reg
= latchregister(m
, src
, alusrc
, latch
, name
="src_r%d" % i
)
344 # rdmask stops src latches from being set. clear all if not busy
345 with m
.If(~self
.busy_o
):
346 m
.d
.sync
+= reg
.eq(0)
349 # ALU connection / interaction
352 # on a go_read, tell the ALU we're accepting data.
353 m
.submodules
.alui_l
= alui_l
= SRLatch(False, name
="alui")
354 m
.d
.comb
+= self
.alu
.p
.i_valid
.eq(alui_l
.q
)
355 m
.d
.sync
+= alui_l
.r
.eq(self
.alu
.p
.o_ready
& alui_l
.q
)
356 m
.d
.comb
+= alui_l
.s
.eq(all_rd_pulse
)
358 # ALU output "ready" side. alu "ready" indication stays hi until
360 m
.submodules
.alu_l
= alu_l
= SRLatch(False, name
="alu")
361 m
.d
.comb
+= self
.alu
.n
.i_ready
.eq(alu_l
.q
)
362 m
.d
.sync
+= alu_l
.r
.eq(self
.alu
.n
.o_valid
& alu_l
.q
)
363 m
.d
.comb
+= alu_l
.s
.eq(all_rd_pulse
) # XXX LOOP
369 slg
= Cat(*map(lambda x
: x
[3], sl
)) # get req gate conditions
370 # all request signals gated by busy_o. prevents picker problems
371 m
.d
.comb
+= self
.busy_o
.eq(opc_l
.q
) # busy out
373 # read-release gated by busy (and read-mask)
374 if True: #self.sync_rw: - experiment (doesn't work)
375 bro
= Repl(self
.busy_o
, self
.n_src
)
377 bro
= Repl(self
.busy_o|self
.issue_i
, self
.n_src
)
378 m
.d
.comb
+= self
.rd
.rel_o
.eq(src_l
.q
& bro
& slg
)
380 # write-release gated by busy and by shadow (and write-mask)
381 brd
= Repl(self
.busy_o
& self
.shadown_i
, self
.n_dst
)
382 m
.d
.comb
+= self
.wr
.rel_o
.eq(req_l
.q_int
& brd
)
384 # output the data from the latch on go_write
385 for i
in range(self
.n_dst
):
386 with m
.If(self
.wr
.go_i
[i
] & self
.busy_o
):
387 m
.d
.comb
+= self
.dest
[i
].eq(drl
[i
])
391 def get_fu_out(self
, i
):
400 yield from self
.oper_i
.ports()