1 """Computation Unit (aka "ALU Manager").
3 Manages a Pipeline or FSM, ensuring that the start and end time are 100%
4 monitored. At no time may the ALU proceed without this module notifying
5 the Dependency Matrices. At no time is a result production "abandoned".
6 This module blocks (indicates busy) starting from when it first receives
7 an opcode until it receives notification that
8 its result(s) have been successfully stored in the regfile(s)
10 Documented at http://libre-soc.org/3d_gpu/architecture/compunit
13 from nmigen
.compat
.sim
import run_simulation
, Settle
14 from nmigen
.cli
import verilog
, rtlil
15 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Repl
, Array
, Cat
, Const
16 from nmigen
.hdl
.rec
import (Record
, DIR_FANIN
, DIR_FANOUT
)
18 from nmutil
.latch
import SRLatch
, latchregister
19 from nmutil
.iocontrol
import RecordObject
21 from soc
.decoder
.power_decoder2
import Data
22 from soc
.decoder
.power_enums
import InternalOp
23 from soc
.fu
.regspec
import RegSpec
, RegSpecALUAPI
26 def go_record(n
, name
):
27 r
= Record([('go', n
, DIR_FANIN
),
28 ('rel', n
, DIR_FANOUT
)], name
=name
)
29 r
.go
.reset_less
= True
30 r
.rel
.reset_less
= True
33 # see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
35 class CompUnitRecord(RegSpec
, RecordObject
):
38 base class for Computation Units, to provide a uniform API
39 and allow "record.connect" etc. to be used, particularly when
40 it comes to connecting multiple Computation Units up as a block
43 LDSTCompUnitRecord should derive from this class and add the
44 additional signals it requires
46 :subkls: the class (not an instance) needed to construct the opcode
47 :rwid: either an integer (specifies width of all regs) or a "regspec"
49 see https://libre-soc.org/3d_gpu/architecture/regfile/ section on regspecs
51 def __init__(self
, subkls
, rwid
, n_src
=None, n_dst
=None, name
=None):
52 RegSpec
.__init
__(self
, rwid
, n_src
, n_dst
)
53 RecordObject
.__init
__(self
, name
)
55 n_src
, n_dst
= self
._n
_src
, self
._n
_dst
57 # create source operands
59 for i
in range(n_src
):
60 j
= i
+ 1 # name numbering to match src1/src2
62 rw
= self
._get
_srcwid
(i
)
63 sreg
= Signal(rw
, name
=name
, reset_less
=True)
64 setattr(self
, name
, sreg
)
68 # create dest operands
70 for i
in range(n_dst
):
71 j
= i
+ 1 # name numbering to match dest1/2...
73 rw
= self
._get
_dstwid
(i
)
74 dreg
= Signal(rw
, name
=name
, reset_less
=True)
75 setattr(self
, name
, dreg
)
79 # operation / data input
80 self
.oper_i
= subkls(name
="oper_i") # operand
82 # create read/write and other scoreboard signalling
83 self
.rd
= go_record(n_src
, name
="rd") # read in, req out
84 self
.wr
= go_record(n_dst
, name
="wr") # write in, req out
85 self
.issue_i
= Signal(reset_less
=True) # fn issue in
86 self
.shadown_i
= Signal(reset
=1) # shadow function, defaults to ON
87 self
.go_die_i
= Signal() # go die (reset)
90 self
.busy_o
= Signal(reset_less
=True) # fn busy out
91 self
.done_o
= Signal(reset_less
=True)
94 class MultiCompUnit(RegSpecALUAPI
, Elaboratable
):
95 def __init__(self
, rwid
, alu
, opsubsetkls
, n_src
=2, n_dst
=1):
98 * :rwid: width of register latches (TODO: allocate per regspec)
99 * :alu: ALU (pipeline, FSM) - must conform to nmutil Pipe API
100 * :opsubsetkls: subset of Decode2ExecuteType
101 * :n_src: number of src operands
102 * :n_dst: number of destination operands
104 RegSpecALUAPI
.__init
__(self
, rwid
, alu
)
105 self
.opsubsetkls
= opsubsetkls
106 self
.cu
= cu
= CompUnitRecord(opsubsetkls
, rwid
, n_src
, n_dst
)
107 n_src
, n_dst
= self
.n_src
, self
.n_dst
= cu
._n
_src
, cu
._n
_dst
108 print ("n_src %d n_dst %d" % (self
.n_src
, self
.n_dst
))
110 # convenience names for src operands
111 for i
in range(n_src
):
112 j
= i
+ 1 # name numbering to match src1/src2
114 setattr(self
, name
, getattr(cu
, name
))
116 # convenience names for dest operands
117 for i
in range(n_dst
):
118 j
= i
+ 1 # name numbering to match dest1/2...
119 name
= "dest%d_o" % j
120 setattr(self
, name
, getattr(cu
, name
))
122 # more convenience names
125 self
.go_rd_i
= self
.rd
.go
# temporary naming
126 self
.go_wr_i
= self
.wr
.go
# temporary naming
127 self
.rd_rel_o
= self
.rd
.rel
# temporary naming
128 self
.req_rel_o
= self
.wr
.rel
# temporary naming
129 self
.issue_i
= cu
.issue_i
130 self
.shadown_i
= cu
.shadown_i
131 self
.go_die_i
= cu
.go_die_i
133 # operation / data input
134 self
.oper_i
= cu
.oper_i
135 self
.src_i
= cu
._src
_i
137 self
.busy_o
= cu
.busy_o
139 self
.data_o
= self
.dest
[0] # Dest out
140 self
.done_o
= cu
.done_o
143 def _mux_op(self
, m
, sl
, op_is_imm
, imm
, i
):
144 # select imm if opcode says so. however also change the latch
145 # to trigger *from* the opcode latch instead.
146 src_or_imm
= Signal(self
.cu
._get
_srcwid
(i
), reset_less
=True)
147 src_sel
= Signal(reset_less
=True)
148 m
.d
.comb
+= src_sel
.eq(Mux(op_is_imm
, self
.opc_l
.q
, self
.src_l
.q
[i
]))
149 m
.d
.comb
+= src_or_imm
.eq(Mux(op_is_imm
, imm
, self
.src_i
[i
]))
150 # overwrite 1st src-latch with immediate-muxed stuff
151 sl
[i
][0] = src_or_imm
153 sl
[i
][3] = ~op_is_imm
# change rd.rel[i] gate condition
155 def elaborate(self
, platform
):
157 m
.submodules
.alu
= self
.alu
158 m
.submodules
.src_l
= src_l
= SRLatch(False, self
.n_src
, name
="src")
159 m
.submodules
.opc_l
= opc_l
= SRLatch(sync
=False, name
="opc")
160 m
.submodules
.req_l
= req_l
= SRLatch(False, self
.n_dst
, name
="req")
161 m
.submodules
.rst_l
= rst_l
= SRLatch(sync
=False, name
="rst")
162 m
.submodules
.rok_l
= rok_l
= SRLatch(sync
=False, name
="rdok")
163 self
.opc_l
, self
.src_l
= opc_l
, src_l
165 # ALU only proceeds when all src are ready. rd_rel_o is delayed
166 # so combine it with go_rd_i. if all bits are set we're good
167 all_rd
= Signal(reset_less
=True)
168 m
.d
.comb
+= all_rd
.eq(self
.busy_o
& rok_l
.q
&
169 (((~self
.rd
.rel
) | self
.rd
.go
).all()))
171 # write_requests all done
172 # req_done works because any one of the last of the writes
173 # is enough, when combined with when read-phase is done (rst_l.q)
174 wr_any
= Signal(reset_less
=True)
175 req_done
= Signal(reset_less
=True)
176 m
.d
.comb
+= self
.done_o
.eq(self
.busy_o
& ~
(self
.wr
.rel
.bool()))
177 m
.d
.comb
+= wr_any
.eq(self
.wr
.go
.bool())
178 m
.d
.comb
+= req_done
.eq(rst_l
.q
& wr_any
)
180 # create rising pulse from alu valid condition.
181 alu_done
= Signal(reset_less
=True)
182 alu_done_dly
= Signal(reset_less
=True)
183 alu_pulse
= Signal(reset_less
=True)
184 m
.d
.comb
+= alu_done
.eq(self
.alu
.n
.valid_o
)
185 m
.d
.sync
+= alu_done_dly
.eq(alu_done
)
186 m
.d
.comb
+= alu_pulse
.eq(alu_done
& ~alu_done_dly
)
189 reset
= Signal(reset_less
=True)
190 rst_r
= Signal(reset_less
=True) # reset latch off
191 reset_w
= Signal(self
.n_dst
, reset_less
=True)
192 reset_r
= Signal(self
.n_src
, reset_less
=True)
193 m
.d
.comb
+= reset
.eq(req_done | self
.go_die_i
)
194 m
.d
.comb
+= rst_r
.eq(self
.issue_i | self
.go_die_i
)
195 m
.d
.comb
+= reset_w
.eq(self
.wr
.go |
Repl(self
.go_die_i
, self
.n_dst
))
196 m
.d
.comb
+= reset_r
.eq(self
.rd
.go |
Repl(self
.go_die_i
, self
.n_src
))
198 # read-done,wr-proceed latch
199 m
.d
.comb
+= rok_l
.s
.eq(self
.issue_i
) # set up when issue starts
200 m
.d
.comb
+= rok_l
.r
.eq(self
.alu
.p
.ready_o
) # off when ALU acknowledges
202 # wr-done, back-to-start latch
203 m
.d
.comb
+= rst_l
.s
.eq(all_rd
) # set when read-phase is fully done
204 m
.d
.comb
+= rst_l
.r
.eq(rst_r
) # *off* on issue
206 # opcode latch (not using go_rd_i) - inverted so that busy resets to 0
207 m
.d
.sync
+= opc_l
.s
.eq(self
.issue_i
) # set on issue
208 m
.d
.sync
+= opc_l
.r
.eq(self
.alu
.n
.valid_o
& req_done
) # reset on ALU
210 # src operand latch (not using go_wr_i)
211 m
.d
.sync
+= src_l
.s
.eq(Repl(self
.issue_i
, self
.n_src
))
212 m
.d
.sync
+= src_l
.r
.eq(reset_r
)
214 # dest operand latch (not using issue_i)
215 m
.d
.comb
+= req_l
.s
.eq(Repl(alu_pulse
, self
.n_dst
))
216 m
.d
.sync
+= req_l
.r
.eq(reset_w
)
218 # create a latch/register for the operand
219 oper_r
= self
.opsubsetkls(name
="oper_r")
220 latchregister(m
, self
.oper_i
, oper_r
, self
.issue_i
, "oper_l")
222 # and for each output from the ALU
224 for i
in range(self
.n_dst
):
225 name
= "data_r%d" % i
226 data_r
= Signal(self
.cu
._get
_dstwid
(i
), name
=name
, reset_less
=True)
227 latchregister(m
, self
.get_out(i
), data_r
, req_l
.q
[i
], name
+ "_l")
230 # pass the operation to the ALU
231 m
.d
.comb
+= self
.get_op().eq(oper_r
)
233 # create list of src/alu-src/src-latch. override 1st and 2nd one below.
234 # in the case, for ALU and Logical pipelines, we assume RB is the
235 # 2nd operand in the input "regspec". see for example
236 # soc.fu.alu.pipe_data.ALUInputData
238 print ("src_i", self
.src_i
)
239 for i
in range(self
.n_src
):
240 sl
.append([self
.src_i
[i
], self
.get_in(i
), src_l
.q
[i
], Const(1,1)])
242 # if the operand subset has "zero_a" we implicitly assume that means
243 # src_i[0] is an INT reg type where zero can be multiplexed in, instead.
244 # see https://bugs.libre-soc.org/show_bug.cgi?id=336
245 if hasattr(oper_r
, "zero_a"):
246 # select zero imm if opcode says so. however also change the latch
247 # to trigger *from* the opcode latch instead.
248 self
._mux
_op
(m
, sl
, oper_r
.zero_a
, 0, 0)
250 # if the operand subset has "imm_data" we implicitly assume that means
251 # "this is an INT ALU/Logical FU jobbie, RB is muxed with the immediate"
252 if hasattr(oper_r
, "imm_data"):
253 # select immediate if opcode says so. however also change the latch
254 # to trigger *from* the opcode latch instead.
255 op_is_imm
= oper_r
.imm_data
.imm_ok
256 imm
= oper_r
.imm_data
.imm
257 self
._mux
_op
(m
, sl
, op_is_imm
, imm
, 1)
259 # create a latch/register for src1/src2 (even if it is a copy of imm)
260 for i
in range(self
.n_src
):
261 src
, alusrc
, latch
, _
= sl
[i
]
262 latchregister(m
, src
, alusrc
, latch
, name
="src_r%d" % i
)
268 slg
= Cat(*map(lambda x
: x
[3], sl
)) # get req gate conditions
269 # all request signals gated by busy_o. prevents picker problems
270 m
.d
.comb
+= self
.busy_o
.eq(opc_l
.q
) # busy out
271 bro
= Repl(self
.busy_o
, self
.n_src
)
272 m
.d
.comb
+= self
.rd
.rel
.eq(src_l
.q
& bro
& slg
) # src1/src2 req rel
274 # on a go_read, tell the ALU we're accepting data.
275 # NOTE: this spells TROUBLE if the ALU isn't ready!
276 # go_read is only valid for one clock!
277 with m
.If(all_rd
): # src operands ready, GO!
278 with m
.If(~self
.alu
.p
.ready_o
): # no ACK yet
279 m
.d
.comb
+= self
.alu
.p
.valid_i
.eq(1) # so indicate valid
281 brd
= Repl(self
.busy_o
& self
.shadown_i
, self
.n_dst
)
282 # only proceed if ALU says its output is valid
283 with m
.If(self
.alu
.n
.valid_o
):
284 # when ALU ready, write req release out. waits for shadow
285 m
.d
.comb
+= self
.wr
.rel
.eq(req_l
.q
& brd
)
286 # when output latch is ready, and ALU says ready, accept ALU output
288 m
.d
.comb
+= self
.alu
.n
.ready_i
.eq(1) # tells ALU "got it"
290 # output the data from the latch on go_write
291 for i
in range(self
.n_dst
):
292 with m
.If(self
.wr
.go
[i
]):
293 m
.d
.comb
+= self
.dest
[i
].eq(drl
[i
])
303 yield from self
.oper_i
.ports()
315 def op_sim(dut
, a
, b
, op
, inv_a
=0, imm
=0, imm_ok
=0, zero_a
=0):
316 yield dut
.issue_i
.eq(0)
318 yield dut
.src_i
[0].eq(a
)
319 yield dut
.src_i
[1].eq(b
)
320 yield dut
.oper_i
.insn_type
.eq(op
)
321 yield dut
.oper_i
.invert_a
.eq(inv_a
)
322 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
323 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
324 yield dut
.oper_i
.zero_a
.eq(zero_a
)
325 yield dut
.issue_i
.eq(1)
327 yield dut
.issue_i
.eq(0)
329 if not imm_ok
or not zero_a
:
330 yield dut
.rd
.go
.eq(0b11)
333 rd_rel_o
= yield dut
.rd
.rel
334 print ("rd_rel", rd_rel_o
)
337 yield dut
.rd
.go
.eq(0)
338 if len(dut
.src_i
) == 3:
339 yield dut
.rd
.go
.eq(0b100)
342 rd_rel_o
= yield dut
.rd
.rel
343 print ("rd_rel", rd_rel_o
)
346 yield dut
.rd
.go
.eq(0)
348 req_rel_o
= yield dut
.wr
.rel
349 result
= yield dut
.data_o
350 print ("req_rel", req_rel_o
, result
)
352 req_rel_o
= yield dut
.wr
.rel
353 result
= yield dut
.data_o
354 print ("req_rel", req_rel_o
, result
)
358 yield dut
.wr
.go
[0].eq(1)
360 result
= yield dut
.data_o
361 print ("result", result
)
362 yield dut
.wr
.go
[0].eq(0)
367 def scoreboard_sim_dummy(dut
):
368 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_NOP
, inv_a
=0,
370 assert result
== 5, result
372 result
= yield from op_sim(dut
, 9, 2, InternalOp
.OP_NOP
, inv_a
=0,
374 assert result
== 9, result
377 def scoreboard_sim(dut
):
378 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_ADD
, inv_a
=0,
382 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_ADD
)
385 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_ADD
, inv_a
=1)
386 assert result
== 65532
388 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_ADD
, zero_a
=1,
392 result
= yield from op_sim(dut
, 5, 2, InternalOp
.OP_ADD
, zero_a
=1)
397 from alu_hier
import ALU
398 from soc
.fu
.alu
.alu_input_record
import CompALUOpSubset
402 dut
= MultiCompUnit(16, alu
, CompALUOpSubset
)
403 m
.submodules
.cu
= dut
405 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
406 with
open("test_compunit1.il", "w") as f
:
409 run_simulation(m
, scoreboard_sim(dut
), vcd_name
='test_compunit1.vcd')
412 class CompUnitParallelTest
:
413 def __init__(self
, dut
):
416 # Operation cycle should not take longer than this:
417 self
.MAX_BUSY_WAIT
= 50
419 # Minimum duration in which issue_i will be kept inactive,
420 # during which busy_o must remain low.
421 self
.MIN_BUSY_LOW
= 5
423 # Number of cycles to stall until the assertion of go.
424 # One value, for each port. Can be zero, for no delay.
425 self
.RD_GO_DELAY
= [0, 3]
427 # store common data for the input operation of the processes
430 self
.inv_a
= self
.zero_a
= 0
431 self
.imm
= self
.imm_ok
= 0
436 print("Begin parallel test.")
437 yield from self
.operation(5, 2, InternalOp
.OP_ADD
, inv_a
=0,
440 def operation(self
, a
, b
, op
, inv_a
=0, imm
=0, imm_ok
=0, zero_a
=0):
441 # store data for the operation
450 # trigger operation cycle
451 yield from self
.issue()
454 # issue_i starts inactive
455 yield self
.dut
.issue_i
.eq(0)
457 for n
in range(self
.MIN_BUSY_LOW
):
459 # busy_o must remain inactive. It cannot rise on its own.
460 busy_o
= yield self
.dut
.busy_o
463 # activate issue_i to begin the operation cycle
464 yield self
.dut
.issue_i
.eq(1)
466 # at the same time, present the operation
467 yield self
.dut
.oper_i
.insn_type
.eq(self
.op
)
468 yield self
.dut
.oper_i
.invert_a
.eq(self
.inv_a
)
469 yield self
.dut
.oper_i
.imm_data
.imm
.eq(self
.imm
)
470 yield self
.dut
.oper_i
.imm_data
.imm_ok
.eq(self
.imm_ok
)
471 yield self
.dut
.oper_i
.zero_a
.eq(self
.zero_a
)
473 # give one cycle for the CompUnit to latch the data
476 # busy_o must keep being low in this cycle, because issue_i was
477 # low on the previous cycle.
478 # It cannot rise on its own.
479 # Also, busy_o and issue_i must never be active at the same time, ever.
480 busy_o
= yield self
.dut
.busy_o
484 yield self
.dut
.issue_i
.eq(0)
486 # deactivate inputs along with issue_i, so we can be sure the data
487 # was latched at the correct cycle
488 yield self
.dut
.oper_i
.insn_type
.eq(0)
489 yield self
.dut
.oper_i
.invert_a
.eq(0)
490 yield self
.dut
.oper_i
.imm_data
.imm
.eq(0)
491 yield self
.dut
.oper_i
.imm_data
.imm_ok
.eq(0)
492 yield self
.dut
.oper_i
.zero_a
.eq(0)
495 # wait for busy_o to lower
496 # timeout after self.MAX_BUSY_WAIT cycles
497 for n
in range(self
.MAX_BUSY_WAIT
):
498 # sample busy_o in the current cycle
499 busy_o
= yield self
.dut
.busy_o
501 # operation cycle ends when busy_o becomes inactive
505 # if busy_o is still active, a timeout has occurred
506 # TODO: Uncomment this, once the test is complete:
510 print("If you are reading this, "
511 "it's because the above test failed, as expected,\n"
512 "with a timeout. It must pass, once the test is complete.")
515 print("If you are reading this, "
516 "it's because the above test unexpectedly passed.")
518 def rd(self
, rd_idx
):
519 # wait for issue_i to rise
521 issue_i
= yield self
.dut
.issue_i
524 # issue_i has not risen yet, so rd must keep low
525 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
529 # we do not want rd to rise on an immediate operand
530 # if it is immediate, exit the process
531 # TODO: don't exit the process, monitor rd instead to ensure it
532 # doesn't rise on its own
533 if (self
.zero_a
and rd_idx
== 0) or (self
.imm_ok
and rd_idx
== 1):
536 # issue_i has risen. rel must rise on the next cycle
537 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
540 # stall for additional cycles. Check that rel doesn't fall on its own
541 for n
in range(self
.RD_GO_DELAY
[rd_idx
]):
543 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
546 # Before asserting "go", make sure "rel" has risen.
547 # The use of Settle allows "go" to be set combinatorially,
548 # rising on the same cycle as "rel".
550 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
553 # assert go for one cycle
554 yield self
.dut
.rd
.go
[rd_idx
].eq(1)
557 # rel must keep high, since go was inactive in the last cycle
558 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
561 # finish the go one-clock pulse
562 yield self
.dut
.rd
.go
[rd_idx
].eq(0)
565 # rel must have gone low in response to go being high
566 # on the previous cycle
567 rel
= yield self
.dut
.rd
.rel
[rd_idx
]
570 # TODO: also when dut.rd.go is set, put the expected value into
571 # the src_i. use dut.get_in[rd_idx] to do so
573 def wr(self
, wr_idx
):
574 # monitor self.dut.wr.req[rd_idx] and sets dut.wr.go[idx] for one cycle
576 # TODO: also when dut.wr.go is set, check the output against the
577 # self.expected_o and assert. use dut.get_out(wr_idx) to do so.
579 def run_simulation(self
, vcd_name
):
580 run_simulation(self
.dut
, [self
.driver(),
581 self
.rd(0), # one read port (a)
582 self
.rd(1), # one read port (b)
583 self
.wr(0), # one write port (o)
588 def test_compunit_regspec3():
589 from alu_hier
import DummyALU
590 from soc
.fu
.alu
.alu_input_record
import CompALUOpSubset
592 inspec
= [('INT', 'a', '0:15'),
593 ('INT', 'b', '0:15'),
594 ('INT', 'c', '0:15')]
595 outspec
= [('INT', 'o', '0:15'),
598 regspec
= (inspec
, outspec
)
602 dut
= MultiCompUnit(regspec
, alu
, CompALUOpSubset
)
603 m
.submodules
.cu
= dut
605 run_simulation(m
, scoreboard_sim_dummy(dut
),
606 vcd_name
='test_compunit_regspec3.vcd')
609 def test_compunit_regspec1():
610 from alu_hier
import ALU
611 from soc
.fu
.alu
.alu_input_record
import CompALUOpSubset
613 inspec
= [('INT', 'a', '0:15'),
614 ('INT', 'b', '0:15')]
615 outspec
= [('INT', 'o', '0:15'),
618 regspec
= (inspec
, outspec
)
622 dut
= MultiCompUnit(regspec
, alu
, CompALUOpSubset
)
623 m
.submodules
.cu
= dut
625 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
626 with
open("test_compunit_regspec1.il", "w") as f
:
629 run_simulation(m
, scoreboard_sim(dut
),
630 vcd_name
='test_compunit_regspec1.vcd')
632 test
= CompUnitParallelTest(dut
)
633 test
.run_simulation("test_compunit_parallel.vcd")
636 if __name__
== '__main__':
638 test_compunit_regspec1()
639 test_compunit_regspec3()