1 """ LOAD / STORE Computation Unit.
3 This module covers POWER9-compliant Load and Store operations,
4 with selection on each between immediate and indexed mode as
5 options for the calculation of the Effective Address (EA),
6 and also "update" mode which optionally stores that EA into
7 an additional register.
9 Stores are activated when Go_Store is enabled, and uses the ALU to
10 compute the "Effective Address", and, when ready (go_st_i and the
11 ALU ready) the operand (src3_i) is stored in the computed address.
13 Loads are activated when Go_Write[0] is enabled. They also use the ALU
14 to compute the EA, and the data comes out (at any time from the
15 PortInterface), and is captured by the LDCompSTUnit.
17 Both LD and ST may request that the address be computed from summing
18 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
19 the immediate (from the opcode).
21 Both LD and ST may also request "update" mode (op_is_update) which
22 activates the use of Go_Write[1] to control storage of the EA into
23 a *second* operand in the register file.
25 Thus this module has *TWO* write-requests to the register file and
26 *THREE* read-requests to the register file.
28 It's a multi-level Finite State Machine that (unfortunately) nmigen.FSM
29 is not suited to (nmigen.FSM is clock-driven, and some aspects of
30 the FSM below are *combinatorial*).
32 * One FSM covers Operand collection and communication address-side
33 with the LD/ST PortInterface. its role ends when "RD_DONE" is asserted
35 * A second FSM activates to cover LD. it activates if op_is_ld is true
37 * A third FSM activates to cover ST. it activates if op_is_st is true
39 * The "overall" (fourth) FSM coordinates the progression and completion
40 of the three other FSMs, firing "WR_RESET" which switches off "busy"
43 https://libre-soc.org/3d_gpu/ld_st_comp_unit.jpg
45 Links including to walk-through videos:
46 * https://libre-soc.org/3d_gpu/architecture/6600scoreboard/
49 from nmigen
.compat
.sim
import run_simulation
50 from nmigen
.cli
import verilog
, rtlil
51 from nmigen
import Module
, Signal
, Mux
, Cat
, Elaboratable
, Array
52 from nmigen
.hdl
.rec
import Record
, Layout
54 from nmutil
.latch
import SRLatch
, latchregister
56 from soc
.experiment
.compalu_multi
import go_record
57 from soc
.experiment
.l0_cache
import PortInterface
58 from soc
.experiment
.testmem
import TestMemory
59 from soc
.decoder
.power_enums
import InternalOp
61 from soc
.experiment
.alu_hier
import CompALUOpSubset
63 from soc
.decoder
.power_enums
import InternalOp
, Function
66 class CompLDSTOpSubset(Record
):
69 a copy of the relevant subset information from Decode2Execute1Type
70 needed for LD/ST operations. use with eq_from_execute1 (below) to
73 def __init__(self
, name
=None):
74 layout
= (('insn_type', InternalOp
),
75 ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
78 ('data_len', 4), # TODO: should be in separate CompLDSTSubset
83 Record
.__init
__(self
, Layout(layout
), name
=name
)
85 # grrr. Record does not have kwargs
86 self
.insn_type
.reset_less
= True
87 self
.is_32bit
.reset_less
= True
88 self
.is_signed
.reset_less
= True
89 self
.data_len
.reset_less
= True
90 self
.byte_reverse
.reset_less
= True
91 self
.sign_extend
.reset_less
= True
92 self
.update
.reset_less
= True
94 def eq_from_execute1(self
, other
):
95 """ use this to copy in from Decode2Execute1Type
98 for fname
, sig
in self
.fields
.items():
99 eqfrom
= other
.fields
[fname
]
100 res
.append(sig
.eq(eqfrom
))
104 return [self
.insn_type
,
114 class LDSTCompUnit(Elaboratable
):
115 """ LOAD / STORE Computation Unit
120 * :rwid: register width
121 * :alu: an ALU module
122 * :mem: a Memory Module (read-write capable)
123 * :src_i: Source Operands (RA/RB/RC) - managed by rd[0-3] go/req
128 * :oper_i: operation being carried out (POWER9 decode LD/ST subset)
129 * :issue_i: LD/ST is being "issued".
130 * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE)
131 * :go_rd_i: read is being actioned (latches in src regs)
132 * :go_wr_i: write mode (exactly like ALU CompUnit)
133 * :go_ad_i: address is being actioned (triggers actual mem LD)
134 * :go_st_i: store is being actioned (triggers actual mem STORE)
135 * :go_die_i: resets the unit back to "wait for issue"
137 Control Signals (Out)
138 ---------------------
140 * :busy_o: function unit is busy
141 * :rd_rel_o: request src1/src2
142 * :adr_rel_o: request address (from mem)
143 * :sto_rel_o: request store (to mem)
144 * :req_rel_o: request write (result)
145 * :load_mem_o: activate memory LOAD
146 * :stwd_mem_o: activate memory STORE
148 Note: load_mem_o, stwd_mem_o and req_rel_o MUST all be acknowledged
149 in a single cycle and the CompUnit set back to doing another op.
150 This means deasserting go_st_i, go_ad_i or go_wr_i as appropriate
151 depending on whether the operation is a STORE, LD, or a straight
152 ALU operation respectively.
156 * :data_o: Dest out (LD) - managed by wr[0] go/req
157 * :addr_o: Address out (LD or ST) - managed by wr[1] go/req
160 def __init__(self
, rwid
, alu
, mem
, debugtest
=False):
164 self
.debugtest
= debugtest
166 # POWER-compliant LD/ST has index and update: *fixed* number of ports
167 self
.n_src
= n_src
= 3 # RA, RB, RT/RS
168 self
.n_dst
= n_dest
= 2 # RA, RT/RS
170 self
.counter
= Signal(4)
172 for i
in range(n_src
):
173 j
= i
+ 1 # name numbering to match src1/src2
174 src
.append(Signal(rwid
, name
="src%d_i" % j
, reset_less
=True))
177 for i
in range(n_dst
):
178 j
= i
+ 1 # name numbering to match dest1/2...
179 dst
.append(Signal(rwid
, name
="dest%d_i" % j
, reset_less
=True))
181 self
.rd
= go_record(n_src
, name
="rd") # read in, req out
182 self
.wr
= go_record(n_dst
, name
="wr") # write in, req out
183 self
.go_rd_i
= self
.rd
.go
# temporary naming
184 self
.go_wr_i
= self
.wr
.go
# temporary naming
185 self
.rd_rel_o
= self
.rd
.rel
# temporary naming
186 self
.req_rel_o
= self
.wr
.rel
# temporary naming
188 self
.ad
= go_record(1, name
="ad") # address go in, req out
189 self
.st
= go_record(1, name
="st") # store go in, req out
190 self
.go_ad_i
= self
.ad
.go
# temp naming: go address in
191 self
.go_st_i
= self
.st
.go
# temp naming: go store in
192 self
.issue_i
= Signal(reset_less
=True) # fn issue in
193 self
.isalu_i
= Signal(reset_less
=True) # fn issue as ALU in
194 self
.shadown_i
= Signal(reset
=1) # shadow function, defaults to ON
195 self
.go_die_i
= Signal() # go die (reset)
197 # operation / data input
198 self
.oper_i
= CompALUOpSubset() # operand
199 self
.src_i
= Array(src
)
200 self
.src1_i
= src
[0] # oper1 in: RA
201 self
.src2_i
= src
[1] # oper2 in: RB
202 self
.src3_i
= src
[3] # oper2 in: RC (RS)
205 self
.busy_o
= Signal(reset_less
=True) # fn busy out
206 self
.dest
= Array(dst
)
207 self
.data_o
= dst
[0] # Dest1 out: RT
209 self
.adr_rel_o
= self
.ad
.rel
# request address (from mem)
210 self
.sto_rel_o
= self
.st
.rel
# request store (to mem)
211 self
.done_o
= Signal(reset_less
=True) # final release signal
212 self
.addr_o
= dst
[1] # Address out (LD or ST) - Update => RA
214 # hmm... TODO... move these to outside of LDSTCompUnit?
215 self
.load_mem_o
= Signal(reset_less
=True) # activate memory LOAD
216 self
.stwd_mem_o
= Signal(reset_less
=True) # activate memory STORE
217 self
.ld_o
= Signal(reset_less
=True) # operation is a LD
218 self
.st_o
= Signal(reset_less
=True) # operation is a ST
220 def elaborate(self
, platform
):
225 m
.submodules
.alu
= self
.alu
226 #m.submodules.mem = self.mem
227 m
.submodules
.opc_l
= opc_l
= SRLatch(sync
=False, name
="opc")
228 m
.submodules
.src_l
= src_l
= SRLatch(sync
=False, self
.n_src
, name
="src")
229 m
.submodules
.alu_l
= alu_l
= SRLatch(sync
=False, name
="alu")
230 m
.submodules
.adr_l
= adr_l
= SRLatch(sync
=False, name
="adr")
231 m
.submodules
.lod_l
= lod_l
= SRLatch(sync
=False, name
="lod")
232 m
.submodules
.sto_l
= sto_l
= SRLatch(sync
=False, name
="sto")
233 m
.submodules
.wri_l
= wri_l
= SRLatch(sync
=False, self
.n_dst
, name
="req")
234 m
.submodules
.rst_l
= sto_l
= SRLatch(sync
=False, name
="rst")
237 reset_b
= Signal(reset_less
=True) # reset opcode
238 reset_w
= Signal(self
.n_dst
, reset_less
=True) # reset write
239 reset_a
= Signal(reset_less
=True) # reset adr latch
240 reset_r
= Signal(self
.n_src
, reset_less
=True) # reset src
241 reset_s
= Signal(reset_less
=True) # reset store
242 wr_reset
= Signal(reset_less
=True) # final reset condition
243 comb
+= reset_b
.eq(wr_reset | self
.go_die_i
)
244 comb
+= reset_w
.eq(self
.wr
.go | self
.go_die_i
)
245 comb
+= reset_s
.eq(self
.go_st_i | self
.go_die_i
)
246 comb
+= reset_r
.eq(self
.rd
.go |
Repl(self
.go_die_i
, self
.n_src
))
247 comb
+= reset_a
.eq(self
.go_ad_i | self
.go_die_i
)
250 op_alu
= Signal(reset_less
=True)
251 op_is_ld
= Signal(reset_less
=True)
252 op_is_st
= Signal(reset_less
=True)
254 # ALU/LD data output control
255 alu_valid
= Signal(reset_less
=True) # ALU operands are valid
256 alu_ok
= Signal(reset_less
=True) # ALU out ok (1 clock delay valid)
257 alulatch
= Signal(reset_less
=True)
258 ldlatch
= Signal(reset_less
=True)
259 wr_any
= Signal(reset_less
=True) # any write (incl. store)
260 rd_done
= Signal(reset_less
=True) # all *necessary* operands read
261 wr_reset
= Signal(reset_less
=True) # final reset condition
264 src2_r
= Signal(self
.rwid
, reset_less
=True)
266 # select immediate or src2 reg to add
267 src2_or_imm
= Signal(self
.rwid
, reset_less
=True)
268 src_sel
= Signal(reset_less
=True)
270 # issue can be either issue_i or issue_alu_i (isalu_i)
271 issue_i
= Signal(reset_less
=True)
272 comb
+= issue_i
.eq(self
.issue_i | self
.isalu_i
)
274 # Ripple-down the latches, each one set cancels the previous.
275 # NOTE: use sync to stop combinatorial loops.
277 # opcode latch - inverted so that busy resets to 0
278 sync
+= opc_l
.s
.eq(issue_i
) # XXX NOTE: INVERTED FROM book!
279 sync
+= opc_l
.r
.eq(reset_b
) # XXX NOTE: INVERTED FROM book!
282 sync
+= src_l
.s
.eq(Repl(issue_i
, self
.n_src
))
283 sync
+= src_l
.r
.eq(reset_r
)
286 sync
+= adr_l
.s
.eq(self
.rd
.go
)
287 sync
+= adr_l
.r
.eq(reset_a
)
290 sync
+= wri_l
.s
.eq(self
.go_ad_i | self
.go_st_i | self
.wr
.go
)
291 sync
+= wri_l
.r
.eq(reset_w
)
294 sync
+= sto_l
.s
.eq(self
.rd
.go
) # XXX not sure which
295 sync
+= sto_l
.r
.eq(reset_s
)
297 # create a latch/register for the operand
298 oper_r
= CompALUOpSubset() # Dest register
299 latchregister(m
, self
.oper_i
, oper_r
, self
.issue_i
, name
="oper_r")
301 # and for each input from the incoming src operands
303 for i
in range(self
.n_src
):
305 src_r
= Signal(self
.rwid
, name
=name
, reset_less
=True)
306 latchregister(m
, self
.src_i
[i
], data_r
, src_l
.q
[i
], name
)
309 # and for each output from the ALU
311 for i
in range(self
.n_dst
):
312 name
= "data_r%d" % i
313 data_r
= Signal(self
.rwid
, name
=name
, reset_less
=True)
314 latchregister(m
, self
.alu
.out
[i
], data_r
, req_l
.q
[i
], name
)
317 # and one for the output from the ALU (for the EA)
318 addr_r
= Signal(self
.rwid
, reset_less
=True) # Effective Address Latch
319 latchregister(m
, self
.alu
.o
, addr_r
, alulatch
, "ea_r")
321 # and pass the operation to the ALU
322 comb
+= self
.alu
.op
.eq(oper_r
)
323 comb
+= self
.alu
.op
.insn_type
.eq(InternalOp
.OP_ADD
) # override insn_type
325 # ok let's connect (and name) the 3 src latched regs created above
326 comb
+= self
.alu
.i
[0].eq(srl
[0]) # Op1 goes straight to ALU input 1
327 op2
= srl
[0] # op2 needs to be muxed (imm select)
328 st_data
= srl
[2] # op3 is for STORE operations
330 # select immediate if opcode says so (and put that into ALU input 2)
331 op_is_imm
= oper_r
.imm_data
.imm_ok
332 src2_or_imm
= Signal(self
.rwid
, reset_less
=True)
333 m
.d
.comb
+= src2_or_imm
.eq(Mux(op_is_imm
, oper_r
.imm_data
.imm
, op2
))
334 comb
+= self
.alu
.i
[1].eq(src2_or_imm
) # src2_or_imm into ALU input 2
336 # outputs: busy and release signals
338 comb
+= self
.busy_o
.eq(opc_l
.q
) # busy out
339 comb
+= self
.rd
.rel
.eq(src_l
.q
& busy_o
) # src1/src2 req rel
340 comb
+= self
.sto_rel_o
.eq(sto_l
.q
& busy_o
& self
.shadown_i
& op_is_st
)
342 # request release enabled based on if op is a LD/ST or a plain ALU
343 # if op is an ADD/SUB or a LD, req_rel activates.
344 wr_q
= Signal(reset_less
=True)
345 comb
+= wr_q
.eq(wri_l
.q
& (~op_ldst | op_is_ld
))
347 comb
+= alulatch
.eq((op_ldst
& self
.adr_rel_o
) |
348 (~op_ldst
& self
.wr
.rel
))
350 # select immediate if opcode says so. however also change the latch
351 # to trigger *from* the opcode latch instead.
352 comb
+= src_sel
.eq(Mux(op_is_imm
, opc_l
.qn
, src_l
.q
))
353 comb
+= src2_or_imm
.eq(Mux(op_is_imm
, oper_r
.imm_data
.imm
,
356 # create a latch/register for src1/src2 (include immediate select)
357 latchregister(m
, self
.src1_i
, self
.alu
.a
, src_l
.q
, name
="src1_r")
358 latchregister(m
, self
.src2_i
, src2_r
, src_l
.q
, name
="src2_r")
359 latchregister(m
, src2_or_imm
, self
.alu
.b
, src_sel
, name
="imm_r")
361 # decode bits of operand (latched)
362 comb
+= op_is_st
.eq(oper_r
.insn_type
== InternalOp
.OP_STORE
) # ST
363 comb
+= op_is_ld
.eq(oper_r
.insn_type
== InternalOp
.OP_LOAD
) # LD
364 op_is_update
= oper_r
.update
# UPDATE
365 comb
+= op_ldst
.eq(op_is_ld | op_is_st
)
366 comb
+= self
.load_mem_o
.eq(op_is_ld
& self
.go_ad_i
)
367 comb
+= self
.stwd_mem_o
.eq(op_is_st
& self
.go_st_i
)
368 comb
+= self
.ld_o
.eq(op_is_ld
)
369 comb
+= self
.st_o
.eq(op_is_st
)
371 # on a go_read, tell the ALU we're accepting data.
372 # NOTE: this spells TROUBLE if the ALU isn't ready!
373 # go_read is only valid for one clock!
374 with m
.If(self
.rd
.go
): # src operands ready, GO!
375 with m
.If(~self
.alu
.p_ready_o
): # no ACK yet
376 m
.d
.comb
+= self
.alu
.p_valid_i
.eq(1) # so indicate valid
378 # only proceed if ALU says its output is valid
379 with m
.If(self
.alu
.n_valid_o
):
380 # write req release out. waits until shadow is dropped.
381 comb
+= self
.wr
.rel
.eq(wr_q
& busy_o
& self
.shadown_i
)
382 # address release only happens on LD/ST, and is shadowed.
383 comb
+= self
.adr_rel_o
.eq(adr_l
.q
& busy_o
&
385 # when output latch is ready, and ALU says ready, accept ALU output
386 with m
.If(self
.wr
.rel
):
387 # tells ALU "thanks got it"
388 m
.d
.comb
+= self
.alu
.n_ready_i
.eq(1)
390 # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
391 comb
+= self
.done_o
.eq((self
.wr
.rel
& ~op_ldst
) |
392 (self
.adr_rel_o
& op_ldst
))
394 # put the register directly onto the output bus on a go_write
395 # this is "ALU mode". go_wr_i *must* be deasserted on next clock
396 with m
.If(self
.wr
.go
):
397 comb
+= self
.data_o
.eq(data_r
)
399 # "LD/ST" mode: put the register directly onto the *address* bus
400 with m
.If(self
.go_ad_i | self
.go_st_i
):
401 comb
+= self
.addr_o
.eq(data_r
)
403 # TODO: think about moving these to another module
408 # connect ST to memory. NOTE: unit *must* be set back
409 # to start again by dropping go_st_i on next clock
410 with m
.If(self
.stwd_mem_o
):
411 wrport
= self
.mem
.wrport
412 comb
+= wrport
.addr
.eq(self
.addr_o
)
413 comb
+= wrport
.data
.eq(src2_r
)
414 comb
+= wrport
.en
.eq(1)
416 # connect LD to memory. NOTE: unit *must* be set back
417 # to start again by dropping go_ad_i on next clock
418 rdport
= self
.mem
.rdport
419 ldd_r
= Signal(self
.rwid
, reset_less
=True) # Dest register
421 latchregister(m
, rdport
.data
, ldd_r
, ldlatch
, "ldo_r")
422 sync
+= ldlatch
.eq(self
.load_mem_o
)
423 with m
.If(self
.load_mem_o
):
424 comb
+= rdport
.addr
.eq(self
.addr_o
)
425 # comb += rdport.en.eq(1) # only when transparent=False
427 # if LD-latch, put ld-reg out onto output
428 with m
.If(ldlatch | self
.load_mem_o
):
429 comb
+= self
.data_o
.eq(ldd_r
)
442 yield from self
.oper_i
.ports()
443 yield from self
.src_i
450 yield self
.load_mem_o
451 yield self
.stwd_mem_o
459 print("wait for", sig
, v
)
468 def store(dut
, src1
, src2
, imm
, imm_ok
=True):
469 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_STORE
)
470 yield dut
.src1_i
.eq(src1
)
471 yield dut
.src2_i
.eq(src2
)
472 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
473 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
474 yield dut
.issue_i
.eq(1)
476 yield dut
.issue_i
.eq(0)
478 yield dut
.rd
.go
.eq(0b11)
479 yield from wait_for(dut
.rd
.rel
)
480 yield dut
.rd
.go
.eq(0)
481 yield from wait_for(dut
.adr_rel_o
)
482 yield dut
.go_st_i
.eq(1)
483 yield from wait_for(dut
.sto_rel_o
)
484 wait_for(dut
.stwd_mem_o
)
485 yield dut
.go_st_i
.eq(0)
489 def load(dut
, src1
, src2
, imm
, imm_ok
=True):
490 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_LOAD
)
491 yield dut
.src1_i
.eq(src1
)
492 yield dut
.src2_i
.eq(src2
)
493 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
494 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
495 yield dut
.issue_i
.eq(1)
497 yield dut
.issue_i
.eq(0)
499 yield dut
.rd
.go
.eq(0b11)
500 yield from wait_for(dut
.rd
.rel
)
501 yield dut
.rd
.go
.eq(0)
502 yield from wait_for(dut
.adr_rel_o
)
503 yield dut
.go_ad_i
.eq(1)
504 yield from wait_for(dut
.busy_o
)
506 data
= (yield dut
.data_o
)
507 yield dut
.go_ad_i
.eq(0)
508 # wait_for(dut.stwd_mem_o)
512 def add(dut
, src1
, src2
, imm
, imm_ok
=False):
513 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_ADD
)
514 yield dut
.src1_i
.eq(src1
)
515 yield dut
.src2_i
.eq(src2
)
516 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
517 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
518 yield dut
.issue_i
.eq(1)
520 yield dut
.issue_i
.eq(0)
522 yield dut
.rd
.go
.eq(1)
523 yield from wait_for(dut
.rd
.rel
)
524 yield dut
.rd
.go
.eq(0)
525 yield from wait_for(dut
.wr
.rel
)
526 yield dut
.wr
.go
.eq(1)
527 yield from wait_for(dut
.busy_o
)
529 data
= (yield dut
.data_o
)
530 yield dut
.wr
.go
.eq(0)
532 # wait_for(dut.stwd_mem_o)
536 def scoreboard_sim(dut
):
537 # two STs (different addresses)
538 yield from store(dut
, 4, 3, 2)
539 yield from store(dut
, 2, 9, 2)
541 # two LDs (deliberately LD from the 1st address then 2nd)
542 data
= yield from load(dut
, 4, 0, 2)
543 assert data
== 0x0003
544 data
= yield from load(dut
, 2, 0, 2)
545 assert data
== 0x0009
549 data
= yield from add(dut
, 4, 3, 0xfeed)
552 # and an add-immediate
553 data
= yield from add(dut
, 4, 0xdeef, 2, imm_ok
=True)
557 class TestLDSTCompUnit(LDSTCompUnit
):
559 def __init__(self
, rwid
):
560 from alu_hier
import ALU
561 self
.alu
= alu
= ALU(rwid
)
562 self
.mem
= mem
= TestMemory(rwid
, 8)
563 LDSTCompUnit
.__init
__(self
, rwid
, alu
, mem
)
565 def elaborate(self
, platform
):
566 m
= LDSTCompUnit
.elaborate(self
, platform
)
567 m
.submodules
.mem
= self
.mem
571 def test_scoreboard():
573 dut
= TestLDSTCompUnit(16)
574 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
575 with
open("test_ldst_comp.il", "w") as f
:
578 run_simulation(dut
, scoreboard_sim(dut
), vcd_name
='test_ldst_comp.vcd')
581 if __name__
== '__main__':