1 """ LOAD / STORE Computation Unit.
3 This module covers POWER9-compliant Load and Store operations,
4 with selection on each between immediate and indexed mode as
5 options for the calculation of the Effective Address (EA),
6 and also "update" mode which optionally stores that EA into
7 an additional register.
9 Stores are activated when Go_Store is enabled, and uses the ALU to
10 compute the "Effective Address", and, when ready (go_st_i and the
11 ALU ready) the operand (src3_i) is stored in the computed address.
13 Loads are activated when Go_Write[0] is enabled. They also use the ALU
14 to compute the EA, and the data comes out (at any time from the
15 PortInterface), and is captured by the LDCompSTUnit.
17 Both LD and ST may request that the address be computed from summing
18 operand1 (src[0]) with operand2 (src[1]) *or* by summing operand1 with
19 the immediate (from the opcode).
21 Both LD and ST may also request "update" mode (op_is_update) which
22 activates the use of Go_Write[1] to control storage of the EA into
23 a *second* operand in the register file.
25 Thus this module has *TWO* write-requests to the register file and
26 *THREE* read-requests to the register file.
28 It's a multi-level Finite State Machine that (unfortunately) nmigen.FSM
29 is not suited to (nmigen.FSM is clock-driven, and some aspects of
30 the FSM below are *combinatorial*).
32 * One FSM covers Operand collection and communication address-side
33 with the LD/ST PortInterface. its role ends when "RD_DONE" is asserted
35 * A second FSM activates to cover LD. it activates if op_is_ld is true
37 * A third FSM activates to cover ST. it activates if op_is_st is true
39 * The "overall" (fourth) FSM coordinates the progression and completion
40 of the three other FSMs, firing "WR_RESET" which switches off "busy"
43 https://libre-soc.org/3d_gpu/ld_st_comp_unit.jpg
45 Links including to walk-through videos:
46 * https://libre-soc.org/3d_gpu/architecture/6600scoreboard/
49 from nmigen
.compat
.sim
import run_simulation
50 from nmigen
.cli
import verilog
, rtlil
51 from nmigen
import Module
, Signal
, Mux
, Cat
, Elaboratable
, Array
52 from nmigen
.hdl
.rec
import Record
, Layout
54 from nmutil
.latch
import SRLatch
, latchregister
56 from soc
.experiment
.compalu_multi
import go_record
57 from soc
.experiment
.l0_cache
import PortInterface
58 from soc
.experiment
.testmem
import TestMemory
59 from soc
.decoder
.power_enums
import InternalOp
61 from soc
.experiment
.alu_hier
import CompALUOpSubset
63 from soc
.decoder
.power_enums
import InternalOp
, Function
66 class CompLDSTOpSubset(Record
):
69 a copy of the relevant subset information from Decode2Execute1Type
70 needed for LD/ST operations. use with eq_from_execute1 (below) to
73 def __init__(self
, name
=None):
74 layout
= (('insn_type', InternalOp
),
75 ('imm_data', Layout((("imm", 64), ("imm_ok", 1)))),
78 ('data_len', 4), # TODO: should be in separate CompLDSTSubset
83 Record
.__init
__(self
, Layout(layout
), name
=name
)
85 # grrr. Record does not have kwargs
86 self
.insn_type
.reset_less
= True
87 self
.is_32bit
.reset_less
= True
88 self
.is_signed
.reset_less
= True
89 self
.data_len
.reset_less
= True
90 self
.byte_reverse
.reset_less
= True
91 self
.sign_extend
.reset_less
= True
92 self
.update
.reset_less
= True
94 def eq_from_execute1(self
, other
):
95 """ use this to copy in from Decode2Execute1Type
98 for fname
, sig
in self
.fields
.items():
99 eqfrom
= other
.fields
[fname
]
100 res
.append(sig
.eq(eqfrom
))
104 return [self
.insn_type
,
114 class LDSTCompUnit(Elaboratable
):
115 """ LOAD / STORE Computation Unit
120 * :rwid: register width
121 * :alu: an ALU module
122 * :mem: a Memory Module (read-write capable)
123 * :src_i: Source Operands (RA/RB/RC) - managed by rd[0-3] go/req
128 * :oper_i: operation being carried out (POWER9 decode LD/ST subset)
129 * :issue_i: LD/ST is being "issued".
130 * :shadown_i: Inverted-shadow is being held (stops STORE *and* WRITE)
131 * :go_rd_i: read is being actioned (latches in src regs)
132 * :go_wr_i: write mode (exactly like ALU CompUnit)
133 * :go_ad_i: address is being actioned (triggers actual mem LD)
134 * :go_st_i: store is being actioned (triggers actual mem STORE)
135 * :go_die_i: resets the unit back to "wait for issue"
137 Control Signals (Out)
138 ---------------------
140 * :busy_o: function unit is busy
141 * :rd_rel_o: request src1/src2
142 * :adr_rel_o: request address (from mem)
143 * :sto_rel_o: request store (to mem)
144 * :req_rel_o: request write (result)
145 * :load_mem_o: activate memory LOAD
146 * :stwd_mem_o: activate memory STORE
148 Note: load_mem_o, stwd_mem_o and req_rel_o MUST all be acknowledged
149 in a single cycle and the CompUnit set back to doing another op.
150 This means deasserting go_st_i, go_ad_i or go_wr_i as appropriate
151 depending on whether the operation is a STORE, LD, or a straight
152 ALU operation respectively.
156 * :data_o: Dest out (LD) - managed by wr[0] go/req
157 * :addr_o: Address out (LD or ST) - managed by wr[1] go/req
160 def __init__(self
, rwid
, alu
, mem
, debugtest
=False):
164 self
.debugtest
= debugtest
166 # POWER-compliant LD/ST has index and update: *fixed* number of ports
167 self
.n_src
= n_src
= 3 # RA, RB, RT/RS
168 self
.n_dst
= n_dest
= 2 # RA, RT/RS
170 self
.counter
= Signal(4)
172 for i
in range(n_src
):
173 j
= i
+ 1 # name numbering to match src1/src2
174 src
.append(Signal(rwid
, name
="src%d_i" % j
, reset_less
=True))
177 for i
in range(n_dst
):
178 j
= i
+ 1 # name numbering to match dest1/2...
179 dst
.append(Signal(rwid
, name
="dest%d_i" % j
, reset_less
=True))
181 self
.rd
= go_record(n_src
, name
="rd") # read in, req out
182 self
.wr
= go_record(n_dst
, name
="wr") # write in, req out
183 self
.go_rd_i
= self
.rd
.go
# temporary naming
184 self
.go_wr_i
= self
.wr
.go
# temporary naming
185 self
.rd_rel_o
= self
.rd
.rel
# temporary naming
186 self
.req_rel_o
= self
.wr
.rel
# temporary naming
188 self
.ad
= go_record(1, name
="ad") # address go in, req out
189 self
.st
= go_record(1, name
="st") # store go in, req out
190 self
.go_ad_i
= self
.ad
.go
# temp naming: go address in
191 self
.go_st_i
= self
.st
.go
# temp naming: go store in
192 self
.issue_i
= Signal(reset_less
=True) # fn issue in
193 self
.isalu_i
= Signal(reset_less
=True) # fn issue as ALU in
194 self
.shadown_i
= Signal(reset
=1) # shadow function, defaults to ON
195 self
.go_die_i
= Signal() # go die (reset)
197 # operation / data input
198 self
.oper_i
= CompALUOpSubset() # operand
199 self
.src_i
= Array(src
)
200 self
.src1_i
= src
[0] # oper1 in: RA
201 self
.src2_i
= src
[1] # oper2 in: RB
202 self
.src3_i
= src
[3] # oper2 in: RC (RS)
205 self
.busy_o
= Signal(reset_less
=True) # fn busy out
206 self
.dest
= Array(dst
)
207 self
.data_o
= dst
[0] # Dest1 out: RT
209 self
.adr_rel_o
= self
.ad
.rel
# request address (from mem)
210 self
.sto_rel_o
= self
.st
.rel
# request store (to mem)
211 self
.done_o
= Signal(reset_less
=True) # final release signal
212 self
.addr_o
= dst
[1] # Address out (LD or ST) - Update => RA
214 # hmm... TODO... move these to outside of LDSTCompUnit?
215 self
.load_mem_o
= Signal(reset_less
=True) # activate memory LOAD
216 self
.stwd_mem_o
= Signal(reset_less
=True) # activate memory STORE
217 self
.ld_o
= Signal(reset_less
=True) # operation is a LD
218 self
.st_o
= Signal(reset_less
=True) # operation is a ST
220 def elaborate(self
, platform
):
225 m
.submodules
.alu
= self
.alu
226 #m.submodules.mem = self.mem
227 m
.submodules
.opc_l
= opc_l
= SRLatch(sync
=False, name
="opc")
228 m
.submodules
.src_l
= src_l
= SRLatch(sync
=False, self
.n_src
, name
="src")
229 m
.submodules
.alu_l
= alu_l
= SRLatch(sync
=False, name
="alu")
230 m
.submodules
.adr_l
= adr_l
= SRLatch(sync
=False, name
="adr")
231 m
.submodules
.lod_l
= lod_l
= SRLatch(sync
=False, name
="lod")
232 m
.submodules
.sto_l
= sto_l
= SRLatch(sync
=False, name
="sto")
233 m
.submodules
.wri_l
= wri_l
= SRLatch(sync
=False, self
.n_dst
, name
="req")
234 m
.submodules
.rst_l
= sto_l
= SRLatch(sync
=False, name
="rst")
237 reset_b
= Signal(reset_less
=True)
238 reset_w
= Signal(self
.n_dst
, reset_less
=True) # reset write
239 reset_a
= Signal(reset_less
=True) # reset adr latch
240 reset_s
= Signal(reset_less
=True)
241 reset_r
= Signal(reset_less
=True)
242 comb
+= reset_b
.eq(self
.go_st_i | self
.wr
.go |
243 self
.go_ad_i | self
.go_die_i
)
244 comb
+= reset_w
.eq(self
.wr
.go | self
.go_die_i
)
245 comb
+= reset_s
.eq(self
.go_st_i | self
.go_die_i
)
246 comb
+= reset_r
.eq(self
.rd
.go | self
.go_die_i
)
247 comb
+= reset_a
.eq(self
.go_ad_i | self
.go_die_i
)
250 op_alu
= Signal(reset_less
=True)
251 op_is_ld
= Signal(reset_less
=True)
252 op_is_st
= Signal(reset_less
=True)
253 op_is_imm
= Signal(reset_less
=True)
255 # ALU/LD data output control
256 alulatch
= Signal(reset_less
=True)
257 ldlatch
= Signal(reset_less
=True)
260 src2_r
= Signal(self
.rwid
, reset_less
=True)
262 # select immediate or src2 reg to add
263 src2_or_imm
= Signal(self
.rwid
, reset_less
=True)
264 src_sel
= Signal(reset_less
=True)
266 # issue can be either issue_i or issue_alu_i (isalu_i)
267 issue_i
= Signal(reset_less
=True)
268 comb
+= issue_i
.eq(self
.issue_i | self
.isalu_i
)
270 # Ripple-down the latches, each one set cancels the previous.
271 # NOTE: use sync to stop combinatorial loops.
273 # opcode latch - inverted so that busy resets to 0
274 sync
+= opc_l
.s
.eq(issue_i
) # XXX NOTE: INVERTED FROM book!
275 sync
+= opc_l
.r
.eq(reset_b
) # XXX NOTE: INVERTED FROM book!
278 sync
+= src_l
.s
.eq(issue_i
)
279 sync
+= src_l
.r
.eq(reset_r
)
282 sync
+= adr_l
.s
.eq(self
.rd
.go
)
283 sync
+= adr_l
.r
.eq(reset_a
)
286 sync
+= wri_l
.s
.eq(self
.go_ad_i | self
.go_st_i | self
.wr
.go
)
287 sync
+= wri_l
.r
.eq(reset_w
)
290 sync
+= sto_l
.s
.eq(self
.rd
.go
) # XXX not sure which
291 sync
+= sto_l
.r
.eq(reset_s
)
293 # create a latch/register for the operand
294 oper_r
= CompALUOpSubset() # Dest register
295 latchregister(m
, self
.oper_i
, oper_r
, self
.issue_i
, name
="oper_r")
297 # and for each output from the ALU
299 for i
in range(self
.n_dst
):
300 name
= "data_r%d" % i
301 data_r
= Signal(self
.rwid
, name
=name
, reset_less
=True)
302 latchregister(m
, self
.alu
.out
[i
], data_r
, req_l
.q
[i
], name
)
305 # and one for the output from the ALU (for the EA)
306 addr_r
= Signal(self
.rwid
, reset_less
=True) # Effective Address Latch
307 latchregister(m
, self
.alu
.o
, addr_r
, alulatch
, "ea_r")
309 # and pass the operation to the ALU
310 comb
+= self
.alu
.op
.eq(oper_r
)
311 comb
+= self
.alu
.op
.insn_type
.eq(InternalOp
.OP_ADD
) # override insn_type
313 # outputs: busy and release signals
315 comb
+= self
.busy_o
.eq(opc_l
.q
) # busy out
316 comb
+= self
.rd
.rel
.eq(src_l
.q
& busy_o
) # src1/src2 req rel
317 comb
+= self
.sto_rel_o
.eq(sto_l
.q
& busy_o
& self
.shadown_i
& op_is_st
)
319 # request release enabled based on if op is a LD/ST or a plain ALU
320 # if op is an ADD/SUB or a LD, req_rel activates.
321 wr_q
= Signal(reset_less
=True)
322 comb
+= wr_q
.eq(wri_l
.q
& (~op_ldst | op_is_ld
))
324 comb
+= alulatch
.eq((op_ldst
& self
.adr_rel_o
) |
325 (~op_ldst
& self
.wr
.rel
))
327 # select immediate if opcode says so. however also change the latch
328 # to trigger *from* the opcode latch instead.
329 comb
+= src_sel
.eq(Mux(op_is_imm
, opc_l
.qn
, src_l
.q
))
330 comb
+= src2_or_imm
.eq(Mux(op_is_imm
, oper_r
.imm_data
.imm
,
333 # create a latch/register for src1/src2 (include immediate select)
334 latchregister(m
, self
.src1_i
, self
.alu
.a
, src_l
.q
, name
="src1_r")
335 latchregister(m
, self
.src2_i
, src2_r
, src_l
.q
, name
="src2_r")
336 latchregister(m
, src2_or_imm
, self
.alu
.b
, src_sel
, name
="imm_r")
338 # decode bits of operand (latched)
339 comb
+= op_is_imm
.eq(oper_r
.imm_data
.imm_ok
) # IMM mode
340 comb
+= op_is_st
.eq(oper_r
.insn_type
== InternalOp
.OP_STORE
) # ST
341 comb
+= op_is_ld
.eq(oper_r
.insn_type
== InternalOp
.OP_LOAD
) # LD
342 op_is_update
= oper_r
.update
# UPDATE
343 comb
+= op_ldst
.eq(op_is_ld | op_is_st
)
344 comb
+= self
.load_mem_o
.eq(op_is_ld
& self
.go_ad_i
)
345 comb
+= self
.stwd_mem_o
.eq(op_is_st
& self
.go_st_i
)
346 comb
+= self
.ld_o
.eq(op_is_ld
)
347 comb
+= self
.st_o
.eq(op_is_st
)
349 # on a go_read, tell the ALU we're accepting data.
350 # NOTE: this spells TROUBLE if the ALU isn't ready!
351 # go_read is only valid for one clock!
352 with m
.If(self
.rd
.go
): # src operands ready, GO!
353 with m
.If(~self
.alu
.p_ready_o
): # no ACK yet
354 m
.d
.comb
+= self
.alu
.p_valid_i
.eq(1) # so indicate valid
356 # only proceed if ALU says its output is valid
357 with m
.If(self
.alu
.n_valid_o
):
358 # write req release out. waits until shadow is dropped.
359 comb
+= self
.wr
.rel
.eq(wr_q
& busy_o
& self
.shadown_i
)
360 # address release only happens on LD/ST, and is shadowed.
361 comb
+= self
.adr_rel_o
.eq(adr_l
.q
& busy_o
&
363 # when output latch is ready, and ALU says ready, accept ALU output
364 with m
.If(self
.wr
.rel
):
365 # tells ALU "thanks got it"
366 m
.d
.comb
+= self
.alu
.n_ready_i
.eq(1)
368 # provide "done" signal: select req_rel for non-LD/ST, adr_rel for LD/ST
369 comb
+= self
.done_o
.eq((self
.wr
.rel
& ~op_ldst
) |
370 (self
.adr_rel_o
& op_ldst
))
372 # put the register directly onto the output bus on a go_write
373 # this is "ALU mode". go_wr_i *must* be deasserted on next clock
374 with m
.If(self
.wr
.go
):
375 comb
+= self
.data_o
.eq(data_r
)
377 # "LD/ST" mode: put the register directly onto the *address* bus
378 with m
.If(self
.go_ad_i | self
.go_st_i
):
379 comb
+= self
.addr_o
.eq(data_r
)
381 # TODO: think about moving these to another module
386 # connect ST to memory. NOTE: unit *must* be set back
387 # to start again by dropping go_st_i on next clock
388 with m
.If(self
.stwd_mem_o
):
389 wrport
= self
.mem
.wrport
390 comb
+= wrport
.addr
.eq(self
.addr_o
)
391 comb
+= wrport
.data
.eq(src2_r
)
392 comb
+= wrport
.en
.eq(1)
394 # connect LD to memory. NOTE: unit *must* be set back
395 # to start again by dropping go_ad_i on next clock
396 rdport
= self
.mem
.rdport
397 ldd_r
= Signal(self
.rwid
, reset_less
=True) # Dest register
399 latchregister(m
, rdport
.data
, ldd_r
, ldlatch
, "ldo_r")
400 sync
+= ldlatch
.eq(self
.load_mem_o
)
401 with m
.If(self
.load_mem_o
):
402 comb
+= rdport
.addr
.eq(self
.addr_o
)
403 # comb += rdport.en.eq(1) # only when transparent=False
405 # if LD-latch, put ld-reg out onto output
406 with m
.If(ldlatch | self
.load_mem_o
):
407 comb
+= self
.data_o
.eq(ldd_r
)
420 yield from self
.oper_i
.ports()
421 yield from self
.src_i
428 yield self
.load_mem_o
429 yield self
.stwd_mem_o
437 print("wait for", sig
, v
)
446 def store(dut
, src1
, src2
, imm
, imm_ok
=True):
447 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_STORE
)
448 yield dut
.src1_i
.eq(src1
)
449 yield dut
.src2_i
.eq(src2
)
450 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
451 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
452 yield dut
.issue_i
.eq(1)
454 yield dut
.issue_i
.eq(0)
456 yield dut
.rd
.go
.eq(0b11)
457 yield from wait_for(dut
.rd
.rel
)
458 yield dut
.rd
.go
.eq(0)
459 yield from wait_for(dut
.adr_rel_o
)
460 yield dut
.go_st_i
.eq(1)
461 yield from wait_for(dut
.sto_rel_o
)
462 wait_for(dut
.stwd_mem_o
)
463 yield dut
.go_st_i
.eq(0)
467 def load(dut
, src1
, src2
, imm
, imm_ok
=True):
468 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_LOAD
)
469 yield dut
.src1_i
.eq(src1
)
470 yield dut
.src2_i
.eq(src2
)
471 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
472 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
473 yield dut
.issue_i
.eq(1)
475 yield dut
.issue_i
.eq(0)
477 yield dut
.rd
.go
.eq(0b11)
478 yield from wait_for(dut
.rd
.rel
)
479 yield dut
.rd
.go
.eq(0)
480 yield from wait_for(dut
.adr_rel_o
)
481 yield dut
.go_ad_i
.eq(1)
482 yield from wait_for(dut
.busy_o
)
484 data
= (yield dut
.data_o
)
485 yield dut
.go_ad_i
.eq(0)
486 # wait_for(dut.stwd_mem_o)
490 def add(dut
, src1
, src2
, imm
, imm_ok
=False):
491 yield dut
.oper_i
.insn_type
.eq(InternalOp
.OP_ADD
)
492 yield dut
.src1_i
.eq(src1
)
493 yield dut
.src2_i
.eq(src2
)
494 yield dut
.oper_i
.imm_data
.imm
.eq(imm
)
495 yield dut
.oper_i
.imm_data
.imm_ok
.eq(imm_ok
)
496 yield dut
.issue_i
.eq(1)
498 yield dut
.issue_i
.eq(0)
500 yield dut
.rd
.go
.eq(1)
501 yield from wait_for(dut
.rd
.rel
)
502 yield dut
.rd
.go
.eq(0)
503 yield from wait_for(dut
.wr
.rel
)
504 yield dut
.wr
.go
.eq(1)
505 yield from wait_for(dut
.busy_o
)
507 data
= (yield dut
.data_o
)
508 yield dut
.wr
.go
.eq(0)
510 # wait_for(dut.stwd_mem_o)
514 def scoreboard_sim(dut
):
515 # two STs (different addresses)
516 yield from store(dut
, 4, 3, 2)
517 yield from store(dut
, 2, 9, 2)
519 # two LDs (deliberately LD from the 1st address then 2nd)
520 data
= yield from load(dut
, 4, 0, 2)
521 assert data
== 0x0003
522 data
= yield from load(dut
, 2, 0, 2)
523 assert data
== 0x0009
527 data
= yield from add(dut
, 4, 3, 0xfeed)
530 # and an add-immediate
531 data
= yield from add(dut
, 4, 0xdeef, 2, imm_ok
=True)
535 class TestLDSTCompUnit(LDSTCompUnit
):
537 def __init__(self
, rwid
):
538 from alu_hier
import ALU
539 self
.alu
= alu
= ALU(rwid
)
540 self
.mem
= mem
= TestMemory(rwid
, 8)
541 LDSTCompUnit
.__init
__(self
, rwid
, alu
, mem
)
543 def elaborate(self
, platform
):
544 m
= LDSTCompUnit
.elaborate(self
, platform
)
545 m
.submodules
.mem
= self
.mem
549 def test_scoreboard():
551 dut
= TestLDSTCompUnit(16)
552 vl
= rtlil
.convert(dut
, ports
=dut
.ports())
553 with
open("test_ldst_comp.il", "w") as f
:
556 run_simulation(dut
, scoreboard_sim(dut
), vcd_name
='test_ldst_comp.vcd')
559 if __name__
== '__main__':