3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
18 from nmigen
.compat
.sim
import run_simulation
, Settle
19 from nmigen
.cli
import verilog
, rtlil
20 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Array
, Cat
21 from nmutil
.iocontrol
import RecordObject
22 from nmigen
.utils
import log2_int
23 from nmigen
.hdl
.rec
import Record
, Layout
25 from nmutil
.latch
import SRLatch
, latchregister
26 from soc
.decoder
.power_decoder2
import Data
27 from soc
.decoder
.power_enums
import InternalOp
28 from soc
.regfile
.regfile
import ortreereduce
29 from nmutil
.util
import treereduce
31 from soc
.experiment
.compldst
import CompLDSTOpSubset
32 from soc
.decoder
.power_decoder2
import Data
33 #from nmutil.picker import PriorityPicker
34 from nmigen
.lib
.coding
import PriorityEncoder
36 # for testing purposes
37 from soc
.experiment
.testmem
import TestMemory
39 class PortInterface(RecordObject
):
42 defines the interface - the API - that the LDSTCompUnit connects
43 to. note that this is NOT a "fire-and-forget" interface. the
44 LDSTCompUnit *must* be kept appraised that the request is in
45 progress, and only when it has a 100% successful completion rate
46 can the notification be given (busy dropped).
48 The interface FSM rules are as follows:
50 * if busy_o is asserted, a LD/ST is in progress. further
51 requests may not be made until busy_o is deasserted.
53 * only one of is_ld_i or is_st_i may be asserted. busy_o
54 will immediately be asserted and remain asserted.
56 * addr.ok is to be asserted when the LD/ST address is known.
57 addr.data is to be valid on the same cycle.
59 addr.ok and addr.data must REMAIN asserted until busy_o
60 is de-asserted. this ensures that there is no need
61 for the L0 Cache/Buffer to have an additional address latch
62 (because the LDSTCompUnit already has it)
64 * addr_ok_o (or addr_exc_o) must be waited for. these will
65 be asserted *only* for one cycle and one cycle only.
67 * addr_exc_o will be asserted if there is no chance that the
68 memory request may be fulfilled.
70 busy_o is deasserted on the same cycle as addr_exc_o is asserted.
72 * conversely: addr_ok_o must *ONLY* be asserted if there is a
73 HUNDRED PERCENT guarantee that the memory request will be
76 * for a LD, ld.ok will be asserted - for only one clock cycle -
77 at any point in the future that is acceptable to the underlying
78 Memory subsystem. the recipient MUST latch ld.data on that cycle.
80 busy_o is deasserted on the same cycle as ld.ok is asserted.
82 * for a ST, st.ok may be asserted only after addr_ok_o had been
83 asserted, alongside valid st.data at the same time. st.ok
84 must only be asserted for one cycle.
86 the underlying Memory is REQUIRED to pick up that data and
87 guarantee its delivery. no back-acknowledgement is required.
89 busy_o is deasserted on the cycle AFTER st.ok is asserted.
92 def __init__(self
, name
=None, regwid
=64, addrwid
=48):
95 self
._addrwid
= addrwid
97 RecordObject
.__init
__(self
, name
=name
)
99 # distinguish op type (ld/st)
100 self
.is_ld_i
= Signal(reset_less
=True)
101 self
.is_st_i
= Signal(reset_less
=True)
102 self
.op
= CompLDSTOpSubset() # hm insn_type ld/st duplicates here
105 self
.busy_o
= Signal(reset_less
=True) # do not use if busy
106 self
.go_die_i
= Signal(reset_less
=True) # back to reset
107 self
.addr
= Data(addrwid
, "addr_i") # addr/addr-ok
108 # addr is valid (TLB, L1 etc.)
109 self
.addr_ok_o
= Signal(reset_less
=True)
110 self
.addr_exc_o
= Signal(reset_less
=True) # TODO, "type" of exception
113 self
.ld
= Data(regwid
, "ld_data_o") # ok to be set by L0 Cache/Buf
114 self
.st
= Data(regwid
, "st_data_i") # ok to be set by CompUnit
116 # TODO: elaborate function
119 class DualPortSplitter(Elaboratable
):
122 * one incoming PortInterface
123 * two *OUTGOING* PortInterfaces
124 * uses LDSTSplitter to do it
126 (actually, thinking about it LDSTSplitter could simply be
127 modified to conform to PortInterface: one in, two out)
129 once that is done each pair of ports may be wired directly
130 to the dual ports of L0CacheBuffer
134 self
.outp
[0] = PortInterface(name
="outp_0")
135 self
.outp
[1] = PortInterface(name
="outp_1")
136 self
.inp
= PortInterface(name
="inp")
139 class DataMergerRecord(Record
):
141 {data: 128 bit, byte_enable: 16 bit}
144 def __init__(self
, name
=None):
145 layout
= (('data', 128),
149 Record
.__init
__(self
, Layout(layout
), name
=name
)
151 #FIXME: make resetless
153 # TODO: formal verification
155 class DataMerger(Elaboratable
):
158 Merges data based on an address-match matrix.
159 Identifies (picks) one (any) row, then uses that row,
160 based on matching address bits, to merge (OR) all data
161 rows into the output.
163 Basically, by the time DataMerger is used, all of its incoming data is
164 determined not to conflict. The last step before actually submitting
165 the request to the Memory Subsystem is to work out which requests,
166 on the same 128-bit cache line, can be "merged" due to them being:
167 (A) on the same address (bits 4 and above) (B) having byte-enable
168 lines that (as previously mentioned) do not conflict.
170 Therefore, put simply, this module will:
171 (1) pick a row (any row) and identify it by an index labelled "idx"
172 (2) merge all byte-enable lines which are on that same address, as
173 indicated by addr_match_i[idx], onto the output
176 def __init__(self
, array_size
):
178 :addr_array_i: an NxN Array of Signals with bits set indicating address
179 match. bits across the diagonal (addr_array_i[x][x])
180 will always be set, to indicate "active".
181 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
182 :data_o: an Output Record of same type
183 {data: 128 bit, byte_enable: 16 bit}
185 self
.array_size
= array_size
187 for i
in range(array_size
):
188 ul
.append(Signal(array_size
,
190 name
="addr_match_%d" % i
))
191 self
.addr_array_i
= Array(ul
)
194 for i
in range(array_size
):
195 ul
.append(DataMergerRecord())
196 self
.data_i
= Array(ul
)
197 self
.data_o
= DataMergerRecord()
199 def elaborate(self
, platform
):
203 m
.submodules
.pick
= pick
= PriorityEncoder(self
.array_size
)
204 for j
in range(self
.array_size
):
205 comb
+= pick
.i
[j
].eq(self
.addr_array_i
[j
].bool())
211 for j
in range(self
.array_size
):
212 select
= self
.addr_array_i
[idx
][j
]
213 r
= DataMergerRecord()
215 comb
+= r
.eq(self
.data_i
[j
])
217 comb
+= self
.data_o
.data
.eq(ortreereduce(l
,"data"))
218 comb
+= self
.data_o
.en
.eq(ortreereduce(l
,"en"))
223 class LDSTPort(Elaboratable
):
224 def __init__(self
, idx
, regwid
=64, addrwid
=48):
225 self
.pi
= PortInterface("ldst_port%d" % idx
, regwid
, addrwid
)
227 def elaborate(self
, platform
):
229 comb
, sync
= m
.d
.comb
, m
.d
.sync
232 m
.submodules
.busy_l
= busy_l
= SRLatch(False, name
="busy")
233 m
.submodules
.cyc_l
= cyc_l
= SRLatch(True, name
="cyc")
234 comb
+= cyc_l
.s
.eq(0)
235 comb
+= cyc_l
.r
.eq(0)
237 # this is a little weird: we let the L0Cache/Buffer set
238 # the outputs: this module just monitors "state".
240 # LD/ST requested activates "busy"
241 with m
.If(self
.pi
.is_ld_i | self
.pi
.is_st_i
):
242 comb
+= busy_l
.s
.eq(1)
244 # monitor for an exception or the completion of LD.
245 with m
.If(self
.pi
.addr_exc_o
):
246 comb
+= busy_l
.r
.eq(1)
248 # however ST needs one cycle before busy is reset
249 with m
.If(self
.pi
.st
.ok | self
.pi
.ld
.ok
):
250 comb
+= cyc_l
.s
.eq(1)
253 comb
+= cyc_l
.r
.eq(1)
254 comb
+= busy_l
.r
.eq(1)
256 # busy latch outputs to interface
257 comb
+= self
.pi
.busy_o
.eq(busy_l
.q
)
262 yield self
.pi
.is_ld_i
263 yield self
.pi
.is_st_i
264 yield from self
.pi
.op
.ports()
266 yield self
.pi
.go_die_i
267 yield from self
.pi
.addr
.ports()
268 yield self
.pi
.addr_ok_o
269 yield self
.pi
.addr_exc_o
271 yield from self
.pi
.ld
.ports()
272 yield from self
.pi
.st
.ports()
278 class L0CacheBuffer(Elaboratable
):
281 Note that the final version will have *two* interfaces per LDSTCompUnit,
282 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
283 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
285 This version is to be used for test purposes (and actively maintained
286 for such, rather than "replaced")
288 There are much better ways to implement this. However it's only
289 a "demo" / "test" class, and one important aspect: it responds
290 combinatorially, where a nmigen FSM's state-changes only activate
291 on clock-sync boundaries.
294 def __init__(self
, n_units
, mem
, regwid
=64, addrwid
=48):
295 self
.n_units
= n_units
298 for i
in range(n_units
):
299 ul
.append(LDSTPort(i
, regwid
, addrwid
))
300 self
.dports
= Array(ul
)
302 def elaborate(self
, platform
):
304 comb
, sync
= m
.d
.comb
, m
.d
.sync
306 # connect the ports as modules
307 for i
in range(self
.n_units
):
308 setattr(m
.submodules
, "port%d" % i
, self
.dports
[i
])
310 # state-machine latches
311 m
.submodules
.st_active
= st_active
= SRLatch(False, name
="st_active")
312 m
.submodules
.ld_active
= ld_active
= SRLatch(False, name
="ld_active")
313 m
.submodules
.reset_l
= reset_l
= SRLatch(True, name
="reset")
314 m
.submodules
.idx_l
= idx_l
= SRLatch(False, name
="idx_l")
315 m
.submodules
.adrok_l
= adrok_l
= SRLatch(False, name
="addr_acked")
317 # find one LD (or ST) and do it. only one per cycle.
318 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
319 # LD/STs using mask-expansion - see LenExpand class
321 m
.submodules
.ldpick
= ldpick
= PriorityEncoder(self
.n_units
)
322 m
.submodules
.stpick
= stpick
= PriorityEncoder(self
.n_units
)
324 lds
= Signal(self
.n_units
, reset_less
=True)
325 sts
= Signal(self
.n_units
, reset_less
=True)
328 for i
in range(self
.n_units
):
329 pi
= self
.dports
[i
].pi
330 ldi
.append(pi
.is_ld_i
& pi
.busy_o
) # accumulate ld-req signals
331 sti
.append(pi
.is_st_i
& pi
.busy_o
) # accumulate st-req signals
332 # put the requests into the priority-pickers
333 comb
+= ldpick
.i
.eq(Cat(*ldi
))
334 comb
+= stpick
.i
.eq(Cat(*sti
))
336 # hmm, have to select (record) the right port index
337 nbits
= log2_int(self
.n_units
, False)
338 ld_idx
= Signal(nbits
, reset_less
=False)
339 st_idx
= Signal(nbits
, reset_less
=False)
340 # use these because of the sync-and-comb pass-through capability
341 latchregister(m
, ldpick
.o
, ld_idx
, idx_l
.qn
, name
="ld_idx_l")
342 latchregister(m
, stpick
.o
, st_idx
, idx_l
.qn
, name
="st_idx_l")
344 # convenience variables to reference the "picked" port
345 ldport
= self
.dports
[ld_idx
].pi
346 stport
= self
.dports
[st_idx
].pi
347 # and the memory ports
348 rdport
= self
.mem
.rdport
349 wrport
= self
.mem
.wrport
351 # Priority-Pickers pick one and only one request, capture its index.
352 # from that point on this code *only* "listens" to that port.
354 sync
+= adrok_l
.s
.eq(0)
355 comb
+= adrok_l
.r
.eq(0)
356 with m
.If(~ldpick
.n
):
357 comb
+= ld_active
.s
.eq(1) # activate LD mode
358 comb
+= idx_l
.r
.eq(1) # pick (and capture) the port index
359 with m
.Elif(~stpick
.n
):
360 comb
+= st_active
.s
.eq(1) # activate ST mode
361 comb
+= idx_l
.r
.eq(1) # pick (and capture) the port index
363 # from this point onwards, with the port "picked", it stays picked
364 # until ld_active (or st_active) are de-asserted.
366 # if now in "LD" mode: wait for addr_ok, then send the address out
367 # to memory, acknowledge address, and send out LD data
368 with m
.If(ld_active
.q
):
369 with m
.If(ldport
.addr
.ok
& adrok_l
.qn
):
370 comb
+= rdport
.addr
.eq(ldport
.addr
.data
) # addr ok, send thru
371 comb
+= ldport
.addr_ok_o
.eq(1) # acknowledge addr ok
372 sync
+= adrok_l
.s
.eq(1) # and pull "ack" latch
374 # if now in "ST" mode: likewise do the same but with "ST"
375 # to memory, acknowledge address, and send out LD data
376 with m
.If(st_active
.q
):
377 with m
.If(stport
.addr
.ok
):
378 comb
+= wrport
.addr
.eq(stport
.addr
.data
) # addr ok, send thru
379 with m
.If(adrok_l
.qn
):
380 comb
+= stport
.addr_ok_o
.eq(1) # acknowledge addr ok
381 sync
+= adrok_l
.s
.eq(1) # and pull "ack" latch
383 # NOTE: in both these, below, the port itself takes care
384 # of de-asserting its "busy_o" signal, based on either ld.ok going
385 # high (by us, here) or by st.ok going high (by the LDSTCompUnit).
387 # for LD mode, when addr has been "ok'd", assume that (because this
388 # is a "Memory" test-class) the memory read data is valid.
389 comb
+= reset_l
.s
.eq(0)
390 comb
+= reset_l
.r
.eq(0)
391 with m
.If(ld_active
.q
& adrok_l
.q
):
392 comb
+= ldport
.ld
.data
.eq(rdport
.data
) # put data out
393 comb
+= ldport
.ld
.ok
.eq(1) # indicate data valid
394 comb
+= reset_l
.s
.eq(1) # reset mode after 1 cycle
396 # for ST mode, when addr has been "ok'd", wait for incoming "ST ok"
397 with m
.If(st_active
.q
& stport
.st
.ok
):
398 comb
+= wrport
.data
.eq(stport
.st
.data
) # write st to mem
399 comb
+= wrport
.en
.eq(1) # enable write
400 comb
+= reset_l
.s
.eq(1) # reset mode after 1 cycle
402 # after waiting one cycle (reset_l is "sync" mode), reset the port
403 with m
.If(reset_l
.q
):
404 comb
+= idx_l
.s
.eq(1) # deactivate port-index selector
405 comb
+= ld_active
.r
.eq(1) # leave the ST active for 1 cycle
406 comb
+= st_active
.r
.eq(1) # leave the ST active for 1 cycle
407 comb
+= reset_l
.r
.eq(1) # clear reset
408 comb
+= adrok_l
.r
.eq(1) # address reset
413 for p
in self
.dports
:
417 class TstL0CacheBuffer(Elaboratable
):
418 def __init__(self
, n_units
=3, regwid
=16, addrwid
=4):
419 self
.mem
= TestMemory(regwid
, addrwid
)
420 self
.l0
= L0CacheBuffer(n_units
, self
.mem
, regwid
, addrwid
)
422 def elaborate(self
, platform
):
424 m
.submodules
.mem
= self
.mem
425 m
.submodules
.l0
= self
.l0
430 yield from self
.l0
.ports()
431 yield self
.mem
.rdport
.addr
432 yield self
.mem
.rdport
.data
433 yield self
.mem
.wrport
.addr
434 yield self
.mem
.wrport
.data
438 def wait_busy(port
, no
=False):
440 busy
= yield port
.pi
.busy_o
441 print("busy", no
, busy
)
449 addr_ok
= yield port
.pi
.addr_ok_o
450 print("addrok", addr_ok
)
458 ldok
= yield port
.pi
.ld
.ok
465 def l0_cache_st(dut
, addr
, data
):
471 # have to wait until not busy
472 yield from wait_busy(port1
, no
=False) # wait until not busy
474 # set up a ST on the port. address first:
475 yield port1
.pi
.is_st_i
.eq(1) # indicate LD
477 yield port1
.pi
.addr
.data
.eq(addr
) # set address
478 yield port1
.pi
.addr
.ok
.eq(1) # set ok
479 yield from wait_addr(port1
) # wait until addr ok
480 # yield # not needed, just for checking
481 # yield # not needed, just for checking
482 # assert "ST" for one cycle (required by the API)
483 yield port1
.pi
.st
.data
.eq(data
)
484 yield port1
.pi
.st
.ok
.eq(1)
486 yield port1
.pi
.st
.ok
.eq(0)
488 # can go straight to reset.
489 yield port1
.pi
.is_st_i
.eq(0) # end
490 yield port1
.pi
.addr
.ok
.eq(0) # set !ok
491 # yield from wait_busy(port1, False) # wait until not busy
494 def l0_cache_ld(dut
, addr
, expected
):
501 # have to wait until not busy
502 yield from wait_busy(port1
, no
=False) # wait until not busy
504 # set up a LD on the port. address first:
505 yield port1
.pi
.is_ld_i
.eq(1) # indicate LD
507 yield port1
.pi
.addr
.data
.eq(addr
) # set address
508 yield port1
.pi
.addr
.ok
.eq(1) # set ok
509 yield from wait_addr(port1
) # wait until addr ok
511 yield from wait_ldok(port1
) # wait until ld ok
512 data
= yield port1
.pi
.ld
.data
515 yield port1
.pi
.is_ld_i
.eq(0) # end
516 yield port1
.pi
.addr
.ok
.eq(0) # set !ok
517 # yield from wait_busy(port1, no=False) # wait until not busy
522 def l0_cache_ldst(dut
):
528 yield from l0_cache_st(dut
, 0x2, data
)
529 yield from l0_cache_st(dut
, 0x3, data2
)
530 result
= yield from l0_cache_ld(dut
, 0x2, data
)
531 result2
= yield from l0_cache_ld(dut
, 0x3, data2
)
533 assert data
== result
, "data %x != %x" % (result
, data
)
534 assert data2
== result2
, "data2 %x != %x" % (result2
, data2
)
536 def data_merger_merge(dut
):
538 #starting with all inputs zero
540 en
= yield dut
.data_o
.en
541 data
= yield dut
.data_o
.data
542 assert en
== 0, "en must be zero"
543 assert data
== 0, "data must be zero"
546 yield dut
.addr_array_i
[0].eq(0xFF)
547 for j
in range(dut
.array_size
):
548 yield dut
.data_i
[j
].en
.eq(1 << j
)
549 yield dut
.data_i
[j
].data
.eq(0xFF << (16*j
))
552 en
= yield dut
.data_o
.en
553 data
= yield dut
.data_o
.data
554 assert data
== 0xff00ff00ff00ff00ff00ff00ff00ff
560 dut
= TstL0CacheBuffer()
561 #vl = rtlil.convert(dut, ports=dut.ports())
562 #with open("test_basic_l0_cache.il", "w") as f:
565 run_simulation(dut
, l0_cache_ldst(dut
),
566 vcd_name
='test_l0_cache_basic.vcd')
568 def test_data_merger():
571 #vl = rtlil.convert(dut, ports=dut.ports())
572 #with open("test_data_merger.il", "w") as f:
575 run_simulation(dut
, data_merger_merge(dut
),
576 vcd_name
='test_data_merger.vcd')
579 if __name__
== '__main__':