3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
18 from nmigen
.compat
.sim
import run_simulation
, Settle
19 from nmigen
.cli
import verilog
, rtlil
20 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Array
, Cat
21 from nmutil
.iocontrol
import RecordObject
22 from nmigen
.utils
import log2_int
23 from nmigen
.hdl
.rec
import Record
, Layout
25 from nmutil
.latch
import SRLatch
, latchregister
26 from soc
.decoder
.power_decoder2
import Data
27 from soc
.decoder
.power_enums
import MicrOp
28 from soc
.regfile
.regfile
import ortreereduce
29 from nmutil
.util
import treereduce
31 from soc
.decoder
.power_decoder2
import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen
.lib
.coding
import PriorityEncoder
34 from soc
.scoreboard
.addr_split
import LDSTSplitter
35 from soc
.scoreboard
.addr_match
import LenExpand
37 # for testing purposes
38 from soc
.config
.test
.test_loadstore
import TestMemPspec
39 from soc
.config
.loadstore
import ConfigMemoryPortInterface
40 from soc
.experiment
.pimem
import PortInterface
41 from soc
.config
.test
.test_pi2ls
import pi_ld
, pi_st
, pi_ldst
45 class DualPortSplitter(Elaboratable
):
48 * one incoming PortInterface
49 * two *OUTGOING* PortInterfaces
50 * uses LDSTSplitter to do it
52 (actually, thinking about it LDSTSplitter could simply be
53 modified to conform to PortInterface: one in, two out)
55 once that is done each pair of ports may be wired directly
56 to the dual ports of L0CacheBuffer
58 The split is carried out so that, regardless of alignment or
59 mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
60 of the address, whilst outgoing PortInterface[1] takes
63 PortInterface *may* need to be changed so that the length is
64 a binary number (accepting values 1-16).
68 self
.outp
= [PortInterface(name
="outp_0"),
69 PortInterface(name
="outp_1")]
70 self
.inp
= PortInterface(name
="inp")
73 def elaborate(self
, platform
):
76 m
.submodules
.splitter
= splitter
= LDSTSplitter(64, 48, 4)
77 comb
+= splitter
.addr_i
.eq(self
.inp
.addr
) # XXX
78 #comb += splitter.len_i.eq()
79 #comb += splitter.valid_i.eq()
80 comb
+= splitter
.is_ld_i
.eq(self
.inp
.is_ld_i
)
81 comb
+= splitter
.is_st_i
.eq(self
.inp
.is_st_i
)
82 #comb += splitter.st_data_i.eq()
83 #comb += splitter.sld_valid_i.eq()
84 #comb += splitter.sld_data_i.eq()
85 #comb += splitter.sst_valid_i.eq()
89 class DataMergerRecord(Record
):
91 {data: 128 bit, byte_enable: 16 bit}
94 def __init__(self
, name
=None):
95 layout
= (('data', 128),
97 Record
.__init
__(self
, Layout(layout
), name
=name
)
99 self
.data
.reset_less
= True
100 self
.en
.reset_less
= True
103 # TODO: formal verification
104 class DataMerger(Elaboratable
):
107 Merges data based on an address-match matrix.
108 Identifies (picks) one (any) row, then uses that row,
109 based on matching address bits, to merge (OR) all data
110 rows into the output.
112 Basically, by the time DataMerger is used, all of its incoming data is
113 determined not to conflict. The last step before actually submitting
114 the request to the Memory Subsystem is to work out which requests,
115 on the same 128-bit cache line, can be "merged" due to them being:
116 (A) on the same address (bits 4 and above) (B) having byte-enable
117 lines that (as previously mentioned) do not conflict.
119 Therefore, put simply, this module will:
120 (1) pick a row (any row) and identify it by an index labelled "idx"
121 (2) merge all byte-enable lines which are on that same address, as
122 indicated by addr_match_i[idx], onto the output
125 def __init__(self
, array_size
):
127 :addr_array_i: an NxN Array of Signals with bits set indicating address
128 match. bits across the diagonal (addr_array_i[x][x])
129 will always be set, to indicate "active".
130 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
131 :data_o: an Output Record of same type
132 {data: 128 bit, byte_enable: 16 bit}
134 self
.array_size
= array_size
136 for i
in range(array_size
):
137 ul
.append(Signal(array_size
,
139 name
="addr_match_%d" % i
))
140 self
.addr_array_i
= Array(ul
)
143 for i
in range(array_size
):
144 ul
.append(DataMergerRecord())
145 self
.data_i
= Array(ul
)
146 self
.data_o
= DataMergerRecord()
148 def elaborate(self
, platform
):
152 m
.submodules
.pick
= pick
= PriorityEncoder(self
.array_size
)
153 for j
in range(self
.array_size
):
154 comb
+= pick
.i
[j
].eq(self
.addr_array_i
[j
].bool())
160 for j
in range(self
.array_size
):
161 select
= self
.addr_array_i
[idx
][j
]
162 r
= DataMergerRecord()
164 comb
+= r
.eq(self
.data_i
[j
])
166 comb
+= self
.data_o
.data
.eq(ortreereduce(l
, "data"))
167 comb
+= self
.data_o
.en
.eq(ortreereduce(l
, "en"))
172 class L0CacheBuffer(Elaboratable
):
175 Note that the final version will have *two* interfaces per LDSTCompUnit,
176 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
177 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
179 This version is to be used for test purposes (and actively maintained
180 for such, rather than "replaced")
182 There are much better ways to implement this. However it's only
183 a "demo" / "test" class, and one important aspect: it responds
184 combinatorially, where a nmigen FSM's state-changes only activate
185 on clock-sync boundaries.
187 Note: the data byte-order is *not* expected to be normalised (LE/BE)
188 by this class. That task is taken care of by LDSTCompUnit.
191 def __init__(self
, n_units
, pimem
, regwid
=64, addrwid
=48):
192 self
.n_units
= n_units
195 self
.addrwid
= addrwid
197 for i
in range(n_units
):
198 ul
.append(PortInterface("ldst_port%d" % i
, regwid
, addrwid
))
199 self
.dports
= Array(ul
)
201 def elaborate(self
, platform
):
203 comb
, sync
= m
.d
.comb
, m
.d
.sync
205 # connect the ports as modules
206 # for i in range(self.n_units):
207 # setattr(m.submodules, "port%d" % i, self.dports[i])
209 # state-machine latches
210 m
.submodules
.idx_l
= idx_l
= SRLatch(False, name
="idx_l")
211 m
.submodules
.reset_l
= reset_l
= SRLatch(True, name
="reset")
213 # find one LD (or ST) and do it. only one per cycle.
214 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
215 # LD/STs using mask-expansion - see LenExpand class
217 m
.submodules
.pick
= pick
= PriorityEncoder(self
.n_units
)
220 for i
in range(self
.n_units
):
222 busy
= (pi
.is_ld_i | pi
.is_st_i
) # & pi.busy_o
223 ldsti
.append(busy
) # accumulate ld/st-req
224 # put the requests into the priority-picker
225 comb
+= pick
.i
.eq(Cat(*ldsti
))
227 # hmm, have to select (record) the right port index
228 nbits
= log2_int(self
.n_units
, False)
229 idx
= Signal(nbits
, reset_less
=False)
231 # use these because of the sync-and-comb pass-through capability
232 latchregister(m
, pick
.o
, idx
, idx_l
.q
, name
="idx_l")
234 # convenience variables to reference the "picked" port
235 port
= self
.dports
[idx
]
237 # pick (and capture) the port index
239 comb
+= idx_l
.s
.eq(1)
241 # from this point onwards, with the port "picked", it stays picked
242 # until idx_l is deasserted
243 comb
+= reset_l
.s
.eq(0)
244 comb
+= reset_l
.r
.eq(0)
247 comb
+= self
.pimem
.connect_port(port
)
248 with m
.If(~self
.pimem
.pi
.busy_o
):
249 comb
+= reset_l
.s
.eq(1) # reset when no longer busy
251 # ugly hack, due to simultaneous addr req-go acknowledge
252 reset_delay
= Signal(reset_less
=True)
253 sync
+= reset_delay
.eq(reset_l
.q
)
255 # after waiting one cycle (reset_l is "sync" mode), reset the port
256 with m
.If(reset_l
.q
):
257 comb
+= idx_l
.r
.eq(1) # deactivate port-index selector
258 comb
+= reset_l
.r
.eq(1) # clear reset
263 for p
in self
.dports
:
267 class TstL0CacheBuffer(Elaboratable
):
268 def __init__(self
, pspec
, n_units
=3):
269 regwid
= pspec
.reg_wid
270 addrwid
= pspec
.addr_wid
271 self
.cmpi
= ConfigMemoryPortInterface(pspec
)
272 self
.pimem
= self
.cmpi
.pi
273 self
.l0
= L0CacheBuffer(n_units
, self
.pimem
, regwid
, addrwid
<< 1)
275 def elaborate(self
, platform
):
277 m
.submodules
.pimem
= self
.pimem
278 m
.submodules
.l0
= self
.l0
279 if hasattr(self
.cmpi
, 'lsmem'): # hmmm not happy about this
280 m
.submodules
.lsmem
= self
.cmpi
.lsmem
.lsi
285 yield from self
.cmpi
.ports()
286 yield from self
.l0
.ports()
287 yield from self
.pimem
.ports()
290 def wait_busy(port
, no
=False):
292 busy
= yield port
.busy_o
293 print("busy", no
, busy
)
301 addr_ok
= yield port
.addr_ok_o
302 print("addrok", addr_ok
)
310 ldok
= yield port
.ld
.ok
317 def l0_cache_st(dut
, addr
, data
, datalen
):
318 return pi_st(dut
.l0
, addr
, datalen
)
321 def l0_cache_ld(dut
, addr
, datalen
, expected
):
322 return pi_ld(dut
.l0
, addr
, datalen
)
325 def l0_cache_ldst(arg
, dut
):
326 port0
= dut
.l0
.dports
[0]
327 return pi_ldst(arg
, port0
)
330 def data_merger_merge(dut
):
332 # starting with all inputs zero
334 en
= yield dut
.data_o
.en
335 data
= yield dut
.data_o
.data
336 assert en
== 0, "en must be zero"
337 assert data
== 0, "data must be zero"
340 yield dut
.addr_array_i
[0].eq(0xFF)
341 for j
in range(dut
.array_size
):
342 yield dut
.data_i
[j
].en
.eq(1 << j
)
343 yield dut
.data_i
[j
].data
.eq(0xFF << (16*j
))
346 en
= yield dut
.data_o
.en
347 data
= yield dut
.data_o
.data
348 assert data
== 0xff00ff00ff00ff00ff00ff00ff00ff
353 class TestL0Cache(unittest
.TestCase
):
355 def test_l0_cache_test_bare_wb(self
):
357 pspec
= TestMemPspec(ldst_ifacetype
='test_bare_wb',
361 dut
= TstL0CacheBuffer(pspec
)
362 vl
= rtlil
.convert(dut
, ports
=[]) # TODOdut.ports())
363 with
open("test_basic_l0_cache_bare_wb.il", "w") as f
:
366 run_simulation(dut
, l0_cache_ldst(self
, dut
),
367 vcd_name
='test_l0_cache_basic_bare_wb.vcd')
369 def test_l0_cache_testpi(self
):
371 pspec
= TestMemPspec(ldst_ifacetype
='testpi',
375 dut
= TstL0CacheBuffer(pspec
)
376 vl
= rtlil
.convert(dut
, ports
=[]) # TODOdut.ports())
377 with
open("test_basic_l0_cache.il", "w") as f
:
380 run_simulation(dut
, l0_cache_ldst(self
, dut
),
381 vcd_name
='test_l0_cache_basic_testpi.vcd')
384 class TestDataMerger(unittest
.TestCase
):
386 def test_data_merger(self
):
389 #vl = rtlil.convert(dut, ports=dut.ports())
390 # with open("test_data_merger.il", "w") as f:
393 run_simulation(dut
, data_merger_merge(dut
),
394 vcd_name
='test_data_merger.vcd')
397 class TestDualPortSplitter(unittest
.TestCase
):
399 def test_dual_port_splitter(self
):
401 dut
= DualPortSplitter()
402 #vl = rtlil.convert(dut, ports=dut.ports())
403 # with open("test_data_merger.il", "w") as f:
406 # run_simulation(dut, data_merger_merge(dut),
407 # vcd_name='test_dual_port_splitter.vcd')
410 if __name__
== '__main__':
411 unittest
.main(exit
=False)