3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
18 from nmigen
.compat
.sim
import run_simulation
, Settle
19 from nmigen
.cli
import verilog
, rtlil
20 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Array
, Cat
21 from nmutil
.iocontrol
import RecordObject
22 from nmigen
.utils
import log2_int
23 from nmigen
.hdl
.rec
import Record
, Layout
25 from nmutil
.latch
import SRLatch
, latchregister
26 from soc
.decoder
.power_decoder2
import Data
27 from soc
.decoder
.power_enums
import InternalOp
28 from soc
.regfile
.regfile
import ortreereduce
29 from nmutil
.util
import treereduce
31 from soc
.decoder
.power_decoder2
import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen
.lib
.coding
import PriorityEncoder
34 from soc
.scoreboard
.addr_split
import LDSTSplitter
35 from soc
.scoreboard
.addr_match
import LenExpand
37 # for testing purposes
38 from soc
.config
.test
.test_loadstore
import TestMemPspec
39 from soc
.config
.loadstore
import ConfigMemoryPortInterface
40 from soc
.experiment
.pimem
import PortInterface
41 from soc
.config
.test
.test_pi2ls
import pi_ld
, pi_st
, pi_ldst
45 class DualPortSplitter(Elaboratable
):
48 * one incoming PortInterface
49 * two *OUTGOING* PortInterfaces
50 * uses LDSTSplitter to do it
52 (actually, thinking about it LDSTSplitter could simply be
53 modified to conform to PortInterface: one in, two out)
55 once that is done each pair of ports may be wired directly
56 to the dual ports of L0CacheBuffer
58 The split is carried out so that, regardless of alignment or
59 mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
60 of the address, whilst outgoing PortInterface[1] takes
63 PortInterface *may* need to be changed so that the length is
64 a binary number (accepting values 1-16).
67 self
.outp
= [PortInterface(name
="outp_0"),
68 PortInterface(name
="outp_1")]
69 self
.inp
= PortInterface(name
="inp")
72 def elaborate(self
, platform
):
75 m
.submodules
.splitter
= splitter
= LDSTSplitter(64, 48, 4)
76 comb
+= splitter
.addr_i
.eq(self
.inp
.addr
) #XXX
77 #comb += splitter.len_i.eq()
78 #comb += splitter.valid_i.eq()
79 comb
+= splitter
.is_ld_i
.eq(self
.inp
.is_ld_i
)
80 comb
+= splitter
.is_st_i
.eq(self
.inp
.is_st_i
)
81 #comb += splitter.st_data_i.eq()
82 #comb += splitter.sld_valid_i.eq()
83 #comb += splitter.sld_data_i.eq()
84 #comb += splitter.sst_valid_i.eq()
88 class DataMergerRecord(Record
):
90 {data: 128 bit, byte_enable: 16 bit}
93 def __init__(self
, name
=None):
94 layout
= (('data', 128),
96 Record
.__init
__(self
, Layout(layout
), name
=name
)
98 self
.data
.reset_less
= True
99 self
.en
.reset_less
= True
102 # TODO: formal verification
103 class DataMerger(Elaboratable
):
106 Merges data based on an address-match matrix.
107 Identifies (picks) one (any) row, then uses that row,
108 based on matching address bits, to merge (OR) all data
109 rows into the output.
111 Basically, by the time DataMerger is used, all of its incoming data is
112 determined not to conflict. The last step before actually submitting
113 the request to the Memory Subsystem is to work out which requests,
114 on the same 128-bit cache line, can be "merged" due to them being:
115 (A) on the same address (bits 4 and above) (B) having byte-enable
116 lines that (as previously mentioned) do not conflict.
118 Therefore, put simply, this module will:
119 (1) pick a row (any row) and identify it by an index labelled "idx"
120 (2) merge all byte-enable lines which are on that same address, as
121 indicated by addr_match_i[idx], onto the output
124 def __init__(self
, array_size
):
126 :addr_array_i: an NxN Array of Signals with bits set indicating address
127 match. bits across the diagonal (addr_array_i[x][x])
128 will always be set, to indicate "active".
129 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
130 :data_o: an Output Record of same type
131 {data: 128 bit, byte_enable: 16 bit}
133 self
.array_size
= array_size
135 for i
in range(array_size
):
136 ul
.append(Signal(array_size
,
138 name
="addr_match_%d" % i
))
139 self
.addr_array_i
= Array(ul
)
142 for i
in range(array_size
):
143 ul
.append(DataMergerRecord())
144 self
.data_i
= Array(ul
)
145 self
.data_o
= DataMergerRecord()
147 def elaborate(self
, platform
):
151 m
.submodules
.pick
= pick
= PriorityEncoder(self
.array_size
)
152 for j
in range(self
.array_size
):
153 comb
+= pick
.i
[j
].eq(self
.addr_array_i
[j
].bool())
159 for j
in range(self
.array_size
):
160 select
= self
.addr_array_i
[idx
][j
]
161 r
= DataMergerRecord()
163 comb
+= r
.eq(self
.data_i
[j
])
165 comb
+= self
.data_o
.data
.eq(ortreereduce(l
,"data"))
166 comb
+= self
.data_o
.en
.eq(ortreereduce(l
,"en"))
171 class L0CacheBuffer(Elaboratable
):
174 Note that the final version will have *two* interfaces per LDSTCompUnit,
175 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
176 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
178 This version is to be used for test purposes (and actively maintained
179 for such, rather than "replaced")
181 There are much better ways to implement this. However it's only
182 a "demo" / "test" class, and one important aspect: it responds
183 combinatorially, where a nmigen FSM's state-changes only activate
184 on clock-sync boundaries.
186 Note: the data byte-order is *not* expected to be normalised (LE/BE)
187 by this class. That task is taken care of by LDSTCompUnit.
190 def __init__(self
, n_units
, pimem
, regwid
=64, addrwid
=48):
191 self
.n_units
= n_units
194 self
.addrwid
= addrwid
196 for i
in range(n_units
):
197 ul
.append(PortInterface("ldst_port%d" % i
, regwid
, addrwid
))
198 self
.dports
= Array(ul
)
200 def elaborate(self
, platform
):
202 comb
, sync
= m
.d
.comb
, m
.d
.sync
204 # connect the ports as modules
205 #for i in range(self.n_units):
206 # setattr(m.submodules, "port%d" % i, self.dports[i])
208 # state-machine latches
209 m
.submodules
.idx_l
= idx_l
= SRLatch(False, name
="idx_l")
210 m
.submodules
.reset_l
= reset_l
= SRLatch(True, name
="reset")
212 # find one LD (or ST) and do it. only one per cycle.
213 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
214 # LD/STs using mask-expansion - see LenExpand class
216 m
.submodules
.pick
= pick
= PriorityEncoder(self
.n_units
)
219 for i
in range(self
.n_units
):
221 busy
= (pi
.is_ld_i | pi
.is_st_i
)# & pi.busy_o
222 ldsti
.append(busy
) # accumulate ld/st-req
223 # put the requests into the priority-picker
224 comb
+= pick
.i
.eq(Cat(*ldsti
))
226 # hmm, have to select (record) the right port index
227 nbits
= log2_int(self
.n_units
, False)
228 idx
= Signal(nbits
, reset_less
=False)
230 # use these because of the sync-and-comb pass-through capability
231 latchregister(m
, pick
.o
, idx
, idx_l
.q
, name
="idx_l")
233 # convenience variables to reference the "picked" port
234 port
= self
.dports
[idx
]
236 # pick (and capture) the port index
238 comb
+= idx_l
.s
.eq(1)
240 # from this point onwards, with the port "picked", it stays picked
241 # until idx_l is deasserted
242 comb
+= reset_l
.s
.eq(0)
243 comb
+= reset_l
.r
.eq(0)
246 comb
+= self
.pimem
.connect_port(port
)
247 with m
.If(~self
.pimem
.pi
.busy_o
):
248 comb
+= reset_l
.s
.eq(1) # reset when no longer busy
250 # ugly hack, due to simultaneous addr req-go acknowledge
251 reset_delay
= Signal(reset_less
=True)
252 sync
+= reset_delay
.eq(reset_l
.q
)
254 # after waiting one cycle (reset_l is "sync" mode), reset the port
255 with m
.If(reset_l
.q
):
256 comb
+= idx_l
.r
.eq(1) # deactivate port-index selector
257 comb
+= reset_l
.r
.eq(1) # clear reset
262 for p
in self
.dports
:
266 class TstL0CacheBuffer(Elaboratable
):
267 def __init__(self
, n_units
=3, regwid
=16, addrwid
=4, ifacetype
='testpi'):
268 pspec
= TestMemPspec(ldst_ifacetype
=ifacetype
,
272 self
.cmpi
= ConfigMemoryPortInterface(pspec
)
273 self
.pimem
= self
.cmpi
.pi
274 self
.l0
= L0CacheBuffer(n_units
, self
.pimem
, regwid
, addrwid
<<1)
276 def elaborate(self
, platform
):
278 m
.submodules
.pimem
= self
.pimem
279 m
.submodules
.l0
= self
.l0
280 if hasattr(self
.cmpi
, 'lsmem'): # hmmm not happy about this
281 m
.submodules
.lsmem
= self
.cmpi
.lsmem
.lsi
286 yield from self
.l0
.ports()
287 yield from self
.pimem
290 def wait_busy(port
, no
=False):
292 busy
= yield port
.busy_o
293 print("busy", no
, busy
)
301 addr_ok
= yield port
.addr_ok_o
302 print("addrok", addr_ok
)
310 ldok
= yield port
.ld
.ok
317 def l0_cache_st(dut
, addr
, data
, datalen
):
318 return pi_st(dut
.l0
, addr
, datalen
)
321 def l0_cache_ld(dut
, addr
, datalen
, expected
):
322 return pi_ld(dut
.l0
, addr
, datalen
)
325 def l0_cache_ldst(arg
, dut
):
326 port0
= dut
.l0
.dports
[0]
327 return pi_ldst(arg
, port0
)
330 def data_merger_merge(dut
):
332 #starting with all inputs zero
334 en
= yield dut
.data_o
.en
335 data
= yield dut
.data_o
.data
336 assert en
== 0, "en must be zero"
337 assert data
== 0, "data must be zero"
340 yield dut
.addr_array_i
[0].eq(0xFF)
341 for j
in range(dut
.array_size
):
342 yield dut
.data_i
[j
].en
.eq(1 << j
)
343 yield dut
.data_i
[j
].data
.eq(0xFF << (16*j
))
346 en
= yield dut
.data_o
.en
347 data
= yield dut
.data_o
.data
348 assert data
== 0xff00ff00ff00ff00ff00ff00ff00ff
353 class TestL0Cache(unittest
.TestCase
):
355 def test_l0_cache_test_bare_wb(self
):
357 dut
= TstL0CacheBuffer(regwid
=64, ifacetype
='test_bare_wb')
358 vl
= rtlil
.convert(dut
, ports
=[])# TODOdut.ports())
359 with
open("test_basic_l0_cache_bare_wb.il", "w") as f
:
362 run_simulation(dut
, l0_cache_ldst(self
, dut
),
363 vcd_name
='test_l0_cache_basic_bare_wb.vcd')
365 def test_l0_cache_testpi(self
):
367 dut
= TstL0CacheBuffer(regwid
=64, ifacetype
='testpi')
368 vl
= rtlil
.convert(dut
, ports
=[])# TODOdut.ports())
369 with
open("test_basic_l0_cache.il", "w") as f
:
372 run_simulation(dut
, l0_cache_ldst(self
, dut
),
373 vcd_name
='test_l0_cache_basic_testpi.vcd')
376 class TestDataMerger(unittest
.TestCase
):
378 def test_data_merger(self
):
381 #vl = rtlil.convert(dut, ports=dut.ports())
382 #with open("test_data_merger.il", "w") as f:
385 run_simulation(dut
, data_merger_merge(dut
),
386 vcd_name
='test_data_merger.vcd')
389 class TestDualPortSplitter(unittest
.TestCase
):
391 def test_dual_port_splitter(self
):
393 dut
= DualPortSplitter()
394 #vl = rtlil.convert(dut, ports=dut.ports())
395 #with open("test_data_merger.il", "w") as f:
398 #run_simulation(dut, data_merger_merge(dut),
399 # vcd_name='test_dual_port_splitter.vcd')
402 if __name__
== '__main__':
403 unittest
.main(exit
=False)