3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
18 from nmigen
.compat
.sim
import run_simulation
, Settle
19 from nmigen
.cli
import verilog
, rtlil
20 from nmigen
import Module
, Signal
, Mux
, Elaboratable
, Array
, Cat
21 from nmutil
.iocontrol
import RecordObject
22 from nmigen
.utils
import log2_int
23 from nmigen
.hdl
.rec
import Record
, Layout
25 from nmutil
.latch
import SRLatch
, latchregister
26 from openpower
.decoder
.power_decoder2
import Data
27 from openpower
.decoder
.power_enums
import MicrOp
28 from soc
.regfile
.regfile
import ortreereduce
29 from nmutil
.util
import treereduce
31 from openpower
.decoder
.power_decoder2
import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen
.lib
.coding
import PriorityEncoder
34 from soc
.scoreboard
.addr_split
import LDSTSplitter
35 from soc
.scoreboard
.addr_match
import LenExpand
37 # for testing purposes
38 from soc
.config
.test
.test_loadstore
import TestMemPspec
39 from soc
.config
.loadstore
import ConfigMemoryPortInterface
40 from soc
.experiment
.pimem
import PortInterface
41 from soc
.config
.test
.test_pi2ls
import pi_ld
, pi_st
, pi_ldst
44 class L0CacheBuffer2(Elaboratable
):
46 def __init__(self
, n_units
=8, regwid
=64, addrwid
=48):
47 self
.n_units
= n_units
49 self
.addrwid
= addrwid
51 for i
in range(self
.n_units
):
52 ul
+= [PortInterface()]
53 self
.dports
= Array(ul
)
55 def elaborate(self
, platform
):
57 comb
, sync
= m
.d
.comb
, m
.d
.sync
59 # connect the ports as modules
61 for i
in range(self
.n_units
):
62 d
= LDSTSplitter(64, 48, 4, self
.dports
[i
])
63 setattr(m
.submodules
, "ldst_splitter%d" % i
, d
)
65 # state-machine latches TODO
68 class DataMergerRecord(Record
):
70 {data: 128 bit, byte_enable: 16 bit}
73 def __init__(self
, name
=None):
74 layout
= (('data', 128),
76 Record
.__init
__(self
, Layout(layout
), name
=name
)
78 self
.data
.reset_less
= True
79 self
.en
.reset_less
= True
81 class CacheRecord(Record
):
82 def __init__(self
, name
=None):
83 layout
= (('addr', 37),
85 ('bytemask_even', 16),
90 Record
.__init
__(self
, Layout(layout
), name
=name
)
92 self
.addr
.reset_less
= True
93 self
.a_even
.reset_less
= True
94 self
.bytemask_even
.reset_less
= True
95 self
.data_even
.reset_less
= True
96 self
.a_odd
.reset_less
= True
97 self
.bytemask_odd
.reset_less
= True
98 self
.data_odd
.reset_less
= True
102 # TODO: formal verification
103 class DataMerger(Elaboratable
):
106 Merges data based on an address-match matrix.
107 Identifies (picks) one (any) row, then uses that row,
108 based on matching address bits, to merge (OR) all data
109 rows into the output.
111 Basically, by the time DataMerger is used, all of its incoming data is
112 determined not to conflict. The last step before actually submitting
113 the request to the Memory Subsystem is to work out which requests,
114 on the same 128-bit cache line, can be "merged" due to them being:
115 (A) on the same address (bits 4 and above) (B) having byte-enable
116 lines that (as previously mentioned) do not conflict.
118 Therefore, put simply, this module will:
119 (1) pick a row (any row) and identify it by an index labelled "idx"
120 (2) merge all byte-enable lines which are on that same address, as
121 indicated by addr_match_i[idx], onto the output
124 def __init__(self
, array_size
):
126 :addr_array_i: an NxN Array of Signals with bits set indicating address
127 match. bits across the diagonal (addr_array_i[x][x])
128 will always be set, to indicate "active".
129 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
130 :data_o: an Output Record of same type
131 {data: 128 bit, byte_enable: 16 bit}
133 self
.array_size
= array_size
135 for i
in range(array_size
):
136 ul
.append(Signal(array_size
,
138 name
="addr_match_%d" % i
))
139 self
.addr_array_i
= Array(ul
)
142 for i
in range(array_size
):
143 ul
.append(DataMergerRecord())
144 self
.data_i
= Array(ul
)
145 self
.data_o
= DataMergerRecord()
147 def elaborate(self
, platform
):
151 m
.submodules
.pick
= pick
= PriorityEncoder(self
.array_size
)
152 for j
in range(self
.array_size
):
153 comb
+= pick
.i
[j
].eq(self
.addr_array_i
[j
].bool())
159 for j
in range(self
.array_size
):
160 select
= self
.addr_array_i
[idx
][j
]
161 r
= DataMergerRecord()
163 comb
+= r
.eq(self
.data_i
[j
])
165 comb
+= self
.data_o
.data
.eq(ortreereduce(l
, "data"))
166 comb
+= self
.data_o
.en
.eq(ortreereduce(l
, "en"))
170 class TstDataMerger2(Elaboratable
):
172 self
.data_odd
= Signal(128,reset_less
=True)
173 self
.data_even
= Signal(128,reset_less
=True)
176 for i
in range(self
.n_units
):
177 ul
.append(CacheRecord())
178 self
.input_array
= Array(ul
)
180 def addr_match(self
,j
,addr
):
182 for k
in range(self
.n_units
):
183 ret
+= [(addr
[j
] == addr
[k
])]
186 def elaborate(self
, platform
):
188 m
.submodules
.dm_odd
= dm_odd
= DataMerger(self
.n_units
)
189 m
.submodules
.dm_even
= dm_even
= DataMerger(self
.n_units
)
193 for j
in range(self
.n_units
):
194 inp
= self
.input_array
[j
]
195 addr_even
+= [Cat(inp
.addr
,inp
.a_even
)]
196 addr_odd
+= [Cat(inp
.addr
,inp
.a_odd
)]
198 for j
in range(self
.n_units
):
199 inp
= self
.input_array
[j
]
200 m
.d
.comb
+= dm_even
.data_i
[j
].en
.eq(inp
.bytemask_even
)
201 m
.d
.comb
+= dm_odd
.data_i
[j
].en
.eq(inp
.bytemask_odd
)
202 m
.d
.comb
+= dm_even
.data_i
[j
].data
.eq(inp
.data_even
)
203 m
.d
.comb
+= dm_odd
.data_i
[j
].data
.eq(inp
.data_odd
)
204 m
.d
.comb
+= dm_even
.addr_array_i
[j
].eq(self
.addr_match(j
,addr_even
))
205 m
.d
.comb
+= dm_odd
.addr_array_i
[j
].eq(self
.addr_match(j
,addr_odd
))
207 m
.d
.comb
+= self
.data_odd
.eq(dm_odd
.data_o
.data
)
208 m
.d
.comb
+= self
.data_even
.eq(dm_even
.data_o
.data
)
212 class L0CacheBuffer(Elaboratable
):
215 Note that the final version will have *two* interfaces per LDSTCompUnit,
216 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
217 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
219 This version is to be used for test purposes (and actively maintained
220 for such, rather than "replaced")
222 There are much better ways to implement this. However it's only
223 a "demo" / "test" class, and one important aspect: it responds
224 combinatorially, where a nmigen FSM's state-changes only activate
225 on clock-sync boundaries.
227 Note: the data byte-order is *not* expected to be normalised (LE/BE)
228 by this class. That task is taken care of by LDSTCompUnit.
231 def __init__(self
, n_units
, pimem
, regwid
=64, addrwid
=48):
232 self
.n_units
= n_units
235 self
.addrwid
= addrwid
237 for i
in range(n_units
):
238 ul
.append(PortInterface("ldst_port%d" % i
, regwid
, addrwid
))
239 self
.dports
= Array(ul
)
241 def elaborate(self
, platform
):
243 comb
, sync
= m
.d
.comb
, m
.d
.sync
245 # connect the ports as modules
246 # for i in range(self.n_units):
247 # setattr(m.submodules, "port%d" % i, self.dports[i])
249 # state-machine latches
250 m
.submodules
.idx_l
= idx_l
= SRLatch(False, name
="idx_l")
251 m
.submodules
.reset_l
= reset_l
= SRLatch(True, name
="reset")
253 # find one LD (or ST) and do it. only one per cycle.
254 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
255 # LD/STs using mask-expansion - see LenExpand class
257 m
.submodules
.pick
= pick
= PriorityEncoder(self
.n_units
)
260 for i
in range(self
.n_units
):
262 busy
= (pi
.is_ld_i | pi
.is_st_i
) # & pi.busy_o
263 ldsti
.append(busy
) # accumulate ld/st-req
264 # put the requests into the priority-picker
265 comb
+= pick
.i
.eq(Cat(*ldsti
))
267 # hmm, have to select (record) the right port index
268 nbits
= log2_int(self
.n_units
, False)
269 idx
= Signal(nbits
, reset_less
=False)
271 # use these because of the sync-and-comb pass-through capability
272 latchregister(m
, pick
.o
, idx
, idx_l
.q
, name
="idx_l")
274 # convenience variables to reference the "picked" port
275 port
= self
.dports
[idx
]
277 # pick (and capture) the port index
279 comb
+= idx_l
.s
.eq(1)
281 # from this point onwards, with the port "picked", it stays picked
282 # until idx_l is deasserted
283 comb
+= reset_l
.s
.eq(0)
284 comb
+= reset_l
.r
.eq(0)
287 comb
+= self
.pimem
.connect_port(port
)
288 with m
.If(~self
.pimem
.pi
.busy_o
):
289 comb
+= reset_l
.s
.eq(1) # reset when no longer busy
291 # ugly hack, due to simultaneous addr req-go acknowledge
292 reset_delay
= Signal(reset_less
=True)
293 sync
+= reset_delay
.eq(reset_l
.q
)
295 # after waiting one cycle (reset_l is "sync" mode), reset the port
296 with m
.If(reset_l
.q
):
297 comb
+= idx_l
.r
.eq(1) # deactivate port-index selector
298 comb
+= reset_l
.r
.eq(1) # clear reset
303 for p
in self
.dports
:
310 class TstL0CacheBuffer(Elaboratable
):
311 def __init__(self
, pspec
, n_units
=3):
312 regwid
= pspec
.reg_wid
313 addrwid
= pspec
.addr_wid
314 self
.cmpi
= ConfigMemoryPortInterface(pspec
)
315 self
.pimem
= self
.cmpi
.pi
316 self
.l0
= L0CacheBuffer(n_units
, self
.pimem
, regwid
, addrwid
<< 1)
318 def elaborate(self
, platform
):
320 m
.submodules
.pimem
= self
.pimem
321 m
.submodules
.l0
= self
.l0
322 if hasattr(self
.cmpi
, 'lsmem'): # hmmm not happy about this
323 m
.submodules
.lsmem
= self
.cmpi
.lsmem
.lsi
328 yield from self
.cmpi
.ports()
329 yield from self
.l0
.ports()
330 yield from self
.pimem
.ports()
333 def wait_busy(port
, no
=False):
335 busy
= yield port
.busy_o
336 print("busy", no
, busy
)
344 addr_ok
= yield port
.addr_ok_o
345 print("addrok", addr_ok
)
353 ldok
= yield port
.ld
.ok
360 def l0_cache_st(dut
, addr
, data
, datalen
):
361 return pi_st(dut
.l0
, addr
, datalen
)
364 def l0_cache_ld(dut
, addr
, datalen
, expected
):
365 return pi_ld(dut
.l0
, addr
, datalen
)
368 def l0_cache_ldst(arg
, dut
):
369 port0
= dut
.l0
.dports
[0]
370 return pi_ldst(arg
, port0
)
373 def data_merger_merge(dut
):
374 # starting with all inputs zero
376 en
= yield dut
.data_o
.en
377 data
= yield dut
.data_o
.data
378 assert en
== 0, "en must be zero"
379 assert data
== 0, "data must be zero"
382 yield dut
.addr_array_i
[0].eq(0xFF)
383 for j
in range(dut
.array_size
):
384 yield dut
.data_i
[j
].en
.eq(1 << j
)
385 yield dut
.data_i
[j
].data
.eq(0xFF << (16*j
))
388 en
= yield dut
.data_o
.en
389 data
= yield dut
.data_o
.data
390 assert data
== 0xff00ff00ff00ff00ff00ff00ff00ff
394 def data_merger_test2(dut
):
395 # starting with all inputs zero
401 class TestL0Cache(unittest
.TestCase
):
403 def test_l0_cache_test_bare_wb(self
):
405 pspec
= TestMemPspec(ldst_ifacetype
='test_bare_wb',
409 dut
= TstL0CacheBuffer(pspec
)
410 vl
= rtlil
.convert(dut
, ports
=[]) # TODOdut.ports())
411 with
open("test_basic_l0_cache_bare_wb.il", "w") as f
:
414 run_simulation(dut
, l0_cache_ldst(self
, dut
),
415 vcd_name
='test_l0_cache_basic_bare_wb.vcd')
417 def test_l0_cache_testpi(self
):
419 pspec
= TestMemPspec(ldst_ifacetype
='testpi',
423 dut
= TstL0CacheBuffer(pspec
)
424 vl
= rtlil
.convert(dut
, ports
=[]) # TODOdut.ports())
425 with
open("test_basic_l0_cache.il", "w") as f
:
428 run_simulation(dut
, l0_cache_ldst(self
, dut
),
429 vcd_name
='test_l0_cache_basic_testpi.vcd')
432 class TestDataMerger(unittest
.TestCase
):
434 def test_data_merger(self
):
436 dut
= TstDataMerger2()
437 #vl = rtlil.convert(dut, ports=dut.ports())
438 # with open("test_data_merger.il", "w") as f:
441 run_simulation(dut
, data_merger_test2(dut
),
442 vcd_name
='test_data_merger.vcd')
446 class TestDualPortSplitter(unittest
.TestCase
):
448 def test_dual_port_splitter(self
):
450 dut
= DualPortSplitter()
451 #vl = rtlil.convert(dut, ports=dut.ports())
452 # with open("test_data_merger.il", "w") as f:
455 # run_simulation(dut, data_merger_merge(dut),
456 # vcd_name='test_dual_port_splitter.vcd')
459 if __name__
== '__main__':