42ef061072d6b6b1511fa9e16061286744b27153
[soc.git] / src / soc / experiment / l0_cache.py
1 """L0 Cache/Buffer
2
3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
5
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
10
11 Links:
12
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
15
16 """
17
18 from nmigen.compat.sim import run_simulation, Settle
19 from nmigen.cli import verilog, rtlil
20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
21 from nmutil.iocontrol import RecordObject
22 from nmigen.utils import log2_int
23 from nmigen.hdl.rec import Record, Layout
24
25 from nmutil.latch import SRLatch, latchregister
26 from openpower.decoder.power_decoder2 import Data
27 from openpower.decoder.power_enums import MicrOp
28 from soc.regfile.regfile import ortreereduce
29 from nmutil.util import treereduce
30
31 from openpower.decoder.power_decoder2 import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen.lib.coding import PriorityEncoder
34 from soc.scoreboard.addr_split import LDSTSplitter
35 from soc.scoreboard.addr_match import LenExpand
36
37 # for testing purposes
38 from soc.config.test.test_loadstore import TestMemPspec
39 from soc.config.loadstore import ConfigMemoryPortInterface
40 from soc.experiment.pimem import PortInterface
41 from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
42 import unittest
43
44 class L0CacheBuffer2(Elaboratable):
45 """L0CacheBuffer2"""
46 def __init__(self, n_units=8, regwid=64, addrwid=64):
47 self.n_units = n_units
48 self.regwid = regwid
49 self.addrwid = addrwid
50 ul = []
51 for i in range(self.n_units):
52 ul += [PortInterface()]
53 self.dports = Array(ul)
54
55 def elaborate(self, platform):
56 m = Module()
57 comb, sync = m.d.comb, m.d.sync
58
59 # connect the ports as modules
60
61 for i in range(self.n_units):
62 d = LDSTSplitter(64, 64, 4, self.dports[i])
63 setattr(m.submodules, "ldst_splitter%d" % i, d)
64
65 # state-machine latches TODO
66 return m
67
68 class DataMergerRecord(Record):
69 """
70 {data: 128 bit, byte_enable: 16 bit}
71 """
72
73 def __init__(self, name=None):
74 layout = (('data', 128),
75 ('en', 16))
76 Record.__init__(self, Layout(layout), name=name)
77
78 self.data.reset_less = True
79 self.en.reset_less = True
80
81 class CacheRecord(Record):
82 def __init__(self, name=None):
83 layout = (('addr', 37),
84 ('a_even', 7),
85 ('bytemask_even', 16),
86 ('data_even', 128),
87 ('a_odd', 7),
88 ('bytemask_odd', 16),
89 ('data_odd', 128))
90 Record.__init__(self, Layout(layout), name=name)
91
92 self.addr.reset_less = True
93 self.a_even.reset_less = True
94 self.bytemask_even.reset_less = True
95 self.data_even.reset_less = True
96 self.a_odd.reset_less = True
97 self.bytemask_odd.reset_less = True
98 self.data_odd.reset_less = True
99
100
101
102 # TODO: formal verification
103 class DataMerger(Elaboratable):
104 """DataMerger
105
106 Merges data based on an address-match matrix.
107 Identifies (picks) one (any) row, then uses that row,
108 based on matching address bits, to merge (OR) all data
109 rows into the output.
110
111 Basically, by the time DataMerger is used, all of its incoming data is
112 determined not to conflict. The last step before actually submitting
113 the request to the Memory Subsystem is to work out which requests,
114 on the same 128-bit cache line, can be "merged" due to them being:
115 (A) on the same address (bits 4 and above) (B) having byte-enable
116 lines that (as previously mentioned) do not conflict.
117
118 Therefore, put simply, this module will:
119 (1) pick a row (any row) and identify it by an index labelled "idx"
120 (2) merge all byte-enable lines which are on that same address, as
121 indicated by addr_match_i[idx], onto the output
122 """
123
124 def __init__(self, array_size):
125 """
126 :addr_array_i: an NxN Array of Signals with bits set indicating address
127 match. bits across the diagonal (addr_array_i[x][x])
128 will always be set, to indicate "active".
129 :i_data: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
130 :o_data: an Output Record of same type
131 {data: 128 bit, byte_enable: 16 bit}
132 """
133 self.array_size = array_size
134 ul = []
135 for i in range(array_size):
136 ul.append(Signal(array_size,
137 reset_less=True,
138 name="addr_match_%d" % i))
139 self.addr_array_i = Array(ul)
140
141 ul = []
142 for i in range(array_size):
143 ul.append(DataMergerRecord())
144 self.i_data = Array(ul)
145 self.o_data = DataMergerRecord()
146
147 def elaborate(self, platform):
148 m = Module()
149 comb = m.d.comb
150 # (1) pick a row
151 m.submodules.pick = pick = PriorityEncoder(self.array_size)
152 for j in range(self.array_size):
153 comb += pick.i[j].eq(self.addr_array_i[j].bool())
154 valid = ~pick.n
155 idx = pick.o
156 # (2) merge
157 with m.If(valid):
158 l = []
159 for j in range(self.array_size):
160 select = self.addr_array_i[idx][j]
161 r = DataMergerRecord()
162 with m.If(select):
163 comb += r.eq(self.i_data[j])
164 l.append(r)
165 comb += self.o_data.data.eq(ortreereduce(l, "data"))
166 comb += self.o_data.en.eq(ortreereduce(l, "en"))
167
168 return m
169
170 class TstDataMerger2(Elaboratable):
171 def __init__(self):
172 self.data_odd = Signal(128,reset_less=True)
173 self.data_even = Signal(128,reset_less=True)
174 self.n_units = 8
175 ul = []
176 for i in range(self.n_units):
177 ul.append(CacheRecord())
178 self.input_array = Array(ul)
179
180 def addr_match(self,j,addr):
181 ret = []
182 for k in range(self.n_units):
183 ret += [(addr[j] == addr[k])]
184 return Cat(*ret)
185
186 def elaborate(self, platform):
187 m = Module()
188 m.submodules.dm_odd = dm_odd = DataMerger(self.n_units)
189 m.submodules.dm_even = dm_even = DataMerger(self.n_units)
190
191 addr_even = []
192 addr_odd = []
193 for j in range(self.n_units):
194 inp = self.input_array[j]
195 addr_even += [Cat(inp.addr,inp.a_even)]
196 addr_odd += [Cat(inp.addr,inp.a_odd)]
197
198 for j in range(self.n_units):
199 inp = self.input_array[j]
200 m.d.comb += dm_even.i_data[j].en.eq(inp.bytemask_even)
201 m.d.comb += dm_odd.i_data[j].en.eq(inp.bytemask_odd)
202 m.d.comb += dm_even.i_data[j].data.eq(inp.data_even)
203 m.d.comb += dm_odd.i_data[j].data.eq(inp.data_odd)
204 m.d.comb += dm_even.addr_array_i[j].eq(self.addr_match(j,addr_even))
205 m.d.comb += dm_odd.addr_array_i[j].eq(self.addr_match(j,addr_odd))
206
207 m.d.comb += self.data_odd.eq(dm_odd.o_data.data)
208 m.d.comb += self.data_even.eq(dm_even.o_data.data)
209 return m
210
211
212 class L0CacheBuffer(Elaboratable):
213 """L0 Cache / Buffer
214
215 Note that the final version will have *two* interfaces per LDSTCompUnit,
216 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
217 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
218
219 This version is to be used for test purposes (and actively maintained
220 for such, rather than "replaced")
221
222 There are much better ways to implement this. However it's only
223 a "demo" / "test" class, and one important aspect: it responds
224 combinatorially, where a nmigen FSM's state-changes only activate
225 on clock-sync boundaries.
226
227 Note: the data byte-order is *not* expected to be normalised (LE/BE)
228 by this class. That task is taken care of by LDSTCompUnit.
229 """
230
231 def __init__(self, n_units, pimem, regwid=64, addrwid=64):
232 self.n_units = n_units
233 self.pimem = pimem
234 self.regwid = regwid
235 self.addrwid = addrwid
236 ul = []
237 for i in range(n_units):
238 ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
239 self.dports = Array(ul)
240
241 def elaborate(self, platform):
242 m = Module()
243 comb, sync = m.d.comb, m.d.sync
244
245 # connect the ports as modules
246 # for i in range(self.n_units):
247 # setattr(m.submodules, "port%d" % i, self.dports[i])
248
249 # state-machine latches
250 m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
251 m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
252
253 # find one LD (or ST) and do it. only one per cycle.
254 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
255 # LD/STs using mask-expansion - see LenExpand class
256
257 m.submodules.pick = pick = PriorityEncoder(self.n_units)
258
259 ldsti = []
260 for i in range(self.n_units):
261 pi = self.dports[i]
262 busy = (pi.is_ld_i | pi.is_st_i) # & pi.busy_o
263 ldsti.append(busy) # accumulate ld/st-req
264 # put the requests into the priority-picker
265 comb += pick.i.eq(Cat(*ldsti))
266
267 # hmm, have to select (record) the right port index
268 nbits = log2_int(self.n_units, False)
269 idx = Signal(nbits, reset_less=False)
270
271 # use these because of the sync-and-comb pass-through capability
272 latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
273
274 # convenience variables to reference the "picked" port
275 port = self.dports[idx]
276
277 # pick (and capture) the port index
278 with m.If(~pick.n):
279 comb += idx_l.s.eq(1)
280
281 # from this point onwards, with the port "picked", it stays picked
282 # until idx_l is deasserted
283 comb += reset_l.s.eq(0)
284 comb += reset_l.r.eq(0)
285
286 with m.If(idx_l.q):
287 comb += self.pimem.connect_port(port)
288 with m.If(~self.pimem.pi.busy_o):
289 comb += reset_l.s.eq(1) # reset when no longer busy
290
291 # ugly hack, due to simultaneous addr req-go acknowledge
292 reset_delay = Signal(reset_less=True)
293 sync += reset_delay.eq(reset_l.q)
294
295 # after waiting one cycle (reset_l is "sync" mode), reset the port
296 with m.If(reset_l.q):
297 comb += idx_l.r.eq(1) # deactivate port-index selector
298 comb += reset_l.r.eq(1) # clear reset
299
300 return m
301
302 def __iter__(self):
303 for p in self.dports:
304 yield from p.ports()
305
306 def ports(self):
307 return list(self)
308
309
310 class TstL0CacheBuffer(Elaboratable):
311 def __init__(self, pspec, n_units=3):
312 self.pspec = pspec
313 regwid = pspec.reg_wid
314 addrwid = pspec.addr_wid
315 self.cmpi = ConfigMemoryPortInterface(pspec)
316 self.pimem = self.cmpi.pi
317 self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid << 1)
318
319 def elaborate(self, platform):
320 m = Module()
321 m.submodules.pimem = self.pimem
322 m.submodules.l0 = self.l0
323
324 if not hasattr(self.cmpi, 'lsmem'):
325 return m
326
327 # really bad hack, the LoadStore1 classes already have the
328 # lsi (LoadStoreInterface) as a submodule.
329 if self.pspec.ldst_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
330 return m
331
332 # hmmm not happy about this - should not be digging down and
333 # putting modules in
334 m.submodules.lsmem = self.cmpi.lsmem.lsi
335
336 return m
337
338 def ports(self):
339 yield from self.cmpi.ports()
340 yield from self.l0.ports()
341 yield from self.pimem.ports()
342
343
344 def wait_busy(port, no=False):
345 while True:
346 busy = yield port.busy_o
347 print("busy", no, busy)
348 if bool(busy) == no:
349 break
350 yield
351
352
353 def wait_addr(port):
354 while True:
355 addr_ok = yield port.addr_ok_o
356 print("addrok", addr_ok)
357 if not addr_ok:
358 break
359 yield
360
361
362 def wait_ldok(port):
363 while True:
364 ldok = yield port.ld.ok
365 print("ldok", ldok)
366 if ldok:
367 break
368 yield
369
370
371 def l0_cache_st(dut, addr, data, datalen):
372 return pi_st(dut.l0, addr, datalen)
373
374
375 def l0_cache_ld(dut, addr, datalen, expected):
376 return pi_ld(dut.l0, addr, datalen)
377
378
379 def l0_cache_ldst(arg, dut):
380 port0 = dut.l0.dports[0]
381 return pi_ldst(arg, port0)
382
383
384 def data_merger_merge(dut):
385 # starting with all inputs zero
386 yield Settle()
387 en = yield dut.o_data.en
388 data = yield dut.o_data.data
389 assert en == 0, "en must be zero"
390 assert data == 0, "data must be zero"
391 yield
392
393 yield dut.addr_array_i[0].eq(0xFF)
394 for j in range(dut.array_size):
395 yield dut.i_data[j].en.eq(1 << j)
396 yield dut.i_data[j].data.eq(0xFF << (16*j))
397 yield Settle()
398
399 en = yield dut.o_data.en
400 data = yield dut.o_data.data
401 assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
402 assert en == 0xff
403 yield
404
405 def data_merger_test2(dut):
406 # starting with all inputs zero
407 yield Settle()
408 yield
409 yield
410
411
412 class TestL0Cache(unittest.TestCase):
413
414 def test_l0_cache_test_bare_wb(self):
415
416 pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
417 addr_wid=64,
418 mask_wid=8,
419 reg_wid=64)
420 dut = TstL0CacheBuffer(pspec)
421 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
422 with open("test_basic_l0_cache_bare_wb.il", "w") as f:
423 f.write(vl)
424
425 run_simulation(dut, l0_cache_ldst(self, dut),
426 vcd_name='test_l0_cache_basic_bare_wb.vcd')
427
428 def test_l0_cache_testpi(self):
429
430 pspec = TestMemPspec(ldst_ifacetype='testpi',
431 addr_wid=64,
432 mask_wid=8,
433 reg_wid=64)
434 dut = TstL0CacheBuffer(pspec)
435 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
436 with open("test_basic_l0_cache.il", "w") as f:
437 f.write(vl)
438
439 run_simulation(dut, l0_cache_ldst(self, dut),
440 vcd_name='test_l0_cache_basic_testpi.vcd')
441
442
443 class TestDataMerger(unittest.TestCase):
444
445 def test_data_merger(self):
446
447 dut = TstDataMerger2()
448 #vl = rtlil.convert(dut, ports=dut.ports())
449 # with open("test_data_merger.il", "w") as f:
450 # f.write(vl)
451
452 run_simulation(dut, data_merger_test2(dut),
453 vcd_name='test_data_merger.vcd')
454
455
456
457 class TestDualPortSplitter(unittest.TestCase):
458
459 def test_dual_port_splitter(self):
460
461 dut = DualPortSplitter()
462 #vl = rtlil.convert(dut, ports=dut.ports())
463 # with open("test_data_merger.il", "w") as f:
464 # f.write(vl)
465
466 # run_simulation(dut, data_merger_merge(dut),
467 # vcd_name='test_dual_port_splitter.vcd')
468
469
470 if __name__ == '__main__':
471 unittest.main()