format some tests
[soc.git] / src / soc / experiment / l0_cache.py
1 """L0 Cache/Buffer
2
3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
5
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
10
11 Links:
12
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
15
16 """
17
18 from nmigen.compat.sim import run_simulation, Settle
19 from nmigen.cli import verilog, rtlil
20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
21 from nmutil.iocontrol import RecordObject
22 from nmigen.utils import log2_int
23 from nmigen.hdl.rec import Record, Layout
24
25 from nmutil.latch import SRLatch, latchregister
26 from soc.decoder.power_decoder2 import Data
27 from soc.decoder.power_enums import MicrOp
28 from soc.regfile.regfile import ortreereduce
29 from nmutil.util import treereduce
30
31 from soc.decoder.power_decoder2 import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen.lib.coding import PriorityEncoder
34 from soc.scoreboard.addr_split import LDSTSplitter
35 from soc.scoreboard.addr_match import LenExpand
36
37 # for testing purposes
38 from soc.config.test.test_loadstore import TestMemPspec
39 from soc.config.loadstore import ConfigMemoryPortInterface
40 from soc.experiment.pimem import PortInterface
41 from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
42 import unittest
43
44
45 class DualPortSplitter(Elaboratable):
46 """DualPortSplitter
47
48 * one incoming PortInterface
49 * two *OUTGOING* PortInterfaces
50 * uses LDSTSplitter to do it
51
52 (actually, thinking about it LDSTSplitter could simply be
53 modified to conform to PortInterface: one in, two out)
54
55 once that is done each pair of ports may be wired directly
56 to the dual ports of L0CacheBuffer
57
58 The split is carried out so that, regardless of alignment or
59 mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
60 of the address, whilst outgoing PortInterface[1] takes
61 bit 4 == 1.
62
63 PortInterface *may* need to be changed so that the length is
64 a binary number (accepting values 1-16).
65 """
66
67 def __init__(self):
68 self.outp = [PortInterface(name="outp_0"),
69 PortInterface(name="outp_1")]
70 self.inp = PortInterface(name="inp")
71 print(self.outp)
72
73 def elaborate(self, platform):
74 m = Module()
75 comb = m.d.comb
76 m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
77 comb += splitter.addr_i.eq(self.inp.addr) # XXX
78 #comb += splitter.len_i.eq()
79 #comb += splitter.valid_i.eq()
80 comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
81 comb += splitter.is_st_i.eq(self.inp.is_st_i)
82 #comb += splitter.st_data_i.eq()
83 #comb += splitter.sld_valid_i.eq()
84 #comb += splitter.sld_data_i.eq()
85 #comb += splitter.sst_valid_i.eq()
86 return m
87
88
89 class DataMergerRecord(Record):
90 """
91 {data: 128 bit, byte_enable: 16 bit}
92 """
93
94 def __init__(self, name=None):
95 layout = (('data', 128),
96 ('en', 16))
97 Record.__init__(self, Layout(layout), name=name)
98
99 self.data.reset_less = True
100 self.en.reset_less = True
101
102
103 # TODO: formal verification
104 class DataMerger(Elaboratable):
105 """DataMerger
106
107 Merges data based on an address-match matrix.
108 Identifies (picks) one (any) row, then uses that row,
109 based on matching address bits, to merge (OR) all data
110 rows into the output.
111
112 Basically, by the time DataMerger is used, all of its incoming data is
113 determined not to conflict. The last step before actually submitting
114 the request to the Memory Subsystem is to work out which requests,
115 on the same 128-bit cache line, can be "merged" due to them being:
116 (A) on the same address (bits 4 and above) (B) having byte-enable
117 lines that (as previously mentioned) do not conflict.
118
119 Therefore, put simply, this module will:
120 (1) pick a row (any row) and identify it by an index labelled "idx"
121 (2) merge all byte-enable lines which are on that same address, as
122 indicated by addr_match_i[idx], onto the output
123 """
124
125 def __init__(self, array_size):
126 """
127 :addr_array_i: an NxN Array of Signals with bits set indicating address
128 match. bits across the diagonal (addr_array_i[x][x])
129 will always be set, to indicate "active".
130 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
131 :data_o: an Output Record of same type
132 {data: 128 bit, byte_enable: 16 bit}
133 """
134 self.array_size = array_size
135 ul = []
136 for i in range(array_size):
137 ul.append(Signal(array_size,
138 reset_less=True,
139 name="addr_match_%d" % i))
140 self.addr_array_i = Array(ul)
141
142 ul = []
143 for i in range(array_size):
144 ul.append(DataMergerRecord())
145 self.data_i = Array(ul)
146 self.data_o = DataMergerRecord()
147
148 def elaborate(self, platform):
149 m = Module()
150 comb = m.d.comb
151 # (1) pick a row
152 m.submodules.pick = pick = PriorityEncoder(self.array_size)
153 for j in range(self.array_size):
154 comb += pick.i[j].eq(self.addr_array_i[j].bool())
155 valid = ~pick.n
156 idx = pick.o
157 # (2) merge
158 with m.If(valid):
159 l = []
160 for j in range(self.array_size):
161 select = self.addr_array_i[idx][j]
162 r = DataMergerRecord()
163 with m.If(select):
164 comb += r.eq(self.data_i[j])
165 l.append(r)
166 comb += self.data_o.data.eq(ortreereduce(l, "data"))
167 comb += self.data_o.en.eq(ortreereduce(l, "en"))
168
169 return m
170
171
172 class L0CacheBuffer(Elaboratable):
173 """L0 Cache / Buffer
174
175 Note that the final version will have *two* interfaces per LDSTCompUnit,
176 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
177 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
178
179 This version is to be used for test purposes (and actively maintained
180 for such, rather than "replaced")
181
182 There are much better ways to implement this. However it's only
183 a "demo" / "test" class, and one important aspect: it responds
184 combinatorially, where a nmigen FSM's state-changes only activate
185 on clock-sync boundaries.
186
187 Note: the data byte-order is *not* expected to be normalised (LE/BE)
188 by this class. That task is taken care of by LDSTCompUnit.
189 """
190
191 def __init__(self, n_units, pimem, regwid=64, addrwid=48):
192 self.n_units = n_units
193 self.pimem = pimem
194 self.regwid = regwid
195 self.addrwid = addrwid
196 ul = []
197 for i in range(n_units):
198 ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
199 self.dports = Array(ul)
200
201 def elaborate(self, platform):
202 m = Module()
203 comb, sync = m.d.comb, m.d.sync
204
205 # connect the ports as modules
206 # for i in range(self.n_units):
207 # setattr(m.submodules, "port%d" % i, self.dports[i])
208
209 # state-machine latches
210 m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
211 m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
212
213 # find one LD (or ST) and do it. only one per cycle.
214 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
215 # LD/STs using mask-expansion - see LenExpand class
216
217 m.submodules.pick = pick = PriorityEncoder(self.n_units)
218
219 ldsti = []
220 for i in range(self.n_units):
221 pi = self.dports[i]
222 busy = (pi.is_ld_i | pi.is_st_i) # & pi.busy_o
223 ldsti.append(busy) # accumulate ld/st-req
224 # put the requests into the priority-picker
225 comb += pick.i.eq(Cat(*ldsti))
226
227 # hmm, have to select (record) the right port index
228 nbits = log2_int(self.n_units, False)
229 idx = Signal(nbits, reset_less=False)
230
231 # use these because of the sync-and-comb pass-through capability
232 latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
233
234 # convenience variables to reference the "picked" port
235 port = self.dports[idx]
236
237 # pick (and capture) the port index
238 with m.If(~pick.n):
239 comb += idx_l.s.eq(1)
240
241 # from this point onwards, with the port "picked", it stays picked
242 # until idx_l is deasserted
243 comb += reset_l.s.eq(0)
244 comb += reset_l.r.eq(0)
245
246 with m.If(idx_l.q):
247 comb += self.pimem.connect_port(port)
248 with m.If(~self.pimem.pi.busy_o):
249 comb += reset_l.s.eq(1) # reset when no longer busy
250
251 # ugly hack, due to simultaneous addr req-go acknowledge
252 reset_delay = Signal(reset_less=True)
253 sync += reset_delay.eq(reset_l.q)
254
255 # after waiting one cycle (reset_l is "sync" mode), reset the port
256 with m.If(reset_l.q):
257 comb += idx_l.r.eq(1) # deactivate port-index selector
258 comb += reset_l.r.eq(1) # clear reset
259
260 return m
261
262 def ports(self):
263 for p in self.dports:
264 yield from p.ports()
265
266
267 class TstL0CacheBuffer(Elaboratable):
268 def __init__(self, pspec, n_units=3):
269 regwid = pspec.reg_wid
270 addrwid = pspec.addr_wid
271 self.cmpi = ConfigMemoryPortInterface(pspec)
272 self.pimem = self.cmpi.pi
273 self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid << 1)
274
275 def elaborate(self, platform):
276 m = Module()
277 m.submodules.pimem = self.pimem
278 m.submodules.l0 = self.l0
279 if hasattr(self.cmpi, 'lsmem'): # hmmm not happy about this
280 m.submodules.lsmem = self.cmpi.lsmem.lsi
281
282 return m
283
284 def ports(self):
285 yield from self.cmpi.ports()
286 yield from self.l0.ports()
287 yield from self.pimem.ports()
288
289
290 def wait_busy(port, no=False):
291 while True:
292 busy = yield port.busy_o
293 print("busy", no, busy)
294 if bool(busy) == no:
295 break
296 yield
297
298
299 def wait_addr(port):
300 while True:
301 addr_ok = yield port.addr_ok_o
302 print("addrok", addr_ok)
303 if not addr_ok:
304 break
305 yield
306
307
308 def wait_ldok(port):
309 while True:
310 ldok = yield port.ld.ok
311 print("ldok", ldok)
312 if ldok:
313 break
314 yield
315
316
317 def l0_cache_st(dut, addr, data, datalen):
318 return pi_st(dut.l0, addr, datalen)
319
320
321 def l0_cache_ld(dut, addr, datalen, expected):
322 return pi_ld(dut.l0, addr, datalen)
323
324
325 def l0_cache_ldst(arg, dut):
326 port0 = dut.l0.dports[0]
327 return pi_ldst(arg, port0)
328
329
330 def data_merger_merge(dut):
331 print("data_merger")
332 # starting with all inputs zero
333 yield Settle()
334 en = yield dut.data_o.en
335 data = yield dut.data_o.data
336 assert en == 0, "en must be zero"
337 assert data == 0, "data must be zero"
338 yield
339
340 yield dut.addr_array_i[0].eq(0xFF)
341 for j in range(dut.array_size):
342 yield dut.data_i[j].en.eq(1 << j)
343 yield dut.data_i[j].data.eq(0xFF << (16*j))
344 yield Settle()
345
346 en = yield dut.data_o.en
347 data = yield dut.data_o.data
348 assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
349 assert en == 0xff
350 yield
351
352
353 class TestL0Cache(unittest.TestCase):
354
355 def test_l0_cache_test_bare_wb(self):
356
357 pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
358 addr_wid=48,
359 mask_wid=8,
360 reg_wid=64)
361 dut = TstL0CacheBuffer(pspec)
362 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
363 with open("test_basic_l0_cache_bare_wb.il", "w") as f:
364 f.write(vl)
365
366 run_simulation(dut, l0_cache_ldst(self, dut),
367 vcd_name='test_l0_cache_basic_bare_wb.vcd')
368
369 def test_l0_cache_testpi(self):
370
371 pspec = TestMemPspec(ldst_ifacetype='testpi',
372 addr_wid=48,
373 mask_wid=8,
374 reg_wid=64)
375 dut = TstL0CacheBuffer(pspec)
376 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
377 with open("test_basic_l0_cache.il", "w") as f:
378 f.write(vl)
379
380 run_simulation(dut, l0_cache_ldst(self, dut),
381 vcd_name='test_l0_cache_basic_testpi.vcd')
382
383
384 class TestDataMerger(unittest.TestCase):
385
386 def test_data_merger(self):
387
388 dut = DataMerger(8)
389 #vl = rtlil.convert(dut, ports=dut.ports())
390 # with open("test_data_merger.il", "w") as f:
391 # f.write(vl)
392
393 run_simulation(dut, data_merger_merge(dut),
394 vcd_name='test_data_merger.vcd')
395
396
397 class TestDualPortSplitter(unittest.TestCase):
398
399 def test_dual_port_splitter(self):
400
401 dut = DualPortSplitter()
402 #vl = rtlil.convert(dut, ports=dut.ports())
403 # with open("test_data_merger.il", "w") as f:
404 # f.write(vl)
405
406 # run_simulation(dut, data_merger_merge(dut),
407 # vcd_name='test_dual_port_splitter.vcd')
408
409
410 if __name__ == '__main__':
411 unittest.main(exit=False)