simplified L0CacheBuffer down to a "PortInterface Arbiter"
[soc.git] / src / soc / experiment / l0_cache.py
1 """L0 Cache/Buffer
2
3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
5
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
10
11 Links:
12
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
15
16 """
17
18 from nmigen.compat.sim import run_simulation, Settle
19 from nmigen.cli import verilog, rtlil
20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
21 from nmutil.iocontrol import RecordObject
22 from nmigen.utils import log2_int
23 from nmigen.hdl.rec import Record, Layout
24
25 from nmutil.latch import SRLatch, latchregister
26 from soc.decoder.power_decoder2 import Data
27 from soc.decoder.power_enums import InternalOp
28 from soc.regfile.regfile import ortreereduce
29 from nmutil.util import treereduce
30
31 from soc.decoder.power_decoder2 import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen.lib.coding import PriorityEncoder
34 from soc.scoreboard.addr_split import LDSTSplitter
35 from soc.scoreboard.addr_match import LenExpand
36
37 # for testing purposes
38 from soc.experiment.testmem import TestMemory # TODO: replace with TMLSUI
39 # TODO: from soc.experiment.testmem import TestMemoryLoadStoreUnit
40 from soc.experiment.pimem import PortInterface, TestMemoryPortInterface
41
42 import unittest
43
44
45 class DualPortSplitter(Elaboratable):
46 """DualPortSplitter
47
48 * one incoming PortInterface
49 * two *OUTGOING* PortInterfaces
50 * uses LDSTSplitter to do it
51
52 (actually, thinking about it LDSTSplitter could simply be
53 modified to conform to PortInterface: one in, two out)
54
55 once that is done each pair of ports may be wired directly
56 to the dual ports of L0CacheBuffer
57
58 The split is carried out so that, regardless of alignment or
59 mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
60 of the address, whilst outgoing PortInterface[1] takes
61 bit 4 == 1.
62
63 PortInterface *may* need to be changed so that the length is
64 a binary number (accepting values 1-16).
65 """
66 def __init__(self):
67 self.outp = [PortInterface(name="outp_0"),
68 PortInterface(name="outp_1")]
69 self.inp = PortInterface(name="inp")
70 print(self.outp)
71
72 def elaborate(self, platform):
73 m = Module()
74 comb = m.d.comb
75 m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
76 comb += splitter.addr_i.eq(self.inp.addr) #XXX
77 #comb += splitter.len_i.eq()
78 #comb += splitter.valid_i.eq()
79 comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
80 comb += splitter.is_st_i.eq(self.inp.is_st_i)
81 #comb += splitter.st_data_i.eq()
82 #comb += splitter.sld_valid_i.eq()
83 #comb += splitter.sld_data_i.eq()
84 #comb += splitter.sst_valid_i.eq()
85 return m
86
87
88 class DataMergerRecord(Record):
89 """
90 {data: 128 bit, byte_enable: 16 bit}
91 """
92
93 def __init__(self, name=None):
94 layout = (('data', 128),
95 ('en', 16))
96 Record.__init__(self, Layout(layout), name=name)
97
98 self.data.reset_less = True
99 self.en.reset_less = True
100
101
102 # TODO: formal verification
103 class DataMerger(Elaboratable):
104 """DataMerger
105
106 Merges data based on an address-match matrix.
107 Identifies (picks) one (any) row, then uses that row,
108 based on matching address bits, to merge (OR) all data
109 rows into the output.
110
111 Basically, by the time DataMerger is used, all of its incoming data is
112 determined not to conflict. The last step before actually submitting
113 the request to the Memory Subsystem is to work out which requests,
114 on the same 128-bit cache line, can be "merged" due to them being:
115 (A) on the same address (bits 4 and above) (B) having byte-enable
116 lines that (as previously mentioned) do not conflict.
117
118 Therefore, put simply, this module will:
119 (1) pick a row (any row) and identify it by an index labelled "idx"
120 (2) merge all byte-enable lines which are on that same address, as
121 indicated by addr_match_i[idx], onto the output
122 """
123
124 def __init__(self, array_size):
125 """
126 :addr_array_i: an NxN Array of Signals with bits set indicating address
127 match. bits across the diagonal (addr_array_i[x][x])
128 will always be set, to indicate "active".
129 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
130 :data_o: an Output Record of same type
131 {data: 128 bit, byte_enable: 16 bit}
132 """
133 self.array_size = array_size
134 ul = []
135 for i in range(array_size):
136 ul.append(Signal(array_size,
137 reset_less=True,
138 name="addr_match_%d" % i))
139 self.addr_array_i = Array(ul)
140
141 ul = []
142 for i in range(array_size):
143 ul.append(DataMergerRecord())
144 self.data_i = Array(ul)
145 self.data_o = DataMergerRecord()
146
147 def elaborate(self, platform):
148 m = Module()
149 comb = m.d.comb
150 #(1) pick a row
151 m.submodules.pick = pick = PriorityEncoder(self.array_size)
152 for j in range(self.array_size):
153 comb += pick.i[j].eq(self.addr_array_i[j].bool())
154 valid = ~pick.n
155 idx = pick.o
156 #(2) merge
157 with m.If(valid):
158 l = []
159 for j in range(self.array_size):
160 select = self.addr_array_i[idx][j]
161 r = DataMergerRecord()
162 with m.If(select):
163 comb += r.eq(self.data_i[j])
164 l.append(r)
165 comb += self.data_o.data.eq(ortreereduce(l,"data"))
166 comb += self.data_o.en.eq(ortreereduce(l,"en"))
167
168 return m
169
170
171 class L0CacheBuffer(Elaboratable):
172 """L0 Cache / Buffer
173
174 Note that the final version will have *two* interfaces per LDSTCompUnit,
175 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
176 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
177
178 This version is to be used for test purposes (and actively maintained
179 for such, rather than "replaced")
180
181 There are much better ways to implement this. However it's only
182 a "demo" / "test" class, and one important aspect: it responds
183 combinatorially, where a nmigen FSM's state-changes only activate
184 on clock-sync boundaries.
185
186 Note: the data byte-order is *not* expected to be normalised (LE/BE)
187 by this class. That task is taken care of by LDSTCompUnit.
188 """
189
190 def __init__(self, n_units, pimem, regwid=64, addrwid=48):
191 self.n_units = n_units
192 self.pimem = pimem
193 self.regwid = regwid
194 self.addrwid = addrwid
195 ul = []
196 for i in range(n_units):
197 ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
198 self.dports = Array(ul)
199
200 def elaborate(self, platform):
201 m = Module()
202 comb, sync = m.d.comb, m.d.sync
203
204 # connect the ports as modules
205 #for i in range(self.n_units):
206 # setattr(m.submodules, "port%d" % i, self.dports[i])
207
208 # state-machine latches
209 m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
210 m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
211
212 # find one LD (or ST) and do it. only one per cycle.
213 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
214 # LD/STs using mask-expansion - see LenExpand class
215
216 m.submodules.pick = pick = PriorityEncoder(self.n_units)
217 m.submodules.lenexp = lenexp = LenExpand(4, 8)
218
219 ldsti = []
220 for i in range(self.n_units):
221 pi = self.dports[i]
222 busy = (pi.is_ld_i | pi.is_st_i)# & pi.busy_o
223 ldsti.append(busy) # accumulate ld/st-req
224 # put the requests into the priority-picker
225 comb += pick.i.eq(Cat(*ldsti))
226
227 # hmm, have to select (record) the right port index
228 nbits = log2_int(self.n_units, False)
229 idx = Signal(nbits, reset_less=False)
230
231 # use these because of the sync-and-comb pass-through capability
232 latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
233
234 # convenience variables to reference the "picked" port
235 port = self.dports[idx]
236
237 # pick (and capture) the port index
238 with m.If(~pick.n):
239 comb += idx_l.s.eq(1)
240
241 # from this point onwards, with the port "picked", it stays picked
242 # until idx_l is deasserted
243 comb += reset_l.s.eq(0)
244 comb += reset_l.r.eq(0)
245
246 with m.If(idx_l.q):
247 comb += self.pimem.connect_port(port)
248 with m.If(~self.pimem.pi.pi.busy_o):
249 comb += reset_l.s.eq(1) # reset when no longer busy
250
251 # ugly hack, due to simultaneous addr req-go acknowledge
252 reset_delay = Signal(reset_less=True)
253 sync += reset_delay.eq(reset_l.q)
254
255 # after waiting one cycle (reset_l is "sync" mode), reset the port
256 with m.If(reset_l.q):
257 comb += idx_l.r.eq(1) # deactivate port-index selector
258 comb += reset_l.r.eq(1) # clear reset
259
260 return m
261
262 def ports(self):
263 for p in self.dports:
264 yield from p.ports()
265
266
267 class TstL0CacheBuffer(Elaboratable):
268 def __init__(self, n_units=3, regwid=16, addrwid=4):
269 self.pimem = TestMemoryPortInterface(regwid, addrwid<<1)
270 self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid<<1)
271
272 def elaborate(self, platform):
273 m = Module()
274 m.submodules.pimem = self.pimem
275 m.submodules.l0 = self.l0
276
277 return m
278
279 def ports(self):
280 yield from self.l0.ports()
281 yield from self.pimem
282
283
284 def wait_busy(port, no=False):
285 while True:
286 busy = yield port.busy_o
287 print("busy", no, busy)
288 if bool(busy) == no:
289 break
290 yield
291
292
293 def wait_addr(port):
294 while True:
295 addr_ok = yield port.addr_ok_o
296 print("addrok", addr_ok)
297 if not addr_ok:
298 break
299 yield
300
301
302 def wait_ldok(port):
303 while True:
304 ldok = yield port.ld.ok
305 print("ldok", ldok)
306 if ldok:
307 break
308 yield
309
310
311 def l0_cache_st(dut, addr, data, datalen):
312 l0 = dut.l0
313 mem = dut.pimem
314 port0 = l0.dports[0]
315 port1 = l0.dports[1]
316
317 # have to wait until not busy
318 yield from wait_busy(port1, no=False) # wait until not busy
319
320 # set up a ST on the port. address first:
321 yield port1.is_st_i.eq(1) # indicate ST
322 yield port1.data_len.eq(datalen) # ST length (1/2/4/8)
323
324 yield port1.addr.data.eq(addr) # set address
325 yield port1.addr.ok.eq(1) # set ok
326 yield from wait_addr(port1) # wait until addr ok
327 # yield # not needed, just for checking
328 # yield # not needed, just for checking
329 # assert "ST" for one cycle (required by the API)
330 yield port1.st.data.eq(data)
331 yield port1.st.ok.eq(1)
332 yield
333 yield port1.st.ok.eq(0)
334
335 # can go straight to reset.
336 yield port1.is_st_i.eq(0) # end
337 yield port1.addr.ok.eq(0) # set !ok
338 # yield from wait_busy(port1, False) # wait until not busy
339
340
341 def l0_cache_ld(dut, addr, datalen, expected):
342
343 l0 = dut.l0
344 mem = dut.pimem
345 port1 = l0.dports[0]
346 port2 = l0.dports[2]
347
348 # have to wait until not busy
349 yield from wait_busy(port1, no=False) # wait until not busy
350
351 # set up a LD on the port. address first:
352 yield port1.is_ld_i.eq(1) # indicate LD
353 yield port1.data_len.eq(datalen) # LD length (1/2/4/8)
354
355 yield port1.addr.data.eq(addr) # set address
356 yield port1.addr.ok.eq(1) # set ok
357 yield from wait_addr(port1) # wait until addr ok
358
359 yield from wait_ldok(port1) # wait until ld ok
360 data = yield port1.ld.data
361
362 # cleanup
363 yield port1.is_ld_i.eq(0) # end
364 yield port1.addr.ok.eq(0) # set !ok
365 # yield from wait_busy(port1, no=False) # wait until not busy
366
367 return data
368
369
370 def l0_cache_ldst(arg, dut):
371 yield
372 addr = 0x2
373 data = 0xbeef
374 data2 = 0xf00f
375 #data = 0x4
376 yield from l0_cache_st(dut, 0x2, data, 2)
377 yield from l0_cache_st(dut, 0x4, data2, 2)
378 result = yield from l0_cache_ld(dut, 0x2, 2, data)
379 result2 = yield from l0_cache_ld(dut, 0x4, 2, data2)
380 yield
381 arg.assertEqual(data, result, "data %x != %x" % (result, data))
382 arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2))
383
384
385 def data_merger_merge(dut):
386 print("data_merger")
387 #starting with all inputs zero
388 yield Settle()
389 en = yield dut.data_o.en
390 data = yield dut.data_o.data
391 assert en == 0, "en must be zero"
392 assert data == 0, "data must be zero"
393 yield
394
395 yield dut.addr_array_i[0].eq(0xFF)
396 for j in range(dut.array_size):
397 yield dut.data_i[j].en.eq(1 << j)
398 yield dut.data_i[j].data.eq(0xFF << (16*j))
399 yield Settle()
400
401 en = yield dut.data_o.en
402 data = yield dut.data_o.data
403 assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
404 assert en == 0xff
405 yield
406
407
408 class TestL0Cache(unittest.TestCase):
409
410 def test_l0_cache(self):
411
412 dut = TstL0CacheBuffer(regwid=64)
413 #vl = rtlil.convert(dut, ports=dut.ports())
414 #with open("test_basic_l0_cache.il", "w") as f:
415 # f.write(vl)
416
417 run_simulation(dut, l0_cache_ldst(self, dut),
418 vcd_name='test_l0_cache_basic.vcd')
419
420
421 class TestDataMerger(unittest.TestCase):
422
423 def test_data_merger(self):
424
425 dut = DataMerger(8)
426 #vl = rtlil.convert(dut, ports=dut.ports())
427 #with open("test_data_merger.il", "w") as f:
428 # f.write(vl)
429
430 run_simulation(dut, data_merger_merge(dut),
431 vcd_name='test_data_merger.vcd')
432
433
434 class TestDualPortSplitter(unittest.TestCase):
435
436 def test_dual_port_splitter(self):
437
438 dut = DualPortSplitter()
439 #vl = rtlil.convert(dut, ports=dut.ports())
440 #with open("test_data_merger.il", "w") as f:
441 # f.write(vl)
442
443 #run_simulation(dut, data_merger_merge(dut),
444 # vcd_name='test_dual_port_splitter.vcd')
445
446
447 if __name__ == '__main__':
448 unittest.main(exit=False)
449