ha! have to explicitly specify the ports when writing out to ilang or verilog
[soc.git] / src / soc / experiment / l0_cache.py
1 """L0 Cache/Buffer
2
3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
5
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
10
11 Links:
12
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
15
16 """
17
18 from nmigen.compat.sim import run_simulation, Settle
19 from nmigen.cli import verilog, rtlil
20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
21 from nmutil.iocontrol import RecordObject
22 from nmigen.utils import log2_int
23 from nmigen.hdl.rec import Record, Layout
24
25 from nmutil.latch import SRLatch, latchregister
26 from soc.decoder.power_decoder2 import Data
27 from soc.decoder.power_enums import MicrOp
28 from soc.regfile.regfile import ortreereduce
29 from nmutil.util import treereduce
30
31 from soc.decoder.power_decoder2 import Data
32 #from nmutil.picker import PriorityPicker
33 from nmigen.lib.coding import PriorityEncoder
34 from soc.scoreboard.addr_split import LDSTSplitter
35 from soc.scoreboard.addr_match import LenExpand
36
37 # for testing purposes
38 from soc.config.test.test_loadstore import TestMemPspec
39 from soc.config.loadstore import ConfigMemoryPortInterface
40 from soc.experiment.pimem import PortInterface
41 from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
42 import unittest
43
44
45 class DualPortSplitter(Elaboratable):
46 """DualPortSplitter
47
48 * one incoming PortInterface
49 * two *OUTGOING* PortInterfaces
50 * uses LDSTSplitter to do it
51
52 (actually, thinking about it LDSTSplitter could simply be
53 modified to conform to PortInterface: one in, two out)
54
55 once that is done each pair of ports may be wired directly
56 to the dual ports of L0CacheBuffer
57
58 The split is carried out so that, regardless of alignment or
59 mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
60 of the address, whilst outgoing PortInterface[1] takes
61 bit 4 == 1.
62
63 PortInterface *may* need to be changed so that the length is
64 a binary number (accepting values 1-16).
65 """
66
67 def __init__(self):
68 self.outp = [PortInterface(name="outp_0"),
69 PortInterface(name="outp_1")]
70 self.inp = PortInterface(name="inp")
71 print(self.outp)
72
73 def elaborate(self, platform):
74 m = Module()
75 comb = m.d.comb
76 m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
77 comb += splitter.addr_i.eq(self.inp.addr) # XXX
78 #comb += splitter.len_i.eq()
79 #comb += splitter.valid_i.eq()
80 comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
81 comb += splitter.is_st_i.eq(self.inp.is_st_i)
82 #comb += splitter.st_data_i.eq()
83 #comb += splitter.sld_valid_i.eq()
84 #comb += splitter.sld_data_i.eq()
85 #comb += splitter.sst_valid_i.eq()
86 return m
87
88
89 class DataMergerRecord(Record):
90 """
91 {data: 128 bit, byte_enable: 16 bit}
92 """
93
94 def __init__(self, name=None):
95 layout = (('data', 128),
96 ('en', 16))
97 Record.__init__(self, Layout(layout), name=name)
98
99 self.data.reset_less = True
100 self.en.reset_less = True
101
102
103 # TODO: formal verification
104 class DataMerger(Elaboratable):
105 """DataMerger
106
107 Merges data based on an address-match matrix.
108 Identifies (picks) one (any) row, then uses that row,
109 based on matching address bits, to merge (OR) all data
110 rows into the output.
111
112 Basically, by the time DataMerger is used, all of its incoming data is
113 determined not to conflict. The last step before actually submitting
114 the request to the Memory Subsystem is to work out which requests,
115 on the same 128-bit cache line, can be "merged" due to them being:
116 (A) on the same address (bits 4 and above) (B) having byte-enable
117 lines that (as previously mentioned) do not conflict.
118
119 Therefore, put simply, this module will:
120 (1) pick a row (any row) and identify it by an index labelled "idx"
121 (2) merge all byte-enable lines which are on that same address, as
122 indicated by addr_match_i[idx], onto the output
123 """
124
125 def __init__(self, array_size):
126 """
127 :addr_array_i: an NxN Array of Signals with bits set indicating address
128 match. bits across the diagonal (addr_array_i[x][x])
129 will always be set, to indicate "active".
130 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
131 :data_o: an Output Record of same type
132 {data: 128 bit, byte_enable: 16 bit}
133 """
134 self.array_size = array_size
135 ul = []
136 for i in range(array_size):
137 ul.append(Signal(array_size,
138 reset_less=True,
139 name="addr_match_%d" % i))
140 self.addr_array_i = Array(ul)
141
142 ul = []
143 for i in range(array_size):
144 ul.append(DataMergerRecord())
145 self.data_i = Array(ul)
146 self.data_o = DataMergerRecord()
147
148 def elaborate(self, platform):
149 m = Module()
150 comb = m.d.comb
151 # (1) pick a row
152 m.submodules.pick = pick = PriorityEncoder(self.array_size)
153 for j in range(self.array_size):
154 comb += pick.i[j].eq(self.addr_array_i[j].bool())
155 valid = ~pick.n
156 idx = pick.o
157 # (2) merge
158 with m.If(valid):
159 l = []
160 for j in range(self.array_size):
161 select = self.addr_array_i[idx][j]
162 r = DataMergerRecord()
163 with m.If(select):
164 comb += r.eq(self.data_i[j])
165 l.append(r)
166 comb += self.data_o.data.eq(ortreereduce(l, "data"))
167 comb += self.data_o.en.eq(ortreereduce(l, "en"))
168
169 return m
170
171
172 class L0CacheBuffer(Elaboratable):
173 """L0 Cache / Buffer
174
175 Note that the final version will have *two* interfaces per LDSTCompUnit,
176 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
177 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
178
179 This version is to be used for test purposes (and actively maintained
180 for such, rather than "replaced")
181
182 There are much better ways to implement this. However it's only
183 a "demo" / "test" class, and one important aspect: it responds
184 combinatorially, where a nmigen FSM's state-changes only activate
185 on clock-sync boundaries.
186
187 Note: the data byte-order is *not* expected to be normalised (LE/BE)
188 by this class. That task is taken care of by LDSTCompUnit.
189 """
190
191 def __init__(self, n_units, pimem, regwid=64, addrwid=48):
192 self.n_units = n_units
193 self.pimem = pimem
194 self.regwid = regwid
195 self.addrwid = addrwid
196 ul = []
197 for i in range(n_units):
198 ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
199 self.dports = Array(ul)
200
201 def elaborate(self, platform):
202 m = Module()
203 comb, sync = m.d.comb, m.d.sync
204
205 # connect the ports as modules
206 # for i in range(self.n_units):
207 # setattr(m.submodules, "port%d" % i, self.dports[i])
208
209 # state-machine latches
210 m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
211 m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
212
213 # find one LD (or ST) and do it. only one per cycle.
214 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
215 # LD/STs using mask-expansion - see LenExpand class
216
217 m.submodules.pick = pick = PriorityEncoder(self.n_units)
218
219 ldsti = []
220 for i in range(self.n_units):
221 pi = self.dports[i]
222 busy = (pi.is_ld_i | pi.is_st_i) # & pi.busy_o
223 ldsti.append(busy) # accumulate ld/st-req
224 # put the requests into the priority-picker
225 comb += pick.i.eq(Cat(*ldsti))
226
227 # hmm, have to select (record) the right port index
228 nbits = log2_int(self.n_units, False)
229 idx = Signal(nbits, reset_less=False)
230
231 # use these because of the sync-and-comb pass-through capability
232 latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
233
234 # convenience variables to reference the "picked" port
235 port = self.dports[idx]
236
237 # pick (and capture) the port index
238 with m.If(~pick.n):
239 comb += idx_l.s.eq(1)
240
241 # from this point onwards, with the port "picked", it stays picked
242 # until idx_l is deasserted
243 comb += reset_l.s.eq(0)
244 comb += reset_l.r.eq(0)
245
246 with m.If(idx_l.q):
247 comb += self.pimem.connect_port(port)
248 with m.If(~self.pimem.pi.busy_o):
249 comb += reset_l.s.eq(1) # reset when no longer busy
250
251 # ugly hack, due to simultaneous addr req-go acknowledge
252 reset_delay = Signal(reset_less=True)
253 sync += reset_delay.eq(reset_l.q)
254
255 # after waiting one cycle (reset_l is "sync" mode), reset the port
256 with m.If(reset_l.q):
257 comb += idx_l.r.eq(1) # deactivate port-index selector
258 comb += reset_l.r.eq(1) # clear reset
259
260 return m
261
262 def __iter__(self):
263 for p in self.dports:
264 yield from p.ports()
265
266 def ports(self):
267 return list(self)
268
269
270 class TstL0CacheBuffer(Elaboratable):
271 def __init__(self, pspec, n_units=3):
272 regwid = pspec.reg_wid
273 addrwid = pspec.addr_wid
274 self.cmpi = ConfigMemoryPortInterface(pspec)
275 self.pimem = self.cmpi.pi
276 self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid << 1)
277
278 def elaborate(self, platform):
279 m = Module()
280 m.submodules.pimem = self.pimem
281 m.submodules.l0 = self.l0
282 if hasattr(self.cmpi, 'lsmem'): # hmmm not happy about this
283 m.submodules.lsmem = self.cmpi.lsmem.lsi
284
285 return m
286
287 def ports(self):
288 yield from self.cmpi.ports()
289 yield from self.l0.ports()
290 yield from self.pimem.ports()
291
292
293 def wait_busy(port, no=False):
294 while True:
295 busy = yield port.busy_o
296 print("busy", no, busy)
297 if bool(busy) == no:
298 break
299 yield
300
301
302 def wait_addr(port):
303 while True:
304 addr_ok = yield port.addr_ok_o
305 print("addrok", addr_ok)
306 if not addr_ok:
307 break
308 yield
309
310
311 def wait_ldok(port):
312 while True:
313 ldok = yield port.ld.ok
314 print("ldok", ldok)
315 if ldok:
316 break
317 yield
318
319
320 def l0_cache_st(dut, addr, data, datalen):
321 return pi_st(dut.l0, addr, datalen)
322
323
324 def l0_cache_ld(dut, addr, datalen, expected):
325 return pi_ld(dut.l0, addr, datalen)
326
327
328 def l0_cache_ldst(arg, dut):
329 port0 = dut.l0.dports[0]
330 return pi_ldst(arg, port0)
331
332
333 def data_merger_merge(dut):
334 print("data_merger")
335 # starting with all inputs zero
336 yield Settle()
337 en = yield dut.data_o.en
338 data = yield dut.data_o.data
339 assert en == 0, "en must be zero"
340 assert data == 0, "data must be zero"
341 yield
342
343 yield dut.addr_array_i[0].eq(0xFF)
344 for j in range(dut.array_size):
345 yield dut.data_i[j].en.eq(1 << j)
346 yield dut.data_i[j].data.eq(0xFF << (16*j))
347 yield Settle()
348
349 en = yield dut.data_o.en
350 data = yield dut.data_o.data
351 assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
352 assert en == 0xff
353 yield
354
355
356 class TestL0Cache(unittest.TestCase):
357
358 def test_l0_cache_test_bare_wb(self):
359
360 pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
361 addr_wid=48,
362 mask_wid=8,
363 reg_wid=64)
364 dut = TstL0CacheBuffer(pspec)
365 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
366 with open("test_basic_l0_cache_bare_wb.il", "w") as f:
367 f.write(vl)
368
369 run_simulation(dut, l0_cache_ldst(self, dut),
370 vcd_name='test_l0_cache_basic_bare_wb.vcd')
371
372 def test_l0_cache_testpi(self):
373
374 pspec = TestMemPspec(ldst_ifacetype='testpi',
375 addr_wid=48,
376 mask_wid=8,
377 reg_wid=64)
378 dut = TstL0CacheBuffer(pspec)
379 vl = rtlil.convert(dut, ports=[]) # TODOdut.ports())
380 with open("test_basic_l0_cache.il", "w") as f:
381 f.write(vl)
382
383 run_simulation(dut, l0_cache_ldst(self, dut),
384 vcd_name='test_l0_cache_basic_testpi.vcd')
385
386
387 class TestDataMerger(unittest.TestCase):
388
389 def test_data_merger(self):
390
391 dut = DataMerger(8)
392 #vl = rtlil.convert(dut, ports=dut.ports())
393 # with open("test_data_merger.il", "w") as f:
394 # f.write(vl)
395
396 run_simulation(dut, data_merger_merge(dut),
397 vcd_name='test_data_merger.vcd')
398
399
400 class TestDualPortSplitter(unittest.TestCase):
401
402 def test_dual_port_splitter(self):
403
404 dut = DualPortSplitter()
405 #vl = rtlil.convert(dut, ports=dut.ports())
406 # with open("test_data_merger.il", "w") as f:
407 # f.write(vl)
408
409 # run_simulation(dut, data_merger_merge(dut),
410 # vcd_name='test_dual_port_splitter.vcd')
411
412
413 if __name__ == '__main__':
414 unittest.main()