implement init function of DualPortSplitter
[soc.git] / src / soc / experiment / l0_cache.py
1 """L0 Cache/Buffer
2
3 This first version is intended for prototyping and test purposes:
4 it has "direct" access to Memory.
5
6 The intention is that this version remains an integral part of the
7 test infrastructure, and, just as with minerva's memory arrangement,
8 a dynamic runtime config *selects* alternative memory arrangements
9 rather than *replaces and discards* this code.
10
11 Links:
12
13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
15
16 """
17
18 from nmigen.compat.sim import run_simulation, Settle
19 from nmigen.cli import verilog, rtlil
20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
21 from nmutil.iocontrol import RecordObject
22 from nmigen.utils import log2_int
23 from nmigen.hdl.rec import Record, Layout
24
25 from nmutil.latch import SRLatch, latchregister
26 from soc.decoder.power_decoder2 import Data
27 from soc.decoder.power_enums import InternalOp
28 from soc.regfile.regfile import ortreereduce
29 from nmutil.util import treereduce
30
31 from soc.experiment.compldst import CompLDSTOpSubset
32 from soc.decoder.power_decoder2 import Data
33 #from nmutil.picker import PriorityPicker
34 from nmigen.lib.coding import PriorityEncoder
35
36 # for testing purposes
37 from soc.experiment.testmem import TestMemory
38
39 class PortInterface(RecordObject):
40 """PortInterface
41
42 defines the interface - the API - that the LDSTCompUnit connects
43 to. note that this is NOT a "fire-and-forget" interface. the
44 LDSTCompUnit *must* be kept appraised that the request is in
45 progress, and only when it has a 100% successful completion rate
46 can the notification be given (busy dropped).
47
48 The interface FSM rules are as follows:
49
50 * if busy_o is asserted, a LD/ST is in progress. further
51 requests may not be made until busy_o is deasserted.
52
53 * only one of is_ld_i or is_st_i may be asserted. busy_o
54 will immediately be asserted and remain asserted.
55
56 * addr.ok is to be asserted when the LD/ST address is known.
57 addr.data is to be valid on the same cycle.
58
59 addr.ok and addr.data must REMAIN asserted until busy_o
60 is de-asserted. this ensures that there is no need
61 for the L0 Cache/Buffer to have an additional address latch
62 (because the LDSTCompUnit already has it)
63
64 * addr_ok_o (or addr_exc_o) must be waited for. these will
65 be asserted *only* for one cycle and one cycle only.
66
67 * addr_exc_o will be asserted if there is no chance that the
68 memory request may be fulfilled.
69
70 busy_o is deasserted on the same cycle as addr_exc_o is asserted.
71
72 * conversely: addr_ok_o must *ONLY* be asserted if there is a
73 HUNDRED PERCENT guarantee that the memory request will be
74 fulfilled.
75
76 * for a LD, ld.ok will be asserted - for only one clock cycle -
77 at any point in the future that is acceptable to the underlying
78 Memory subsystem. the recipient MUST latch ld.data on that cycle.
79
80 busy_o is deasserted on the same cycle as ld.ok is asserted.
81
82 * for a ST, st.ok may be asserted only after addr_ok_o had been
83 asserted, alongside valid st.data at the same time. st.ok
84 must only be asserted for one cycle.
85
86 the underlying Memory is REQUIRED to pick up that data and
87 guarantee its delivery. no back-acknowledgement is required.
88
89 busy_o is deasserted on the cycle AFTER st.ok is asserted.
90 """
91
92 def __init__(self, name=None, regwid=64, addrwid=48):
93
94 self._regwid = regwid
95 self._addrwid = addrwid
96
97 RecordObject.__init__(self, name=name)
98
99 # distinguish op type (ld/st)
100 self.is_ld_i = Signal(reset_less=True)
101 self.is_st_i = Signal(reset_less=True)
102 self.op = CompLDSTOpSubset() # hm insn_type ld/st duplicates here
103
104 # common signals
105 self.busy_o = Signal(reset_less=True) # do not use if busy
106 self.go_die_i = Signal(reset_less=True) # back to reset
107 self.addr = Data(addrwid, "addr_i") # addr/addr-ok
108 # addr is valid (TLB, L1 etc.)
109 self.addr_ok_o = Signal(reset_less=True)
110 self.addr_exc_o = Signal(reset_less=True) # TODO, "type" of exception
111
112 # LD/ST
113 self.ld = Data(regwid, "ld_data_o") # ok to be set by L0 Cache/Buf
114 self.st = Data(regwid, "st_data_i") # ok to be set by CompUnit
115
116 # TODO: elaborate function
117
118
119 class DualPortSplitter(Elaboratable):
120 """DualPortSplitter
121
122 * one incoming PortInterface
123 * two *OUTGOING* PortInterfaces
124 * uses LDSTSplitter to do it
125
126 (actually, thinking about it LDSTSplitter could simply be
127 modified to conform to PortInterface: one in, two out)
128
129 once that is done each pair of ports may be wired directly
130 to the dual ports of L0CacheBuffer
131 """
132 def __init__(self):
133 self.outp = []
134 self.outp[0] = PortInterface(name="outp_0")
135 self.outp[1] = PortInterface(name="outp_1")
136 self.inp = PortInterface(name="inp")
137
138
139 class DataMergerRecord(Record):
140 """
141 {data: 128 bit, byte_enable: 16 bit}
142 """
143
144 def __init__(self, name=None):
145 layout = (('data', 128),
146 ('en', 16)
147 )
148
149 Record.__init__(self, Layout(layout), name=name)
150
151 #FIXME: make resetless
152
153 # TODO: formal verification
154
155 class DataMerger(Elaboratable):
156 """DataMerger
157
158 Merges data based on an address-match matrix.
159 Identifies (picks) one (any) row, then uses that row,
160 based on matching address bits, to merge (OR) all data
161 rows into the output.
162
163 Basically, by the time DataMerger is used, all of its incoming data is
164 determined not to conflict. The last step before actually submitting
165 the request to the Memory Subsystem is to work out which requests,
166 on the same 128-bit cache line, can be "merged" due to them being:
167 (A) on the same address (bits 4 and above) (B) having byte-enable
168 lines that (as previously mentioned) do not conflict.
169
170 Therefore, put simply, this module will:
171 (1) pick a row (any row) and identify it by an index labelled "idx"
172 (2) merge all byte-enable lines which are on that same address, as
173 indicated by addr_match_i[idx], onto the output
174 """
175
176 def __init__(self, array_size):
177 """
178 :addr_array_i: an NxN Array of Signals with bits set indicating address
179 match. bits across the diagonal (addr_array_i[x][x])
180 will always be set, to indicate "active".
181 :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
182 :data_o: an Output Record of same type
183 {data: 128 bit, byte_enable: 16 bit}
184 """
185 self.array_size = array_size
186 ul = []
187 for i in range(array_size):
188 ul.append(Signal(array_size,
189 reset_less=True,
190 name="addr_match_%d" % i))
191 self.addr_array_i = Array(ul)
192
193 ul = []
194 for i in range(array_size):
195 ul.append(DataMergerRecord())
196 self.data_i = Array(ul)
197 self.data_o = DataMergerRecord()
198
199 def elaborate(self, platform):
200 m = Module()
201 comb = m.d.comb
202 #(1) pick a row
203 m.submodules.pick = pick = PriorityEncoder(self.array_size)
204 for j in range(self.array_size):
205 comb += pick.i[j].eq(self.addr_array_i[j].bool())
206 valid = ~pick.n
207 idx = pick.o
208 #(2) merge
209 with m.If(valid):
210 l = []
211 for j in range(self.array_size):
212 select = self.addr_array_i[idx][j]
213 r = DataMergerRecord()
214 with m.If(select):
215 comb += r.eq(self.data_i[j])
216 l.append(r)
217 comb += self.data_o.data.eq(ortreereduce(l,"data"))
218 comb += self.data_o.en.eq(ortreereduce(l,"en"))
219
220 return m
221
222
223 class LDSTPort(Elaboratable):
224 def __init__(self, idx, regwid=64, addrwid=48):
225 self.pi = PortInterface("ldst_port%d" % idx, regwid, addrwid)
226
227 def elaborate(self, platform):
228 m = Module()
229 comb, sync = m.d.comb, m.d.sync
230
231 # latches
232 m.submodules.busy_l = busy_l = SRLatch(False, name="busy")
233 m.submodules.cyc_l = cyc_l = SRLatch(True, name="cyc")
234 comb += cyc_l.s.eq(0)
235 comb += cyc_l.r.eq(0)
236
237 # this is a little weird: we let the L0Cache/Buffer set
238 # the outputs: this module just monitors "state".
239
240 # LD/ST requested activates "busy"
241 with m.If(self.pi.is_ld_i | self.pi.is_st_i):
242 comb += busy_l.s.eq(1)
243
244 # monitor for an exception or the completion of LD.
245 with m.If(self.pi.addr_exc_o):
246 comb += busy_l.r.eq(1)
247
248 # however ST needs one cycle before busy is reset
249 with m.If(self.pi.st.ok | self.pi.ld.ok):
250 comb += cyc_l.s.eq(1)
251
252 with m.If(cyc_l.q):
253 comb += cyc_l.r.eq(1)
254 comb += busy_l.r.eq(1)
255
256 # busy latch outputs to interface
257 comb += self.pi.busy_o.eq(busy_l.q)
258
259 return m
260
261 def __iter__(self):
262 yield self.pi.is_ld_i
263 yield self.pi.is_st_i
264 yield from self.pi.op.ports()
265 yield self.pi.busy_o
266 yield self.pi.go_die_i
267 yield from self.pi.addr.ports()
268 yield self.pi.addr_ok_o
269 yield self.pi.addr_exc_o
270
271 yield from self.pi.ld.ports()
272 yield from self.pi.st.ports()
273
274 def ports(self):
275 return list(self)
276
277
278 class L0CacheBuffer(Elaboratable):
279 """L0 Cache / Buffer
280
281 Note that the final version will have *two* interfaces per LDSTCompUnit,
282 to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
283 interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
284
285 This version is to be used for test purposes (and actively maintained
286 for such, rather than "replaced")
287
288 There are much better ways to implement this. However it's only
289 a "demo" / "test" class, and one important aspect: it responds
290 combinatorially, where a nmigen FSM's state-changes only activate
291 on clock-sync boundaries.
292 """
293
294 def __init__(self, n_units, mem, regwid=64, addrwid=48):
295 self.n_units = n_units
296 self.mem = mem
297 ul = []
298 for i in range(n_units):
299 ul.append(LDSTPort(i, regwid, addrwid))
300 self.dports = Array(ul)
301
302 def elaborate(self, platform):
303 m = Module()
304 comb, sync = m.d.comb, m.d.sync
305
306 # connect the ports as modules
307 for i in range(self.n_units):
308 setattr(m.submodules, "port%d" % i, self.dports[i])
309
310 # state-machine latches
311 m.submodules.st_active = st_active = SRLatch(False, name="st_active")
312 m.submodules.ld_active = ld_active = SRLatch(False, name="ld_active")
313 m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
314 m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
315 m.submodules.adrok_l = adrok_l = SRLatch(False, name="addr_acked")
316
317 # find one LD (or ST) and do it. only one per cycle.
318 # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
319 # LD/STs using mask-expansion - see LenExpand class
320
321 m.submodules.ldpick = ldpick = PriorityEncoder(self.n_units)
322 m.submodules.stpick = stpick = PriorityEncoder(self.n_units)
323
324 lds = Signal(self.n_units, reset_less=True)
325 sts = Signal(self.n_units, reset_less=True)
326 ldi = []
327 sti = []
328 for i in range(self.n_units):
329 pi = self.dports[i].pi
330 ldi.append(pi.is_ld_i & pi.busy_o) # accumulate ld-req signals
331 sti.append(pi.is_st_i & pi.busy_o) # accumulate st-req signals
332 # put the requests into the priority-pickers
333 comb += ldpick.i.eq(Cat(*ldi))
334 comb += stpick.i.eq(Cat(*sti))
335
336 # hmm, have to select (record) the right port index
337 nbits = log2_int(self.n_units, False)
338 ld_idx = Signal(nbits, reset_less=False)
339 st_idx = Signal(nbits, reset_less=False)
340 # use these because of the sync-and-comb pass-through capability
341 latchregister(m, ldpick.o, ld_idx, idx_l.qn, name="ld_idx_l")
342 latchregister(m, stpick.o, st_idx, idx_l.qn, name="st_idx_l")
343
344 # convenience variables to reference the "picked" port
345 ldport = self.dports[ld_idx].pi
346 stport = self.dports[st_idx].pi
347 # and the memory ports
348 rdport = self.mem.rdport
349 wrport = self.mem.wrport
350
351 # Priority-Pickers pick one and only one request, capture its index.
352 # from that point on this code *only* "listens" to that port.
353
354 sync += adrok_l.s.eq(0)
355 comb += adrok_l.r.eq(0)
356 with m.If(~ldpick.n):
357 comb += ld_active.s.eq(1) # activate LD mode
358 comb += idx_l.r.eq(1) # pick (and capture) the port index
359 with m.Elif(~stpick.n):
360 comb += st_active.s.eq(1) # activate ST mode
361 comb += idx_l.r.eq(1) # pick (and capture) the port index
362
363 # from this point onwards, with the port "picked", it stays picked
364 # until ld_active (or st_active) are de-asserted.
365
366 # if now in "LD" mode: wait for addr_ok, then send the address out
367 # to memory, acknowledge address, and send out LD data
368 with m.If(ld_active.q):
369 with m.If(ldport.addr.ok & adrok_l.qn):
370 comb += rdport.addr.eq(ldport.addr.data) # addr ok, send thru
371 comb += ldport.addr_ok_o.eq(1) # acknowledge addr ok
372 sync += adrok_l.s.eq(1) # and pull "ack" latch
373
374 # if now in "ST" mode: likewise do the same but with "ST"
375 # to memory, acknowledge address, and send out LD data
376 with m.If(st_active.q):
377 with m.If(stport.addr.ok):
378 comb += wrport.addr.eq(stport.addr.data) # addr ok, send thru
379 with m.If(adrok_l.qn):
380 comb += stport.addr_ok_o.eq(1) # acknowledge addr ok
381 sync += adrok_l.s.eq(1) # and pull "ack" latch
382
383 # NOTE: in both these, below, the port itself takes care
384 # of de-asserting its "busy_o" signal, based on either ld.ok going
385 # high (by us, here) or by st.ok going high (by the LDSTCompUnit).
386
387 # for LD mode, when addr has been "ok'd", assume that (because this
388 # is a "Memory" test-class) the memory read data is valid.
389 comb += reset_l.s.eq(0)
390 comb += reset_l.r.eq(0)
391 with m.If(ld_active.q & adrok_l.q):
392 comb += ldport.ld.data.eq(rdport.data) # put data out
393 comb += ldport.ld.ok.eq(1) # indicate data valid
394 comb += reset_l.s.eq(1) # reset mode after 1 cycle
395
396 # for ST mode, when addr has been "ok'd", wait for incoming "ST ok"
397 with m.If(st_active.q & stport.st.ok):
398 comb += wrport.data.eq(stport.st.data) # write st to mem
399 comb += wrport.en.eq(1) # enable write
400 comb += reset_l.s.eq(1) # reset mode after 1 cycle
401
402 # after waiting one cycle (reset_l is "sync" mode), reset the port
403 with m.If(reset_l.q):
404 comb += idx_l.s.eq(1) # deactivate port-index selector
405 comb += ld_active.r.eq(1) # leave the ST active for 1 cycle
406 comb += st_active.r.eq(1) # leave the ST active for 1 cycle
407 comb += reset_l.r.eq(1) # clear reset
408 comb += adrok_l.r.eq(1) # address reset
409
410 return m
411
412 def ports(self):
413 for p in self.dports:
414 yield from p.ports()
415
416
417 class TstL0CacheBuffer(Elaboratable):
418 def __init__(self, n_units=3, regwid=16, addrwid=4):
419 self.mem = TestMemory(regwid, addrwid)
420 self.l0 = L0CacheBuffer(n_units, self.mem, regwid, addrwid)
421
422 def elaborate(self, platform):
423 m = Module()
424 m.submodules.mem = self.mem
425 m.submodules.l0 = self.l0
426
427 return m
428
429 def ports(self):
430 yield from self.l0.ports()
431 yield self.mem.rdport.addr
432 yield self.mem.rdport.data
433 yield self.mem.wrport.addr
434 yield self.mem.wrport.data
435 # TODO: mem ports
436
437
438 def wait_busy(port, no=False):
439 while True:
440 busy = yield port.pi.busy_o
441 print("busy", no, busy)
442 if bool(busy) == no:
443 break
444 yield
445
446
447 def wait_addr(port):
448 while True:
449 addr_ok = yield port.pi.addr_ok_o
450 print("addrok", addr_ok)
451 if not addr_ok:
452 break
453 yield
454
455
456 def wait_ldok(port):
457 while True:
458 ldok = yield port.pi.ld.ok
459 print("ldok", ldok)
460 if ldok:
461 break
462 yield
463
464
465 def l0_cache_st(dut, addr, data):
466 l0 = dut.l0
467 mem = dut.mem
468 port0 = l0.dports[0]
469 port1 = l0.dports[1]
470
471 # have to wait until not busy
472 yield from wait_busy(port1, no=False) # wait until not busy
473
474 # set up a ST on the port. address first:
475 yield port1.pi.is_st_i.eq(1) # indicate LD
476
477 yield port1.pi.addr.data.eq(addr) # set address
478 yield port1.pi.addr.ok.eq(1) # set ok
479 yield from wait_addr(port1) # wait until addr ok
480 # yield # not needed, just for checking
481 # yield # not needed, just for checking
482 # assert "ST" for one cycle (required by the API)
483 yield port1.pi.st.data.eq(data)
484 yield port1.pi.st.ok.eq(1)
485 yield
486 yield port1.pi.st.ok.eq(0)
487
488 # can go straight to reset.
489 yield port1.pi.is_st_i.eq(0) # end
490 yield port1.pi.addr.ok.eq(0) # set !ok
491 # yield from wait_busy(port1, False) # wait until not busy
492
493
494 def l0_cache_ld(dut, addr, expected):
495
496 l0 = dut.l0
497 mem = dut.mem
498 port0 = l0.dports[0]
499 port1 = l0.dports[1]
500
501 # have to wait until not busy
502 yield from wait_busy(port1, no=False) # wait until not busy
503
504 # set up a LD on the port. address first:
505 yield port1.pi.is_ld_i.eq(1) # indicate LD
506
507 yield port1.pi.addr.data.eq(addr) # set address
508 yield port1.pi.addr.ok.eq(1) # set ok
509 yield from wait_addr(port1) # wait until addr ok
510
511 yield from wait_ldok(port1) # wait until ld ok
512 data = yield port1.pi.ld.data
513
514 # cleanup
515 yield port1.pi.is_ld_i.eq(0) # end
516 yield port1.pi.addr.ok.eq(0) # set !ok
517 # yield from wait_busy(port1, no=False) # wait until not busy
518
519 return data
520
521
522 def l0_cache_ldst(dut):
523 yield
524 addr = 0x2
525 data = 0xbeef
526 data2 = 0xf00f
527 #data = 0x4
528 yield from l0_cache_st(dut, 0x2, data)
529 yield from l0_cache_st(dut, 0x3, data2)
530 result = yield from l0_cache_ld(dut, 0x2, data)
531 result2 = yield from l0_cache_ld(dut, 0x3, data2)
532 yield
533 assert data == result, "data %x != %x" % (result, data)
534 assert data2 == result2, "data2 %x != %x" % (result2, data2)
535
536 def data_merger_merge(dut):
537 print("data_merger")
538 #starting with all inputs zero
539 yield Settle()
540 en = yield dut.data_o.en
541 data = yield dut.data_o.data
542 assert en == 0, "en must be zero"
543 assert data == 0, "data must be zero"
544 yield
545
546 yield dut.addr_array_i[0].eq(0xFF)
547 for j in range(dut.array_size):
548 yield dut.data_i[j].en.eq(1 << j)
549 yield dut.data_i[j].data.eq(0xFF << (16*j))
550 yield Settle()
551
552 en = yield dut.data_o.en
553 data = yield dut.data_o.data
554 assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
555 assert en == 0xff
556 yield
557
558 def test_l0_cache():
559
560 dut = TstL0CacheBuffer()
561 #vl = rtlil.convert(dut, ports=dut.ports())
562 #with open("test_basic_l0_cache.il", "w") as f:
563 # f.write(vl)
564
565 run_simulation(dut, l0_cache_ldst(dut),
566 vcd_name='test_l0_cache_basic.vcd')
567
568 def test_data_merger():
569
570 dut = DataMerger(8)
571 #vl = rtlil.convert(dut, ports=dut.ports())
572 #with open("test_data_merger.il", "w") as f:
573 # f.write(vl)
574
575 run_simulation(dut, data_merger_merge(dut),
576 vcd_name='test_data_merger.vcd')
577
578
579 if __name__ == '__main__':
580 test_l0_cache()
581 test_data_merger()