icache.py rearrange the code within the base class ICache
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Signal)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.util import log2_int
30
31
32 from soc.experiment.mem_types import Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType
35
36 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
37 WBAddrType, WBDataType, WBSelType,
38 WbMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut
41
42
43 # Cache reload state machine
44 @unique
45 class State(Enum):
46 IDLE
47 CLR_TAG
48 WAIT_ACK
49
50 # type reg_internal_t is record
51 # -- Cache hit state (Latches for 1 cycle BRAM access)
52 # hit_way : way_t;
53 # hit_nia : std_ulogic_vector(63 downto 0);
54 # hit_smark : std_ulogic;
55 # hit_valid : std_ulogic;
56 #
57 # -- Cache miss state (reload state machine)
58 # state : state_t;
59 # wb : wishbone_master_out;
60 # store_way : way_t;
61 # store_index : index_t;
62 # store_row : row_t;
63 # store_tag : cache_tag_t;
64 # store_valid : std_ulogic;
65 # end_row_ix : row_in_line_t;
66 # rows_valid : row_per_line_valid_t;
67 #
68 # -- TLB miss state
69 # fetch_failed : std_ulogic;
70 # end record;
71 class RegInternal(RecordObject):
72 def __init__(self):
73 super().__init__()
74 # Cache hit state (Latches for 1 cycle BRAM access)
75 self.hit_way = Signal(NUM_WAYS)
76 self.hit_nia = Signal(64)
77 self.hit_smark = Signal()
78 self.hit_valid = Signal()
79
80 # Cache miss state (reload state machine)
81 self.state = State()
82 self.wb = WBMasterOut()
83 self.store_way = Signal(NUM_WAYS)
84 self.store_index = Signal(NUM_LINES)
85 self.store_row = Signal(BRAM_ROWS)
86 self.store_tag = Signal(TAG_BITS)
87 self.store_valid = Signal()
88 self.end_row_ix = Signal(ROW_LINE_BITS)
89 self.rows_valid = RowPerLineValidArray()
90
91 # TLB miss state
92 self.fetch_failed = Signal()
93
94 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
95 #
96 # entity icache is
97 # generic (
98 # SIM : boolean := false;
99 # -- Line size in bytes
100 # LINE_SIZE : positive := 64;
101 # -- BRAM organisation: We never access more than wishbone_data_bits
102 # -- at a time so to save resources we make the array only that wide,
103 # -- and use consecutive indices for to make a cache "line"
104 # --
105 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
106 # -- so 64-bits)
107 # ROW_SIZE : positive := wishbone_data_bits / 8;
108 # -- Number of lines in a set
109 # NUM_LINES : positive := 32;
110 # -- Number of ways
111 # NUM_WAYS : positive := 4;
112 # -- L1 ITLB number of entries (direct mapped)
113 # TLB_SIZE : positive := 64;
114 # -- L1 ITLB log_2(page_size)
115 # TLB_LG_PGSZ : positive := 12;
116 # -- Number of real address bits that we store
117 # REAL_ADDR_BITS : positive := 56;
118 # -- Non-zero to enable log data collection
119 # LOG_LENGTH : natural := 0
120 # );
121 # port (
122 # clk : in std_ulogic;
123 # rst : in std_ulogic;
124 #
125 # i_in : in Fetch1ToIcacheType;
126 # i_out : out IcacheToDecode1Type;
127 #
128 # m_in : in MmuToIcacheType;
129 #
130 # stall_in : in std_ulogic;
131 # stall_out : out std_ulogic;
132 # flush_in : in std_ulogic;
133 # inval_in : in std_ulogic;
134 #
135 # wishbone_out : out wishbone_master_out;
136 # wishbone_in : in wishbone_slave_out;
137 #
138 # log_out : out std_ulogic_vector(53 downto 0)
139 # );
140 # end entity icache;
141 # 64 bit direct mapped icache. All instructions are 4B aligned.
142 class ICache(Elaboratable):
143 """64 bit direct mapped icache. All instructions are 4B aligned."""
144 def __init__(self):
145 self.SIM = 0
146 self.LINE_SIZE = 64
147 # BRAM organisation: We never access more than wishbone_data_bits
148 # at a time so to save resources we make the array only that wide,
149 # and use consecutive indices for to make a cache "line"
150 #
151 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
152 self.ROW_SIZE = WB_DATA_BITS / 8
153 # Number of lines in a set
154 self.NUM_LINES = 32
155 # Number of ways
156 self.NUM_WAYS = 4
157 # L1 ITLB number of entries (direct mapped)
158 self.TLB_SIZE = 64
159 # L1 ITLB log_2(page_size)
160 self.TLB_LG_PGSZ = 12
161 # Number of real address bits that we store
162 self.REAL_ADDR_BITS = 56
163 # Non-zero to enable log data collection
164 self.LOG_LENGTH = 0
165
166 self.i_in = Fetch1ToICacheType()
167 self.i_out = ICacheToDecode1Type()
168
169 self.m_in = MMUToICacheType()
170
171 self.stall_in = Signal()
172 self.stall_out = Signal()
173 self.flush_in = Signal()
174 self.inval_in = Signal()
175
176 self.wb_out = WBMasterOut()
177 self.wb_in = WBSlaveOut()
178
179 self.log_out = Signal(54)
180
181 # -- Return the cache line index (tag index) for an address
182 # function get_index(addr: std_ulogic_vector(63 downto 0))
183 # return index_t is
184 # begin
185 # return to_integer(unsigned(
186 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
187 # ));
188 # end;
189 # Return the cache line index (tag index) for an address
190 def get_index(addr):
191 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
192
193 # -- Return the cache row index (data memory) for an address
194 # function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
195 # begin
196 # return to_integer(unsigned(
197 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
198 # ));
199 # end;
200 # Return the cache row index (data memory) for an address
201 def get_row(addr):
202 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
203
204 # -- Return the index of a row within a line
205 # function get_row_of_line(row: row_t) return row_in_line_t is
206 # variable row_v : unsigned(ROW_BITS-1 downto 0);
207 # begin
208 # row_v := to_unsigned(row, ROW_BITS);
209 # return row_v(ROW_LINEBITS-1 downto 0);
210 # end;
211 # Return the index of a row within a line
212 def get_row_of_line(row):
213 row[:ROW_LINE_BITS]
214
215 # -- Returns whether this is the last row of a line
216 # function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t)
217 # return boolean is
218 # begin
219 # return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
220 # end;
221 # Returns whether this is the last row of a line
222 def is_last_row_addr(addr, last):
223 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
224
225 # -- Returns whether this is the last row of a line
226 # function is_last_row(row: row_t; last: row_in_line_t) return boolean is
227 # begin
228 # return get_row_of_line(row) = last;
229 # end;
230 # Returns whether this is the last row of a line
231 def is_last_row(row, last):
232 return get_row_of_line(row) == last
233
234 # -- Return the address of the next row in the current cache line
235 # function next_row_addr(addr: wishbone_addr_type)
236 # return std_ulogic_vector is
237 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
238 # variable result : wishbone_addr_type;
239 # begin
240 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
241 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
242 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
243 # result := addr;
244 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
245 # return result;
246 # end;
247 # Return the address of the next row in the current cache line
248 def next_row_addr(addr):
249 # TODO no idea what's going on here, looks like double assignments
250 # overriding earlier assignments ??? Help please!
251
252 # -- Return the next row in the current cache line. We use a dedicated
253 # -- function in order to limit the size of the generated adder to be
254 # -- only the bits within a cache line (3 bits with default settings)
255 # function next_row(row: row_t) return row_t is
256 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
257 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
258 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
259 # begin
260 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
261 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
262 # row_v(ROW_LINEBITS-1 downto 0) :=
263 # std_ulogic_vector(unsigned(row_idx) + 1);
264 # return to_integer(unsigned(row_v));
265 # end;
266 # Return the next row in the current cache line. We use a dedicated
267 # function in order to limit the size of the generated adder to be
268 # only the bits within a cache line (3 bits with default settings)
269 def next_row(row):
270 # TODO no idea what's going on here, looks like double assignments
271 # overriding earlier assignments ??? Help please!
272
273 # -- Read the instruction word for the given address in the
274 # -- current cache row
275 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
276 # data: cache_row_t) return std_ulogic_vector is
277 # variable word: integer range 0 to INSN_PER_ROW-1;
278 # begin
279 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
280 # return data(31+word*32 downto word*32);
281 # end;
282 # Read the instruction word for the given address
283 # in the current cache row
284 def read_insn_word(addr, data):
285 word = addr[2:INSN_BITS+3]
286 return data[word * 32:32 + word * 32]
287
288 # -- Get the tag value from the address
289 # function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0))
290 # return cache_tag_t is
291 # begin
292 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
293 # end;
294 # Get the tag value from the address
295 def get_tag(addr):
296 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
297
298 # -- Read a tag from a tag memory row
299 # function read_tag(way: way_t; tagset: cache_tags_set_t)
300 # return cache_tag_t is
301 # begin
302 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
303 # end;
304 # Read a tag from a tag memory row
305 def read_tag(way, tagset):
306 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
307
308 # -- Write a tag to tag memory row
309 # procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
310 # tag: cache_tag_t) is
311 # begin
312 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
313 # end;
314 # Write a tag to tag memory row
315 def write_tag(way, tagset, tag):
316 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
317
318 # -- Simple hash for direct-mapped TLB index
319 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
320 # return tlb_index_t is
321 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
322 # begin
323 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
324 # xor addr(
325 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
326 # TLB_LG_PGSZ + TLB_BITS
327 # )
328 # xor addr(
329 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
330 # TLB_LG_PGSZ + 2 * TLB_BITS
331 # );
332 # return to_integer(unsigned(hash));
333 # end;
334 # Simple hash for direct-mapped TLB index
335 def hash_ea(addr):
336 hash = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS]
337 ^ addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS]
338 ^ addr[TLB_LG_PGSZ + 2 * TLB_BITS: TLB_LG_PGSZE + 3 * TLB_BITS]
339 return hash
340
341 # -- Generate a cache RAM for each way
342 # rams: for i in 0 to NUM_WAYS-1 generate
343 # signal do_read : std_ulogic;
344 # signal do_write : std_ulogic;
345 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
346 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
347 # signal dout : cache_row_t;
348 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
349 # begin
350 # way: entity work.cache_ram
351 # generic map (
352 # ROW_BITS => ROW_BITS,
353 # WIDTH => ROW_SIZE_BITS
354 # )
355 # port map (
356 # clk => clk,
357 # rd_en => do_read,
358 # rd_addr => rd_addr,
359 # rd_data => dout,
360 # wr_sel => wr_sel,
361 # wr_addr => wr_addr,
362 # wr_data => wishbone_in.dat
363 # );
364 # process(all)
365 # begin
366 # do_read <= not (stall_in or use_previous);
367 # do_write <= '0';
368 # if wishbone_in.ack = '1' and replace_way = i then
369 # do_write <= '1';
370 # end if;
371 # cache_out(i) <= dout;
372 # rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
373 # wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
374 # for i in 0 to ROW_SIZE-1 loop
375 # wr_sel(i) <= do_write;
376 # end loop;
377 # end process;
378 # end generate;
379 def rams(self, m):
380 comb = m.d.comb
381
382 do_read = Signal()
383 do_write = Signal()
384 rd_addr = Signal(ROW_BITS)
385 wr_addr = Signal(ROW_BITS)
386 _d_out = Signal(ROW_SIZE_BITS)
387 wr_sel = Signal(ROW_SIZE)
388
389 for i in range(NUM_WAYS)
390 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
391 comb += way.rd_en.eq(do_read)
392 comb += way.rd_addr.eq(rd_addr)
393 comb += way.rd_data.eq(_d_out)
394 comb += way.wr_sel.eq(wr_sel)
395 comb += way.wr_add.eq(wr_addr)
396 comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''')
397
398 comb += do_read.eq(~(stall_in | use_previous))
399 comb += do_write.eq(0)
400
401 with m.If(wb_in.ack & replace_way == i):
402 do_write.eq(1)
403
404 comb += cache_out[i].eq(_d_out)
405 comb += rd_addr.eq(Signal(req_row))
406 comb += wr_addr.eq(Signal(r.store_row))
407 for j in range(ROW_SIZE):
408 comb += wr_sel[j].eq(do_write)
409
410 # -- Generate PLRUs
411 # maybe_plrus: if NUM_WAYS > 1 generate
412 # begin
413 # plrus: for i in 0 to NUM_LINES-1 generate
414 # -- PLRU interface
415 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
416 # signal plru_acc_en : std_ulogic;
417 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
418 #
419 # begin
420 # plru : entity work.plru
421 # generic map (
422 # BITS => WAY_BITS
423 # )
424 # port map (
425 # clk => clk,
426 # rst => rst,
427 # acc => plru_acc,
428 # acc_en => plru_acc_en,
429 # lru => plru_out
430 # );
431 #
432 # process(all)
433 # begin
434 # -- PLRU interface
435 # if get_index(r.hit_nia) = i then
436 # plru_acc_en <= r.hit_valid;
437 # else
438 # plru_acc_en <= '0';
439 # end if;
440 # plru_acc <=
441 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
442 # plru_victim(i) <= plru_out;
443 # end process;
444 # end generate;
445 # end generate;
446 def maybe_plrus(self, m):
447 comb += m.d.comb
448
449 with m.If(NUM_WAYS > 1):
450 plru_acc = Signal(WAY_BITS)
451 plru_acc_en = Signal()
452 plru_out = Signal(WAY_BITS)
453
454 for i in range(NUM_LINES):
455 plru = PLRU(WAY_BITS)
456 comb += plru.acc.eq(plru_acc)
457 comb += plru.acc_en.eq(plru_acc_en)
458 comb += plru.lru.eq(plru_out)
459
460 # PLRU interface
461 with m.If(get_index(r.hit_nia) == i):
462 comb += plru.acc_en.eq(r.hit_valid)
463
464 with m.Else():
465 comb += plru.acc_en.eq(0)
466
467 comb += plru.acc.eq(r.hit_way)
468 comb += plru_victim[i].eq(plru.lru)
469
470 # -- TLB hit detection and real address generation
471 # itlb_lookup : process(all)
472 # variable pte : tlb_pte_t;
473 # variable ttag : tlb_tag_t;
474 # begin
475 # tlb_req_index <= hash_ea(i_in.nia);
476 # pte := itlb_ptes(tlb_req_index);
477 # ttag := itlb_tags(tlb_req_index);
478 # if i_in.virt_mode = '1' then
479 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
480 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
481 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
482 # ra_valid <= itlb_valids(tlb_req_index);
483 # else
484 # ra_valid <= '0';
485 # end if;
486 # eaa_priv <= pte(3);
487 # else
488 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
489 # ra_valid <= '1';
490 # eaa_priv <= '1';
491 # end if;
492 #
493 # -- no IAMR, so no KUEP support for now
494 # priv_fault <= eaa_priv and not i_in.priv_mode;
495 # access_ok <= ra_valid and not priv_fault;
496 # end process;
497 # TLB hit detection and real address generation
498 def itlb_lookup(self, m):
499 comb = m.d.comb
500
501 comb += tlb_req_index.eq(hash_ea(i_in.nia))
502 comb += pte.eq(itlb_ptes[tlb_req_index])
503 comb += ttag.eq(itlb_tags[tlb_req_index])
504
505 with m.If(i_in.virt_mode):
506 comb += real_addr.eq(Cat(
507 i_in.nia[:TLB_LB_PGSZ],
508 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
509 ))
510
511 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
512 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
513
514 with m.Else():
515 comb += ra_valid.eq(0)
516
517 with m.Else():
518 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
519 comb += ra_valid.eq(1)
520 comb += eaa_priv.eq(1)
521
522 # No IAMR, so no KUEP support for now
523 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
524 comb += access_ok.eq(ra_valid & ~priv_fault)
525
526 # -- iTLB update
527 # itlb_update: process(clk)
528 # variable wr_index : tlb_index_t;
529 # begin
530 # if rising_edge(clk) then
531 # wr_index := hash_ea(m_in.addr);
532 # if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
533 # -- clear all valid bits
534 # for i in tlb_index_t loop
535 # itlb_valids(i) <= '0';
536 # end loop;
537 # elsif m_in.tlbie = '1' then
538 # -- clear entry regardless of hit or miss
539 # itlb_valids(wr_index) <= '0';
540 # elsif m_in.tlbld = '1' then
541 # itlb_tags(wr_index) <= m_in.addr(
542 # 63 downto TLB_LG_PGSZ + TLB_BITS
543 # );
544 # itlb_ptes(wr_index) <= m_in.pte;
545 # itlb_valids(wr_index) <= '1';
546 # end if;
547 # end if;
548 # end process;
549 # iTLB update
550 def itlb_update(self, m):
551 sync = m.d.sync
552
553 wr_index = Signal(TLB_SIZE)
554 sync += wr_index.eq(hash_ea(m_in.addr))
555
556 with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
557 # Clear all valid bits
558 for i in range(TLB_SIZE):
559 sync += itlb_vlaids[i].eq(0)
560
561 with m.Elif(m_in.tlbie):
562 # Clear entry regardless of hit or miss
563 sync += itlb_valid_bits[wr_index].eq(0)
564
565 with m.Elif(m_in.tlbld):
566 sync += itlb_tags[wr_index].eq(
567 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
568 )
569 sync += itlb_ptes[wr_index].eq(m_in.pte)
570 sync += itlb_valid_bits[wr_index].eq(1)
571
572 # -- Cache hit detection, output to fetch2 and other misc logic
573 # icache_comb : process(all)
574 # Cache hit detection, output to fetch2 and other misc logic
575 def icache_comb(self, m):
576 # variable is_hit : std_ulogic;
577 # variable hit_way : way_t;
578 comb = m.d.comb
579
580 is_hit = Signal()
581 hit_way = Signal(NUM_WAYS)
582 # begin
583 # -- i_in.sequential means that i_in.nia this cycle is 4 more than
584 # -- last cycle. If we read more than 32 bits at a time, had a
585 # -- cache hit last cycle, and we don't want the first 32-bit chunk
586 # -- then we can keep the data we read last cycle and just use that.
587 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
588 # use_previous <= i_in.sequential and r.hit_valid;
589 # else
590 # use_previous <= '0';
591 # end if;
592 # i_in.sequential means that i_in.nia this cycle is 4 more than
593 # last cycle. If we read more than 32 bits at a time, had a
594 # cache hit last cycle, and we don't want the first 32-bit chunk
595 # then we can keep the data we read last cycle and just use that.
596 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
597 comb += use_previous.eq(i_in.sequential & r.hit_valid)
598
599 with m.else():
600 comb += use_previous.eq(0)
601
602 # -- Extract line, row and tag from request
603 # req_index <= get_index(i_in.nia);
604 # req_row <= get_row(i_in.nia);
605 # req_tag <= get_tag(real_addr);
606 # Extract line, row and tag from request
607 comb += req_index.eq(get_index(i_in.nia))
608 comb += req_row.eq(get_row(i_in.nia))
609 comb += req_tag.eq(get_tag(real_addr))
610
611 # -- Calculate address of beginning of cache row, will be
612 # -- used for cache miss processing if needed
613 # req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
614 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
615 # (ROW_OFF_BITS-1 downto 0 => '0');
616 # Calculate address of beginning of cache row, will be
617 # used for cache miss processing if needed
618 comb += req_laddr.eq(Cat(
619 Const(0b0, ROW_OFF_BITS),
620 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
621 Const(0, REAL_ADDR_BITS)
622 ))
623
624 # -- Test if pending request is a hit on any way
625 # hit_way := 0;
626 # is_hit := '0';
627 # for i in way_t loop
628 # if i_in.req = '1' and
629 # (cache_valids(req_index)(i) = '1' or
630 # (r.state = WAIT_ACK and
631 # req_index = r.store_index and
632 # i = r.store_way and
633 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
634 # if read_tag(i, cache_tags(req_index)) = req_tag then
635 # hit_way := i;
636 # is_hit := '1';
637 # end if;
638 # end if;
639 # end loop;
640 # Test if pending request is a hit on any way
641 for i in range(NUM_WAYS):
642 with m.If(i_in.req &
643 (cache_valid_bits[req_index][i] |
644 ((r.state == State.WAIT_ACK)
645 & (req_index == r.store_index)
646 & (i == r.store_way)
647 & r.rows_valid[req_row % ROW_PER_LINE])):
648 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
649 comb += hit_way.eq(i)
650 comb += is_hit.eq(1)
651
652 # -- Generate the "hit" and "miss" signals for the synchronous blocks
653 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
654 # and rst = '0' then
655 # req_is_hit <= is_hit;
656 # req_is_miss <= not is_hit;
657 # else
658 # req_is_hit <= '0';
659 # req_is_miss <= '0';
660 # end if;
661 # req_hit_way <= hit_way;
662 # Generate the "hit" and "miss" signals for the synchronous blocks
663 with m.If(i_in.rq & access_ok & ~flush_in & '''TODO nmigen rst'''):
664 comb += req_is_hit.eq(is_hit)
665 comb += req_is_miss.eq(~is_hit)
666
667 with m.Else():
668 comb += req_is_hit.eq(0)
669 comb += req_is_miss.eq(0)
670
671 # -- The way to replace on a miss
672 # if r.state = CLR_TAG then
673 # replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
674 # else
675 # replace_way <= r.store_way;
676 # end if;
677 # The way to replace on a miss
678 with m.If(r.state == State.CLR_TAG):
679 comb += replace_way.eq(plru_victim[r.store_index])
680
681 with m.Else():
682 comb += replace_way.eq(r.store_way)
683
684 # -- Output instruction from current cache row
685 # --
686 # -- Note: This is a mild violation of our design principle of
687 # -- having pipeline stages output from a clean latch. In this
688 # -- case we output the result of a mux. The alternative would
689 # -- be output an entire row which I prefer not to do just yet
690 # -- as it would force fetch2 to know about some of the cache
691 # -- geometry information.
692 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
693 # i_out.valid <= r.hit_valid;
694 # i_out.nia <= r.hit_nia;
695 # i_out.stop_mark <= r.hit_smark;
696 # i_out.fetch_failed <= r.fetch_failed;
697 # Output instruction from current cache row
698 #
699 # Note: This is a mild violation of our design principle of
700 # having pipeline stages output from a clean latch. In this
701 # case we output the result of a mux. The alternative would
702 # be output an entire row which I prefer not to do just yet
703 # as it would force fetch2 to know about some of the cache
704 # geometry information.
705 comb += i_out.insn.eq(
706 read_insn_word(r.hit_nia, cache_out[r.hit_way])
707 )
708 comb += i_out.valid.eq(r.hit_valid)
709 comb += i_out.nia.eq(r.hit_nia)
710 comb += i_out.stop_mark.eq(r.hit_smark)
711 comb += i_out.fetch_failed.eq(r.fetch_failed)
712
713 # -- Stall fetch1 if we have a miss on cache or TLB
714 # -- or a protection fault
715 # stall_out <= not (is_hit and access_ok);
716 # Stall fetch1 if we have a miss on cache or TLB
717 # or a protection fault
718 comb += stall_out.eq(~(is_hit & access_ok))
719
720 # -- Wishbone requests output (from the cache miss reload machine)
721 # wishbone_out <= r.wb;
722 # Wishbone requests output (from the cache miss reload machine)
723 comb += wb_out.eq(r.wb)
724 # end process;
725
726 # -- Cache hit synchronous machine
727 # icache_hit : process(clk)
728 # Cache hit synchronous machine
729 def icache_hit(self, m):
730 sync = m.d.sync
731 # begin
732 # if rising_edge(clk) then
733 # -- keep outputs to fetch2 unchanged on a stall
734 # -- except that flush or reset sets valid to 0
735 # -- If use_previous, keep the same data as last
736 # -- cycle and use the second half
737 # if stall_in = '1' or use_previous = '1' then
738 # if rst = '1' or flush_in = '1' then
739 # r.hit_valid <= '0';
740 # end if;
741 # keep outputs to fetch2 unchanged on a stall
742 # except that flush or reset sets valid to 0
743 # If use_previous, keep the same data as last
744 # cycle and use the second half
745 with m.If(stall_in | use_previous):
746 with m.If('''TODO rst nmigen''' | flush_in):
747 sync += r.hit_valid.eq(0)
748 # else
749 # -- On a hit, latch the request for the next cycle,
750 # -- when the BRAM data will be available on the
751 # -- cache_out output of the corresponding way
752 # r.hit_valid <= req_is_hit;
753 # if req_is_hit = '1' then
754 # r.hit_way <= req_hit_way;
755 with m.Else():
756 # On a hit, latch the request for the next cycle,
757 # when the BRAM data will be available on the
758 # cache_out output of the corresponding way
759 sync += r.hit_valid.eq(req_is_hit)
760
761 with m.If(req_is_hit):
762 sync += r.hit_way.eq(req_hit_way)
763
764 # report "cache hit nia:" & to_hstring(i_in.nia) &
765 # " IR:" & std_ulogic'image(i_in.virt_mode) &
766 # " SM:" & std_ulogic'image(i_in.stop_mark) &
767 # " idx:" & integer'image(req_index) &
768 # " tag:" & to_hstring(req_tag) &
769 # " way:" & integer'image(req_hit_way) &
770 # " RA:" & to_hstring(real_addr);
771 print(f"cache hit nia:{i_in.nia}, IR:{i_in.virt_mode}, " \
772 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
773 f"tag:{req_tag}, way:{req_hit_way}, RA:{real_addr}")
774 # end if;
775 # end if;
776 # if stall_in = '0' then
777 # -- Send stop marks and NIA down regardless of validity
778 # r.hit_smark <= i_in.stop_mark;
779 # r.hit_nia <= i_in.nia;
780 # end if;
781 with m.If(~stall_in):
782 # Send stop marks and NIA down regardless of validity
783 sync += r.hit_smark.eq(i_in.stop_mark)
784 sync += r.hit_nia.eq(i_in.nia)
785 # end if;
786 # end process;
787
788 # -- Cache miss/reload synchronous machine
789 # icache_miss : process(clk)
790 # Cache miss/reload synchronous machine
791 def icache_miss(self, m):
792 comb = m.d.comb
793 sync = m.d.sync
794
795 # variable tagset : cache_tags_set_t;
796 # variable stbs_done : boolean;
797
798 tagset = Signal(TAG_RAM_WIDTH)
799 stbs_done = Signal()
800
801 # begin
802 # if rising_edge(clk) then
803 # -- On reset, clear all valid bits to force misses
804 # if rst = '1' then
805 # On reset, clear all valid bits to force misses
806 with m.If('''TODO rst nmigen'''):
807 # for i in index_t loop
808 # cache_valids(i) <= (others => '0');
809 # end loop;
810 for i in Signal(NUM_LINES):
811 sync += cache_valid_bits[i].eq(~1)
812
813 # r.state <= IDLE;
814 # r.wb.cyc <= '0';
815 # r.wb.stb <= '0';
816 sync += r.state.eq(State.IDLE)
817 sync += r.wb.cyc.eq(0)
818 sync += r.wb.stb.eq(0)
819
820 # -- We only ever do reads on wishbone
821 # r.wb.dat <= (others => '0');
822 # r.wb.sel <= "11111111";
823 # r.wb.we <= '0';
824 # We only ever do reads on wishbone
825 sync += r.wb.dat.eq(~1)
826 sync += r.wb.sel.eq(Const(0b11111111, 8))
827 sync += r.wb.we.eq(0)
828
829 # -- Not useful normally but helps avoiding tons of sim warnings
830 # r.wb.adr <= (others => '0');
831 # Not useful normally but helps avoiding tons of sim warnings
832 sync += r.wb.adr.eq(~1)
833
834 # else
835 with m.Else():
836 # -- Process cache invalidations
837 # if inval_in = '1' then
838 # for i in index_t loop
839 # cache_valids(i) <= (others => '0');
840 # end loop;
841 # r.store_valid <= '0';
842 # end if;
843 # Process cache invalidations
844 with m.If(inval_in):
845 for i in range(NUM_LINES):
846 sync += cache_valid_bits[i].eq(~1)
847
848 sync += r.store_valid.eq(0)
849
850 # -- Main state machine
851 # case r.state is
852 # Main state machine
853 with m.Switch(r.state):
854
855 # when IDLE =>
856 with m.Case(State.IDLE):
857 # -- Reset per-row valid flags, only used in WAIT_ACK
858 # for i in 0 to ROW_PER_LINE - 1 loop
859 # r.rows_valid(i) <= '0';
860 # end loop;
861 # Reset per-row valid flags, onlyy used in WAIT_ACK
862 for i in range(ROW_PER_LINE):
863 sync += r.rows_valid[i].eq(0)
864
865 # -- We need to read a cache line
866 # if req_is_miss = '1' then
867 # report "cache miss nia:" & to_hstring(i_in.nia) &
868 # " IR:" & std_ulogic'image(i_in.virt_mode) &
869 # " SM:" & std_ulogic'image(i_in.stop_mark) &
870 # " idx:" & integer'image(req_index) &
871 # " way:" & integer'image(replace_way) &
872 # " tag:" & to_hstring(req_tag) &
873 # " RA:" & to_hstring(real_addr);
874 # We need to read a cache line
875 with m.If(req_is_miss):
876 print(f"cache miss nia:{i_in.nia} " \
877 f"IR:{i_in.virt_mode} " \
878 f"SM:{i_in.stop_mark} idx:{req_index} " \
879 f"way:{replace_way} tag:{req_tag} " \
880 f"RA:{real_addr}")
881
882 # -- Keep track of our index and way for
883 # -- subsequent stores
884 # r.store_index <= req_index;
885 # r.store_row <= get_row(req_laddr);
886 # r.store_tag <= req_tag;
887 # r.store_valid <= '1';
888 # r.end_row_ix <=
889 # get_row_of_line(get_row(req_laddr)) - 1;
890 # Keep track of our index and way
891 # for subsequent stores
892 sync += r.store_index.eq(req_index)
893 sync += r.store_row.eq(get_row(req_laddr))
894 sync += r.store_tag.eq(req_tag)
895 sync += r.store_valid.eq(1)
896 sync += r.end_row_ix.eq(
897 get_row_of_line(get_row(req_laddr)) - 1
898 )
899
900 # -- Prep for first wishbone read. We calculate the
901 # -- address of the start of the cache line and
902 # -- start the WB cycle.
903 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
904 # r.wb.cyc <= '1';
905 # r.wb.stb <= '1';
906 # Prep for first wishbone read. We calculate the
907 # address of the start of the cache line and
908 # start the WB cycle.
909 sync += r.wb.adr.eq(
910 req_laddr[:r.wb.adr'''left?''']
911 )
912
913 # -- Track that we had one request sent
914 # r.state <= CLR_TAG;
915 # Track that we had one request sent
916 sync += r.state.eq(State.CLR_TAG)
917 # end if;
918
919 # when CLR_TAG | WAIT_ACK =>
920 with m.Case(State.CLR_TAG, State.WAIT_ACK):
921 # if r.state = CLR_TAG then
922 with m.If(r.state == State.CLR_TAG):
923 # -- Get victim way from plru
924 # r.store_way <= replace_way;
925 # Get victim way from plru
926 sync += r.store_way.eq(replace_way)
927 #
928 # -- Force misses on that way while reloading that line
929 # cache_valids(req_index)(replace_way) <= '0';
930 # Force misses on that way while
931 # realoading that line
932 sync += cache_valid_bits[
933 req_index
934 ][replace_way].eq(0)
935
936 # -- Store new tag in selected way
937 # for i in 0 to NUM_WAYS-1 loop
938 # if i = replace_way then
939 # tagset := cache_tags(r.store_index);
940 # write_tag(i, tagset, r.store_tag);
941 # cache_tags(r.store_index) <= tagset;
942 # end if;
943 # end loop;
944 for i in range(NUM_WAYS):
945 with m.If(i == replace_way):
946 comb += tagset.eq(
947 cache_tags[r.store_index]
948 )
949 sync += write_tag(i, tagset, r.store_tag)
950 sync += cache_tags(r.store_index).eq(
951 tagset
952 )
953
954 # r.state <= WAIT_ACK;
955 sync += r.state.eq(State.WAIT_ACK)
956 # end if;
957
958 # -- Requests are all sent if stb is 0
959 # stbs_done := r.wb.stb = '0';
960 # Requests are all sent if stb is 0
961 comb += stbs_done.eq(r.wb.stb == 0)
962
963 # -- If we are still sending requests, was one accepted ?
964 # if wishbone_in.stall = '0' and not stbs_done then
965 # If we are still sending requests, was one accepted?
966 with m.If(~wb_in.stall & ~stbs_done):
967 # -- That was the last word ? We are done sending.
968 # -- Clear stb and set stbs_done so we can handle
969 # -- an eventual last ack on the same cycle.
970 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
971 # r.wb.stb <= '0';
972 # stbs_done := true;
973 # end if;
974 # That was the last word ? We are done sending.
975 # Clear stb and set stbs_done so we can handle
976 # an eventual last ack on the same cycle.
977 with m.If(is_last_row_addr(
978 r.wb.adr, r.end_row_ix)):
979 sync += r.wb.stb.eq(0)
980 stbs_done.eq(1)
981
982 # -- Calculate the next row address
983 # r.wb.adr <= next_row_addr(r.wb.adr);
984 # Calculate the next row address
985 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
986 # end if;
987
988 # -- Incoming acks processing
989 # if wishbone_in.ack = '1' then
990 # Incoming acks processing
991 with m.If(wb_in.ack):
992 # r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
993 sync += r.rows_valid[
994 r.store_row & ROW_PER_LINE
995 ].eq(1)
996
997 # -- Check for completion
998 # if stbs_done and
999 # is_last_row(r.store_row, r.end_row_ix) then
1000 # Check for completion
1001 with m.If(stbs_done & is_last_row(
1002 r.store_row, r.end_row_ix)):
1003 # -- Complete wishbone cycle
1004 # r.wb.cyc <= '0';
1005 # Complete wishbone cycle
1006 sync += r.wb.cyc.eq(0)
1007
1008 # -- Cache line is now valid
1009 # cache_valids(r.store_index)(replace_way) <=
1010 # r.store_valid and not inval_in;
1011 # Cache line is now valid
1012 sync += cache_valid_bits[
1013 r.store_index
1014 ][relace_way].eq(
1015 r.store_valid & ~inval_in
1016 )
1017
1018 # -- We are done
1019 # r.state <= IDLE;
1020 # We are done
1021 sync += r.state.eq(State.IDLE)
1022 # end if;
1023
1024 # -- Increment store row counter
1025 # r.store_row <= next_row(r.store_row);
1026 # Increment store row counter
1027 sync += store_row.eq(next_row(r.store_row))
1028 # end if;
1029 # end case;
1030 # end if;
1031 #
1032 # -- TLB miss and protection fault processing
1033 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1034 # r.fetch_failed <= '0';
1035 # elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
1036 # r.fetch_failed <= '1';
1037 # end if;
1038 # TLB miss and protection fault processing
1039 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1040 sync += r.fetch_failed.eq(0)
1041
1042 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1043 sync += r.fetch_failed.eq(1)
1044 # end if;
1045 # end process;
1046
1047 # icache_log: if LOG_LENGTH > 0 generate
1048 def icache_log(self, m, log_out):
1049 comb = m.d.comb
1050 sync = m.d.sync
1051
1052 # -- Output data to logger
1053 # signal log_data : std_ulogic_vector(53 downto 0);
1054 # begin
1055 # data_log: process(clk)
1056 # variable lway: way_t;
1057 # variable wstate: std_ulogic;
1058 # Output data to logger
1059 for i in range(LOG_LENGTH)
1060 # Output data to logger
1061 log_data = Signal(54)
1062 lway = Signal(NUM_WAYS)
1063 wstate = Signal()
1064
1065 # begin
1066 # if rising_edge(clk) then
1067 # lway := req_hit_way;
1068 # wstate := '0';
1069 comb += lway.eq(req_hit_way)
1070 comb += wstate.eq(0)
1071
1072 # if r.state /= IDLE then
1073 # wstate := '1';
1074 # end if;
1075 with m.If(r.state != State.IDLE):
1076 comb += wstate.eq(1)
1077
1078 # log_data <= i_out.valid &
1079 # i_out.insn &
1080 # wishbone_in.ack &
1081 # r.wb.adr(5 downto 3) &
1082 # r.wb.stb & r.wb.cyc &
1083 # wishbone_in.stall &
1084 # stall_out &
1085 # r.fetch_failed &
1086 # r.hit_nia(5 downto 2) &
1087 # wstate &
1088 # std_ulogic_vector(to_unsigned(lway, 3)) &
1089 # req_is_hit & req_is_miss &
1090 # access_ok &
1091 # ra_valid;
1092 sync += log_data.eq(Cat(
1093 ra_valid, access_ok, req_is_miss, req_is_hit,
1094 lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6],
1095 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1096 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1097 i_out.valid
1098 ))
1099 # end if;
1100 # end process;
1101 # log_out <= log_data;
1102 comb += log_out.eq(log_data)
1103 # end generate;
1104 # end;
1105
1106 def elaborate(self, platform):
1107 # architecture rtl of icache is
1108 # constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
1109 # -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
1110 # constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
1111 # -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
1112 # -- icache
1113 # constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
1114 # -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
1115 # constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
1116 # -- Bit fields counts in the address
1117 #
1118 # -- INSN_BITS is the number of bits to select an instruction in a row
1119 # constant INSN_BITS : natural := log2(INSN_PER_ROW);
1120 # -- ROW_BITS is the number of bits to select a row
1121 # constant ROW_BITS : natural := log2(BRAM_ROWS);
1122 # -- ROW_LINEBITS is the number of bits to select a row within a line
1123 # constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
1124 # -- LINE_OFF_BITS is the number of bits for the offset in a cache line
1125 # constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
1126 # -- ROW_OFF_BITS is the number of bits for the offset in a row
1127 # constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
1128 # -- INDEX_BITS is the number of bits to select a cache line
1129 # constant INDEX_BITS : natural := log2(NUM_LINES);
1130 # -- SET_SIZE_BITS is the log base 2 of the set size
1131 # constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
1132 # -- TAG_BITS is the number of bits of the tag part of the address
1133 # constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
1134 # -- WAY_BITS is the number of bits to select a way
1135 # constant WAY_BITS : natural := log2(NUM_WAYS);
1136
1137 ROW_SIZE_BITS = ROW_SIZE * 8
1138 # ROW_PER_LINE is the number of row
1139 # (wishbone) transactions in a line
1140 ROW_PER_LINE = LINE_SIZE / ROW_SIZE
1141 # BRAM_ROWS is the number of rows in
1142 # BRAM needed to represent the full icache
1143 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
1144 # INSN_PER_ROW is the number of 32bit
1145 # instructions per BRAM row
1146 INSN_PER_ROW = ROW_SIZE_BITS / 32
1147
1148 # Bit fields counts in the address
1149 #
1150 # INSN_BITS is the number of bits to
1151 # select an instruction in a row
1152 INSN_BITS = log2_int(INSN_PER_ROW)
1153 # ROW_BITS is the number of bits to
1154 # select a row
1155 ROW_BITS = log2_int(BRAM_ROWS)
1156 # ROW_LINEBITS is the number of bits to
1157 # select a row within a line
1158 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
1159 # LINE_OFF_BITS is the number of bits for
1160 # the offset in a cache line
1161 LINE_OFF_BITS = log2_int(LINE_SIZE)
1162 # ROW_OFF_BITS is the number of bits for
1163 # the offset in a row
1164 ROW_OFF_BITS = log2_int(ROW_SIZE)
1165 # INDEX_BITS is the number of bits to
1166 # select a cache line
1167 INDEX_BITS = log2_int(NUM_LINES)
1168 # SET_SIZE_BITS is the log base 2 of
1169 # the set size
1170 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
1171 # TAG_BITS is the number of bits of
1172 # the tag part of the address
1173 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
1174 # WAY_BITS is the number of bits to
1175 # select a way
1176 WAY_BITS = log2_int(NUM_WAYS)
1177 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
1178
1179 # -- Example of layout for 32 lines of 64 bytes:
1180 # --
1181 # -- .. tag |index| line |
1182 # -- .. | row | |
1183 # -- .. | | | |00| zero (2)
1184 # -- .. | | |-| | INSN_BITS (1)
1185 # -- .. | |---| | ROW_LINEBITS (3)
1186 # -- .. | |--- - --| LINE_OFF_BITS (6)
1187 # -- .. | |- --| ROW_OFF_BITS (3)
1188 # -- .. |----- ---| | ROW_BITS (8)
1189 # -- .. |-----| | INDEX_BITS (5)
1190 # -- .. --------| | TAG_BITS (53)
1191 # Example of layout for 32 lines of 64 bytes:
1192 #
1193 # .. tag |index| line |
1194 # .. | row | |
1195 # .. | | | |00| zero (2)
1196 # .. | | |-| | INSN_BITS (1)
1197 # .. | |---| | ROW_LINEBITS (3)
1198 # .. | |--- - --| LINE_OFF_BITS (6)
1199 # .. | |- --| ROW_OFF_BITS (3)
1200 # .. |----- ---| | ROW_BITS (8)
1201 # .. |-----| | INDEX_BITS (5)
1202 # .. --------| | TAG_BITS (53)
1203
1204 # subtype row_t is integer range 0 to BRAM_ROWS-1;
1205 # subtype index_t is integer range 0 to NUM_LINES-1;
1206 # subtype way_t is integer range 0 to NUM_WAYS-1;
1207 # subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
1208 #
1209 # -- The cache data BRAM organized as described above for each way
1210 # subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
1211 #
1212 # -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
1213 # -- not handle a clean (commented) definition of the cache tags as a 3d
1214 # -- memory. For now, work around it by putting all the tags
1215 # subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
1216 # -- type cache_tags_set_t is array(way_t) of cache_tag_t;
1217 # -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
1218 # constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
1219 # subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
1220 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
1221 def CacheTagArray():
1222 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
1223
1224 # -- The cache valid bits
1225 # subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
1226 # type cache_valids_t is array(index_t) of cache_way_valids_t;
1227 # type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
1228 def CacheValidBitsArray():
1229 return Array(Signal() for x in ROW_PER_LINE)
1230
1231 def RowPerLineValidArray():
1232 return Array(Signal() for x in range ROW_PER_LINE)
1233
1234 # -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1235 # signal cache_tags : cache_tags_array_t;
1236 # signal cache_valids : cache_valids_t;
1237 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1238 cache_tags = CacheTagArray()
1239 cache_valid_bits = CacheValidBitsArray()
1240
1241 # attribute ram_style : string;
1242 # attribute ram_style of cache_tags : signal is "distributed";
1243 # TODO to be passed to nigmen as ram attributes
1244 # attribute ram_style : string;
1245 # attribute ram_style of cache_tags : signal is "distributed";
1246
1247 # -- L1 ITLB.
1248 # constant TLB_BITS : natural := log2(TLB_SIZE);
1249 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
1250 # constant TLB_PTE_BITS : natural := 64;
1251 TLB_BITS = log2_int(TLB_SIZE)
1252 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
1253 TLB_PTE_BITS = 64
1254
1255 # subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
1256 # type tlb_valids_t is array(tlb_index_t) of std_ulogic;
1257 # subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
1258 # type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
1259 # subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
1260 # type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
1261 def TLBValidBitsArray():
1262 return Array(Signal() for x in range(TLB_SIZE))
1263
1264 def TLBTagArray():
1265 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
1266
1267 def TLBPTEArray():
1268 return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
1269
1270 # signal itlb_valids : tlb_valids_t;
1271 # signal itlb_tags : tlb_tags_t;
1272 # signal itlb_ptes : tlb_ptes_t;
1273 # attribute ram_style of itlb_tags : signal is "distributed";
1274 # attribute ram_style of itlb_ptes : signal is "distributed";
1275 itlb_valid_bits = TLBValidBitsArray()
1276 itlb_tags = TLBTagArray()
1277 itlb_ptes = TLBPTEArray()
1278 # TODO to be passed to nmigen as ram attributes
1279 # attribute ram_style of itlb_tags : signal is "distributed";
1280 # attribute ram_style of itlb_ptes : signal is "distributed";
1281
1282 # -- Privilege bit from PTE EAA field
1283 # signal eaa_priv : std_ulogic;
1284 # Privilege bit from PTE EAA field
1285 eaa_priv = Signal()
1286
1287
1288 # signal r : reg_internal_t;
1289 r = RegInternal()
1290
1291 # -- Async signals on incoming request
1292 # signal req_index : index_t;
1293 # signal req_row : row_t;
1294 # signal req_hit_way : way_t;
1295 # signal req_tag : cache_tag_t;
1296 # signal req_is_hit : std_ulogic;
1297 # signal req_is_miss : std_ulogic;
1298 # signal req_laddr : std_ulogic_vector(63 downto 0);
1299 # Async signal on incoming request
1300 req_index = Signal(NUM_LINES)
1301 req_row = Signal(BRAM_ROWS)
1302 req_hit_way = Signal(NUM_WAYS)
1303 req_tag = Signal(TAG_BITS)
1304 req_is_hit = Signal()
1305 req_is_miss = Signal()
1306 req_laddr = Signal(64)
1307
1308 # signal tlb_req_index : tlb_index_t;
1309 # signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
1310 # signal ra_valid : std_ulogic;
1311 # signal priv_fault : std_ulogic;
1312 # signal access_ok : std_ulogic;
1313 # signal use_previous : std_ulogic;
1314 tlb_req_index = Signal(TLB_SIZE)
1315 real_addr = Signal(REAL_ADDR_BITS)
1316 ra_valid = Signal()
1317 priv_fault = Signal()
1318 access_ok = Signal()
1319 use_previous = Signal()
1320
1321 # -- Cache RAM interface
1322 # type cache_ram_out_t is array(way_t) of cache_row_t;
1323 # signal cache_out : cache_ram_out_t;
1324 # Cache RAM interface
1325 def CacheRamOut():
1326 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
1327
1328 cache_out = CacheRamOut()
1329
1330 # -- PLRU output interface
1331 # type plru_out_t is array(index_t) of
1332 # std_ulogic_vector(WAY_BITS-1 downto 0);
1333 # signal plru_victim : plru_out_t;
1334 # signal replace_way : way_t;
1335 # PLRU output interface
1336 def PLRUOut():
1337 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
1338
1339 plru_victim = PLRUOut()
1340 replace_way = Signal(NUM_WAYS)
1341
1342 # begin
1343 #
1344 # assert LINE_SIZE mod ROW_SIZE = 0;
1345 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
1346 # severity FAILURE;
1347 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
1348 # severity FAILURE;
1349 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
1350 # severity FAILURE;
1351 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
1352 # severity FAILURE;
1353 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
1354 # report "geometry bits don't add up" severity FAILURE;
1355 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
1356 # report "geometry bits don't add up" severity FAILURE;
1357 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
1358 # report "geometry bits don't add up" severity FAILURE;
1359 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
1360 # report "geometry bits don't add up" severity FAILURE;
1361 #
1362 # sim_debug: if SIM generate
1363 # debug: process
1364 # begin
1365 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
1366 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
1367 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
1368 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
1369 # report "INSN_BITS = " & natural'image(INSN_BITS);
1370 # report "ROW_BITS = " & natural'image(ROW_BITS);
1371 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
1372 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
1373 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
1374 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
1375 # report "TAG_BITS = " & natural'image(TAG_BITS);
1376 # report "WAY_BITS = " & natural'image(WAY_BITS);
1377 # wait;
1378 # end process;
1379 # end generate;
1380