icache.py fix spelling, syntax
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Signal)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.util import log2_int
30
31
32 from soc.experiment.mem_types import Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType
35
36 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
37 WBAddrType, WBDataType, WBSelType,
38 WbMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut
41
42
43 # Cache reload state machine
44 @unique
45 class State(Enum):
46 IDLE
47 CLR_TAG
48 WAIT_ACK
49
50 # type reg_internal_t is record
51 # -- Cache hit state (Latches for 1 cycle BRAM access)
52 # hit_way : way_t;
53 # hit_nia : std_ulogic_vector(63 downto 0);
54 # hit_smark : std_ulogic;
55 # hit_valid : std_ulogic;
56 #
57 # -- Cache miss state (reload state machine)
58 # state : state_t;
59 # wb : wishbone_master_out;
60 # store_way : way_t;
61 # store_index : index_t;
62 # store_row : row_t;
63 # store_tag : cache_tag_t;
64 # store_valid : std_ulogic;
65 # end_row_ix : row_in_line_t;
66 # rows_valid : row_per_line_valid_t;
67 #
68 # -- TLB miss state
69 # fetch_failed : std_ulogic;
70 # end record;
71 class RegInternal(RecordObject):
72 def __init__(self):
73 super().__init__()
74 # Cache hit state (Latches for 1 cycle BRAM access)
75 self.hit_way = Signal(NUM_WAYS)
76 self.hit_nia = Signal(64)
77 self.hit_smark = Signal()
78 self.hit_valid = Signal()
79
80 # Cache miss state (reload state machine)
81 self.state = State()
82 self.wb = WBMasterOut()
83 self.store_way = Signal(NUM_WAYS)
84 self.store_index = Signal(NUM_LINES)
85 self.store_row = Signal(BRAM_ROWS)
86 self.store_tag = Signal(TAG_BITS)
87 self.store_valid = Signal()
88 self.end_row_ix = Signal(ROW_LINE_BITS)
89 self.rows_valid = RowPerLineValidArray()
90
91 # TLB miss state
92 self.fetch_failed = Signal()
93
94 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
95 #
96 # entity icache is
97 # generic (
98 # SIM : boolean := false;
99 # -- Line size in bytes
100 # LINE_SIZE : positive := 64;
101 # -- BRAM organisation: We never access more than wishbone_data_bits
102 # -- at a time so to save resources we make the array only that wide,
103 # -- and use consecutive indices for to make a cache "line"
104 # --
105 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
106 # -- so 64-bits)
107 # ROW_SIZE : positive := wishbone_data_bits / 8;
108 # -- Number of lines in a set
109 # NUM_LINES : positive := 32;
110 # -- Number of ways
111 # NUM_WAYS : positive := 4;
112 # -- L1 ITLB number of entries (direct mapped)
113 # TLB_SIZE : positive := 64;
114 # -- L1 ITLB log_2(page_size)
115 # TLB_LG_PGSZ : positive := 12;
116 # -- Number of real address bits that we store
117 # REAL_ADDR_BITS : positive := 56;
118 # -- Non-zero to enable log data collection
119 # LOG_LENGTH : natural := 0
120 # );
121 # port (
122 # clk : in std_ulogic;
123 # rst : in std_ulogic;
124 #
125 # i_in : in Fetch1ToIcacheType;
126 # i_out : out IcacheToDecode1Type;
127 #
128 # m_in : in MmuToIcacheType;
129 #
130 # stall_in : in std_ulogic;
131 # stall_out : out std_ulogic;
132 # flush_in : in std_ulogic;
133 # inval_in : in std_ulogic;
134 #
135 # wishbone_out : out wishbone_master_out;
136 # wishbone_in : in wishbone_slave_out;
137 #
138 # log_out : out std_ulogic_vector(53 downto 0)
139 # );
140 # end entity icache;
141 # 64 bit direct mapped icache. All instructions are 4B aligned.
142 class ICache(Elaboratable):
143 """64 bit direct mapped icache. All instructions are 4B aligned."""
144 def __init__(self):
145 self.SIM = 0
146 self.LINE_SIZE = 64
147 # BRAM organisation: We never access more than wishbone_data_bits
148 # at a time so to save resources we make the array only that wide,
149 # and use consecutive indices for to make a cache "line"
150 #
151 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
152 self.ROW_SIZE = WB_DATA_BITS / 8
153 # Number of lines in a set
154 self.NUM_LINES = 32
155 # Number of ways
156 self.NUM_WAYS = 4
157 # L1 ITLB number of entries (direct mapped)
158 self.TLB_SIZE = 64
159 # L1 ITLB log_2(page_size)
160 self.TLB_LG_PGSZ = 12
161 # Number of real address bits that we store
162 self.REAL_ADDR_BITS = 56
163 # Non-zero to enable log data collection
164 self.LOG_LENGTH = 0
165
166 self.i_in = Fetch1ToICacheType()
167 self.i_out = ICacheToDecode1Type()
168
169 self.m_in = MMUToICacheType()
170
171 self.stall_in = Signal()
172 self.stall_out = Signal()
173 self.flush_in = Signal()
174 self.inval_in = Signal()
175
176 self.wb_out = WBMasterOut()
177 self.wb_in = WBSlaveOut()
178
179 self.log_out = Signal(54)
180
181 # -- Return the cache line index (tag index) for an address
182 # function get_index(addr: std_ulogic_vector(63 downto 0))
183 # return index_t is
184 # begin
185 # return to_integer(unsigned(
186 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
187 # ));
188 # end;
189 # Return the cache line index (tag index) for an address
190 def get_index(addr):
191 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
192
193 # -- Return the cache row index (data memory) for an address
194 # function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
195 # begin
196 # return to_integer(unsigned(
197 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
198 # ));
199 # end;
200 # Return the cache row index (data memory) for an address
201 def get_row(addr):
202 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
203
204 # -- Return the index of a row within a line
205 # function get_row_of_line(row: row_t) return row_in_line_t is
206 # variable row_v : unsigned(ROW_BITS-1 downto 0);
207 # begin
208 # row_v := to_unsigned(row, ROW_BITS);
209 # return row_v(ROW_LINEBITS-1 downto 0);
210 # end;
211 # Return the index of a row within a line
212 def get_row_of_line(row):
213 row[:ROW_LINE_BITS]
214
215 # -- Returns whether this is the last row of a line
216 # function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t)
217 # return boolean is
218 # begin
219 # return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
220 # end;
221 # Returns whether this is the last row of a line
222 def is_last_row_addr(addr, last):
223 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
224
225 # -- Returns whether this is the last row of a line
226 # function is_last_row(row: row_t; last: row_in_line_t) return boolean is
227 # begin
228 # return get_row_of_line(row) = last;
229 # end;
230 # Returns whether this is the last row of a line
231 def is_last_row(row, last):
232 return get_row_of_line(row) == last
233
234 # -- Return the address of the next row in the current cache line
235 # function next_row_addr(addr: wishbone_addr_type)
236 # return std_ulogic_vector is
237 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
238 # variable result : wishbone_addr_type;
239 # begin
240 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
241 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
242 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
243 # result := addr;
244 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
245 # return result;
246 # end;
247 # Return the address of the next row in the current cache line
248 def next_row_addr(addr):
249 # TODO no idea what's going on here, looks like double assignments
250 # overriding earlier assignments ??? Help please!
251
252 # -- Return the next row in the current cache line. We use a dedicated
253 # -- function in order to limit the size of the generated adder to be
254 # -- only the bits within a cache line (3 bits with default settings)
255 # function next_row(row: row_t) return row_t is
256 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
257 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
258 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
259 # begin
260 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
261 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
262 # row_v(ROW_LINEBITS-1 downto 0) :=
263 # std_ulogic_vector(unsigned(row_idx) + 1);
264 # return to_integer(unsigned(row_v));
265 # end;
266 # Return the next row in the current cache line. We use a dedicated
267 # function in order to limit the size of the generated adder to be
268 # only the bits within a cache line (3 bits with default settings)
269 def next_row(row):
270 # TODO no idea what's going on here, looks like double assignments
271 # overriding earlier assignments ??? Help please!
272
273 # -- Read the instruction word for the given address in the
274 # -- current cache row
275 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
276 # data: cache_row_t) return std_ulogic_vector is
277 # variable word: integer range 0 to INSN_PER_ROW-1;
278 # begin
279 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
280 # return data(31+word*32 downto word*32);
281 # end;
282 # Read the instruction word for the given address
283 # in the current cache row
284 def read_insn_word(addr, data):
285 word = addr[2:INSN_BITS+3]
286 return data[word * 32:32 + word * 32]
287
288 # -- Get the tag value from the address
289 # function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0))
290 # return cache_tag_t is
291 # begin
292 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
293 # end;
294 # Get the tag value from the address
295 def get_tag(addr):
296 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
297
298 # -- Read a tag from a tag memory row
299 # function read_tag(way: way_t; tagset: cache_tags_set_t)
300 # return cache_tag_t is
301 # begin
302 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
303 # end;
304 # Read a tag from a tag memory row
305 def read_tag(way, tagset):
306 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
307
308 # -- Write a tag to tag memory row
309 # procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
310 # tag: cache_tag_t) is
311 # begin
312 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
313 # end;
314 # Write a tag to tag memory row
315 def write_tag(way, tagset, tag):
316 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
317
318 # -- Simple hash for direct-mapped TLB index
319 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
320 # return tlb_index_t is
321 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
322 # begin
323 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
324 # xor addr(
325 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
326 # TLB_LG_PGSZ + TLB_BITS
327 # )
328 # xor addr(
329 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
330 # TLB_LG_PGSZ + 2 * TLB_BITS
331 # );
332 # return to_integer(unsigned(hash));
333 # end;
334 # Simple hash for direct-mapped TLB index
335 def hash_ea(addr):
336 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS]
337 ^ addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS]
338 ^ addr[TLB_LG_PGSZ + 2 * TLB_BITS: TLB_LG_PGSZ + 3 * TLB_BITS]
339 return hsh
340
341 # -- Generate a cache RAM for each way
342 # rams: for i in 0 to NUM_WAYS-1 generate
343 # signal do_read : std_ulogic;
344 # signal do_write : std_ulogic;
345 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
346 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
347 # signal dout : cache_row_t;
348 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
349 # begin
350 # way: entity work.cache_ram
351 # generic map (
352 # ROW_BITS => ROW_BITS,
353 # WIDTH => ROW_SIZE_BITS
354 # )
355 # port map (
356 # clk => clk,
357 # rd_en => do_read,
358 # rd_addr => rd_addr,
359 # rd_data => dout,
360 # wr_sel => wr_sel,
361 # wr_addr => wr_addr,
362 # wr_data => wishbone_in.dat
363 # );
364 # process(all)
365 # begin
366 # do_read <= not (stall_in or use_previous);
367 # do_write <= '0';
368 # if wishbone_in.ack = '1' and replace_way = i then
369 # do_write <= '1';
370 # end if;
371 # cache_out(i) <= dout;
372 # rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
373 # wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
374 # for i in 0 to ROW_SIZE-1 loop
375 # wr_sel(i) <= do_write;
376 # end loop;
377 # end process;
378 # end generate;
379 def rams(self, m):
380 comb = m.d.comb
381
382 do_read = Signal()
383 do_write = Signal()
384 rd_addr = Signal(ROW_BITS)
385 wr_addr = Signal(ROW_BITS)
386 _d_out = Signal(ROW_SIZE_BITS)
387 wr_sel = Signal(ROW_SIZE)
388
389 for i in range(NUM_WAYS)
390 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
391 comb += way.rd_en.eq(do_read)
392 comb += way.rd_addr.eq(rd_addr)
393 comb += way.rd_data.eq(_d_out)
394 comb += way.wr_sel.eq(wr_sel)
395 comb += way.wr_add.eq(wr_addr)
396 comb += way.wr_data.eq(wb_in.dat)
397
398 comb += do_read.eq(~(stall_in | use_previous))
399 comb += do_write.eq(0)
400
401 with m.If(wb_in.ack & (replace_way == i)):
402 do_write.eq(1)
403
404 comb += cache_out[i].eq(_d_out)
405 comb += rd_addr.eq(Signal(req_row))
406 comb += wr_addr.eq(Signal(r.store_row))
407 for j in range(ROW_SIZE):
408 comb += wr_sel[j].eq(do_write)
409
410 # -- Generate PLRUs
411 # maybe_plrus: if NUM_WAYS > 1 generate
412 # begin
413 # plrus: for i in 0 to NUM_LINES-1 generate
414 # -- PLRU interface
415 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
416 # signal plru_acc_en : std_ulogic;
417 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
418 #
419 # begin
420 # plru : entity work.plru
421 # generic map (
422 # BITS => WAY_BITS
423 # )
424 # port map (
425 # clk => clk,
426 # rst => rst,
427 # acc => plru_acc,
428 # acc_en => plru_acc_en,
429 # lru => plru_out
430 # );
431 #
432 # process(all)
433 # begin
434 # -- PLRU interface
435 # if get_index(r.hit_nia) = i then
436 # plru_acc_en <= r.hit_valid;
437 # else
438 # plru_acc_en <= '0';
439 # end if;
440 # plru_acc <=
441 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
442 # plru_victim(i) <= plru_out;
443 # end process;
444 # end generate;
445 # end generate;
446 def maybe_plrus(self, m):
447 comb += m.d.comb
448
449 with m.If(NUM_WAYS > 1):
450 for i in range(NUM_LINES):
451 plru_acc = Signal(WAY_BITS)
452 plru_acc_en = Signal()
453 plru_out = Signal(WAY_BITS)
454 plru = PLRU(WAY_BITS)
455 comb += plru.acc.eq(plru_acc)
456 comb += plru.acc_en.eq(plru_acc_en)
457 comb += plru.lru.eq(plru_out)
458
459 # PLRU interface
460 with m.If(get_index(r.hit_nia) == i):
461 comb += plru.acc_en.eq(r.hit_valid)
462
463 with m.Else():
464 comb += plru.acc_en.eq(0)
465
466 comb += plru.acc.eq(r.hit_way)
467 comb += plru_victim[i].eq(plru.lru)
468
469 # -- TLB hit detection and real address generation
470 # itlb_lookup : process(all)
471 # variable pte : tlb_pte_t;
472 # variable ttag : tlb_tag_t;
473 # begin
474 # tlb_req_index <= hash_ea(i_in.nia);
475 # pte := itlb_ptes(tlb_req_index);
476 # ttag := itlb_tags(tlb_req_index);
477 # if i_in.virt_mode = '1' then
478 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
479 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
480 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
481 # ra_valid <= itlb_valids(tlb_req_index);
482 # else
483 # ra_valid <= '0';
484 # end if;
485 # eaa_priv <= pte(3);
486 # else
487 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
488 # ra_valid <= '1';
489 # eaa_priv <= '1';
490 # end if;
491 #
492 # -- no IAMR, so no KUEP support for now
493 # priv_fault <= eaa_priv and not i_in.priv_mode;
494 # access_ok <= ra_valid and not priv_fault;
495 # end process;
496 # TLB hit detection and real address generation
497 def itlb_lookup(self, m):
498 comb = m.d.comb
499
500 comb += tlb_req_index.eq(hash_ea(i_in.nia))
501 comb += pte.eq(itlb_ptes[tlb_req_index])
502 comb += ttag.eq(itlb_tags[tlb_req_index])
503
504 with m.If(i_in.virt_mode):
505 comb += real_addr.eq(Cat(
506 i_in.nia[:TLB_LB_PGSZ],
507 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
508 ))
509
510 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
511 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
512
513 with m.Else():
514 comb += ra_valid.eq(0)
515
516 with m.Else():
517 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
518 comb += ra_valid.eq(1)
519 comb += eaa_priv.eq(1)
520
521 # No IAMR, so no KUEP support for now
522 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
523 comb += access_ok.eq(ra_valid & ~priv_fault)
524
525 # -- iTLB update
526 # itlb_update: process(clk)
527 # variable wr_index : tlb_index_t;
528 # begin
529 # if rising_edge(clk) then
530 # wr_index := hash_ea(m_in.addr);
531 # if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
532 # -- clear all valid bits
533 # for i in tlb_index_t loop
534 # itlb_valids(i) <= '0';
535 # end loop;
536 # elsif m_in.tlbie = '1' then
537 # -- clear entry regardless of hit or miss
538 # itlb_valids(wr_index) <= '0';
539 # elsif m_in.tlbld = '1' then
540 # itlb_tags(wr_index) <= m_in.addr(
541 # 63 downto TLB_LG_PGSZ + TLB_BITS
542 # );
543 # itlb_ptes(wr_index) <= m_in.pte;
544 # itlb_valids(wr_index) <= '1';
545 # end if;
546 # end if;
547 # end process;
548 # iTLB update
549 def itlb_update(self, m):
550 sync = m.d.sync
551
552 wr_index = Signal(TLB_SIZE)
553 sync += wr_index.eq(hash_ea(m_in.addr))
554
555 with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
556 # Clear all valid bits
557 for i in range(TLB_SIZE):
558 sync += itlb_vlaids[i].eq(0)
559
560 with m.Elif(m_in.tlbie):
561 # Clear entry regardless of hit or miss
562 sync += itlb_valid_bits[wr_index].eq(0)
563
564 with m.Elif(m_in.tlbld):
565 sync += itlb_tags[wr_index].eq(
566 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
567 )
568 sync += itlb_ptes[wr_index].eq(m_in.pte)
569 sync += itlb_valid_bits[wr_index].eq(1)
570
571 # -- Cache hit detection, output to fetch2 and other misc logic
572 # icache_comb : process(all)
573 # Cache hit detection, output to fetch2 and other misc logic
574 def icache_comb(self, m):
575 # variable is_hit : std_ulogic;
576 # variable hit_way : way_t;
577 comb = m.d.comb
578
579 is_hit = Signal()
580 hit_way = Signal(NUM_WAYS)
581 # begin
582 # -- i_in.sequential means that i_in.nia this cycle is 4 more than
583 # -- last cycle. If we read more than 32 bits at a time, had a
584 # -- cache hit last cycle, and we don't want the first 32-bit chunk
585 # -- then we can keep the data we read last cycle and just use that.
586 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
587 # use_previous <= i_in.sequential and r.hit_valid;
588 # else
589 # use_previous <= '0';
590 # end if;
591 # i_in.sequential means that i_in.nia this cycle is 4 more than
592 # last cycle. If we read more than 32 bits at a time, had a
593 # cache hit last cycle, and we don't want the first 32-bit chunk
594 # then we can keep the data we read last cycle and just use that.
595 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
596 comb += use_previous.eq(i_in.sequential & r.hit_valid)
597
598 with m.else():
599 comb += use_previous.eq(0)
600
601 # -- Extract line, row and tag from request
602 # req_index <= get_index(i_in.nia);
603 # req_row <= get_row(i_in.nia);
604 # req_tag <= get_tag(real_addr);
605 # Extract line, row and tag from request
606 comb += req_index.eq(get_index(i_in.nia))
607 comb += req_row.eq(get_row(i_in.nia))
608 comb += req_tag.eq(get_tag(real_addr))
609
610 # -- Calculate address of beginning of cache row, will be
611 # -- used for cache miss processing if needed
612 # req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
613 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
614 # (ROW_OFF_BITS-1 downto 0 => '0');
615 # Calculate address of beginning of cache row, will be
616 # used for cache miss processing if needed
617 comb += req_laddr.eq(Cat(
618 Const(0b0, ROW_OFF_BITS),
619 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
620 Const(0, REAL_ADDR_BITS)
621 ))
622
623 # -- Test if pending request is a hit on any way
624 # hit_way := 0;
625 # is_hit := '0';
626 # for i in way_t loop
627 # if i_in.req = '1' and
628 # (cache_valids(req_index)(i) = '1' or
629 # (r.state = WAIT_ACK and
630 # req_index = r.store_index and
631 # i = r.store_way and
632 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
633 # if read_tag(i, cache_tags(req_index)) = req_tag then
634 # hit_way := i;
635 # is_hit := '1';
636 # end if;
637 # end if;
638 # end loop;
639 # Test if pending request is a hit on any way
640 for i in range(NUM_WAYS):
641 with m.If(i_in.req &
642 (cache_valid_bits[req_index][i] |
643 ((r.state == State.WAIT_ACK)
644 & (req_index == r.store_index)
645 & (i == r.store_way)
646 & r.rows_valid[req_row % ROW_PER_LINE])):
647 with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
648 comb += hit_way.eq(i)
649 comb += is_hit.eq(1)
650
651 # -- Generate the "hit" and "miss" signals for the synchronous blocks
652 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
653 # and rst = '0' then
654 # req_is_hit <= is_hit;
655 # req_is_miss <= not is_hit;
656 # else
657 # req_is_hit <= '0';
658 # req_is_miss <= '0';
659 # end if;
660 # req_hit_way <= hit_way;
661 # Generate the "hit" and "miss" signals for the synchronous blocks
662 with m.If(i_in.rq & access_ok & ~flush_in & '''TODO nmigen rst'''):
663 comb += req_is_hit.eq(is_hit)
664 comb += req_is_miss.eq(~is_hit)
665
666 with m.Else():
667 comb += req_is_hit.eq(0)
668 comb += req_is_miss.eq(0)
669
670 # -- The way to replace on a miss
671 # if r.state = CLR_TAG then
672 # replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
673 # else
674 # replace_way <= r.store_way;
675 # end if;
676 # The way to replace on a miss
677 with m.If(r.state == State.CLR_TAG):
678 comb += replace_way.eq(plru_victim[r.store_index])
679
680 with m.Else():
681 comb += replace_way.eq(r.store_way)
682
683 # -- Output instruction from current cache row
684 # --
685 # -- Note: This is a mild violation of our design principle of
686 # -- having pipeline stages output from a clean latch. In this
687 # -- case we output the result of a mux. The alternative would
688 # -- be output an entire row which I prefer not to do just yet
689 # -- as it would force fetch2 to know about some of the cache
690 # -- geometry information.
691 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
692 # i_out.valid <= r.hit_valid;
693 # i_out.nia <= r.hit_nia;
694 # i_out.stop_mark <= r.hit_smark;
695 # i_out.fetch_failed <= r.fetch_failed;
696 # Output instruction from current cache row
697 #
698 # Note: This is a mild violation of our design principle of
699 # having pipeline stages output from a clean latch. In this
700 # case we output the result of a mux. The alternative would
701 # be output an entire row which I prefer not to do just yet
702 # as it would force fetch2 to know about some of the cache
703 # geometry information.
704 comb += i_out.insn.eq(
705 read_insn_word(r.hit_nia, cache_out[r.hit_way])
706 )
707 comb += i_out.valid.eq(r.hit_valid)
708 comb += i_out.nia.eq(r.hit_nia)
709 comb += i_out.stop_mark.eq(r.hit_smark)
710 comb += i_out.fetch_failed.eq(r.fetch_failed)
711
712 # -- Stall fetch1 if we have a miss on cache or TLB
713 # -- or a protection fault
714 # stall_out <= not (is_hit and access_ok);
715 # Stall fetch1 if we have a miss on cache or TLB
716 # or a protection fault
717 comb += stall_out.eq(~(is_hit & access_ok))
718
719 # -- Wishbone requests output (from the cache miss reload machine)
720 # wishbone_out <= r.wb;
721 # Wishbone requests output (from the cache miss reload machine)
722 comb += wb_out.eq(r.wb)
723 # end process;
724
725 # -- Cache hit synchronous machine
726 # icache_hit : process(clk)
727 # Cache hit synchronous machine
728 def icache_hit(self, m):
729 sync = m.d.sync
730 # begin
731 # if rising_edge(clk) then
732 # -- keep outputs to fetch2 unchanged on a stall
733 # -- except that flush or reset sets valid to 0
734 # -- If use_previous, keep the same data as last
735 # -- cycle and use the second half
736 # if stall_in = '1' or use_previous = '1' then
737 # if rst = '1' or flush_in = '1' then
738 # r.hit_valid <= '0';
739 # end if;
740 # keep outputs to fetch2 unchanged on a stall
741 # except that flush or reset sets valid to 0
742 # If use_previous, keep the same data as last
743 # cycle and use the second half
744 with m.If(stall_in | use_previous):
745 with m.If('''TODO rst nmigen''' | flush_in):
746 sync += r.hit_valid.eq(0)
747 # else
748 # -- On a hit, latch the request for the next cycle,
749 # -- when the BRAM data will be available on the
750 # -- cache_out output of the corresponding way
751 # r.hit_valid <= req_is_hit;
752 # if req_is_hit = '1' then
753 # r.hit_way <= req_hit_way;
754 with m.Else():
755 # On a hit, latch the request for the next cycle,
756 # when the BRAM data will be available on the
757 # cache_out output of the corresponding way
758 sync += r.hit_valid.eq(req_is_hit)
759
760 with m.If(req_is_hit):
761 sync += r.hit_way.eq(req_hit_way)
762
763 # report "cache hit nia:" & to_hstring(i_in.nia) &
764 # " IR:" & std_ulogic'image(i_in.virt_mode) &
765 # " SM:" & std_ulogic'image(i_in.stop_mark) &
766 # " idx:" & integer'image(req_index) &
767 # " tag:" & to_hstring(req_tag) &
768 # " way:" & integer'image(req_hit_way) &
769 # " RA:" & to_hstring(real_addr);
770 print(f"cache hit nia:{i_in.nia}, IR:{i_in.virt_mode}, " \
771 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
772 f"tag:{req_tag}, way:{req_hit_way}, RA:{real_addr}")
773 # end if;
774 # end if;
775 # if stall_in = '0' then
776 # -- Send stop marks and NIA down regardless of validity
777 # r.hit_smark <= i_in.stop_mark;
778 # r.hit_nia <= i_in.nia;
779 # end if;
780 with m.If(~stall_in):
781 # Send stop marks and NIA down regardless of validity
782 sync += r.hit_smark.eq(i_in.stop_mark)
783 sync += r.hit_nia.eq(i_in.nia)
784 # end if;
785 # end process;
786
787 # -- Cache miss/reload synchronous machine
788 # icache_miss : process(clk)
789 # Cache miss/reload synchronous machine
790 def icache_miss(self, m):
791 comb = m.d.comb
792 sync = m.d.sync
793
794 # variable tagset : cache_tags_set_t;
795 # variable stbs_done : boolean;
796
797 tagset = Signal(TAG_RAM_WIDTH)
798 stbs_done = Signal()
799
800 # begin
801 # if rising_edge(clk) then
802 # -- On reset, clear all valid bits to force misses
803 # if rst = '1' then
804 # On reset, clear all valid bits to force misses
805 with m.If('''TODO rst nmigen'''):
806 # for i in index_t loop
807 # cache_valids(i) <= (others => '0');
808 # end loop;
809 for i in Signal(NUM_LINES):
810 sync += cache_valid_bits[i].eq(~1)
811
812 # r.state <= IDLE;
813 # r.wb.cyc <= '0';
814 # r.wb.stb <= '0';
815 sync += r.state.eq(State.IDLE)
816 sync += r.wb.cyc.eq(0)
817 sync += r.wb.stb.eq(0)
818
819 # -- We only ever do reads on wishbone
820 # r.wb.dat <= (others => '0');
821 # r.wb.sel <= "11111111";
822 # r.wb.we <= '0';
823 # We only ever do reads on wishbone
824 sync += r.wb.dat.eq(~1)
825 sync += r.wb.sel.eq(Const(0b11111111, 8))
826 sync += r.wb.we.eq(0)
827
828 # -- Not useful normally but helps avoiding tons of sim warnings
829 # r.wb.adr <= (others => '0');
830 # Not useful normally but helps avoiding tons of sim warnings
831 sync += r.wb.adr.eq(~1)
832
833 # else
834 with m.Else():
835 # -- Process cache invalidations
836 # if inval_in = '1' then
837 # for i in index_t loop
838 # cache_valids(i) <= (others => '0');
839 # end loop;
840 # r.store_valid <= '0';
841 # end if;
842 # Process cache invalidations
843 with m.If(inval_in):
844 for i in range(NUM_LINES):
845 sync += cache_valid_bits[i].eq(~1)
846
847 sync += r.store_valid.eq(0)
848
849 # -- Main state machine
850 # case r.state is
851 # Main state machine
852 with m.Switch(r.state):
853
854 # when IDLE =>
855 with m.Case(State.IDLE):
856 # -- Reset per-row valid flags, only used in WAIT_ACK
857 # for i in 0 to ROW_PER_LINE - 1 loop
858 # r.rows_valid(i) <= '0';
859 # end loop;
860 # Reset per-row valid flags, onlyy used in WAIT_ACK
861 for i in range(ROW_PER_LINE):
862 sync += r.rows_valid[i].eq(0)
863
864 # -- We need to read a cache line
865 # if req_is_miss = '1' then
866 # report "cache miss nia:" & to_hstring(i_in.nia) &
867 # " IR:" & std_ulogic'image(i_in.virt_mode) &
868 # " SM:" & std_ulogic'image(i_in.stop_mark) &
869 # " idx:" & integer'image(req_index) &
870 # " way:" & integer'image(replace_way) &
871 # " tag:" & to_hstring(req_tag) &
872 # " RA:" & to_hstring(real_addr);
873 # We need to read a cache line
874 with m.If(req_is_miss):
875 print(f"cache miss nia:{i_in.nia} " \
876 f"IR:{i_in.virt_mode} " \
877 f"SM:{i_in.stop_mark} idx:{req_index} " \
878 f"way:{replace_way} tag:{req_tag} " \
879 f"RA:{real_addr}")
880
881 # -- Keep track of our index and way for
882 # -- subsequent stores
883 # r.store_index <= req_index;
884 # r.store_row <= get_row(req_laddr);
885 # r.store_tag <= req_tag;
886 # r.store_valid <= '1';
887 # r.end_row_ix <=
888 # get_row_of_line(get_row(req_laddr)) - 1;
889 # Keep track of our index and way
890 # for subsequent stores
891 sync += r.store_index.eq(req_index)
892 sync += r.store_row.eq(get_row(req_laddr))
893 sync += r.store_tag.eq(req_tag)
894 sync += r.store_valid.eq(1)
895 sync += r.end_row_ix.eq(
896 get_row_of_line(get_row(req_laddr)) - 1
897 )
898
899 # -- Prep for first wishbone read. We calculate the
900 # -- address of the start of the cache line and
901 # -- start the WB cycle.
902 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
903 # r.wb.cyc <= '1';
904 # r.wb.stb <= '1';
905 # Prep for first wishbone read. We calculate the
906 # address of the start of the cache line and
907 # start the WB cycle.
908 sync += r.wb.adr.eq(
909 req_laddr[:r.wb.adr '''left?''']
910 )
911
912 # -- Track that we had one request sent
913 # r.state <= CLR_TAG;
914 # Track that we had one request sent
915 sync += r.state.eq(State.CLR_TAG)
916 # end if;
917
918 # when CLR_TAG | WAIT_ACK =>
919 with m.Case(State.CLR_TAG, State.WAIT_ACK):
920 # if r.state = CLR_TAG then
921 with m.If(r.state == State.CLR_TAG):
922 # -- Get victim way from plru
923 # r.store_way <= replace_way;
924 # Get victim way from plru
925 sync += r.store_way.eq(replace_way)
926 #
927 # -- Force misses on that way while reloading that line
928 # cache_valids(req_index)(replace_way) <= '0';
929 # Force misses on that way while
930 # realoading that line
931 sync += cache_valid_bits[
932 req_index
933 ][replace_way].eq(0)
934
935 # -- Store new tag in selected way
936 # for i in 0 to NUM_WAYS-1 loop
937 # if i = replace_way then
938 # tagset := cache_tags(r.store_index);
939 # write_tag(i, tagset, r.store_tag);
940 # cache_tags(r.store_index) <= tagset;
941 # end if;
942 # end loop;
943 for i in range(NUM_WAYS):
944 with m.If(i == replace_way):
945 comb += tagset.eq(
946 cache_tags[r.store_index]
947 )
948 sync += write_tag(i, tagset, r.store_tag)
949 sync += cache_tags(r.store_index).eq(
950 tagset
951 )
952
953 # r.state <= WAIT_ACK;
954 sync += r.state.eq(State.WAIT_ACK)
955 # end if;
956
957 # -- Requests are all sent if stb is 0
958 # stbs_done := r.wb.stb = '0';
959 # Requests are all sent if stb is 0
960 comb += stbs_done.eq(r.wb.stb == 0)
961
962 # -- If we are still sending requests, was one accepted ?
963 # if wishbone_in.stall = '0' and not stbs_done then
964 # If we are still sending requests, was one accepted?
965 with m.If(~wb_in.stall & ~stbs_done):
966 # -- That was the last word ? We are done sending.
967 # -- Clear stb and set stbs_done so we can handle
968 # -- an eventual last ack on the same cycle.
969 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
970 # r.wb.stb <= '0';
971 # stbs_done := true;
972 # end if;
973 # That was the last word ? We are done sending.
974 # Clear stb and set stbs_done so we can handle
975 # an eventual last ack on the same cycle.
976 with m.If(is_last_row_addr(
977 r.wb.adr, r.end_row_ix)):
978 sync += r.wb.stb.eq(0)
979 stbs_done.eq(1)
980
981 # -- Calculate the next row address
982 # r.wb.adr <= next_row_addr(r.wb.adr);
983 # Calculate the next row address
984 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
985 # end if;
986
987 # -- Incoming acks processing
988 # if wishbone_in.ack = '1' then
989 # Incoming acks processing
990 with m.If(wb_in.ack):
991 # r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
992 sync += r.rows_valid[
993 r.store_row & ROW_PER_LINE
994 ].eq(1)
995
996 # -- Check for completion
997 # if stbs_done and
998 # is_last_row(r.store_row, r.end_row_ix) then
999 # Check for completion
1000 with m.If(stbs_done & is_last_row(
1001 r.store_row, r.end_row_ix)):
1002 # -- Complete wishbone cycle
1003 # r.wb.cyc <= '0';
1004 # Complete wishbone cycle
1005 sync += r.wb.cyc.eq(0)
1006
1007 # -- Cache line is now valid
1008 # cache_valids(r.store_index)(replace_way) <=
1009 # r.store_valid and not inval_in;
1010 # Cache line is now valid
1011 sync += cache_valid_bits[
1012 r.store_index
1013 ][relace_way].eq(
1014 r.store_valid & ~inval_in
1015 )
1016
1017 # -- We are done
1018 # r.state <= IDLE;
1019 # We are done
1020 sync += r.state.eq(State.IDLE)
1021 # end if;
1022
1023 # -- Increment store row counter
1024 # r.store_row <= next_row(r.store_row);
1025 # Increment store row counter
1026 sync += store_row.eq(next_row(r.store_row))
1027 # end if;
1028 # end case;
1029 # end if;
1030 #
1031 # -- TLB miss and protection fault processing
1032 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1033 # r.fetch_failed <= '0';
1034 # elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
1035 # r.fetch_failed <= '1';
1036 # end if;
1037 # TLB miss and protection fault processing
1038 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1039 sync += r.fetch_failed.eq(0)
1040
1041 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1042 sync += r.fetch_failed.eq(1)
1043 # end if;
1044 # end process;
1045
1046 # icache_log: if LOG_LENGTH > 0 generate
1047 def icache_log(self, m, log_out):
1048 comb = m.d.comb
1049 sync = m.d.sync
1050
1051 # -- Output data to logger
1052 # signal log_data : std_ulogic_vector(53 downto 0);
1053 # begin
1054 # data_log: process(clk)
1055 # variable lway: way_t;
1056 # variable wstate: std_ulogic;
1057 # Output data to logger
1058 for i in range(LOG_LENGTH)
1059 # Output data to logger
1060 log_data = Signal(54)
1061 lway = Signal(NUM_WAYS)
1062 wstate = Signal()
1063
1064 # begin
1065 # if rising_edge(clk) then
1066 # lway := req_hit_way;
1067 # wstate := '0';
1068 comb += lway.eq(req_hit_way)
1069 comb += wstate.eq(0)
1070
1071 # if r.state /= IDLE then
1072 # wstate := '1';
1073 # end if;
1074 with m.If(r.state != State.IDLE):
1075 comb += wstate.eq(1)
1076
1077 # log_data <= i_out.valid &
1078 # i_out.insn &
1079 # wishbone_in.ack &
1080 # r.wb.adr(5 downto 3) &
1081 # r.wb.stb & r.wb.cyc &
1082 # wishbone_in.stall &
1083 # stall_out &
1084 # r.fetch_failed &
1085 # r.hit_nia(5 downto 2) &
1086 # wstate &
1087 # std_ulogic_vector(to_unsigned(lway, 3)) &
1088 # req_is_hit & req_is_miss &
1089 # access_ok &
1090 # ra_valid;
1091 sync += log_data.eq(Cat(
1092 ra_valid, access_ok, req_is_miss, req_is_hit,
1093 lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6],
1094 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1095 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1096 i_out.valid
1097 ))
1098 # end if;
1099 # end process;
1100 # log_out <= log_data;
1101 comb += log_out.eq(log_data)
1102 # end generate;
1103 # end;
1104
1105 def elaborate(self, platform):
1106 # architecture rtl of icache is
1107 # constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
1108 # -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
1109 # constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
1110 # -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
1111 # -- icache
1112 # constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
1113 # -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
1114 # constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
1115 # -- Bit fields counts in the address
1116 #
1117 # -- INSN_BITS is the number of bits to select an instruction in a row
1118 # constant INSN_BITS : natural := log2(INSN_PER_ROW);
1119 # -- ROW_BITS is the number of bits to select a row
1120 # constant ROW_BITS : natural := log2(BRAM_ROWS);
1121 # -- ROW_LINEBITS is the number of bits to select a row within a line
1122 # constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
1123 # -- LINE_OFF_BITS is the number of bits for the offset in a cache line
1124 # constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
1125 # -- ROW_OFF_BITS is the number of bits for the offset in a row
1126 # constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
1127 # -- INDEX_BITS is the number of bits to select a cache line
1128 # constant INDEX_BITS : natural := log2(NUM_LINES);
1129 # -- SET_SIZE_BITS is the log base 2 of the set size
1130 # constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
1131 # -- TAG_BITS is the number of bits of the tag part of the address
1132 # constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
1133 # -- WAY_BITS is the number of bits to select a way
1134 # constant WAY_BITS : natural := log2(NUM_WAYS);
1135
1136 ROW_SIZE_BITS = ROW_SIZE * 8
1137 # ROW_PER_LINE is the number of row
1138 # (wishbone) transactions in a line
1139 ROW_PER_LINE = LINE_SIZE / ROW_SIZE
1140 # BRAM_ROWS is the number of rows in
1141 # BRAM needed to represent the full icache
1142 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
1143 # INSN_PER_ROW is the number of 32bit
1144 # instructions per BRAM row
1145 INSN_PER_ROW = ROW_SIZE_BITS / 32
1146
1147 # Bit fields counts in the address
1148 #
1149 # INSN_BITS is the number of bits to
1150 # select an instruction in a row
1151 INSN_BITS = log2_int(INSN_PER_ROW)
1152 # ROW_BITS is the number of bits to
1153 # select a row
1154 ROW_BITS = log2_int(BRAM_ROWS)
1155 # ROW_LINEBITS is the number of bits to
1156 # select a row within a line
1157 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
1158 # LINE_OFF_BITS is the number of bits for
1159 # the offset in a cache line
1160 LINE_OFF_BITS = log2_int(LINE_SIZE)
1161 # ROW_OFF_BITS is the number of bits for
1162 # the offset in a row
1163 ROW_OFF_BITS = log2_int(ROW_SIZE)
1164 # INDEX_BITS is the number of bits to
1165 # select a cache line
1166 INDEX_BITS = log2_int(NUM_LINES)
1167 # SET_SIZE_BITS is the log base 2 of
1168 # the set size
1169 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
1170 # TAG_BITS is the number of bits of
1171 # the tag part of the address
1172 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
1173 # WAY_BITS is the number of bits to
1174 # select a way
1175 WAY_BITS = log2_int(NUM_WAYS)
1176 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
1177
1178 # -- Example of layout for 32 lines of 64 bytes:
1179 # --
1180 # -- .. tag |index| line |
1181 # -- .. | row | |
1182 # -- .. | | | |00| zero (2)
1183 # -- .. | | |-| | INSN_BITS (1)
1184 # -- .. | |---| | ROW_LINEBITS (3)
1185 # -- .. | |--- - --| LINE_OFF_BITS (6)
1186 # -- .. | |- --| ROW_OFF_BITS (3)
1187 # -- .. |----- ---| | ROW_BITS (8)
1188 # -- .. |-----| | INDEX_BITS (5)
1189 # -- .. --------| | TAG_BITS (53)
1190 # Example of layout for 32 lines of 64 bytes:
1191 #
1192 # .. tag |index| line |
1193 # .. | row | |
1194 # .. | | | |00| zero (2)
1195 # .. | | |-| | INSN_BITS (1)
1196 # .. | |---| | ROW_LINEBITS (3)
1197 # .. | |--- - --| LINE_OFF_BITS (6)
1198 # .. | |- --| ROW_OFF_BITS (3)
1199 # .. |----- ---| | ROW_BITS (8)
1200 # .. |-----| | INDEX_BITS (5)
1201 # .. --------| | TAG_BITS (53)
1202
1203 # subtype row_t is integer range 0 to BRAM_ROWS-1;
1204 # subtype index_t is integer range 0 to NUM_LINES-1;
1205 # subtype way_t is integer range 0 to NUM_WAYS-1;
1206 # subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
1207 #
1208 # -- The cache data BRAM organized as described above for each way
1209 # subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
1210 #
1211 # -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
1212 # -- not handle a clean (commented) definition of the cache tags as a 3d
1213 # -- memory. For now, work around it by putting all the tags
1214 # subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
1215 # -- type cache_tags_set_t is array(way_t) of cache_tag_t;
1216 # -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
1217 # constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
1218 # subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
1219 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
1220 def CacheTagArray():
1221 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
1222
1223 # -- The cache valid bits
1224 # subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
1225 # type cache_valids_t is array(index_t) of cache_way_valids_t;
1226 # type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
1227 def CacheValidBitsArray():
1228 return Array(Signal() for x in ROW_PER_LINE)
1229
1230 def RowPerLineValidArray():
1231 return Array(Signal() for x in range ROW_PER_LINE)
1232
1233 # -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1234 # signal cache_tags : cache_tags_array_t;
1235 # signal cache_valids : cache_valids_t;
1236 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1237 cache_tags = CacheTagArray()
1238 cache_valid_bits = CacheValidBitsArray()
1239
1240 # attribute ram_style : string;
1241 # attribute ram_style of cache_tags : signal is "distributed";
1242 # TODO to be passed to nigmen as ram attributes
1243 # attribute ram_style : string;
1244 # attribute ram_style of cache_tags : signal is "distributed";
1245
1246 # -- L1 ITLB.
1247 # constant TLB_BITS : natural := log2(TLB_SIZE);
1248 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
1249 # constant TLB_PTE_BITS : natural := 64;
1250 TLB_BITS = log2_int(TLB_SIZE)
1251 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
1252 TLB_PTE_BITS = 64
1253
1254 # subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
1255 # type tlb_valids_t is array(tlb_index_t) of std_ulogic;
1256 # subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
1257 # type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
1258 # subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
1259 # type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
1260 def TLBValidBitsArray():
1261 return Array(Signal() for x in range(TLB_SIZE))
1262
1263 def TLBTagArray():
1264 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
1265
1266 def TLBPTEArray():
1267 return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
1268
1269 # signal itlb_valids : tlb_valids_t;
1270 # signal itlb_tags : tlb_tags_t;
1271 # signal itlb_ptes : tlb_ptes_t;
1272 # attribute ram_style of itlb_tags : signal is "distributed";
1273 # attribute ram_style of itlb_ptes : signal is "distributed";
1274 itlb_valid_bits = TLBValidBitsArray()
1275 itlb_tags = TLBTagArray()
1276 itlb_ptes = TLBPTEArray()
1277 # TODO to be passed to nmigen as ram attributes
1278 # attribute ram_style of itlb_tags : signal is "distributed";
1279 # attribute ram_style of itlb_ptes : signal is "distributed";
1280
1281 # -- Privilege bit from PTE EAA field
1282 # signal eaa_priv : std_ulogic;
1283 # Privilege bit from PTE EAA field
1284 eaa_priv = Signal()
1285
1286
1287 # signal r : reg_internal_t;
1288 r = RegInternal()
1289
1290 # -- Async signals on incoming request
1291 # signal req_index : index_t;
1292 # signal req_row : row_t;
1293 # signal req_hit_way : way_t;
1294 # signal req_tag : cache_tag_t;
1295 # signal req_is_hit : std_ulogic;
1296 # signal req_is_miss : std_ulogic;
1297 # signal req_laddr : std_ulogic_vector(63 downto 0);
1298 # Async signal on incoming request
1299 req_index = Signal(NUM_LINES)
1300 req_row = Signal(BRAM_ROWS)
1301 req_hit_way = Signal(NUM_WAYS)
1302 req_tag = Signal(TAG_BITS)
1303 req_is_hit = Signal()
1304 req_is_miss = Signal()
1305 req_laddr = Signal(64)
1306
1307 # signal tlb_req_index : tlb_index_t;
1308 # signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
1309 # signal ra_valid : std_ulogic;
1310 # signal priv_fault : std_ulogic;
1311 # signal access_ok : std_ulogic;
1312 # signal use_previous : std_ulogic;
1313 tlb_req_index = Signal(TLB_SIZE)
1314 real_addr = Signal(REAL_ADDR_BITS)
1315 ra_valid = Signal()
1316 priv_fault = Signal()
1317 access_ok = Signal()
1318 use_previous = Signal()
1319
1320 # -- Cache RAM interface
1321 # type cache_ram_out_t is array(way_t) of cache_row_t;
1322 # signal cache_out : cache_ram_out_t;
1323 # Cache RAM interface
1324 def CacheRamOut():
1325 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
1326
1327 cache_out = CacheRamOut()
1328
1329 # -- PLRU output interface
1330 # type plru_out_t is array(index_t) of
1331 # std_ulogic_vector(WAY_BITS-1 downto 0);
1332 # signal plru_victim : plru_out_t;
1333 # signal replace_way : way_t;
1334 # PLRU output interface
1335 def PLRUOut():
1336 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
1337
1338 plru_victim = PLRUOut()
1339 replace_way = Signal(NUM_WAYS)
1340
1341 # begin
1342 #
1343 # assert LINE_SIZE mod ROW_SIZE = 0;
1344 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
1345 # severity FAILURE;
1346 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
1347 # severity FAILURE;
1348 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
1349 # severity FAILURE;
1350 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
1351 # severity FAILURE;
1352 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
1353 # report "geometry bits don't add up" severity FAILURE;
1354 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
1355 # report "geometry bits don't add up" severity FAILURE;
1356 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
1357 # report "geometry bits don't add up" severity FAILURE;
1358 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
1359 # report "geometry bits don't add up" severity FAILURE;
1360 #
1361 # sim_debug: if SIM generate
1362 # debug: process
1363 # begin
1364 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
1365 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
1366 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
1367 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
1368 # report "INSN_BITS = " & natural'image(INSN_BITS);
1369 # report "ROW_BITS = " & natural'image(ROW_BITS);
1370 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
1371 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
1372 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
1373 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
1374 # report "TAG_BITS = " & natural'image(TAG_BITS);
1375 # report "WAY_BITS = " & natural'image(WAY_BITS);
1376 # wait;
1377 # end process;
1378 # end generate;
1379