crossreference to bugreport
[soc.git] / src / soc / scoreboard / addr_match.py
1 """ Load / Store partial address matcher
2
3 Related bugreports:
4 * http://bugs.libre-riscv.org/show_bug.cgi?id=216
5
6 Loads and Stores do not need a full match (CAM), they need "good enough"
7 avoidance. Around 11 bits on a 64-bit address is "good enough".
8
9 The simplest way to use this module is to ignore not only the top bits,
10 but also the bottom bits as well: in this case (this RV64 processor),
11 enough to cover a DWORD (64-bit). that means ignore the bottom 4 bits,
12 due to the possibility of 64-bit LD/ST being misaligned.
13
14 To reiterate: the use of this module is an *optimisation*. All it has
15 to do is cover the cases that are *definitely* matches (by checking 11
16 bits or so), and if a few opportunities for parallel LD/STs are missed
17 because the top (or bottom) bits weren't checked, so what: all that
18 happens is: the mis-matched addresses are LD/STd on single-cycles. Big Deal.
19
20 However, if we wanted to enhance this algorithm (without using a CAM and
21 without using expensive comparators) probably the best way to do so would
22 be to turn the last 16 bits into a byte-level bitmap. LD/ST on a byte
23 would have 1 of the 16 bits set. LD/ST on a DWORD would have 8 of the 16
24 bits set (offset if the LD/ST was misaligned). TODO.
25
26 Notes:
27
28 > I have used bits <11:6> as they are not translated (4KB pages)
29 > and larger than a cache line (64 bytes).
30 > I have used bits <11:4> when the L1 cache was QuadW sized and
31 > the L2 cache was Line sized.
32 """
33
34 from nmigen.compat.sim import run_simulation
35 from nmigen.cli import verilog, rtlil
36 from nmigen import Module, Signal, Const, Array, Cat, Elaboratable
37 from nmigen.lib.coding import Decoder
38
39 from nmutil.latch import latchregister, SRLatch
40
41
42 class PartialAddrMatch(Elaboratable):
43 """A partial address matcher
44 """
45 def __init__(self, n_adr, bitwid):
46 self.n_adr = n_adr
47 self.bitwid = bitwid
48 # inputs
49 self.addrs_i = Array(Signal(bitwid, name="addr") for i in range(n_adr))
50 self.addr_we_i = Signal(n_adr, reset_less=True) # write-enable
51 self.addr_en_i = Signal(n_adr, reset_less=True) # address latched in
52 self.addr_rs_i = Signal(n_adr, reset_less=True) # address deactivated
53
54 # output: a nomatch for each address plus individual nomatch signals
55 self.addr_nomatch_o = Signal(n_adr, name="nomatch_o", reset_less=True)
56 self.addr_nomatch_a_o = Array(Signal(n_adr, reset_less=True,
57 name="nomatch_array_o") \
58 for i in range(n_adr))
59
60 def elaborate(self, platform):
61 m = Module()
62 return self._elaborate(m, platform)
63
64 def _elaborate(self, m, platform):
65 comb = m.d.comb
66 sync = m.d.sync
67
68 # array of address-latches
69 m.submodules.l = self.l = l = SRLatch(llen=self.n_adr, sync=False)
70 self.addrs_r = addrs_r = Array(Signal(self.bitwid, reset_less=True,
71 name="a_r") \
72 for i in range(self.n_adr))
73
74 # latch set/reset
75 comb += l.s.eq(self.addr_en_i)
76 comb += l.r.eq(self.addr_rs_i)
77
78 # copy in addresses (and "enable" signals)
79 for i in range(self.n_adr):
80 latchregister(m, self.addrs_i[i], addrs_r[i], l.q[i])
81
82 # is there a clash, yes/no
83 matchgrp = []
84 for i in range(self.n_adr):
85 match = []
86 for j in range(self.n_adr):
87 match.append(self.is_match(i, j))
88 comb += self.addr_nomatch_a_o[i].eq(~Cat(*match) & l.q)
89 matchgrp.append(self.addr_nomatch_a_o[i] == l.q)
90 comb += self.addr_nomatch_o.eq(Cat(*matchgrp) & l.q)
91
92 return m
93
94 def is_match(self, i, j):
95 if i == j:
96 return Const(0) # don't match against self!
97 return self.addrs_r[i] == self.addrs_r[j]
98
99 def __iter__(self):
100 yield from self.addrs_i
101 yield self.addr_we_i
102 yield self.addr_en_i
103 yield from self.addr_nomatch_a_o
104 yield self.addr_nomatch_o
105
106 def ports(self):
107 return list(self)
108
109
110 class LenExpand(Elaboratable):
111 """LenExpand: expands binary length (and LSBs of an address) into unary
112
113 this basically produces a bitmap of which *bytes* are to be read (written)
114 in memory. examples:
115
116 (bit_len=4) len=4, addr=0b0011 => 0b1111 << addr
117 => 0b1111000
118 (bit_len=4) len=8, addr=0b0101 => 0b11111111 << addr
119 => 0b1111111100000
120 """
121
122 def __init__(self, bit_len):
123 self.bit_len = bit_len
124 self.len_i = Signal(bit_len, reset_less=True)
125 self.addr_i = Signal(bit_len, reset_less=True)
126 self.explen_o = Signal(1<<(bit_len+1), reset_less=True)
127
128 def elaborate(self, platform):
129 m = Module()
130 comb = m.d.comb
131
132 # temp
133 binlen = Signal((1<<self.bit_len)+1, reset_less=True)
134 comb += binlen.eq((Const(1, self.bit_len+1) << (1+self.len_i)) - 1)
135 comb += self.explen_o.eq(binlen << self.addr_i)
136
137 return m
138
139 def ports(self):
140 return [self.len_i, self.addr_i, self.explen_o,]
141
142
143 class PartialAddrBitmap(PartialAddrMatch):
144 """PartialAddrBitMap
145
146 makes two comparisons for each address, with each (addr,len)
147 being extended to an unary byte-map.
148
149 two comparisons are needed because when an address is misaligned,
150 the byte-map is split into two halves. example:
151
152 address = 0b1011011, len=8 => 0b101 and shift of 11 (0b1011)
153 len in unary is 0b0000 0000 1111 1111
154 when shifted becomes TWO addresses:
155
156 * 0b101 and a byte-map of 0b1111 1000 0000 0000 (len-mask shifted by 11)
157 * 0b101+1 and a byte-map of 0b0000 0000 0000 0111 (overlaps onto next 16)
158
159 therefore, because this now covers two addresses, we need *two*
160 comparisons per address *not* one.
161 """
162 def __init__(self, n_adr, bitwid, bitlen):
163 self.bitwid = bitwid # number of bits to turn into unary
164 self.midlen = bitlen-bitwid
165 PartialAddrMatch.__init__(self, n_adr, self.midlen)
166
167 # input: length of the LOAD/STORE
168 self.len_i = Array(Signal(bitwid, reset_less=True,
169 name="len") for i in range(n_adr))
170 # input: full address
171 self.faddrs_i = Array(Signal(bitlen, reset_less=True,
172 name="fadr") for i in range(n_adr))
173
174 # intermediary: address + 1
175 self.addr1s = Array(Signal(self.bitwid, reset_less=True,
176 name="adr1") \
177 for i in range(n_adr))
178
179 def elaborate(self, platform):
180 m = PartialAddrMatch.elaborate(self, platform)
181 comb = m.d.comb
182
183 # intermediaries
184 addrs_r, l = self.addrs_r, self.l
185 expwid = 1+self.bitwid # XXX assume LD/ST no greater than 8
186 explen_i = Array(Signal(expwid, reset_less=True,
187 name="a_l") \
188 for i in range(self.n_adr))
189 lenexp_r = Array(Signal(expwid, reset_less=True,
190 name="a_l") \
191 for i in range(self.n_adr))
192
193 # copy the top bitlen..(bitwid-bit_len) of addresses to compare
194 for i in range(self.n_adr):
195 comb += self.addrs_i[i].eq(self.faddrs_i[i][self.bitwid:])
196
197 # copy in lengths and latch them
198 for i in range(self.n_adr):
199 latchregister(m, explen_i[i], lenexp_r[i], l.q[i])
200
201 # add one to intermediate addresses
202 for i in range(self.n_adr):
203 comb += self.addr1s[i].eq(self.addrs_r[i]+1)
204
205 # put the bottom bits into the LenExpanders. One is for
206 # non-aligned stores.
207
208 return m
209
210 def is_match(self, i, j):
211 if i == j:
212 return Const(0) # don't match against self!
213 return self.addrs_r[i] == self.addrs_r[j]
214
215 def __iter__(self):
216 yield from self.faddrs_i
217 yield from self.len_i
218 yield self.addr_we_i
219 yield self.addr_en_i
220 yield from self.addr_nomatch_a_o
221 yield self.addr_nomatch_o
222
223 def ports(self):
224 return list(self)
225
226 def part_addr_sim(dut):
227 yield dut.dest_i.eq(1)
228 yield dut.issue_i.eq(1)
229 yield
230 yield dut.issue_i.eq(0)
231 yield
232 yield dut.src1_i.eq(1)
233 yield dut.issue_i.eq(1)
234 yield
235 yield dut.issue_i.eq(0)
236 yield
237 yield dut.go_rd_i.eq(1)
238 yield
239 yield dut.go_rd_i.eq(0)
240 yield
241 yield dut.go_wr_i.eq(1)
242 yield
243 yield dut.go_wr_i.eq(0)
244 yield
245
246 def test_part_addr():
247 dut = LenExpand(4)
248 vl = rtlil.convert(dut, ports=dut.ports())
249 with open("test_len_expand.il", "w") as f:
250 f.write(vl)
251
252 dut = PartialAddrBitmap(3, 4, 10)
253 vl = rtlil.convert(dut, ports=dut.ports())
254 with open("test_part_bit.il", "w") as f:
255 f.write(vl)
256
257 dut = PartialAddrMatch(3, 10)
258 vl = rtlil.convert(dut, ports=dut.ports())
259 with open("test_part_addr.il", "w") as f:
260 f.write(vl)
261
262 run_simulation(dut, part_addr_sim(dut), vcd_name='test_part_addr.vcd')
263
264 if __name__ == '__main__':
265 test_part_addr()