argh, have LD-bitreverse select the offset from RA REMAP schedule
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_ldst.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.isa.caller import ISACaller
6 from openpower.decoder.power_decoder import (create_pdecode)
7 from openpower.decoder.power_decoder2 import (PowerDecode2)
8 from openpower.simulator.program import Program
9 from openpower.decoder.isa.caller import ISACaller, SVP64State
10 from openpower.decoder.selectable_int import SelectableInt
11 from openpower.decoder.orderedset import OrderedSet
12 from openpower.decoder.isa.all import ISA
13 from openpower.decoder.isa.test_caller import Register, run_tst
14 from openpower.sv.trans.svp64 import SVP64Asm
15 from openpower.consts import SVP64CROffs
16 from openpower.decoder.helpers import fp64toselectable
17 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
18 )
19 from copy import deepcopy
20
21
22 class DecoderTestCase(FHDLTestCase):
23
24 def _check_regs(self, sim, expected):
25 for i in range(32):
26 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
27
28 def _check_fpregs(self, sim, expected):
29 for i in range(32):
30 self.assertEqual(sim.fpr(i), SelectableInt(expected[i], 64))
31
32 def test_sv_load_store_elementstride(self):
33 """>>> lst = ["addi 1, 0, 0x0010",
34 "addi 2, 0, 0x0008",
35 "addi 4, 0, 0x1234",
36 "addi 5, 0, 0x1235",
37 "sv.stw/els 4.v, 16(1)",
38 "sv.lwz/els 8.v, 16(1)"]
39
40 note: element stride mode is only enabled when RA is a scalar
41 and when the immediate is non-zero
42
43 element stride is computed as:
44 for i in range(VL):
45 EA = (RA|0) + EXTS(D) * i
46 """
47 lst = SVP64Asm(["addi 1, 0, 0x0010",
48 "addi 2, 0, 0x0008",
49 "addi 4, 0, 0x1234",
50 "addi 5, 0, 0x1235",
51 "sv.stw/els 4.v, 24(1)", # scalar r1 + 16 + 24*offs
52 "sv.lwz/els 8.v, 24(1)"]) # scalar r1 + 16 + 24*offs
53 lst = list(lst)
54
55 # SVSTATE (in this case, VL=2)
56 svstate = SVP64State()
57 svstate.vl = 2 # VL
58 svstate.maxvl = 2 # MAXVL
59 print ("SVSTATE", bin(svstate.asint()))
60
61 with Program(lst, bigendian=False) as program:
62 sim = self.run_tst_program(program, svstate=svstate)
63 mem = sim.mem.dump(printout=False)
64 print (mem)
65 # contents of memory expected at:
66 # element 0: r1=0x10, D=24, => EA = 0x10+24*0 = 16 (0x10)
67 # element 1: r1=0x10, D=24, => EA = 0x10+24*1 = 40 (0x28)
68 # therefore, at address 0x10 ==> 0x1234
69 # therefore, at address 0x28 ==> 0x1235
70 expected_mem = [(16, 0x1234),
71 (40, 0x1235)]
72 self.assertEqual(mem, expected_mem)
73 print(sim.gpr(1))
74 self.assertEqual(sim.gpr(8), SelectableInt(0x1234, 64))
75 self.assertEqual(sim.gpr(9), SelectableInt(0x1235, 64))
76
77 def test_sv_load_store_unitstride(self):
78 """>>> lst = ["addi 1, 0, 0x0010",
79 "addi 2, 0, 0x0008",
80 "addi 5, 0, 0x1234",
81 "addi 6, 0, 0x1235",
82 "sv.stw 8.v, 8(1)",
83 "sv.lwz 12.v, 8(1)"]
84
85 note: unit stride mode is only enabled when RA is a scalar.
86
87 unit stride is computed as:
88 for i in range(VL):
89 EA = (RA|0) + EXTS(D) + LDSTsize * i
90 where for stw and lwz, LDSTsize is 4 because it is 32-bit words
91 """
92 lst = SVP64Asm(["addi 1, 0, 0x0010",
93 "addi 2, 0, 0x0008",
94 "addi 8, 0, 0x1234",
95 "addi 9, 0, 0x1235",
96 "sv.stw 8.v, 8(1)", # scalar r1 + 8 + wordlen*offs
97 "sv.lwz 12.v, 8(1)"]) # scalar r1 + 8 + wordlen*offs
98 lst = list(lst)
99
100 # SVSTATE (in this case, VL=2)
101 svstate = SVP64State()
102 svstate.vl = 2 # VL
103 svstate.maxvl = 2 # MAXVL
104 print ("SVSTATE", bin(svstate.asint()))
105
106 with Program(lst, bigendian=False) as program:
107 sim = self.run_tst_program(program, svstate=svstate)
108 mem = sim.mem.dump(printout=False)
109 print ("Mem")
110 print (mem)
111 # contents of memory expected at:
112 # element 0: r1=0x10, D=8, wordlen=4 => EA = 0x10+8+4*0 = 0x24
113 # element 1: r1=0x10, D=8, wordlen=4 => EA = 0x10+8+4*8 = 0x28
114 # therefore, at address 0x24 ==> 0x1234
115 # therefore, at address 0x28 ==> 0x1235
116 self.assertEqual(mem, [(24, 0x123500001234)])
117 print(sim.gpr(1))
118 self.assertEqual(sim.gpr(12), SelectableInt(0x1234, 64))
119 self.assertEqual(sim.gpr(13), SelectableInt(0x1235, 64))
120
121 def test_sv_load_store_bitreverse(self):
122 """>>> lst = ["addi 1, 0, 0x0010",
123 "addi 2, 0, 0x0004",
124 "addi 3, 0, 0x0002",
125 "addi 5, 0, 0x101",
126 "addi 6, 0, 0x202",
127 "addi 7, 0, 0x303",
128 "addi 8, 0, 0x404",
129 "sv.stw 5.v, 0(1)",
130 "sv.lwzbr 12.v, 4(1), 2"]
131
132 note: bitreverse mode is... odd. it's the butterfly generator
133 from Cooley-Tukey FFT:
134 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
135
136 bitreverse LD is computed as:
137 for i in range(VL):
138 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
139
140 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
141 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
142
143 and thus creates the butterfly needed for one iteration of FFT.
144 the RC (shift) is to be able to offset the LDs by Radix-2 spans
145 """
146 lst = SVP64Asm(["addi 1, 0, 0x0010",
147 "addi 2, 0, 0x0000",
148 "addi 5, 0, 0x101",
149 "addi 6, 0, 0x202",
150 "addi 7, 0, 0x303",
151 "addi 8, 0, 0x404",
152 "sv.stw 5.v, 0(1)", # scalar r1 + 0 + wordlen*offs
153 "sv.lwzbr 12.v, 4(1), 2"]) # bit-reversed
154 lst = list(lst)
155
156 # SVSTATE (in this case, VL=4)
157 svstate = SVP64State()
158 svstate.vl = 4 # VL
159 svstate.maxvl = 4 # MAXVL
160 print ("SVSTATE", bin(svstate.asint()))
161
162 with Program(lst, bigendian=False) as program:
163 sim = self.run_tst_program(program, svstate=svstate)
164 mem = sim.mem.dump(printout=False)
165 print (mem)
166
167 self.assertEqual(mem, [(16, 0x020200000101),
168 (24, 0x040400000303)])
169 print(sim.gpr(1))
170 # from STs
171 self.assertEqual(sim.gpr(5), SelectableInt(0x101, 64))
172 self.assertEqual(sim.gpr(6), SelectableInt(0x202, 64))
173 self.assertEqual(sim.gpr(7), SelectableInt(0x303, 64))
174 self.assertEqual(sim.gpr(8), SelectableInt(0x404, 64))
175 # r1=0x10, RC=0, offs=4: contents of memory expected at:
176 # element 0: EA = r1 + bitrev(0b00)*4 => 0x10 + 0b00*4 => 0x10
177 # element 1: EA = r1 + bitrev(0b01)*4 => 0x10 + 0b10*4 => 0x18
178 # element 2: EA = r1 + bitrev(0b10)*4 => 0x10 + 0b01*4 => 0x14
179 # element 3: EA = r1 + bitrev(0b11)*4 => 0x10 + 0b10*4 => 0x1c
180 # therefore loaded from (bit-reversed indexing):
181 # r9 => mem[0x10] which was stored from r5
182 # r10 => mem[0x18] which was stored from r6
183 # r11 => mem[0x18] which was stored from r7
184 # r12 => mem[0x1c] which was stored from r8
185 self.assertEqual(sim.gpr(12), SelectableInt(0x101, 64))
186 self.assertEqual(sim.gpr(13), SelectableInt(0x303, 64))
187 self.assertEqual(sim.gpr(14), SelectableInt(0x202, 64))
188 self.assertEqual(sim.gpr(15), SelectableInt(0x404, 64))
189
190 def test_sv_load_store_bitreverse2(self):
191 """>>> lst = ["addi 1, 0, 0x0010",
192 "addi 2, 0, 0x0004",
193 "addi 3, 0, 0x0002",
194 "sv.stfs 4.v, 0(1)",
195 "sv.lfsbr 12.v, 4(1), 2"]
196
197 note: bitreverse mode is... odd. it's the butterfly generator
198 from Cooley-Tukey FFT:
199 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
200
201 bitreverse LD is computed as:
202 for i in range(VL):
203 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
204
205 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
206 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
207
208 and thus creates the butterfly needed for one iteration of FFT.
209 the RC (shift) is to be able to offset the LDs by Radix-2 spans
210 """
211 lst = SVP64Asm(["addi 1, 0, 0x0010",
212 "addi 2, 0, 0x0000",
213 "sv.stfs 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
214 "sv.lfsbr 12.v, 4(1), 2"]) # bit-reversed
215 lst = list(lst)
216
217 # SVSTATE (in this case, VL=4)
218 svstate = SVP64State()
219 svstate.vl = 4 # VL
220 svstate.maxvl = 4 # MAXVL
221 print ("SVSTATE", bin(svstate.asint()))
222
223 fprs = [0] * 32
224 scalar_a = 1.3
225 scalar_b = -2.0
226 fprs[4] = fp64toselectable(1.0)
227 fprs[5] = fp64toselectable(2.0)
228 fprs[6] = fp64toselectable(3.0)
229 fprs[7] = fp64toselectable(4.0)
230
231 # expected results, remember that bit-reversed load has been done
232 expected_fprs = deepcopy(fprs)
233 expected_fprs[12] = fprs[4] # 0b00 -> 0b00
234 expected_fprs[13] = fprs[6] # 0b01 -> 0b10
235 expected_fprs[14] = fprs[5] # 0b10 -> 0b01
236 expected_fprs[15] = fprs[7] # 0b11 -> 0b11
237
238 with Program(lst, bigendian=False) as program:
239 sim = self.run_tst_program(program, svstate=svstate,
240 initial_fprs=fprs)
241 mem = sim.mem.dump(printout=False)
242 print ("mem dump")
243 print (mem)
244
245 print ("FPRs")
246 sim.fpr.dump()
247
248 #self.assertEqual(mem, [(16, 0x020200000101),
249 # (24, 0x040400000303)])
250 self._check_fpregs(sim, expected_fprs)
251
252 def test_sv_load_store_bitreverse_remap_matrix(self):
253 """>>> lst = ["addi 1, 0, 0x0010",
254 "addi 2, 0, 0x0004",
255 "addi 3, 0, 0x0002",
256 "addi 5, 0, 0x101",
257 "addi 6, 0, 0x202",
258 "addi 7, 0, 0x303",
259 "addi 8, 0, 0x404",
260 "sv.stw 5.v, 0(1)",
261 "svshape 4, 4, 2, 0, 0",
262 "svremap 31, 1, 2, 3, 0, 0, 0, 0",
263 "sv.lwzbr 12.v, 4(1), 2"]
264
265 note: bitreverse mode is... odd. it's the butterfly generator
266 from Cooley-Tukey FFT:
267 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
268
269 bitreverse LD is computed as:
270 for i in range(VL):
271 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
272
273 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
274 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
275
276 and thus creates the butterfly needed for one iteration of FFT.
277 the RC (shift) is to be able to offset the LDs by Radix-2 spans
278
279 in this case however it is REMAPed via a Matrix Multiply Schedule,
280 which is set up as 4x2.
281 """
282 lst = SVP64Asm(["addi 1, 0, 0x0010",
283 "addi 2, 0, 0x0000",
284 "addi 4, 0, 0x101",
285 "addi 5, 0, 0x202",
286 "addi 6, 0, 0x303",
287 "addi 7, 0, 0x404",
288 "addi 8, 0, 0x505",
289 "addi 9, 0, 0x606",
290 "addi 10, 0, 0x707",
291 "addi 11, 0, 0x808",
292 "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
293 "svshape 4, 4, 2, 0, 0",
294 "svremap 31, 1, 2, 3, 0, 0, 0, 1",
295 #"setvl 0, 0, 8, 0, 1, 1",
296 "sv.lwzbr 12.v, 4(1), 2"]) # bit-reversed
297 lst = list(lst)
298
299 # SVSTATE (in this case, VL=4)
300 svstate = SVP64State()
301 svstate.vl = 8 # VL
302 svstate.maxvl = 8 # MAXVL
303 print ("SVSTATE", bin(svstate.asint()))
304
305 regs = [0] * 64
306
307 with Program(lst, bigendian=False) as program:
308 sim = self.run_tst_program(program, svstate=svstate,
309 initial_regs=regs)
310 mem = sim.mem.dump(printout=False)
311 print ("Mem")
312 print (mem)
313
314 self.assertEqual(mem, [(16, 0x020200000101),
315 (24, 0x040400000303),
316 (32, 0x060600000505),
317 (40, 0x080800000707)])
318 print(sim.gpr(1))
319 # from STs
320 self.assertEqual(sim.gpr(4), SelectableInt(0x101, 64))
321 self.assertEqual(sim.gpr(5), SelectableInt(0x202, 64))
322 self.assertEqual(sim.gpr(6), SelectableInt(0x303, 64))
323 self.assertEqual(sim.gpr(7), SelectableInt(0x404, 64))
324 self.assertEqual(sim.gpr(8), SelectableInt(0x505, 64))
325 self.assertEqual(sim.gpr(9), SelectableInt(0x606, 64))
326 self.assertEqual(sim.gpr(10), SelectableInt(0x707, 64))
327 self.assertEqual(sim.gpr(11), SelectableInt(0x808, 64))
328 # combination of bit-reversed load with a Matrix REMAP
329 # schedule
330 self.assertEqual(sim.gpr(12), SelectableInt(0x101, 64))
331 self.assertEqual(sim.gpr(13), SelectableInt(0x505, 64))
332 self.assertEqual(sim.gpr(14), SelectableInt(0x303, 64))
333 self.assertEqual(sim.gpr(15), SelectableInt(0x707, 64))
334 self.assertEqual(sim.gpr(16), SelectableInt(0x202, 64))
335 self.assertEqual(sim.gpr(17), SelectableInt(0x606, 64))
336 self.assertEqual(sim.gpr(18), SelectableInt(0x404, 64))
337 self.assertEqual(sim.gpr(19), SelectableInt(0x808, 64))
338
339 def test_sv_load_store_bitreverse_remap_halfswap(self):
340 """>>> lst = ["addi 1, 0, 0x0010",
341 "addi 2, 0, 0x0000",
342 "addi 4, 0, 0x101",
343 "addi 5, 0, 0x202",
344 "addi 6, 0, 0x303",
345 "addi 7, 0, 0x404",
346 "addi 8, 0, 0x505",
347 "addi 9, 0, 0x606",
348 "addi 10, 0, 0x707",
349 "addi 11, 0, 0x808",
350 "sv.stw 5.v, 0(1)",
351 "svshape 8, 1, 1, 6, 0",
352 "svremap 31, 1, 2, 3, 0, 0, 0, 0",
353 "sv.lwzbr 12.v, 4(1), 2"]
354
355 bitreverse LD is computed as:
356 for i in range(VL):
357 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
358
359 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
360 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
361
362 and thus creates the butterfly needed for one iteration of FFT.
363 the RC (shift) is to be able to offset the LDs by Radix-2 spans
364
365 on top of the bit-reversal is a REMAP for half-swaps for DCT
366 in-place.
367 """
368 lst = SVP64Asm(["addi 1, 0, 0x0010",
369 "addi 2, 0, 0x0000",
370 "addi 4, 0, 0x001",
371 "addi 5, 0, 0x102",
372 "addi 6, 0, 0x203",
373 "addi 7, 0, 0x304",
374 "addi 8, 0, 0x405",
375 "addi 9, 0, 0x506",
376 "addi 10, 0, 0x607",
377 "addi 11, 0, 0x708",
378 "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
379 "svshape 8, 1, 1, 6, 0",
380 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
381 #"setvl 0, 0, 8, 0, 1, 1",
382 "sv.lwzbr 12.v, 4(1), 2",
383 #"sv.lwz 12.v, 0(1)" # bit-reversed
384 ])
385 lst = list(lst)
386
387 # SVSTATE (in this case, VL=4)
388 svstate = SVP64State()
389 svstate.vl = 8 # VL
390 svstate.maxvl = 8 # MAXVL
391 print ("SVSTATE", bin(svstate.asint()))
392
393 regs = [0] * 64
394
395 avi = [0x001, 0x102, 0x203, 0x304, 0x405, 0x506, 0x607, 0x708]
396 n = len(avi)
397 levels = n.bit_length() - 1
398 ri = list(range(n))
399 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
400 av = halfrev2(avi, False)
401 av = [av[ri[i]] for i in range(n)]
402
403 with Program(lst, bigendian=False) as program:
404 sim = self.run_tst_program(program, svstate=svstate,
405 initial_regs=regs)
406 mem = sim.mem.dump(printout=False)
407 print ("Mem")
408 print (mem)
409
410 self.assertEqual(mem, [(16, 0x010200000001),
411 (24, 0x030400000203),
412 (32, 0x050600000405),
413 (40, 0x070800000607)])
414 # from STs
415 for i in range(len(avi)):
416 print ("st gpr", i, sim.gpr(i+4), hex(avi[i]))
417 self.assertEqual(sim.gpr(i+4), avi[i])
418 self.assertEqual(sim.gpr(5), SelectableInt(0x102, 64))
419 self.assertEqual(sim.gpr(6), SelectableInt(0x203, 64))
420 self.assertEqual(sim.gpr(7), SelectableInt(0x304, 64))
421 self.assertEqual(sim.gpr(8), SelectableInt(0x405, 64))
422 self.assertEqual(sim.gpr(9), SelectableInt(0x506, 64))
423 self.assertEqual(sim.gpr(10), SelectableInt(0x607, 64))
424 self.assertEqual(sim.gpr(11), SelectableInt(0x708, 64))
425 # combination of bit-reversed load with a DCT half-swap REMAP
426 # schedule
427 for i in range(len(avi)):
428 print ("ld gpr", i, sim.gpr(i+12), hex(av[i]))
429 self.assertEqual(sim.gpr(i+12), av[i])
430
431 def run_tst_program(self, prog, initial_regs=None,
432 svstate=None, initial_fprs=None):
433 if initial_regs is None:
434 initial_regs = [0] * 32
435 if initial_fprs is None:
436 initial_fprs = [0] * 32
437 simulator = run_tst(prog, initial_regs, svstate=svstate,
438 initial_fprs=initial_fprs)
439 simulator.gpr.dump()
440 return simulator
441
442
443 if __name__ == "__main__":
444 unittest.main()