fix LD/ST bitreverse with Matrix REMAP to instead be non-bitreversed.
[openpower-isa.git] / src / openpower / decoder / isa / test_caller_svp64_ldst.py
1 from nmigen import Module, Signal
2 from nmigen.back.pysim import Simulator, Delay, Settle
3 from nmutil.formaltest import FHDLTestCase
4 import unittest
5 from openpower.decoder.isa.caller import ISACaller
6 from openpower.decoder.power_decoder import (create_pdecode)
7 from openpower.decoder.power_decoder2 import (PowerDecode2)
8 from openpower.simulator.program import Program
9 from openpower.decoder.isa.caller import ISACaller, SVP64State
10 from openpower.decoder.selectable_int import SelectableInt
11 from openpower.decoder.orderedset import OrderedSet
12 from openpower.decoder.isa.all import ISA
13 from openpower.decoder.isa.test_caller import Register, run_tst
14 from openpower.sv.trans.svp64 import SVP64Asm
15 from openpower.consts import SVP64CROffs
16 from openpower.decoder.helpers import fp64toselectable
17 from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
18 )
19 from copy import deepcopy
20
21
22 class DecoderTestCase(FHDLTestCase):
23
24 def _check_regs(self, sim, expected):
25 for i in range(32):
26 self.assertEqual(sim.gpr(i), SelectableInt(expected[i], 64))
27
28 def _check_fpregs(self, sim, expected):
29 for i in range(32):
30 self.assertEqual(sim.fpr(i), SelectableInt(expected[i], 64))
31
32 def test_sv_load_store_elementstride(self):
33 """>>> lst = ["addi 1, 0, 0x0010",
34 "addi 2, 0, 0x0008",
35 "addi 4, 0, 0x1234",
36 "addi 5, 0, 0x1235",
37 "sv.stw/els 4.v, 16(1)",
38 "sv.lwz/els 8.v, 16(1)"]
39
40 note: element stride mode is only enabled when RA is a scalar
41 and when the immediate is non-zero
42
43 element stride is computed as:
44 for i in range(VL):
45 EA = (RA|0) + EXTS(D) * i
46 """
47 lst = SVP64Asm(["addi 1, 0, 0x0010",
48 "addi 2, 0, 0x0008",
49 "addi 4, 0, 0x1234",
50 "addi 5, 0, 0x1235",
51 "sv.stw/els 4.v, 24(1)", # scalar r1 + 16 + 24*offs
52 "sv.lwz/els 8.v, 24(1)"]) # scalar r1 + 16 + 24*offs
53 lst = list(lst)
54
55 # SVSTATE (in this case, VL=2)
56 svstate = SVP64State()
57 svstate.vl = 2 # VL
58 svstate.maxvl = 2 # MAXVL
59 print ("SVSTATE", bin(svstate.asint()))
60
61 with Program(lst, bigendian=False) as program:
62 sim = self.run_tst_program(program, svstate=svstate)
63 mem = sim.mem.dump(printout=False)
64 print (mem)
65 # contents of memory expected at:
66 # element 0: r1=0x10, D=24, => EA = 0x10+24*0 = 16 (0x10)
67 # element 1: r1=0x10, D=24, => EA = 0x10+24*1 = 40 (0x28)
68 # therefore, at address 0x10 ==> 0x1234
69 # therefore, at address 0x28 ==> 0x1235
70 expected_mem = [(16, 0x1234),
71 (40, 0x1235)]
72 self.assertEqual(mem, expected_mem)
73 print(sim.gpr(1))
74 self.assertEqual(sim.gpr(8), SelectableInt(0x1234, 64))
75 self.assertEqual(sim.gpr(9), SelectableInt(0x1235, 64))
76
77 def test_sv_load_store_unitstride(self):
78 """>>> lst = ["addi 1, 0, 0x0010",
79 "addi 2, 0, 0x0008",
80 "addi 5, 0, 0x1234",
81 "addi 6, 0, 0x1235",
82 "sv.stw 8.v, 8(1)",
83 "sv.lwz 12.v, 8(1)"]
84
85 note: unit stride mode is only enabled when RA is a scalar.
86
87 unit stride is computed as:
88 for i in range(VL):
89 EA = (RA|0) + EXTS(D) + LDSTsize * i
90 where for stw and lwz, LDSTsize is 4 because it is 32-bit words
91 """
92 lst = SVP64Asm(["addi 1, 0, 0x0010",
93 "addi 2, 0, 0x0008",
94 "addi 8, 0, 0x1234",
95 "addi 9, 0, 0x1235",
96 "sv.stw 8.v, 8(1)", # scalar r1 + 8 + wordlen*offs
97 "sv.lwz 12.v, 8(1)"]) # scalar r1 + 8 + wordlen*offs
98 lst = list(lst)
99
100 # SVSTATE (in this case, VL=2)
101 svstate = SVP64State()
102 svstate.vl = 2 # VL
103 svstate.maxvl = 2 # MAXVL
104 print ("SVSTATE", bin(svstate.asint()))
105
106 with Program(lst, bigendian=False) as program:
107 sim = self.run_tst_program(program, svstate=svstate)
108 mem = sim.mem.dump(printout=False)
109 print ("Mem")
110 print (mem)
111 # contents of memory expected at:
112 # element 0: r1=0x10, D=8, wordlen=4 => EA = 0x10+8+4*0 = 0x24
113 # element 1: r1=0x10, D=8, wordlen=4 => EA = 0x10+8+4*8 = 0x28
114 # therefore, at address 0x24 ==> 0x1234
115 # therefore, at address 0x28 ==> 0x1235
116 self.assertEqual(mem, [(24, 0x123500001234)])
117 print(sim.gpr(1))
118 self.assertEqual(sim.gpr(12), SelectableInt(0x1234, 64))
119 self.assertEqual(sim.gpr(13), SelectableInt(0x1235, 64))
120
121 def test_sv_load_store_bitreverse(self):
122 """>>> lst = ["addi 1, 0, 0x0010",
123 "addi 2, 0, 0x0004",
124 "addi 3, 0, 0x0002",
125 "addi 5, 0, 0x101",
126 "addi 6, 0, 0x202",
127 "addi 7, 0, 0x303",
128 "addi 8, 0, 0x404",
129 "sv.stw 5.v, 0(1)",
130 "sv.lwzbr 12.v, 4(1), 2"]
131
132 note: bitreverse mode is... odd. it's the butterfly generator
133 from Cooley-Tukey FFT:
134 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
135
136 bitreverse LD is computed as:
137 for i in range(VL):
138 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
139
140 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
141 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
142
143 and thus creates the butterfly needed for one iteration of FFT.
144 the RC (shift) is to be able to offset the LDs by Radix-2 spans
145 """
146 lst = SVP64Asm(["addi 1, 0, 0x0010",
147 "addi 2, 0, 0x0000",
148 "addi 5, 0, 0x101",
149 "addi 6, 0, 0x202",
150 "addi 7, 0, 0x303",
151 "addi 8, 0, 0x404",
152 "sv.stw 5.v, 0(1)", # scalar r1 + 0 + wordlen*offs
153 "sv.lwzbr 12.v, 4(1), 2"]) # bit-reversed
154 lst = list(lst)
155
156 # SVSTATE (in this case, VL=4)
157 svstate = SVP64State()
158 svstate.vl = 4 # VL
159 svstate.maxvl = 4 # MAXVL
160 print ("SVSTATE", bin(svstate.asint()))
161
162 with Program(lst, bigendian=False) as program:
163 sim = self.run_tst_program(program, svstate=svstate)
164 mem = sim.mem.dump(printout=False)
165 print (mem)
166
167 self.assertEqual(mem, [(16, 0x020200000101),
168 (24, 0x040400000303)])
169 print(sim.gpr(1))
170 # from STs
171 self.assertEqual(sim.gpr(5), SelectableInt(0x101, 64))
172 self.assertEqual(sim.gpr(6), SelectableInt(0x202, 64))
173 self.assertEqual(sim.gpr(7), SelectableInt(0x303, 64))
174 self.assertEqual(sim.gpr(8), SelectableInt(0x404, 64))
175 # r1=0x10, RC=0, offs=4: contents of memory expected at:
176 # element 0: EA = r1 + bitrev(0b00)*4 => 0x10 + 0b00*4 => 0x10
177 # element 1: EA = r1 + bitrev(0b01)*4 => 0x10 + 0b10*4 => 0x18
178 # element 2: EA = r1 + bitrev(0b10)*4 => 0x10 + 0b01*4 => 0x14
179 # element 3: EA = r1 + bitrev(0b11)*4 => 0x10 + 0b10*4 => 0x1c
180 # therefore loaded from (bit-reversed indexing):
181 # r9 => mem[0x10] which was stored from r5
182 # r10 => mem[0x18] which was stored from r6
183 # r11 => mem[0x18] which was stored from r7
184 # r12 => mem[0x1c] which was stored from r8
185 self.assertEqual(sim.gpr(12), SelectableInt(0x101, 64))
186 self.assertEqual(sim.gpr(13), SelectableInt(0x303, 64))
187 self.assertEqual(sim.gpr(14), SelectableInt(0x202, 64))
188 self.assertEqual(sim.gpr(15), SelectableInt(0x404, 64))
189
190 def test_sv_load_store_bitreverse2(self):
191 """>>> lst = ["addi 1, 0, 0x0010",
192 "addi 2, 0, 0x0004",
193 "addi 3, 0, 0x0002",
194 "sv.stfs 4.v, 0(1)",
195 "sv.lfsbr 12.v, 4(1), 2"]
196
197 note: bitreverse mode is... odd. it's the butterfly generator
198 from Cooley-Tukey FFT:
199 https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms
200
201 bitreverse LD is computed as:
202 for i in range(VL):
203 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
204
205 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
206 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
207
208 and thus creates the butterfly needed for one iteration of FFT.
209 the RC (shift) is to be able to offset the LDs by Radix-2 spans
210 """
211 lst = SVP64Asm(["addi 1, 0, 0x0010",
212 "addi 2, 0, 0x0000",
213 "sv.stfs 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
214 "sv.lfsbr 12.v, 4(1), 2"]) # bit-reversed
215 lst = list(lst)
216
217 # SVSTATE (in this case, VL=4)
218 svstate = SVP64State()
219 svstate.vl = 4 # VL
220 svstate.maxvl = 4 # MAXVL
221 print ("SVSTATE", bin(svstate.asint()))
222
223 fprs = [0] * 32
224 scalar_a = 1.3
225 scalar_b = -2.0
226 fprs[4] = fp64toselectable(1.0)
227 fprs[5] = fp64toselectable(2.0)
228 fprs[6] = fp64toselectable(3.0)
229 fprs[7] = fp64toselectable(4.0)
230
231 # expected results, remember that bit-reversed load has been done
232 expected_fprs = deepcopy(fprs)
233 expected_fprs[12] = fprs[4] # 0b00 -> 0b00
234 expected_fprs[13] = fprs[6] # 0b01 -> 0b10
235 expected_fprs[14] = fprs[5] # 0b10 -> 0b01
236 expected_fprs[15] = fprs[7] # 0b11 -> 0b11
237
238 with Program(lst, bigendian=False) as program:
239 sim = self.run_tst_program(program, svstate=svstate,
240 initial_fprs=fprs)
241 mem = sim.mem.dump(printout=False)
242 print ("mem dump")
243 print (mem)
244
245 print ("FPRs")
246 sim.fpr.dump()
247
248 #self.assertEqual(mem, [(16, 0x020200000101),
249 # (24, 0x040400000303)])
250 self._check_fpregs(sim, expected_fprs)
251
252 def test_sv_load_store_remap_matrix(self):
253 """>>> lst = ["addi 1, 0, 0x0010",
254 "addi 2, 0, 0x0004",
255 "addi 3, 0, 0x0002",
256 "addi 5, 0, 0x101",
257 "addi 6, 0, 0x202",
258 "addi 7, 0, 0x303",
259 "addi 8, 0, 0x404",
260 "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
261 "svshape 3, 3, 4, 0, 0",
262 "svremap 1, 1, 2, 0, 0, 0, 0, 1",
263 "sv.lwz 20.v, 0(1)",
264 ]
265
266 REMAPed a LD operation via a Matrix Multiply Schedule,
267 which is set up as 3x4 result
268 """
269 lst = SVP64Asm(["addi 1, 0, 0x0010",
270 "addi 2, 0, 0x0000",
271 "addi 4, 0, 0x101",
272 "addi 5, 0, 0x202",
273 "addi 6, 0, 0x303",
274 "addi 7, 0, 0x404",
275 "addi 8, 0, 0x505",
276 "addi 9, 0, 0x606",
277 "addi 10, 0, 0x707",
278 "addi 11, 0, 0x808",
279 "addi 12, 0, 0x909",
280 "addi 13, 0, 0xa0a",
281 "addi 14, 0, 0xb0b",
282 "addi 15, 0, 0xc0c",
283 "addi 16, 0, 0xd0d",
284 "addi 17, 0, 0xe0e",
285 "addi 18, 0, 0xf0f",
286 "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
287 "svshape 3, 3, 4, 0, 0",
288 "svremap 1, 1, 2, 0, 0, 0, 0, 1",
289 "sv.lwz 20.v, 0(1)",
290 #"sv.lwzbr 12.v, 4(1), 2", # bit-reversed
291 ])
292 lst = list(lst)
293
294 # SVSTATE (in this case, VL=4)
295 svstate = SVP64State()
296 svstate.vl = 12 # VL
297 svstate.maxvl = 12 # MAXVL
298 print ("SVSTATE", bin(svstate.asint()))
299
300 regs = [0] * 64
301
302 with Program(lst, bigendian=False) as program:
303 sim = self.run_tst_program(program, svstate=svstate,
304 initial_regs=regs)
305 mem = sim.mem.dump(printout=False)
306 print ("Mem")
307 print (mem)
308
309 self.assertEqual(mem, [(16, 0x020200000101),
310 (24, 0x040400000303),
311 (32, 0x060600000505),
312 (40, 0x080800000707),
313 (48, 0x0a0a00000909),
314 (56, 0x0c0c00000b0b)])
315 print(sim.gpr(1))
316 # from STs
317 self.assertEqual(sim.gpr(4), SelectableInt(0x101, 64))
318 self.assertEqual(sim.gpr(5), SelectableInt(0x202, 64))
319 self.assertEqual(sim.gpr(6), SelectableInt(0x303, 64))
320 self.assertEqual(sim.gpr(7), SelectableInt(0x404, 64))
321 self.assertEqual(sim.gpr(8), SelectableInt(0x505, 64))
322 self.assertEqual(sim.gpr(9), SelectableInt(0x606, 64))
323 self.assertEqual(sim.gpr(10), SelectableInt(0x707, 64))
324 self.assertEqual(sim.gpr(11), SelectableInt(0x808, 64))
325 # combination of bit-reversed load with a Matrix REMAP
326 # schedule
327 for i in range(3):
328 self.assertEqual(sim.gpr(20+i), SelectableInt(0x101, 64))
329 self.assertEqual(sim.gpr(23+i), SelectableInt(0x505, 64))
330 self.assertEqual(sim.gpr(26+i), SelectableInt(0x909, 64))
331 self.assertEqual(sim.gpr(29+i), SelectableInt(0x202, 64))
332
333 def test_sv_load_store_bitreverse_remap_halfswap(self):
334 """>>> lst = ["addi 1, 0, 0x0010",
335 "addi 2, 0, 0x0000",
336 "addi 4, 0, 0x101",
337 "addi 5, 0, 0x202",
338 "addi 6, 0, 0x303",
339 "addi 7, 0, 0x404",
340 "addi 8, 0, 0x505",
341 "addi 9, 0, 0x606",
342 "addi 10, 0, 0x707",
343 "addi 11, 0, 0x808",
344 "sv.stw 5.v, 0(1)",
345 "svshape 8, 1, 1, 6, 0",
346 "svremap 31, 1, 2, 3, 0, 0, 0, 0",
347 "sv.lwzbr 12.v, 4(1), 2"]
348
349 bitreverse LD is computed as:
350 for i in range(VL):
351 EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
352
353 bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
354 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11
355
356 and thus creates the butterfly needed for one iteration of FFT.
357 the RC (shift) is to be able to offset the LDs by Radix-2 spans
358
359 on top of the bit-reversal is a REMAP for half-swaps for DCT
360 in-place.
361 """
362 lst = SVP64Asm(["addi 1, 0, 0x0010",
363 "addi 2, 0, 0x0000",
364 "addi 4, 0, 0x001",
365 "addi 5, 0, 0x102",
366 "addi 6, 0, 0x203",
367 "addi 7, 0, 0x304",
368 "addi 8, 0, 0x405",
369 "addi 9, 0, 0x506",
370 "addi 10, 0, 0x607",
371 "addi 11, 0, 0x708",
372 "sv.stw 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs
373 "svshape 8, 1, 1, 6, 0",
374 "svremap 1, 0, 0, 0, 0, 0, 0, 1",
375 #"setvl 0, 0, 8, 0, 1, 1",
376 "sv.lwzbr 12.v, 4(1), 2", # bit-reversed
377 #"sv.lwz 12.v, 0(1)"
378 ])
379 lst = list(lst)
380
381 # SVSTATE (in this case, VL=4)
382 svstate = SVP64State()
383 svstate.vl = 8 # VL
384 svstate.maxvl = 8 # MAXVL
385 print ("SVSTATE", bin(svstate.asint()))
386
387 regs = [0] * 64
388
389 avi = [0x001, 0x102, 0x203, 0x304, 0x405, 0x506, 0x607, 0x708]
390 n = len(avi)
391 levels = n.bit_length() - 1
392 ri = list(range(n))
393 ri = [ri[reverse_bits(i, levels)] for i in range(n)]
394 av = halfrev2(avi, False)
395 av = [av[ri[i]] for i in range(n)]
396
397 with Program(lst, bigendian=False) as program:
398 sim = self.run_tst_program(program, svstate=svstate,
399 initial_regs=regs)
400 mem = sim.mem.dump(printout=False)
401 print ("Mem")
402 print (mem)
403
404 self.assertEqual(mem, [(16, 0x010200000001),
405 (24, 0x030400000203),
406 (32, 0x050600000405),
407 (40, 0x070800000607)])
408 # from STs
409 for i in range(len(avi)):
410 print ("st gpr", i, sim.gpr(i+4), hex(avi[i]))
411 self.assertEqual(sim.gpr(i+4), avi[i])
412 self.assertEqual(sim.gpr(5), SelectableInt(0x102, 64))
413 self.assertEqual(sim.gpr(6), SelectableInt(0x203, 64))
414 self.assertEqual(sim.gpr(7), SelectableInt(0x304, 64))
415 self.assertEqual(sim.gpr(8), SelectableInt(0x405, 64))
416 self.assertEqual(sim.gpr(9), SelectableInt(0x506, 64))
417 self.assertEqual(sim.gpr(10), SelectableInt(0x607, 64))
418 self.assertEqual(sim.gpr(11), SelectableInt(0x708, 64))
419 # combination of bit-reversed load with a DCT half-swap REMAP
420 # schedule
421 for i in range(len(avi)):
422 print ("ld gpr", i, sim.gpr(i+12), hex(av[i]))
423 self.assertEqual(sim.gpr(i+12), av[i])
424
425 def run_tst_program(self, prog, initial_regs=None,
426 svstate=None, initial_fprs=None):
427 if initial_regs is None:
428 initial_regs = [0] * 32
429 if initial_fprs is None:
430 initial_fprs = [0] * 32
431 simulator = run_tst(prog, initial_regs, svstate=svstate,
432 initial_fprs=initial_fprs)
433 simulator.gpr.dump()
434 return simulator
435
436
437 if __name__ == "__main__":
438 unittest.main()