rename divrem2du->divmod2du for consistency with PowerISA mod* instructions
[openpower-isa.git] / openpower / isa / simplev.mdwn
1 <!-- This defines Draft SVP64 instructions to augment PowerISA Version 3.0 -->
2 <!-- These are not described in book 1 -->
3
4 # svstep
5
6 SVL-Form
7
8 * svstep RT,SVi,vf (Rc=0)
9 * svstep. RT,SVi,vf (Rc=1)
10
11 Pseudo-code:
12
13 if SVi[3:4] = 0b11 then
14 # store subvl, pack and unpack in SVSTATE
15 SVSTATE[53] <- SVi[5]
16 SVSTATE[54] <- SVi[6]
17 RT <- [0]*62 || SVSTATE[53:54]
18 else
19 step <- SVSTATE_NEXT(SVi, vf)
20 RT <- [0]*57 || step
21
22 Special Registers Altered:
23
24 CR0 (if Rc=1)
25
26 # setvl
27
28 SVL-Form
29
30 * setvl RT,RA,SVi,vf,vs,ms (Rc=0)
31 * setvl. RT,RA,SVi,vf,vs,ms (Rc=1)
32
33 Pseudo-code:
34
35 overflow <- 0b0
36 if (vf & (¬vs) & ¬(ms)) = 1 then
37 step <- SVSTATE_NEXT(SVi, 0b0)
38 if _RT != 0 then
39 GPR(_RT) <- [0]*57 || step
40 else
41 VLimm <- SVi + 1
42 # set or get MVL
43 if ms = 1 then MVL <- VLimm[0:6]
44 else MVL <- SVSTATE[0:6]
45 # set or get VL
46 if vs = 0 then VL <- SVSTATE[7:13]
47 else if _RA != 0 then
48 if (RA) >u 0b1111111 then
49 VL <- 0b1111111
50 overflow <- 0b1
51 else VL <- (RA)[57:63]
52 else if _RA != 0 then VL <- (RA)[57:63]
53 else if _RT = 0 then VL <- VLimm[0:6]
54 else if CTR >u 0b1111111 then
55 VL <- 0b1111111
56 overflow <- 0b1
57 else VL <- CTR[57:63]
58 # limit VL to within MVL
59 if VL >u MVL then
60 overflow <- 0b1
61 VL <- MVL
62 SVSTATE[0:6] <- MVL
63 SVSTATE[7:13] <- VL
64 if _RT != 0 then
65 GPR(_RT) <- [0]*57 || VL
66 if ((¬vs) & ¬(ms)) = 0 then
67 # set requested Vertical-First mode, clear persist
68 SVSTATE[63] <- vf
69 SVSTATE[62] <- 0b0
70
71 Special Registers Altered:
72
73 CR0 (if Rc=1)
74
75 # svremap
76
77 SVRM-Form
78
79 * svremap SVme,mi0,mi1,mi2,mo0,mo1,pst
80
81 Pseudo-code:
82
83 # registers RA RB RC RT EA/FRS SVSHAPE0-3 indices
84 SVSTATE[32:33] <- mi0
85 SVSTATE[34:35] <- mi1
86 SVSTATE[36:37] <- mi2
87 SVSTATE[38:39] <- mo0
88 SVSTATE[40:41] <- mo1
89 # enable bit for RA RB RC RT EA/FRS
90 SVSTATE[42:46] <- SVme
91 # persistence bit (applies to more than one instruction)
92 SVSTATE[62] <- pst
93
94 Special Registers Altered:
95
96 None
97
98 # svshape
99
100 SVM-Form
101
102 * svshape SVxd,SVyd,SVzd,SVrm,vf
103
104 Pseudo-code:
105
106 # for convenience, VL to be calculated and stored in SVSTATE
107 vlen <- [0] * 7
108 mscale[0:5] <- 0b000001 # for scaling MAXVL
109 itercount[0:6] <- [0] * 7
110 SVSTATE[0:31] <- [0] * 32
111 # only overwrite REMAP if "persistence" is zero
112 if (SVSTATE[62] = 0b0) then
113 SVSTATE[32:33] <- 0b00
114 SVSTATE[34:35] <- 0b00
115 SVSTATE[36:37] <- 0b00
116 SVSTATE[38:39] <- 0b00
117 SVSTATE[40:41] <- 0b00
118 SVSTATE[42:46] <- 0b00000
119 SVSTATE[62] <- 0b0
120 SVSTATE[63] <- 0b0
121 # clear out all SVSHAPEs
122 SVSHAPE0[0:31] <- [0] * 32
123 SVSHAPE1[0:31] <- [0] * 32
124 SVSHAPE2[0:31] <- [0] * 32
125 SVSHAPE3[0:31] <- [0] * 32
126 # set schedule up for multiply
127 if (SVrm = 0b0000) then
128 # VL in Matrix Multiply is xd*yd*zd
129 xd <- (0b00 || SVxd) + 1
130 yd <- (0b00 || SVyd) + 1
131 zd <- (0b00 || SVzd) + 1
132 n <- xd * yd * zd
133 vlen[0:6] <- n[14:20]
134 # set up template in SVSHAPE0, then copy to 1-3
135 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
136 SVSHAPE0[6:11] <- (0b0 || SVyd) # ydim
137 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim
138 SVSHAPE0[28:29] <- 0b11 # skip z
139 # copy
140 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
141 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
142 SVSHAPE3[0:31] <- SVSHAPE0[0:31]
143 # set up FRA
144 SVSHAPE1[18:20] <- 0b001 # permute x,z,y
145 SVSHAPE1[28:29] <- 0b01 # skip z
146 # FRC
147 SVSHAPE2[18:20] <- 0b001 # permute x,z,y
148 SVSHAPE2[28:29] <- 0b11 # skip y
149 # set schedule up for FFT butterfly
150 if (SVrm = 0b0001) then
151 # calculate O(N log2 N)
152 n <- [0] * 3
153 do while n < 5
154 if SVxd[4-n] = 0 then
155 leave
156 n <- n + 1
157 n <- ((0b0 || SVxd) + 1) * n
158 vlen[0:6] <- n[1:7]
159 # set up template in SVSHAPE0, then copy to 1-3
160 # for FRA and FRT
161 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
162 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT)
163 mscale <- (0b0 || SVzd) + 1
164 SVSHAPE0[30:31] <- 0b01 # Butterfly mode
165 # copy
166 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
167 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
168 # set up FRB and FRS
169 SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
170 # FRC (coefficients)
171 SVSHAPE2[28:29] <- 0b10 # k schedule
172 # set schedule up for (i)DCT Inner butterfly
173 # SVrm Mode 2 (Mode 6 for iDCT) is for pre-calculated coefficients,
174 # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
175 if ((SVrm = 0b0010) | (SVrm = 0b0100) |
176 (SVrm = 0b1010) | (SVrm = 0b1100)) then
177 # calculate O(N log2 N)
178 n <- [0] * 3
179 do while n < 5
180 if SVxd[4-n] = 0 then
181 leave
182 n <- n + 1
183 n <- ((0b0 || SVxd) + 1) * n
184 vlen[0:6] <- n[1:7]
185 # set up template in SVSHAPE0, then copy to 1-3
186 # set up FRB and FRS
187 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
188 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
189 mscale <- (0b0 || SVzd) + 1
190 if (SVrm = 0b1010) | (SVrm = 0b1100) then
191 SVSHAPE0[30:31] <- 0b11 # iDCT mode
192 SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode
193 else
194 SVSHAPE0[30:31] <- 0b01 # DCT mode
195 SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode
196 SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop
197 if (SVrm = 0b1100) | (SVrm = 0b0100) then
198 SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4
199 else
200 SVSHAPE0[6:11] <- 0b000001 # (i)DCT Inner Butterfly mode 2
201 # copy
202 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
203 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
204 if (SVrm != 0b0100) & (SVrm != 0b1100) then
205 SVSHAPE3[0:31] <- SVSHAPE0[0:31]
206 # for FRA and FRT
207 SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule
208 # for cos coefficient
209 SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule
210 SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1
211 if (SVrm != 0b0100) & (SVrm != 0b1100) then
212 SVSHAPE3[28:29] <- 0b11 # size schedule
213 # set schedule up for (i)DCT Outer butterfly
214 if (SVrm = 0b0011) | (SVrm = 0b1011) then
215 # calculate O(N log2 N) number of outer butterfly overlapping adds
216 vlen[0:6] <- [0] * 7
217 n <- 0b000
218 size <- 0b0000001
219 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
220 itercount[0:6] <- (0b0 || itercount[0:5])
221 do while n < 5
222 if SVxd[4-n] = 0 then
223 leave
224 n <- n + 1
225 count <- (itercount - 0b0000001) * size
226 vlen[0:6] <- vlen + count[7:13]
227 size[0:6] <- (size[1:6] || 0b0)
228 itercount[0:6] <- (0b0 || itercount[0:5])
229 # set up template in SVSHAPE0, then copy to 1-3
230 # set up FRB and FRS
231 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
232 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
233 mscale <- (0b0 || SVzd) + 1
234 if (SVrm = 0b1011) then
235 SVSHAPE0[30:31] <- 0b11 # iDCT mode
236 SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode
237 SVSHAPE0[21:23] <- 0b101 # "inverse" on outer and inner loop
238 else
239 SVSHAPE0[30:31] <- 0b01 # DCT mode
240 SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode
241 SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode
242 # copy
243 SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
244 SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
245 # for FRA and FRT
246 SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
247 # reset costable "striding" to 1
248 SVSHAPE2[12:17] <- 0b000000
249 # set schedule up for DCT COS table generation
250 if (SVrm = 0b0101) | (SVrm = 0b1101) then
251 # calculate O(N log2 N)
252 vlen[0:6] <- [0] * 7
253 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
254 itercount[0:6] <- (0b0 || itercount[0:5])
255 n <- [0] * 3
256 do while n < 5
257 if SVxd[4-n] = 0 then
258 leave
259 n <- n + 1
260 vlen[0:6] <- vlen + itercount
261 itercount[0:6] <- (0b0 || itercount[0:5])
262 # set up template in SVSHAPE0, then copy to 1-3
263 # set up FRB and FRS
264 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
265 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
266 mscale <- (0b0 || SVzd) + 1
267 SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
268 SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode
269 if (SVrm = 0b0101) then
270 SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop for DCT
271 # copy
272 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
273 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
274 # for cos coefficient
275 SVSHAPE1[28:29] <- 0b10 # ci schedule
276 SVSHAPE2[28:29] <- 0b11 # size schedule
277 # set schedule up for iDCT / DCT inverse of half-swapped ordering
278 if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
279 vlen[0:6] <- (0b00 || SVxd) + 0b0000001
280 # set up template in SVSHAPE0
281 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
282 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
283 mscale <- (0b0 || SVzd) + 1
284 if (SVrm = 0b1110) then
285 SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap
286 if (SVrm = 0b1111) then
287 SVSHAPE0[30:31] <- 0b01 # FFT mode
288 else
289 SVSHAPE0[30:31] <- 0b11 # DCT mode
290 SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode
291 # set schedule up for parallel reduction
292 if (SVrm = 0b0111) then
293 # calculate the total number of operations (brute-force)
294 vlen[0:6] <- [0] * 7
295 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
296 step[0:6] <- 0b0000001
297 i[0:6] <- 0b0000000
298 do while step <u itercount
299 newstep <- step[1:6] || 0b0
300 j[0:6] <- 0b0000000
301 do while (j+step <u itercount)
302 j <- j + newstep
303 i <- i + 1
304 step <- newstep
305 # VL in Parallel-Reduce is the number of operations
306 vlen[0:6] <- i
307 # set up template in SVSHAPE0, then copy to 1. only 2 needed
308 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
309 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
310 mscale <- (0b0 || SVzd) + 1
311 SVSHAPE0[30:31] <- 0b10 # parallel reduce submode
312 # copy
313 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
314 # set up right operand (left operand 28:29 is zero)
315 SVSHAPE1[28:29] <- 0b01 # right operand
316 # set VL, MVL and Vertical-First
317 m[0:12] <- vlen * mscale
318 maxvl[0:6] <- m[6:12]
319 SVSTATE[0:6] <- maxvl # MAVXL
320 SVSTATE[7:13] <- vlen # VL
321 SVSTATE[63] <- vf
322
323 Special Registers Altered:
324
325 None
326
327 # svindex
328
329 SVI-Form
330
331 * svindex SVG,rmm,SVd,ew,SVyx,mm,sk
332
333 Pseudo-code:
334
335 # based on nearest MAXVL compute other dimension
336 MVL <- SVSTATE[0:6]
337 d <- [0] * 6
338 dim <- SVd+1
339 do while d*dim <u ([0]*4 || MVL)
340 d <- d + 1
341 # set up template, then copy once location identified
342 shape <- [0]*32
343 shape[30:31] <- 0b00 # mode
344 if SVyx = 0 then
345 shape[18:20] <- 0b110 # indexed xd/yd
346 shape[0:5] <- (0b0 || SVd) # xdim
347 if sk = 0 then shape[6:11] <- 0 # ydim
348 else shape[6:11] <- 0b111111 # ydim max
349 else
350 shape[18:20] <- 0b111 # indexed yd/xd
351 if sk = 1 then shape[6:11] <- 0 # ydim
352 else shape[6:11] <- d-1 # ydim max
353 shape[0:5] <- (0b0 || SVd) # ydim
354 shape[12:17] <- (0b0 || SVG) # SVGPR
355 shape[28:29] <- ew # element-width override
356 if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
357 else shape[28:29] <- 0b00 # no skipping
358 # select the mode for updating SVSHAPEs
359 SVSTATE[62] <- mm # set or clear persistence
360 if mm = 0 then
361 # clear out all SVSHAPEs first
362 SVSHAPE0[0:31] <- [0] * 32
363 SVSHAPE1[0:31] <- [0] * 32
364 SVSHAPE2[0:31] <- [0] * 32
365 SVSHAPE3[0:31] <- [0] * 32
366 SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
367 SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
368 idx <- 0
369 for bit = 0 to 4
370 if rmm[4-bit] then
371 # activate requested shape
372 if idx = 0 then SVSHAPE0 <- shape
373 if idx = 1 then SVSHAPE1 <- shape
374 if idx = 2 then SVSHAPE2 <- shape
375 if idx = 3 then SVSHAPE3 <- shape
376 SVSTATE[bit*2+32:bit*2+33] <- idx
377 # increment shape index, modulo 4
378 if idx = 3 then idx <- 0
379 else idx <- idx + 1
380 else
381 # refined SVSHAPE/REMAP update mode
382 bit <- rmm[0:2]
383 idx <- rmm[3:4]
384 if idx = 0 then SVSHAPE0 <- shape
385 if idx = 1 then SVSHAPE1 <- shape
386 if idx = 2 then SVSHAPE2 <- shape
387 if idx = 3 then SVSHAPE3 <- shape
388 SVSTATE[bit*2+32:bit*2+33] <- idx
389 SVSTATE[46-bit] <- 1
390
391 Special Registers Altered:
392
393 None
394
395 # svshape2
396
397 SVM2-Form
398
399 * svshape2 SVo,SVyx,rmm,SVd,sk,mm
400
401 Pseudo-code:
402
403 # based on nearest MAXVL compute other dimension
404 MVL <- SVSTATE[0:6]
405 d <- [0] * 6
406 dim <- SVd+1
407 do while d*dim <u ([0]*4 || MVL)
408 d <- d + 1
409 # set up template, then copy once location identified
410 shape <- [0]*32
411 shape[30:31] <- 0b00 # mode
412 shape[0:5] <- (0b0 || SVd) # x/ydim
413 if SVyx = 0 then
414 shape[18:20] <- 0b000 # ordering xd/yd(/zd)
415 if sk = 0 then shape[6:11] <- 0 # ydim
416 else shape[6:11] <- 0b111111 # ydim max
417 else
418 shape[18:20] <- 0b010 # ordering yd/xd(/zd)
419 if sk = 1 then shape[6:11] <- 0 # ydim
420 else shape[6:11] <- d-1 # ydim max
421 # offset (the prime purpose of this instruction)
422 shape[24:27] <- SVo # offset
423 if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
424 else shape[28:29] <- 0b00 # no skipping
425 # select the mode for updating SVSHAPEs
426 SVSTATE[62] <- mm # set or clear persistence
427 if mm = 0 then
428 # clear out all SVSHAPEs first
429 SVSHAPE0[0:31] <- [0] * 32
430 SVSHAPE1[0:31] <- [0] * 32
431 SVSHAPE2[0:31] <- [0] * 32
432 SVSHAPE3[0:31] <- [0] * 32
433 SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
434 SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
435 idx <- 0
436 for bit = 0 to 4
437 if rmm[4-bit] then
438 # activate requested shape
439 if idx = 0 then SVSHAPE0 <- shape
440 if idx = 1 then SVSHAPE1 <- shape
441 if idx = 2 then SVSHAPE2 <- shape
442 if idx = 3 then SVSHAPE3 <- shape
443 SVSTATE[bit*2+32:bit*2+33] <- idx
444 # increment shape index, modulo 4
445 if idx = 3 then idx <- 0
446 else idx <- idx + 1
447 else
448 # refined SVSHAPE/REMAP update mode
449 bit <- rmm[0:2]
450 idx <- rmm[3:4]
451 if idx = 0 then SVSHAPE0 <- shape
452 if idx = 1 then SVSHAPE1 <- shape
453 if idx = 2 then SVSHAPE2 <- shape
454 if idx = 3 then SVSHAPE3 <- shape
455 SVSTATE[bit*2+32:bit*2+33] <- idx
456 SVSTATE[46-bit] <- 1
457
458 Special Registers Altered:
459
460 None
461