bitmanip.mdwn: avoid overflow for m variable
[openpower-isa.git] / openpower / isa / simplev.mdwn
1 <!-- This defines Draft SVP64 instructions to augment PowerISA Version 3.0 -->
2 <!-- These are not described in book 1 -->
3
4 # svstep
5
6 SVL-Form
7
8 * svstep RT,SVi,vf (Rc=0)
9 * svstep. RT,SVi,vf (Rc=1)
10
11 Pseudo-code:
12
13 if SVi[3:4] = 0b11 then
14 # store subvl, pack and unpack in SVSTATE
15 SVSTATE[53] <- SVi[5]
16 SVSTATE[54] <- SVi[6]
17 RT <- [0]*62 || SVSTATE[53:54]
18 else
19 step <- SVSTATE_NEXT(SVi, vf)
20 RT <- [0]*57 || step
21
22 Special Registers Altered:
23
24 CR0 (if Rc=1)
25
26 # setvl
27
28 SVL-Form
29
30 * setvl RT,RA,SVi,vf,vs,ms (Rc=0)
31 * setvl. RT,RA,SVi,vf,vs,ms (Rc=1)
32
33 Pseudo-code:
34
35 overflow <- 0b0
36 VLimm <- SVi + 1
37 # set or get MVL
38 if ms = 1 then MVL <- VLimm[0:6]
39 else MVL <- SVSTATE[0:6]
40 # set or get VL
41 if vs = 0 then VL <- SVSTATE[7:13]
42 else if _RA != 0 then
43 if (RA) >u 0b1111111 then
44 VL <- 0b1111111
45 overflow <- 0b1
46 else VL <- (RA)[57:63]
47 else if _RT = 0 then VL <- VLimm[0:6]
48 else if CTR >u 0b1111111 then
49 VL <- 0b1111111
50 overflow <- 0b1
51 else VL <- CTR[57:63]
52 # limit VL to within MVL
53 if VL >u MVL then
54 overflow <- 0b1
55 VL <- MVL
56 SVSTATE[0:6] <- MVL
57 SVSTATE[7:13] <- VL
58 if _RT != 0 then
59 GPR(_RT) <- [0]*57 || VL
60 if ((¬vs) & ¬(ms)) = 0 then
61 # set requested Vertical-First mode, clear persist
62 SVSTATE[63] <- vf
63 SVSTATE[62] <- 0b0
64
65 Special Registers Altered:
66
67 CR0 (if Rc=1)
68
69 # svremap
70
71 SVRM-Form
72
73 * svremap SVme,mi0,mi1,mi2,mo0,mo1,pst
74
75 Pseudo-code:
76
77 # registers RA RB RC RT EA/FRS SVSHAPE0-3 indices
78 SVSTATE[32:33] <- mi0
79 SVSTATE[34:35] <- mi1
80 SVSTATE[36:37] <- mi2
81 SVSTATE[38:39] <- mo0
82 SVSTATE[40:41] <- mo1
83 # enable bit for RA RB RC RT EA/FRS
84 SVSTATE[42:46] <- SVme
85 # persistence bit (applies to more than one instruction)
86 SVSTATE[62] <- pst
87
88 Special Registers Altered:
89
90 None
91
92 # svshape
93
94 SVM-Form
95
96 * svshape SVxd,SVyd,SVzd,SVrm,vf
97
98 Pseudo-code:
99
100 # for convenience, VL to be calculated and stored in SVSTATE
101 vlen <- [0] * 7
102 mscale[0:5] <- 0b000001 # for scaling MAXVL
103 itercount[0:6] <- [0] * 7
104 SVSTATE[0:31] <- [0] * 32
105 # only overwrite REMAP if "persistence" is zero
106 if (SVSTATE[62] = 0b0) then
107 SVSTATE[32:33] <- 0b00
108 SVSTATE[34:35] <- 0b00
109 SVSTATE[36:37] <- 0b00
110 SVSTATE[38:39] <- 0b00
111 SVSTATE[40:41] <- 0b00
112 SVSTATE[42:46] <- 0b00000
113 SVSTATE[62] <- 0b0
114 SVSTATE[63] <- 0b0
115 # clear out all SVSHAPEs
116 SVSHAPE0[0:31] <- [0] * 32
117 SVSHAPE1[0:31] <- [0] * 32
118 SVSHAPE2[0:31] <- [0] * 32
119 SVSHAPE3[0:31] <- [0] * 32
120 # set schedule up for multiply
121 if (SVrm = 0b0000) then
122 # VL in Matrix Multiply is xd*yd*zd
123 xd <- (0b00 || SVxd) + 1
124 yd <- (0b00 || SVyd) + 1
125 zd <- (0b00 || SVzd) + 1
126 n <- xd * yd * zd
127 vlen[0:6] <- n[14:20]
128 # set up template in SVSHAPE0, then copy to 1-3
129 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
130 SVSHAPE0[6:11] <- (0b0 || SVyd) # ydim
131 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim
132 SVSHAPE0[28:29] <- 0b11 # skip z
133 # copy
134 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
135 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
136 SVSHAPE3[0:31] <- SVSHAPE0[0:31]
137 # set up FRA
138 SVSHAPE1[18:20] <- 0b001 # permute x,z,y
139 SVSHAPE1[28:29] <- 0b01 # skip z
140 # FRC
141 SVSHAPE2[18:20] <- 0b001 # permute x,z,y
142 SVSHAPE2[28:29] <- 0b11 # skip y
143 # set schedule up for FFT butterfly
144 if (SVrm = 0b0001) then
145 # calculate O(N log2 N)
146 n <- [0] * 3
147 do while n < 5
148 if SVxd[4-n] = 0 then
149 leave
150 n <- n + 1
151 n <- ((0b0 || SVxd) + 1) * n
152 vlen[0:6] <- n[1:7]
153 # set up template in SVSHAPE0, then copy to 1-3
154 # for FRA and FRT
155 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
156 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D FFT)
157 mscale <- (0b0 || SVzd) + 1
158 SVSHAPE0[30:31] <- 0b01 # Butterfly mode
159 # copy
160 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
161 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
162 # set up FRB and FRS
163 SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
164 # FRC (coefficients)
165 SVSHAPE2[28:29] <- 0b10 # k schedule
166 # set schedule up for (i)DCT Inner butterfly
167 # SVrm Mode 2 (Mode 6 for iDCT) is for pre-calculated coefficients,
168 # SVrm Mode 4 (Mode 12 for iDCT) is for on-the-fly (Vertical-First Mode)
169 if ((SVrm = 0b0010) | (SVrm = 0b0100) |
170 (SVrm = 0b1010) | (SVrm = 0b1100)) then
171 # calculate O(N log2 N)
172 n <- [0] * 3
173 do while n < 5
174 if SVxd[4-n] = 0 then
175 leave
176 n <- n + 1
177 n <- ((0b0 || SVxd) + 1) * n
178 vlen[0:6] <- n[1:7]
179 # set up template in SVSHAPE0, then copy to 1-3
180 # set up FRB and FRS
181 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
182 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
183 mscale <- (0b0 || SVzd) + 1
184 if (SVrm = 0b1010) | (SVrm = 0b1100) then
185 SVSHAPE0[30:31] <- 0b11 # iDCT mode
186 SVSHAPE0[18:20] <- 0b011 # iDCT Inner Butterfly sub-mode
187 else
188 SVSHAPE0[30:31] <- 0b01 # DCT mode
189 SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode
190 SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop
191 if (SVrm = 0b1100) | (SVrm = 0b0100) then
192 SVSHAPE0[6:11] <- 0b000011 # (i)DCT Inner Butterfly mode 4
193 else
194 SVSHAPE0[6:11] <- 0b000001 # (i)DCT Inner Butterfly mode 2
195 # copy
196 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
197 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
198 if (SVrm != 0b0100) & (SVrm != 0b1100) then
199 SVSHAPE3[0:31] <- SVSHAPE0[0:31]
200 # for FRA and FRT
201 SVSHAPE0[28:29] <- 0b01 # j+halfstep schedule
202 # for cos coefficient
203 SVSHAPE2[28:29] <- 0b10 # ci (k for mode 4) schedule
204 SVSHAPE2[12:17] <- 0b000000 # reset costable "striding" to 1
205 if (SVrm != 0b0100) & (SVrm != 0b1100) then
206 SVSHAPE3[28:29] <- 0b11 # size schedule
207 # set schedule up for (i)DCT Outer butterfly
208 if (SVrm = 0b0011) | (SVrm = 0b1011) then
209 # calculate O(N log2 N) number of outer butterfly overlapping adds
210 vlen[0:6] <- [0] * 7
211 n <- 0b000
212 size <- 0b0000001
213 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
214 itercount[0:6] <- (0b0 || itercount[0:5])
215 do while n < 5
216 if SVxd[4-n] = 0 then
217 leave
218 n <- n + 1
219 count <- (itercount - 0b0000001) * size
220 vlen[0:6] <- vlen + count[7:13]
221 size[0:6] <- (size[1:6] || 0b0)
222 itercount[0:6] <- (0b0 || itercount[0:5])
223 # set up template in SVSHAPE0, then copy to 1-3
224 # set up FRB and FRS
225 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
226 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
227 mscale <- (0b0 || SVzd) + 1
228 if (SVrm = 0b1011) then
229 SVSHAPE0[30:31] <- 0b11 # iDCT mode
230 SVSHAPE0[18:20] <- 0b011 # iDCT Outer Butterfly sub-mode
231 SVSHAPE0[21:23] <- 0b101 # "inverse" on outer and inner loop
232 else
233 SVSHAPE0[30:31] <- 0b01 # DCT mode
234 SVSHAPE0[18:20] <- 0b100 # DCT Outer Butterfly sub-mode
235 SVSHAPE0[6:11] <- 0b000010 # DCT Butterfly mode
236 # copy
237 SVSHAPE1[0:31] <- SVSHAPE0[0:31] # j+halfstep schedule
238 SVSHAPE2[0:31] <- SVSHAPE0[0:31] # costable coefficients
239 # for FRA and FRT
240 SVSHAPE1[28:29] <- 0b01 # j+halfstep schedule
241 # reset costable "striding" to 1
242 SVSHAPE2[12:17] <- 0b000000
243 # set schedule up for DCT COS table generation
244 if (SVrm = 0b0101) | (SVrm = 0b1101) then
245 # calculate O(N log2 N)
246 vlen[0:6] <- [0] * 7
247 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
248 itercount[0:6] <- (0b0 || itercount[0:5])
249 n <- [0] * 3
250 do while n < 5
251 if SVxd[4-n] = 0 then
252 leave
253 n <- n + 1
254 vlen[0:6] <- vlen + itercount
255 itercount[0:6] <- (0b0 || itercount[0:5])
256 # set up template in SVSHAPE0, then copy to 1-3
257 # set up FRB and FRS
258 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
259 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
260 mscale <- (0b0 || SVzd) + 1
261 SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode
262 SVSHAPE0[6:11] <- 0b000100 # DCT Inner Butterfly COS-gen mode
263 if (SVrm = 0b0101) then
264 SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop for DCT
265 # copy
266 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
267 SVSHAPE2[0:31] <- SVSHAPE0[0:31]
268 # for cos coefficient
269 SVSHAPE1[28:29] <- 0b10 # ci schedule
270 SVSHAPE2[28:29] <- 0b11 # size schedule
271 # set schedule up for iDCT / DCT inverse of half-swapped ordering
272 if (SVrm = 0b0110) | (SVrm = 0b1110) | (SVrm = 0b1111) then
273 vlen[0:6] <- (0b00 || SVxd) + 0b0000001
274 # set up template in SVSHAPE0
275 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
276 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
277 mscale <- (0b0 || SVzd) + 1
278 if (SVrm = 0b1110) then
279 SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap
280 if (SVrm = 0b1111) then
281 SVSHAPE0[30:31] <- 0b01 # FFT mode
282 else
283 SVSHAPE0[30:31] <- 0b11 # DCT mode
284 SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode
285 # set schedule up for parallel reduction
286 if (SVrm = 0b0111) then
287 # calculate the total number of operations (brute-force)
288 vlen[0:6] <- [0] * 7
289 itercount[0:6] <- (0b00 || SVxd) + 0b0000001
290 step[0:6] <- 0b0000001
291 i[0:6] <- 0b0000000
292 do while step <u itercount
293 newstep <- step[1:6] || 0b0
294 j[0:6] <- 0b0000000
295 do while (j+step <u itercount)
296 j <- j + newstep
297 i <- i + 1
298 step <- newstep
299 # VL in Parallel-Reduce is the number of operations
300 vlen[0:6] <- i
301 # set up template in SVSHAPE0, then copy to 1. only 2 needed
302 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim
303 SVSHAPE0[12:17] <- (0b0 || SVzd) # zdim - "striding" (2D DCT)
304 mscale <- (0b0 || SVzd) + 1
305 SVSHAPE0[30:31] <- 0b10 # parallel reduce submode
306 # copy
307 SVSHAPE1[0:31] <- SVSHAPE0[0:31]
308 # set up right operand (left operand 28:29 is zero)
309 SVSHAPE1[28:29] <- 0b01 # right operand
310 # set VL, MVL and Vertical-First
311 m[0:12] <- vlen * mscale
312 maxvl[0:6] <- m[6:12]
313 SVSTATE[0:6] <- maxvl # MAVXL
314 SVSTATE[7:13] <- vlen # VL
315 SVSTATE[63] <- vf
316
317 Special Registers Altered:
318
319 None
320
321 # svindex
322
323 SVI-Form
324
325 * svindex SVG,rmm,SVd,ew,SVyx,mm,sk
326
327 Pseudo-code:
328
329 # based on nearest MAXVL compute other dimension
330 MVL <- SVSTATE[0:6]
331 d <- [0] * 6
332 dim <- SVd+1
333 do while d*dim <u ([0]*4 || MVL)
334 d <- d + 1
335 # set up template, then copy once location identified
336 shape <- [0]*32
337 shape[30:31] <- 0b00 # mode
338 if SVyx = 0 then
339 shape[18:20] <- 0b110 # indexed xd/yd
340 shape[0:5] <- (0b0 || SVd) # xdim
341 if sk = 0 then shape[6:11] <- 0 # ydim
342 else shape[6:11] <- 0b111111 # ydim max
343 else
344 shape[18:20] <- 0b111 # indexed yd/xd
345 if sk = 1 then shape[6:11] <- 0 # ydim
346 else shape[6:11] <- d-1 # ydim max
347 shape[0:5] <- (0b0 || SVd) # ydim
348 shape[12:17] <- (0b0 || SVG) # SVGPR
349 shape[28:29] <- ew # element-width override
350 shape[21] <- sk # skip 1st dimension
351 # select the mode for updating SVSHAPEs
352 SVSTATE[62] <- mm # set or clear persistence
353 if mm = 0 then
354 # clear out all SVSHAPEs first
355 SVSHAPE0[0:31] <- [0] * 32
356 SVSHAPE1[0:31] <- [0] * 32
357 SVSHAPE2[0:31] <- [0] * 32
358 SVSHAPE3[0:31] <- [0] * 32
359 SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
360 SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
361 idx <- 0
362 for bit = 0 to 4
363 if rmm[4-bit] then
364 # activate requested shape
365 if idx = 0 then SVSHAPE0 <- shape
366 if idx = 1 then SVSHAPE1 <- shape
367 if idx = 2 then SVSHAPE2 <- shape
368 if idx = 3 then SVSHAPE3 <- shape
369 SVSTATE[bit*2+32:bit*2+33] <- idx
370 # increment shape index, modulo 4
371 if idx = 3 then idx <- 0
372 else idx <- idx + 1
373 else
374 # refined SVSHAPE/REMAP update mode
375 bit <- rmm[0:2]
376 idx <- rmm[3:4]
377 if idx = 0 then SVSHAPE0 <- shape
378 if idx = 1 then SVSHAPE1 <- shape
379 if idx = 2 then SVSHAPE2 <- shape
380 if idx = 3 then SVSHAPE3 <- shape
381 SVSTATE[bit*2+32:bit*2+33] <- idx
382 SVSTATE[46-bit] <- 1
383
384 Special Registers Altered:
385
386 None
387
388 # svshape2
389
390 SVM2-Form
391
392 * svshape2 SVo,SVyx,rmm,SVd,sk,mm
393
394 Pseudo-code:
395
396 # based on nearest MAXVL compute other dimension
397 MVL <- SVSTATE[0:6]
398 d <- [0] * 6
399 dim <- SVd+1
400 do while d*dim <u ([0]*4 || MVL)
401 d <- d + 1
402 # set up template, then copy once location identified
403 shape <- [0]*32
404 shape[30:31] <- 0b00 # mode
405 shape[0:5] <- (0b0 || SVd) # x/ydim
406 if SVyx = 0 then
407 shape[18:20] <- 0b000 # ordering xd/yd(/zd)
408 if sk = 0 then shape[6:11] <- 0 # ydim
409 else shape[6:11] <- 0b111111 # ydim max
410 else
411 shape[18:20] <- 0b010 # ordering yd/xd(/zd)
412 if sk = 1 then shape[6:11] <- 0 # ydim
413 else shape[6:11] <- d-1 # ydim max
414 # offset (the prime purpose of this instruction)
415 shape[24:27] <- SVo # offset
416 if sk = 1 then shape[28:29] <- 0b01 # skip 1st dimension
417 else shape[28:29] <- 0b00 # no skipping
418 # select the mode for updating SVSHAPEs
419 SVSTATE[62] <- mm # set or clear persistence
420 if mm = 0 then
421 # clear out all SVSHAPEs first
422 SVSHAPE0[0:31] <- [0] * 32
423 SVSHAPE1[0:31] <- [0] * 32
424 SVSHAPE2[0:31] <- [0] * 32
425 SVSHAPE3[0:31] <- [0] * 32
426 SVSTATE[32:41] <- [0] * 10 # clear REMAP.mi/o
427 SVSTATE[42:46] <- rmm # rmm exactly REMAP.SVme
428 idx <- 0
429 for bit = 0 to 4
430 if rmm[4-bit] then
431 # activate requested shape
432 if idx = 0 then SVSHAPE0 <- shape
433 if idx = 1 then SVSHAPE1 <- shape
434 if idx = 2 then SVSHAPE2 <- shape
435 if idx = 3 then SVSHAPE3 <- shape
436 SVSTATE[bit*2+32:bit*2+33] <- idx
437 # increment shape index, modulo 4
438 if idx = 3 then idx <- 0
439 else idx <- idx + 1
440 else
441 # refined SVSHAPE/REMAP update mode
442 bit <- rmm[0:2]
443 idx <- rmm[3:4]
444 if idx = 0 then SVSHAPE0 <- shape
445 if idx = 1 then SVSHAPE1 <- shape
446 if idx = 2 then SVSHAPE2 <- shape
447 if idx = 3 then SVSHAPE3 <- shape
448 SVSTATE[bit*2+32:bit*2+33] <- idx
449 SVSTATE[46-bit] <- 1
450
451 Special Registers Altered:
452
453 None
454