pysvp64db: fix traversal
[openpower-isa.git] / media / video / av1 / src / ppc / cdef_tmpl_svp64_real.s
1 .set img_ptr, 3
2 .set stride, 4
3 .set var, 5
4 .set bd, 6 # bitdepth_min_8
5
6 .set pred, 3 # predicate for last stage, reuse r3
7
8 .set ptr_copy, 7 # copy of img_ptr
9 .set ptr_orig, 2 # another one
10
11 .set max, 2 # max result
12 .set retval, 3 # return value
13
14 .set divt, 8 # div_table[15]
15 .set cost, 24 # cost array, 8 elements
16 .set img, 32 # img array, 8x8 = 64 elements
17 .set psum, 96 # We will place the results of the psums here
18 .set psum_alt, 64 # reuse img when done with last stage
19
20
21 .machine libresoc
22 .file "cdef_tmpl_svp64_real.c"
23 .abiversion 2
24 .section ".text"
25 .align 2
26 .globl cdef_find_dir_svp64_real
27 .type cdef_find_dir_svp64_real, @function
28 cdef_find_dir_svp64_real:
29 .L0:
30 .cfi_startproc
31 # Load div_table array, originally it is
32 # div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
33 # however, to make calculations easier, we add the same elements in reverse and 105 in middle
34 # and just set VL=15
35 li divt+0, 840
36 li divt+1, 420
37 li divt+2, 280
38 li divt+3, 210
39 li divt+4, 168
40 li divt+5, 140
41 li divt+6, 120
42 li divt+7, 105
43 li divt+8, 120
44 li divt+9, 140
45 li divt+10, 168
46 li divt+11, 210
47 li divt+12, 280
48 li divt+13, 420
49 li divt+14, 840
50
51 mr ptr_copy, img_ptr
52 mr ptr_orig, img_ptr
53
54 .L1:
55 # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
56 setvl 0,0,8,0,1,1 # Set VL to 8 elements
57 sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy)
58 add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride
59 sv.lha *img + 8, 0(ptr_copy)
60 add ptr_copy, ptr_copy, stride
61 sv.lha *img + 16, 0(ptr_copy)
62 add ptr_copy, ptr_copy, stride
63 sv.lha *img + 24, 0(ptr_copy)
64 add ptr_copy, ptr_copy, stride
65 sv.lha *img + 32, 0(ptr_copy)
66 add ptr_copy, ptr_copy, stride
67 sv.lha *img + 40, 0(ptr_copy)
68 add ptr_copy, ptr_copy, stride
69 sv.lha *img + 48, 0(ptr_copy)
70 add ptr_copy, ptr_copy, stride
71 sv.lha *img + 56, 0(ptr_copy)
72
73 setvl 0,0,64,0,1,1 # Set VL to 64 elements
74 sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8
75 sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128
76
77 # Zero psum registers for partial_sum_hv
78 setvl 0,0,16,0,1,1 # Set VL to 16 elements
79 sv.ori *psum, 0, 0
80
81 # First do the horizontal partial sums:
82 # partial_sum_hv[0][y] += px;
83 setvl 0,0,8,0,1,1 # Set VL to 8 elements
84 sv.add/mr psum+0, psum+0, *img+0
85 sv.add/mr psum+1, psum+1, *img+8
86 sv.add/mr psum+2, psum+2, *img+16
87 sv.add/mr psum+3, psum+3, *img+24
88 sv.add/mr psum+4, psum+4, *img+32
89 sv.add/mr psum+5, psum+5, *img+40
90 sv.add/mr psum+6, psum+6, *img+48
91 sv.add/mr psum+7, psum+7, *img+56
92
93 # Next the vertical partial sums:
94 # partial_sum_hv[1][x] += px;
95 sv.add/mr *psum+8, *psum+8, *img+0
96 sv.add/mr *psum+8, *psum+8, *img+8
97 sv.add/mr *psum+8, *psum+8, *img+16
98 sv.add/mr *psum+8, *psum+8, *img+24
99 sv.add/mr *psum+8, *psum+8, *img+32
100 sv.add/mr *psum+8, *psum+8, *img+40
101 sv.add/mr *psum+8, *psum+8, *img+48
102 sv.add/mr *psum+8, *psum+8, *img+56
103
104 # Zero cost registers
105 setvl 0,0,8,0,1,1 # Set VL to 8 elements
106 sv.ori *cost, 0, 0
107
108 # cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
109 sv.maddld/mr cost+2, *psum, *psum, cost+2
110 # cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
111 sv.maddld/mr cost+6, *psum+8, *psum+8, cost+6
112
113 # cost[2] *= 105
114 # cost[6] *= 105
115 mulli cost+2, cost+2, 105
116 mulli cost+6, cost+6, 105
117
118 # We're done with partial_sum_hv values, we can reuse the registers
119 # for partial_sum_diag
120 # Zero psum registers for partial_sum_diag
121 setvl 0,0,30,0,1,1 # Set VL to 30 elements
122 sv.ori *psum, 0, 0
123
124 setvl 0,0,8,0,1,1 # Set VL to 8 elements
125 # First row of diagonal partial sums:
126 # partial_sum_diag[0][y + x] += px;
127 sv.add/mr *psum+0, *psum+0, *img+0
128 sv.add/mr *psum+1, *psum+1, *img+8
129 sv.add/mr *psum+2, *psum+2, *img+16
130 sv.add/mr *psum+3, *psum+3, *img+24
131 sv.add/mr *psum+4, *psum+4, *img+32
132 sv.add/mr *psum+5, *psum+5, *img+40
133 sv.add/mr *psum+6, *psum+6, *img+48
134 sv.add/mr *psum+7, *psum+7, *img+56
135
136 # Second row of diagonal partial sums:
137 # partial_sum_diag[1][7 + y - x] += px;
138 sv.add/mr *psum+15, *psum+15, *img+56
139 sv.add/mr *psum+16, *psum+16, *img+48
140 sv.add/mr *psum+17, *psum+17, *img+40
141 sv.add/mr *psum+18, *psum+18, *img+32
142 sv.add/mr *psum+19, *psum+19, *img+24
143 sv.add/mr *psum+20, *psum+20, *img+16
144 sv.add/mr *psum+21, *psum+21, *img+8
145 sv.add/mr *psum+22, *psum+22, *img+0
146 # these were calculated correctly but in reverse order,
147 # but since they're going to be used in a sum, order is not important.
148
149 # cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] +
150 # partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
151 # Produce squares of all values
152 setvl 0,0,15,0,1,1 # Set VL to 15 elements
153 sv.mulld *psum+0, *psum+0, *psum+0
154 sv.mulld *psum+0, *psum+0, *divt
155 sv.add/mr cost+0, *psum+0, cost+0
156
157 # Similarly for cost[4]
158 # cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] +
159 # partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
160 sv.mulld *psum+15, *psum+15, *psum+15
161 sv.mulld *psum+15, *psum+15, *divt
162 sv.add/mr cost+4, *psum+15, cost+4
163
164 # First row of alt partial sums:
165 # partial_sum_alt [0][y + (x >> 1)] += px;
166 # These are essentially calculated the following way:
167 # horiz axis: x, vert axis: y, quantity of y + (x>>1):
168 #
169 # We calculate this in a similar manner to the diagonal
170 # partial sums, but first we have to do pair-wise addition
171 # on all the elements of the img matrix, compressing the rows
172 # to half size in the process
173 #
174 #
175 # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
176 # | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 | | 0 | 0 | 1 | 2 | 3 |
177 # | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 | | 1 | | 1 | 2 | 3 | 4 |
178 # | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 | | 2 | | 2 | 3 | 4 | 5 |
179 # | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 | -> | 3 | | 3 | 4 | 5 | 6 |
180 # | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 | | 4 | | 4 | 5 | 6 | 7 |
181 # | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 | | 5 | | 5 | 6 | 7 | 8 |
182 # | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 | | 6 | | 6 | 7 | 8 | 9 |
183 # | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a | | 7 | | 7 | 8 | 9 | a |
184 #
185 setvl 0,0,16,0,1,1 # Set VL to 16 elements
186 ori pred, 0, 0b0101010101010101
187 sv.add/sm=r3 *psum+0, *img, *img+1
188 sv.add/sm=r3 *psum+16, *img+16, *img+17
189 #Copy the even-numbered registers only
190 sv.ori/sm=r3 *img+0, *psum+0, 0
191 sv.ori/sm=r3 *img+8, *psum+16, 0
192 # Process the next 32 elements
193 sv.add/sm=r3 *psum+0, *img+32, *img+33
194 sv.add/sm=r3 *psum+16, *img+48, *img+49
195 # Copy their sums (again even-numbered registers only)
196 sv.ori/sm=r3 *img+16, *psum+0, 0
197 sv.ori/sm=r3 *img+24, *psum+16, 0
198
199 # clear registers to hold the values
200 setvl 0,0,11,0,1,1 # Set VL to 22 elements
201 sv.ori *psum_alt, 0, 0
202
203 setvl 0,0,4,0,1,1 # Set VL to 4 elements
204 sv.add *psum_alt+0, *psum_alt+0, *img+0
205 sv.add *psum_alt+1, *psum_alt+1, *img+4
206 sv.add *psum_alt+2, *psum_alt+2, *img+8
207 sv.add *psum_alt+3, *psum_alt+3, *img+12
208 sv.add *psum_alt+4, *psum_alt+4, *img+16
209 sv.add *psum_alt+5, *psum_alt+5, *img+20
210 sv.add *psum_alt+6, *psum_alt+6, *img+24
211 sv.add *psum_alt+7, *psum_alt+7, *img+28
212
213 # We need to reshape div_table to ease calculations:
214 # The elements 3 - 8 will be multiplied by 105
215 # and elements 0-3 and 8-10 will be multiplied by 420, 210, 140, resp,
216 # so
217 li divt+0, 420
218 li divt+1, 210
219 li divt+2, 140
220 setvl 0,0,5,0,1,1 # Set VL to 5 elements
221 sv.ori *divt+3, 0, 105
222 li divt+8, 140
223 li divt+9, 210
224 li divt+10, 420
225
226 # Now the following is equivalent to:
227 # for (int m = 0; m < 5; m++)
228 # cost[1] += partial_sum_alt[0][3 + m] * partial_sum_alt[0][3 + m];
229 # cost[1] *= 105;
230 # for (int m = 0; m < 3; m++) {
231 # const int d = div_table[2 * m + 1];
232 # cost[1] += (partial_sum_alt[0][m] * partial_sum_alt[0][m] +
233 # partial_sum_alt[0][10 - m] * partial_sum_alt[0][10 - m]) * d;
234 setvl 0,0,11,0,1,1 # Set VL to 11 elements
235 sv.mulld *psum_alt+0, *psum_alt+0, *psum_alt+0
236 sv.mulld *psum_alt+0, *psum_alt+0, *divt
237 sv.add/mr cost+1, *psum_alt+0, cost+1
238
239 # Next row of partial_sum_alts,
240 # partial_sum_alt [1][3 + y - (x >> 1)] += px;
241 #
242 # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
243 # | 0 | 3 | 3 | 2 | 2 | 1 | 1 | 0 | 0 | | 0 | | 3 | 2 | 1 | 0 |
244 # | 1 | 4 | 4 | 3 | 3 | 2 | 2 | 1 | 1 | | 1 | | 4 | 3 | 2 | 1 |
245 # | 2 | 5 | 5 | 4 | 4 | 3 | 3 | 2 | 2 | | 2 | | 5 | 4 | 3 | 2 |
246 # | 3 | 6 | 6 | 5 | 5 | 4 | 4 | 3 | 3 | -> | 3 | | 6 | 5 | 4 | 3 |
247 # | 4 | 7 | 7 | 6 | 6 | 5 | 5 | 4 | 4 | | 4 | | 7 | 6 | 5 | 4 |
248 # | 5 | 8 | 8 | 7 | 7 | 6 | 6 | 5 | 5 | | 5 | | 8 | 7 | 6 | 5 |
249 # | 6 | 9 | 9 | 8 | 8 | 7 | 7 | 6 | 6 | | 6 | | 9 | 8 | 7 | 6 |
250 # | 7 | a | a | 9 | 9 | 8 | 8 | 7 | 7 | | 7 | a | 9 | 8 | 7 |
251
252 setvl 0,0,32,0,1,1 # clear everything
253 sv.ori *96, 0, 0
254
255 # Same method, unfortunately now we have to load img again
256 # With elwidth and subvl we could pack the data to avoid any loads whatsever
257 # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
258 mr ptr_copy, ptr_orig
259 setvl 0,0,8,0,1,1 # Set VL to 8 elements
260 sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy)
261 add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride
262 sv.lha *img + 8, 0(ptr_copy)
263 add ptr_copy, ptr_copy, stride
264 sv.lha *img + 16, 0(ptr_copy)
265 add ptr_copy, ptr_copy, stride
266 sv.lha *img + 24, 0(ptr_copy)
267 add ptr_copy, ptr_copy, stride
268 sv.lha *img + 32, 0(ptr_copy)
269 add ptr_copy, ptr_copy, stride
270 sv.lha *img + 40, 0(ptr_copy)
271 add ptr_copy, ptr_copy, stride
272 sv.lha *img + 48, 0(ptr_copy)
273 add ptr_copy, ptr_copy, stride
274 sv.lha *img + 56, 0(ptr_copy)
275
276 setvl 0,0,64,0,1,1 # Set VL to 64 elements
277 sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8
278 sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128
279
280 setvl 0,0,16,0,1,1 # Set VL to 16 elements
281 ori pred, 0, 0b0101010101010101
282 sv.add/sm=r3 *psum+0, *img, *img+1
283 sv.add/sm=r3 *psum+16, *img+16, *img+17
284 #Copy the even-numbered registers only
285 sv.ori/sm=r3 *img+0, *psum+0, 0
286 sv.ori/sm=r3 *img+8, *psum+16, 0
287 # Process the next 32 elements
288 sv.add/sm=r3 *psum+0, *img+32, *img+33
289 sv.add/sm=r3 *psum+16, *img+48, *img+49
290 # Copy their sums (again even-numbered registers only)
291 sv.ori/sm=r3 *img+16, *psum+0, 0
292 sv.ori/sm=r3 *img+24, *psum+16, 0
293
294 # clear registers to hold the values
295 setvl 0,0,11,0,1,1 # Set VL to 11 elements
296 sv.ori *psum_alt, 0, 0
297
298 setvl 0,0,4,0,1,1 # Set VL to 4 elements
299 sv.add *psum_alt+7, *psum_alt+7, *img+0
300 sv.add *psum_alt+6, *psum_alt+6, *img+4
301 sv.add *psum_alt+5, *psum_alt+5, *img+8
302 sv.add *psum_alt+4, *psum_alt+4, *img+12
303 sv.add *psum_alt+3, *psum_alt+3, *img+16
304 sv.add *psum_alt+2, *psum_alt+2, *img+20
305 sv.add *psum_alt+1, *psum_alt+1, *img+24
306 sv.add *psum_alt+0, *psum_alt+0, *img+28
307
308 # Now the following is equivalent to:
309 # for (int m = 0; m < 5; m++)
310 # cost[3] += partial_sum_alt[1][3 + m] * partial_sum_alt[1][3 + m];
311 # cost[3] *= 105;
312 # for (int m = 0; m < 3; m++) {
313 # const int d = div_table[2 * m + 1];
314 # cost[3] += (partial_sum_alt[1][m] * partial_sum_alt[1][m] +
315 # partial_sum_alt[1][10 - m] * partial_sum_alt[1][10 - m]) * d;
316 setvl 0,0,11,0,1,1 # Set VL to 11 elements
317 sv.mulld *psum_alt+0, *psum_alt+0, *psum_alt+0
318 sv.mulld *psum_alt+0, *psum_alt+0, *divt
319 sv.add/mr cost+3, *psum_alt+0, cost+3
320
321 #setvl 0,0,64,0,1,1 # clear everything
322 #sv.ori *img, 0, 0
323 #setvl 0,0,32,0,1,1 # clear everything
324 #sv.ori *96, 0, 0
325
326 # Next row of partial_sum_alts,
327 # partial_sum_alt [2][3 - (y >> 1) + x ] += px;
328 #
329 # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
330 # | 0 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 0 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
331 # | 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 1 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
332 # | 2 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 2 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
333 # | 3 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -> | 3 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
334 # | 4 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 4 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
335 # | 5 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 5 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
336 # | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
337 # | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
338
339 # We calculate this in a similar manner to the diagonal
340 # partial sums, but first we have to do pair-wise addition, this time across rows
341 # on all the elements of the img matrix, compressing the columns
342 # to half size in the process
343
344 # Similar method, unfortunately now we have to load img again
345 # With elwidth and subvl we could pack the data to avoid any loads whatsever
346 # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
347 mr ptr_copy, ptr_orig
348 setvl 0,0,8,0,1,1 # Set VL to 8 elements
349 sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy)
350 add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride
351 sv.lha *img + 8, 0(ptr_copy)
352 add ptr_copy, ptr_copy, stride
353 sv.lha *img + 16, 0(ptr_copy)
354 add ptr_copy, ptr_copy, stride
355 sv.lha *img + 24, 0(ptr_copy)
356 add ptr_copy, ptr_copy, stride
357 sv.lha *img + 32, 0(ptr_copy)
358 add ptr_copy, ptr_copy, stride
359 sv.lha *img + 40, 0(ptr_copy)
360 add ptr_copy, ptr_copy, stride
361 sv.lha *img + 48, 0(ptr_copy)
362 add ptr_copy, ptr_copy, stride
363 sv.lha *img + 56, 0(ptr_copy)
364
365 setvl 0,0,64,0,1,1 # Set VL to 64 elements
366 sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8
367 sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128
368
369 # clear registers to hold the values
370 setvl 0,0,11,0,1,1 # Set VL to 11 elements
371 sv.ori *psum, 0, 0
372
373 setvl 0,0,8,0,1,1 # Set VL to 16 elements
374 # sum row 1 & 2, index +3
375 sv.add *psum+3, *psum+3, *img+0
376 sv.add *psum+3, *psum+3, *img+8
377 # sum row 2 & 3, index +2
378 sv.add *psum+2, *psum+2, *img+16
379 sv.add *psum+2, *psum+2, *img+24
380 # sum row 4 & 5, index +1
381 sv.add *psum+1, *psum+1, *img+32
382 sv.add *psum+1, *psum+1, *img+40
383 # sum row 6 & 7, index +0
384 sv.add *psum+0, *psum+0, *img+48
385 sv.add *psum+0, *psum+0, *img+56
386
387 # Now the following is equivalent to:
388 # for (int m = 0; m < 5; m++)
389 # cost[5] += partial_sum_alt[2][3 + m] * partial_sum_alt[2][3 + m];
390 # cost[5] *= 105;
391 # for (int m = 0; m < 3; m++) {
392 # const int d = div_table[2 * m + 1];
393 # cost[5] += (partial_sum_alt[2][m] * partial_sum_alt[2][m] +
394 # partial_sum_alt[2][10 - m] * partial_sum_alt[2][10 - m]) * d;
395 setvl 0,0,11,0,1,1 # Set VL to 11 elements
396 sv.mulld *psum+0, *psum+0, *psum+0
397 sv.mulld *psum+0, *psum+0, *divt
398 sv.add/mr cost+5, *psum+0, cost+5
399
400 # Next row of partial_sum_alts,
401 # partial_sum_alt [3][ (y >> 1) + x ] += px;
402 #
403 # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
404 # | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
405 # | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
406 # | 2 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 2 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
407 # | 3 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -> | 3 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
408 # | 4 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 4 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
409 # | 5 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 5 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
410 # | 6 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 6 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
411 # | 7 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 7 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
412
413 # This calculation is similar to the previous, and we have enough registers available,
414 # we don't have to reload img.
415
416 setvl 0,0,11,0,1,1 # Set VL to 11 elements
417 sv.ori *psum, 0, 0
418
419 setvl 0,0,8,0,1,1 # Set VL to 8 elements
420 # sum row 1 & 2, index +0
421 sv.add *psum+0, *psum+0, *img+0
422 sv.add *psum+0, *psum+0, *img+8
423 # sum row 2 & 3, index +1
424 sv.add *psum+1, *psum+1, *img+16
425 sv.add *psum+1, *psum+1, *img+24
426 # sum row 4 & 5, index +2
427 sv.add *psum+2, *psum+2, *img+32
428 sv.add *psum+2, *psum+2, *img+40
429 # sum row 6 & 7, index +3
430 sv.add *psum+3, *psum+3, *img+48
431 sv.add *psum+3, *psum+3, *img+56
432
433 # Now the following is equivalent to:
434 # for (int m = 0; m < 5; m++)
435 # cost[7] += partial_sum_alt[3][3 + m] * partial_sum_alt[3][3 + m];
436 # cost[7] *= 105;
437 # for (int m = 0; m < 3; m++) {
438 # const int d = div_table[2 * m + 1];
439 # cost[7] += (partial_sum_alt[3][m] * partial_sum_alt[3][m] +
440 # partial_sum_alt[3][10 - m] * partial_sum_alt[3][10 - m]) * d;
441 setvl 0,0,11,0,1,1 # Set VL to 11 elements
442 sv.mulld *psum+0, *psum+0, *psum+0
443 sv.mulld *psum+0, *psum+0, *divt
444 sv.add/mr cost+7, *psum+0, cost+7
445
446 mr max, cost+5
447 setvl 0,0,8,0,1,1 # Set VL to 8 elements
448 #sv.minmax/mr max, max, *cost, 3 # MMM=maxs
449 sv.cmp 0, 0, *cost, max
450 svstep retval, 5, 1
451 # sv.addi/m=eq retval,*,0
452 blr
453 .long 0
454 .byte 0,0,0,0,0,0,0,0
455 .cfi_endproc
456 .LFE27:
457 .size cdef_find_dir_svp64_real,.-cdef_find_dir_svp64_real
458 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
459 .section .note.GNU-stack,"",@progbits