media/video/av1/src/ppc/cdef_tmpl_svp64_real.s

   1 .set img_ptr, 3
   2 .set stride, 4
   3 .set var, 5
   4 .set bd, 6              # bitdepth_min_8
   5
   6 .set pred, 3            # predicate for last stage, reuse r3
   7
   8 .set ptr_copy, 7        # copy of img_ptr
   9 .set ptr_orig, 2        # another one
  10
  11 .set max, 2             # max result
  12 .set retval, 3          # return value
  13
  14 .set divt, 8            # div_table[15]
  15 .set cost, 24           # cost array, 8 elements
  16 .set img, 32            # img array, 8x8 = 64 elements
  17 .set psum, 96           # We will place the results of the psums here
  18 .set psum_alt, 64       # reuse img when done with last stage
  19
  20
  21         .machine libresoc
  22         .file   "cdef_tmpl_svp64_real.c"
  23         .abiversion 2
  24         .section        ".text"
  25         .align 2
  26         .globl cdef_find_dir_svp64_real
  27         .type   cdef_find_dir_svp64_real, @function
  28 cdef_find_dir_svp64_real:
  29 .L0:
  30         .cfi_startproc
  31         # Load div_table array, originally it is
  32         # div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
  33         # however, to make calculations easier, we add the same elements in reverse and 105 in middle
  34         # and just set VL=15
  35         li                      divt+0, 840
  36         li                      divt+1, 420
  37         li                      divt+2, 280
  38         li                      divt+3, 210
  39         li                      divt+4, 168
  40         li                      divt+5, 140
  41         li                      divt+6, 120
  42         li                      divt+7, 105
  43         li                      divt+8, 120
  44         li                      divt+9, 140
  45         li                      divt+10, 168
  46         li                      divt+11, 210
  47         li                      divt+12, 280
  48         li                      divt+13, 420
  49         li                      divt+14, 840
  50
  51         mr                      ptr_copy, img_ptr
  52         mr                      ptr_orig, img_ptr
  53
  54 .L1:
  55         # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
  56         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
  57         sv.lha                  *img, 0(ptr_copy)               # Load 8 ints from (ptr_copy)
  58         add                     ptr_copy, ptr_copy, stride      # Advance ptr_copy by stride
  59         sv.lha                  *img + 8, 0(ptr_copy)
  60         add                     ptr_copy, ptr_copy, stride
  61         sv.lha                  *img + 16, 0(ptr_copy)
  62         add                     ptr_copy, ptr_copy, stride
  63         sv.lha                  *img + 24, 0(ptr_copy)
  64         add                     ptr_copy, ptr_copy, stride
  65         sv.lha                  *img + 32, 0(ptr_copy)
  66         add                     ptr_copy, ptr_copy, stride
  67         sv.lha                  *img + 40, 0(ptr_copy)
  68         add                     ptr_copy, ptr_copy, stride
  69         sv.lha                  *img + 48, 0(ptr_copy)
  70         add                     ptr_copy, ptr_copy, stride
  71         sv.lha                  *img + 56, 0(ptr_copy)
  72
  73         setvl                   0,0,64,0,1,1                    # Set VL to 64 elements
  74         sv.sraw                 *img, *img, bd                  # img[x] >> bitdepth_min_8
  75         sv.addi                 *img, *img, -128                # px = (img[x] >> bitdepth_min_8) - 128
  76
  77         # Zero psum registers for partial_sum_hv
  78         setvl                   0,0,16,0,1,1                    # Set VL to 16 elements
  79         sv.ori                  *psum, 0, 0
  80
  81         # First do the horizontal partial sums:
  82         # partial_sum_hv[0][y] += px;
  83         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
  84         sv.add/mr               psum+0, psum+0, *img+0
  85         sv.add/mr               psum+1, psum+1, *img+8
  86         sv.add/mr               psum+2, psum+2, *img+16
  87         sv.add/mr               psum+3, psum+3, *img+24
  88         sv.add/mr               psum+4, psum+4, *img+32
  89         sv.add/mr               psum+5, psum+5, *img+40
  90         sv.add/mr               psum+6, psum+6, *img+48
  91         sv.add/mr               psum+7, psum+7, *img+56
  92
  93         # Next the vertical partial sums:
  94         # partial_sum_hv[1][x] += px;
  95         sv.add/mr               *psum+8, *psum+8, *img+0
  96         sv.add/mr               *psum+8, *psum+8, *img+8
  97         sv.add/mr               *psum+8, *psum+8, *img+16
  98         sv.add/mr               *psum+8, *psum+8, *img+24
  99         sv.add/mr               *psum+8, *psum+8, *img+32
 100         sv.add/mr               *psum+8, *psum+8, *img+40
 101         sv.add/mr               *psum+8, *psum+8, *img+48
 102         sv.add/mr               *psum+8, *psum+8, *img+56
 103
 104         # Zero cost registers
 105         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
 106         sv.ori                  *cost, 0, 0
 107
 108         # cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
 109         sv.maddld/mr            cost+2, *psum, *psum, cost+2
 110         # cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
 111         sv.maddld/mr            cost+6, *psum+8, *psum+8, cost+6
 112
 113         # cost[2] *= 105
 114         # cost[6] *= 105
 115         mulli                   cost+2, cost+2, 105
 116         mulli                   cost+6, cost+6, 105
 117
 118         # We're done with partial_sum_hv values, we can reuse the registers
 119         # for partial_sum_diag
 120         # Zero psum registers for partial_sum_diag
 121         setvl                   0,0,30,0,1,1                    # Set VL to 30 elements
 122         sv.ori                  *psum, 0, 0
 123
 124         setvl           0,0,8,0,1,1                     # Set VL to 8 elements
 125         # First row of diagonal partial sums:
 126         # partial_sum_diag[0][y + x] += px;
 127         sv.add/mr               *psum+0, *psum+0, *img+0
 128         sv.add/mr               *psum+1, *psum+1, *img+8
 129         sv.add/mr               *psum+2, *psum+2, *img+16
 130         sv.add/mr               *psum+3, *psum+3, *img+24
 131         sv.add/mr               *psum+4, *psum+4, *img+32
 132         sv.add/mr               *psum+5, *psum+5, *img+40
 133         sv.add/mr               *psum+6, *psum+6, *img+48
 134         sv.add/mr               *psum+7, *psum+7, *img+56
 135
 136         # Second row of diagonal partial sums:
 137         # partial_sum_diag[1][7 + y - x] += px;
 138         sv.add/mr               *psum+15, *psum+15, *img+56
 139         sv.add/mr               *psum+16, *psum+16, *img+48
 140         sv.add/mr               *psum+17, *psum+17, *img+40
 141         sv.add/mr               *psum+18, *psum+18, *img+32
 142         sv.add/mr               *psum+19, *psum+19, *img+24
 143         sv.add/mr               *psum+20, *psum+20, *img+16
 144         sv.add/mr               *psum+21, *psum+21, *img+8
 145         sv.add/mr               *psum+22, *psum+22, *img+0
 146         # these were calculated correctly but in reverse order,
 147         # but since they're going to be used in a sum, order is not important.
 148
 149         # cost[0] += (partial_sum_diag[0][n]      * partial_sum_diag[0][n] +
 150         #             partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
 151         # Produce squares of all values
 152         setvl                   0,0,15,0,1,1                    # Set VL to 15 elements
 153         sv.mulld                *psum+0, *psum+0, *psum+0
 154         sv.mulld                *psum+0, *psum+0, *divt
 155         sv.add/mr               cost+0, *psum+0, cost+0
 156
 157         # Similarly for cost[4]
 158         # cost[4] += (partial_sum_diag[1][n]      * partial_sum_diag[1][n] +
 159         #             partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
 160         sv.mulld                *psum+15, *psum+15, *psum+15
 161         sv.mulld                *psum+15, *psum+15, *divt
 162         sv.add/mr               cost+4, *psum+15, cost+4
 163
 164         # First row of alt partial sums:
 165         # partial_sum_alt [0][y + (x >> 1)] += px;
 166         # These are essentially calculated the following way:
 167         # horiz axis: x, vert axis: y, quantity of y + (x>>1):
 168         #
 169         # We calculate this in a similar manner to the diagonal
 170         # partial sums, but first we have to do pair-wise addition
 171         # on all the elements of the img matrix, compressing the rows
 172         # to half size in the process
 173         #
 174         #
 175         # |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
 176         # | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 |    | 0 | 0 | 1 | 2 | 3 |
 177         # | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 |    | 1 |   | 1 | 2 | 3 | 4 |
 178         # | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 |    | 2 |       | 2 | 3 | 4 | 5 |
 179         # | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 | -> | 3 |           | 3 | 4 | 5 | 6 |
 180         # | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 |    | 4 |               | 4 | 5 | 6 | 7 |
 181         # | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 |    | 5 |                   | 5 | 6 | 7 | 8 |
 182         # | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 |    | 6 |                       | 6 | 7 | 8 | 9 |
 183         # | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a |    | 7 |                           | 7 | 8 | 9 | a |
 184         #
 185         setvl                   0,0,16,0,1,1                    # Set VL to 16 elements
 186         ori                     pred, 0, 0b0101010101010101
 187         sv.add/sm=r3            *psum+0, *img, *img+1
 188         sv.add/sm=r3            *psum+16, *img+16, *img+17
 189         #Copy the even-numbered registers only
 190         sv.ori/sm=r3            *img+0, *psum+0, 0
 191         sv.ori/sm=r3            *img+8, *psum+16, 0
 192         # Process the next 32 elements
 193         sv.add/sm=r3            *psum+0, *img+32, *img+33
 194         sv.add/sm=r3            *psum+16, *img+48, *img+49
 195         # Copy their sums (again even-numbered registers only)
 196         sv.ori/sm=r3            *img+16, *psum+0, 0
 197         sv.ori/sm=r3            *img+24, *psum+16, 0
 198
 199         # clear registers to hold the values
 200         setvl                   0,0,11,0,1,1                    # Set VL to 22 elements
 201         sv.ori                  *psum_alt, 0, 0
 202
 203         setvl                   0,0,4,0,1,1                     # Set VL to 4 elements
 204         sv.add                  *psum_alt+0, *psum_alt+0, *img+0
 205         sv.add                  *psum_alt+1, *psum_alt+1, *img+4
 206         sv.add                  *psum_alt+2, *psum_alt+2, *img+8
 207         sv.add                  *psum_alt+3, *psum_alt+3, *img+12
 208         sv.add                  *psum_alt+4, *psum_alt+4, *img+16
 209         sv.add                  *psum_alt+5, *psum_alt+5, *img+20
 210         sv.add                  *psum_alt+6, *psum_alt+6, *img+24
 211         sv.add                  *psum_alt+7, *psum_alt+7, *img+28
 212
 213         # We need to reshape div_table to ease calculations:
 214         # The elements 3 - 8 will be multiplied by 105
 215         # and elements 0-3 and 8-10 will be multiplied by 420, 210, 140, resp,
 216         # so
 217         li                      divt+0, 420
 218         li                      divt+1, 210
 219         li                      divt+2, 140
 220         setvl                   0,0,5,0,1,1                     # Set VL to 5 elements
 221         sv.ori                  *divt+3, 0, 105
 222         li                      divt+8, 140
 223         li                      divt+9, 210
 224         li                      divt+10, 420
 225
 226         # Now the following is equivalent to:
 227         # for (int m = 0; m < 5; m++)
 228         #   cost[1] += partial_sum_alt[0][3 + m] * partial_sum_alt[0][3 + m];
 229         # cost[1] *= 105;
 230         # for (int m = 0; m < 3; m++) {
 231         #   const int d = div_table[2 * m + 1];
 232         #   cost[1] += (partial_sum_alt[0][m]      * partial_sum_alt[0][m] +
 233         #               partial_sum_alt[0][10 - m] * partial_sum_alt[0][10 - m]) * d;
 234         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 235         sv.mulld                *psum_alt+0, *psum_alt+0, *psum_alt+0
 236         sv.mulld                *psum_alt+0, *psum_alt+0, *divt
 237         sv.add/mr               cost+1, *psum_alt+0, cost+1
 238
 239         # Next row of partial_sum_alts,
 240         # partial_sum_alt [1][3 + y - (x >> 1)] += px;
 241         #
 242         # |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
 243         # | 0 | 3 | 3 | 2 | 2 | 1 | 1 | 0 | 0 |    | 0 |                           | 3 | 2 | 1 | 0 |
 244         # | 1 | 4 | 4 | 3 | 3 | 2 | 2 | 1 | 1 |    | 1 |                       | 4 | 3 | 2 | 1 |
 245         # | 2 | 5 | 5 | 4 | 4 | 3 | 3 | 2 | 2 |    | 2 |                   | 5 | 4 | 3 | 2 |
 246         # | 3 | 6 | 6 | 5 | 5 | 4 | 4 | 3 | 3 | -> | 3 |               | 6 | 5 | 4 | 3 |
 247         # | 4 | 7 | 7 | 6 | 6 | 5 | 5 | 4 | 4 |    | 4 |           | 7 | 6 | 5 | 4 |
 248         # | 5 | 8 | 8 | 7 | 7 | 6 | 6 | 5 | 5 |    | 5 |       | 8 | 7 | 6 | 5 |
 249         # | 6 | 9 | 9 | 8 | 8 | 7 | 7 | 6 | 6 |    | 6 |   | 9 | 8 | 7 | 6 |
 250         # | 7 | a | a | 9 | 9 | 8 | 8 | 7 | 7 |    | 7 | a | 9 | 8 | 7 |
 251
 252         setvl                   0,0,32,0,1,1                    # clear everything
 253         sv.ori                  *96, 0, 0
 254
 255         # Same method, unfortunately now we have to load img again
 256         # With elwidth and subvl we could pack the data to avoid any loads whatsever
 257         # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
 258         mr                      ptr_copy, ptr_orig
 259         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
 260         sv.lha                  *img, 0(ptr_copy)               # Load 8 ints from (ptr_copy)
 261         add                     ptr_copy, ptr_copy, stride      # Advance ptr_copy by stride
 262         sv.lha                  *img + 8, 0(ptr_copy)
 263         add                     ptr_copy, ptr_copy, stride
 264         sv.lha                  *img + 16, 0(ptr_copy)
 265         add                     ptr_copy, ptr_copy, stride
 266         sv.lha                  *img + 24, 0(ptr_copy)
 267         add                     ptr_copy, ptr_copy, stride
 268         sv.lha                  *img + 32, 0(ptr_copy)
 269         add                     ptr_copy, ptr_copy, stride
 270         sv.lha                  *img + 40, 0(ptr_copy)
 271         add                     ptr_copy, ptr_copy, stride
 272         sv.lha                  *img + 48, 0(ptr_copy)
 273         add                     ptr_copy, ptr_copy, stride
 274         sv.lha                  *img + 56, 0(ptr_copy)
 275
 276         setvl                   0,0,64,0,1,1                    # Set VL to 64 elements
 277         sv.sraw                 *img, *img, bd                  # img[x] >> bitdepth_min_8
 278         sv.addi                 *img, *img, -128                # px = (img[x] >> bitdepth_min_8) - 128
 279
 280         setvl                   0,0,16,0,1,1                    # Set VL to 16 elements
 281         ori                     pred, 0, 0b0101010101010101
 282         sv.add/sm=r3            *psum+0, *img, *img+1
 283         sv.add/sm=r3            *psum+16, *img+16, *img+17
 284         #Copy the even-numbered registers only
 285         sv.ori/sm=r3            *img+0, *psum+0, 0
 286         sv.ori/sm=r3            *img+8, *psum+16, 0
 287         # Process the next 32 elements
 288         sv.add/sm=r3            *psum+0, *img+32, *img+33
 289         sv.add/sm=r3            *psum+16, *img+48, *img+49
 290         # Copy their sums (again even-numbered registers only)
 291         sv.ori/sm=r3            *img+16, *psum+0, 0
 292         sv.ori/sm=r3            *img+24, *psum+16, 0
 293
 294         # clear registers to hold the values
 295         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 296         sv.ori                  *psum_alt, 0, 0
 297
 298         setvl                   0,0,4,0,1,1                     # Set VL to 4 elements
 299         sv.add                  *psum_alt+7, *psum_alt+7, *img+0
 300         sv.add                  *psum_alt+6, *psum_alt+6, *img+4
 301         sv.add                  *psum_alt+5, *psum_alt+5, *img+8
 302         sv.add                  *psum_alt+4, *psum_alt+4, *img+12
 303         sv.add                  *psum_alt+3, *psum_alt+3, *img+16
 304         sv.add                  *psum_alt+2, *psum_alt+2, *img+20
 305         sv.add                  *psum_alt+1, *psum_alt+1, *img+24
 306         sv.add                  *psum_alt+0, *psum_alt+0, *img+28
 307
 308         # Now the following is equivalent to:
 309         # for (int m = 0; m < 5; m++)
 310         #   cost[3] += partial_sum_alt[1][3 + m] * partial_sum_alt[1][3 + m];
 311         # cost[3] *= 105;
 312         # for (int m = 0; m < 3; m++) {
 313         #   const int d = div_table[2 * m + 1];
 314         #   cost[3] += (partial_sum_alt[1][m]      * partial_sum_alt[1][m] +
 315         #               partial_sum_alt[1][10 - m] * partial_sum_alt[1][10 - m]) * d;
 316         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 317         sv.mulld                *psum_alt+0, *psum_alt+0, *psum_alt+0
 318         sv.mulld                *psum_alt+0, *psum_alt+0, *divt
 319         sv.add/mr               cost+3, *psum_alt+0, cost+3
 320
 321         #setvl                  0,0,64,0,1,1                    # clear everything
 322         #sv.ori                 *img, 0, 0
 323         #setvl                  0,0,32,0,1,1                    # clear everything
 324         #sv.ori                 *96, 0, 0
 325
 326         # Next row of partial_sum_alts,
 327         # partial_sum_alt [2][3 - (y >> 1) +  x      ] += px;
 328         #
 329         # |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
 330         # | 0 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 0 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 331         # | 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 1 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 332         # | 2 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 2 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 333         # | 3 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -> | 3 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 334         # | 4 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 4 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 335         # | 5 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 5 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 336         # | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 337         # | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 338
 339         # We calculate this in a similar manner to the diagonal
 340         # partial sums, but first we have to do pair-wise addition, this time across rows
 341         # on all the elements of the img matrix, compressing the columns
 342         # to half size in the process
 343
 344         # Similar method, unfortunately now we have to load img again
 345         # With elwidth and subvl we could pack the data to avoid any loads whatsever
 346         # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
 347         mr                      ptr_copy, ptr_orig
 348         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
 349         sv.lha                  *img, 0(ptr_copy)               # Load 8 ints from (ptr_copy)
 350         add                     ptr_copy, ptr_copy, stride      # Advance ptr_copy by stride
 351         sv.lha                  *img + 8, 0(ptr_copy)
 352         add                     ptr_copy, ptr_copy, stride
 353         sv.lha                  *img + 16, 0(ptr_copy)
 354         add                     ptr_copy, ptr_copy, stride
 355         sv.lha                  *img + 24, 0(ptr_copy)
 356         add                     ptr_copy, ptr_copy, stride
 357         sv.lha                  *img + 32, 0(ptr_copy)
 358         add                     ptr_copy, ptr_copy, stride
 359         sv.lha                  *img + 40, 0(ptr_copy)
 360         add                     ptr_copy, ptr_copy, stride
 361         sv.lha                  *img + 48, 0(ptr_copy)
 362         add                     ptr_copy, ptr_copy, stride
 363         sv.lha                  *img + 56, 0(ptr_copy)
 364
 365         setvl                   0,0,64,0,1,1                    # Set VL to 64 elements
 366         sv.sraw                 *img, *img, bd                  # img[x] >> bitdepth_min_8
 367         sv.addi                 *img, *img, -128                # px = (img[x] >> bitdepth_min_8) - 128
 368
 369         # clear registers to hold the values
 370         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 371         sv.ori                  *psum, 0, 0
 372
 373         setvl                   0,0,8,0,1,1                     # Set VL to 16 elements
 374         # sum row 1 & 2, index +3
 375         sv.add                  *psum+3, *psum+3, *img+0
 376         sv.add                  *psum+3, *psum+3, *img+8
 377         # sum row 2 & 3, index +2
 378         sv.add                  *psum+2, *psum+2, *img+16
 379         sv.add                  *psum+2, *psum+2, *img+24
 380         # sum row 4 & 5, index +1
 381         sv.add                  *psum+1, *psum+1, *img+32
 382         sv.add                  *psum+1, *psum+1, *img+40
 383         # sum row 6 & 7, index +0
 384         sv.add                  *psum+0, *psum+0, *img+48
 385         sv.add                  *psum+0, *psum+0, *img+56
 386
 387         # Now the following is equivalent to:
 388         # for (int m = 0; m < 5; m++)
 389         #   cost[5] += partial_sum_alt[2][3 + m] * partial_sum_alt[2][3 + m];
 390         # cost[5] *= 105;
 391         # for (int m = 0; m < 3; m++) {
 392         #   const int d = div_table[2 * m + 1];
 393         #   cost[5] += (partial_sum_alt[2][m]      * partial_sum_alt[2][m] +
 394         #               partial_sum_alt[2][10 - m] * partial_sum_alt[2][10 - m]) * d;
 395         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 396         sv.mulld                *psum+0, *psum+0, *psum+0
 397         sv.mulld                *psum+0, *psum+0, *divt
 398         sv.add/mr               cost+5, *psum+0, cost+5
 399
 400         # Next row of partial_sum_alts,
 401         # partial_sum_alt [3][ (y >> 1) +  x      ] += px;
 402         #
 403         # |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
 404         # | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 405         # | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 406         # | 2 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 2 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 407         # | 3 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -> | 3 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 408         # | 4 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 4 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 409         # | 5 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 5 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 410         # | 6 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 6 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 411         # | 7 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 7 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 412
 413         # This calculation is similar to the previous, and we have enough registers available,
 414         # we don't have to reload img.
 415
 416         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 417         sv.ori                  *psum, 0, 0
 418
 419         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
 420         # sum row 1 & 2, index +0
 421         sv.add                  *psum+0, *psum+0, *img+0
 422         sv.add                  *psum+0, *psum+0, *img+8
 423         # sum row 2 & 3, index +1
 424         sv.add                  *psum+1, *psum+1, *img+16
 425         sv.add                  *psum+1, *psum+1, *img+24
 426         # sum row 4 & 5, index +2
 427         sv.add                  *psum+2, *psum+2, *img+32
 428         sv.add                  *psum+2, *psum+2, *img+40
 429         # sum row 6 & 7, index +3
 430         sv.add                  *psum+3, *psum+3, *img+48
 431         sv.add                  *psum+3, *psum+3, *img+56
 432
 433         # Now the following is equivalent to:
 434         # for (int m = 0; m < 5; m++)
 435         #   cost[7] += partial_sum_alt[3][3 + m] * partial_sum_alt[3][3 + m];
 436         # cost[7] *= 105;
 437         # for (int m = 0; m < 3; m++) {
 438         #   const int d = div_table[2 * m + 1];
 439         #   cost[7] += (partial_sum_alt[3][m]      * partial_sum_alt[3][m] +
 440         #               partial_sum_alt[3][10 - m] * partial_sum_alt[3][10 - m]) * d;
 441         setvl                   0,0,11,0,1,1                    # Set VL to 11 elements
 442         sv.mulld                *psum+0, *psum+0, *psum+0
 443         sv.mulld                *psum+0, *psum+0, *divt
 444         sv.add/mr               cost+7, *psum+0, cost+7
 445
 446         mr                      max, cost+5
 447         setvl                   0,0,8,0,1,1                     # Set VL to 8 elements
 448         #sv.minmax/mr           max, max, *cost, 3 # MMM=maxs
 449         sv.cmp                  0, 0, *cost, max
 450         svstep                  retval, 5, 1
 451 #       sv.addi/m=eq            retval,*,0
 452         blr
 453         .long 0
 454         .byte 0,0,0,0,0,0,0,0
 455         .cfi_endproc
 456 .LFE27:
 457         .size   cdef_find_dir_svp64_real,.-cdef_find_dir_svp64_real
 458         .ident  "GCC: (Debian 8.3.0-6) 8.3.0"
 459         .section        .note.GNU-stack,"",@progbits