From: Konstantinos Margaritis Date: Fri, 14 Oct 2022 00:16:59 +0000 (+0000) Subject: first working version X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=58a3e55f1b813dcada9f12f5e28f377bff69185c;p=openpower-isa.git first working version --- diff --git a/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s b/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s index adbe99c4..c0111407 100644 --- a/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s +++ b/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s @@ -1,17 +1,21 @@ -.set y, 1 -.set x, 2 - .set img_ptr, 3 .set stride, 4 .set var, 5 .set bd, 6 # bitdepth_min_8 -.set cost, 7 # cost array, 8 elements -.set divt, 14 # div_table[8] -.set img, 24 # img array, 8x8 = 64 elements -.set psum, 88 # We will place the results of the psums here -.set tmp, 108 # temporary elements -.set tmp2, 116 # temporary elements +.set pred, 3 # predicate for last stage, reuse r3 + +.set ptr_copy, 7 # copy of img_ptr +.set ptr_orig, 2 # another one + +.set max, 2 # max result +.set retval, 3 # return value + +.set divt, 8 # div_table[15] +.set cost, 24 # cost array, 8 elements +.set img, 32 # img array, 8x8 = 64 elements +.set psum, 96 # We will place the results of the psums here +.set psum_alt, 64 # reuse img when done with last stage .machine libresoc @@ -24,180 +28,425 @@ cdef_find_dir_svp64_real: .L0: .cfi_startproc - # Load div_table[7] array + # Load div_table array, originally it is # div_table[7] = { 840, 420, 280, 210, 168, 140, 120 }; - li divt+0, 840 - li divt+1, 420 - li divt+2, 280 - li divt+3, 210 - li divt+4, 168 - li divt+5, 140 - li divt+6, 120 - li divt+7, 105 # Add 105 as element 8 of the divt table - # saves having to do special case for it + # however, to make calculations easier, we add the same elements in reverse and 105 in middle + # and just set VL=15 + li divt+0, 840 + li divt+1, 420 + li divt+2, 280 + li divt+3, 210 + li divt+4, 168 + li divt+5, 140 + li divt+6, 120 + li divt+7, 105 + li divt+8, 120 + li divt+9, 140 + li divt+10, 168 + li divt+11, 210 + li divt+12, 280 + li divt+13, 420 + li divt+14, 840 + + mr ptr_copy, img_ptr + mr ptr_orig, img_ptr .L1: - # Load 8x8 8-bit elements from img_ptr in groups of 8 with stride - setvl 0,0,8,0,1,1 # Set VL to 8 elements - sv.lha *img, 0(img_ptr) # Load 8 ints from (img_ptr) - add img_ptr, img_ptr, stride # Advance img_ptr by stride - sv.lha *img + 8, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 16, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 24, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 32, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 40, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 48, 0(img_ptr) - add img_ptr, img_ptr, stride - sv.lha *img + 56, 0(img_ptr) - - setvl 0,0,64,0,1,1 # Set VL to 64 elements - sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8 - sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128 + # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride + setvl 0,0,8,0,1,1 # Set VL to 8 elements + sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy) + add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride + sv.lha *img + 8, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 16, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 24, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 32, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 40, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 48, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 56, 0(ptr_copy) + + setvl 0,0,64,0,1,1 # Set VL to 64 elements + sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8 + sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128 # Zero psum registers for partial_sum_hv - setvl 0,0,16,0,1,1 # Set VL to 16 elements - sv.ori *psum, 0, 0 + setvl 0,0,16,0,1,1 # Set VL to 16 elements + sv.ori *psum, 0, 0 # First do the horizontal partial sums: # partial_sum_hv[0][y] += px; - setvl 0,0,8,0,1,1 # Set VL to 8 elements - sv.add/mr psum+0, psum+0, *img+0 - sv.add/mr psum+1, psum+1, *img+8 - sv.add/mr psum+2, psum+2, *img+16 - sv.add/mr psum+3, psum+3, *img+24 - sv.add/mr psum+4, psum+4, *img+32 - sv.add/mr psum+5, psum+5, *img+40 - sv.add/mr psum+6, psum+6, *img+48 - sv.add/mr psum+7, psum+7, *img+56 + setvl 0,0,8,0,1,1 # Set VL to 8 elements + sv.add/mr psum+0, psum+0, *img+0 + sv.add/mr psum+1, psum+1, *img+8 + sv.add/mr psum+2, psum+2, *img+16 + sv.add/mr psum+3, psum+3, *img+24 + sv.add/mr psum+4, psum+4, *img+32 + sv.add/mr psum+5, psum+5, *img+40 + sv.add/mr psum+6, psum+6, *img+48 + sv.add/mr psum+7, psum+7, *img+56 # Next the vertical partial sums: # partial_sum_hv[1][x] += px; - sv.add/mr *psum+8, *psum+8, *img+0 - sv.add/mr *psum+8, *psum+8, *img+8 - sv.add/mr *psum+8, *psum+8, *img+16 - sv.add/mr *psum+8, *psum+8, *img+24 - sv.add/mr *psum+8, *psum+8, *img+32 - sv.add/mr *psum+8, *psum+8, *img+40 - sv.add/mr *psum+8, *psum+8, *img+48 - sv.add/mr *psum+8, *psum+8, *img+56 + sv.add/mr *psum+8, *psum+8, *img+0 + sv.add/mr *psum+8, *psum+8, *img+8 + sv.add/mr *psum+8, *psum+8, *img+16 + sv.add/mr *psum+8, *psum+8, *img+24 + sv.add/mr *psum+8, *psum+8, *img+32 + sv.add/mr *psum+8, *psum+8, *img+40 + sv.add/mr *psum+8, *psum+8, *img+48 + sv.add/mr *psum+8, *psum+8, *img+56 # Zero cost registers - setvl 0,0,8,0,1,1 # Set VL to 8 elements - sv.ori *cost, 0, 0 + setvl 0,0,8,0,1,1 # Set VL to 8 elements + sv.ori *cost, 0, 0 # cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n]; - sv.maddld/mr cost+2, *psum, *psum, cost+2 + sv.maddld/mr cost+2, *psum, *psum, cost+2 # cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n]; - sv.maddld/mr cost+6, *psum+8, *psum+8, cost+6 + sv.maddld/mr cost+6, *psum+8, *psum+8, cost+6 # cost[2] *= 105 # cost[6] *= 105 - mulli cost+2, cost+2, 105 - mulli cost+6, cost+6, 105 + mulli cost+2, cost+2, 105 + mulli cost+6, cost+6, 105 # We're done with partial_sum_hv values, we can reuse the registers # for partial_sum_diag # Zero psum registers for partial_sum_diag - setvl 0,0,30,0,1,1 # Set VL to 30 elements - sv.ori *psum, 0, 0 + setvl 0,0,30,0,1,1 # Set VL to 30 elements + sv.ori *psum, 0, 0 setvl 0,0,8,0,1,1 # Set VL to 8 elements # First row of diagonal partial sums: # partial_sum_diag[0][y + x] += px; - sv.add/mr *psum+0, *psum+0, *img+0 - sv.add/mr *psum+1, *psum+1, *img+8 - sv.add/mr *psum+2, *psum+2, *img+16 - sv.add/mr *psum+3, *psum+3, *img+24 - sv.add/mr *psum+4, *psum+4, *img+32 - sv.add/mr *psum+5, *psum+5, *img+40 - sv.add/mr *psum+6, *psum+6, *img+48 - sv.add/mr *psum+7, *psum+7, *img+56 + sv.add/mr *psum+0, *psum+0, *img+0 + sv.add/mr *psum+1, *psum+1, *img+8 + sv.add/mr *psum+2, *psum+2, *img+16 + sv.add/mr *psum+3, *psum+3, *img+24 + sv.add/mr *psum+4, *psum+4, *img+32 + sv.add/mr *psum+5, *psum+5, *img+40 + sv.add/mr *psum+6, *psum+6, *img+48 + sv.add/mr *psum+7, *psum+7, *img+56 # Second row of diagonal partial sums: # partial_sum_diag[1][7 + y - x] += px; - sv.add/mr *psum+15, *psum+15, *img+56 - sv.add/mr *psum+16, *psum+16, *img+48 - sv.add/mr *psum+17, *psum+17, *img+40 - sv.add/mr *psum+18, *psum+18, *img+32 - sv.add/mr *psum+19, *psum+19, *img+24 - sv.add/mr *psum+20, *psum+20, *img+16 - sv.add/mr *psum+21, *psum+21, *img+8 - sv.add/mr *psum+22, *psum+22, *img+0 + sv.add/mr *psum+15, *psum+15, *img+56 + sv.add/mr *psum+16, *psum+16, *img+48 + sv.add/mr *psum+17, *psum+17, *img+40 + sv.add/mr *psum+18, *psum+18, *img+32 + sv.add/mr *psum+19, *psum+19, *img+24 + sv.add/mr *psum+20, *psum+20, *img+16 + sv.add/mr *psum+21, *psum+21, *img+8 + sv.add/mr *psum+22, *psum+22, *img+0 # these were calculated correctly but in reverse order, # but since they're going to be used in a sum, order is not important. - setvl 0,0,15,0,1,1 # Set VL to 15 elements - sv.ori *tmp, 0, 0 - # cost[0] += (partial_sum_diag[0][n] * partial_sum_diag[0][n] + # partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d; # Produce squares of all values - sv.maddld/mr *tmp, *psum+0, *psum+0, *tmp - # Handle the first 8 elements in order, *includes* partial_sum_diag[0][7]! - #setvl 0,0,8,0,1,1 # Set VL to 8 elements - #sv.mulld *tmp, *tmp, *divt - # Handle remaining 7 elements, in reverse order - setvl 0,0,7,0,1,1 # Set VL to 7 elements - sv.svstep/mrr *tmp2, 6, 1 - svindex 29,0b1,7,0,0,0,0 - sv.ori *tmp, *divt, 0 - #sv.mulld *tmp, *tmp, *divt - # Now sum those up to cost[0] element - #setvl 0,0,15,0,1,1 # Set VL to 15 elements - #sv.add/mr cost+0, *tmp, cost+0 + setvl 0,0,15,0,1,1 # Set VL to 15 elements + sv.mulld *psum+0, *psum+0, *psum+0 + sv.mulld *psum+0, *psum+0, *divt + sv.add/mr cost+0, *psum+0, cost+0 # Similarly for cost[4] # cost[4] += (partial_sum_diag[1][n] * partial_sum_diag[1][n] + # partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d; - #sv.maddld/mr *tmp, *psum+16, *psum+16, *tmp - #sv.maddld/mr *tmp, *psum+24, *psum+24, *tmp - #sv.mulld *tmp, *tmp, *divt - #sv.add/mr cost+4, *tmp, cost+4 - - - # Zero psum registers for partial_sum_alt, process half of 44 - #setvl 0,0,22,0,1,1 # Set VL to 22 elements - #sv.ori psum, 0, 0 + sv.mulld *psum+15, *psum+15, *psum+15 + sv.mulld *psum+15, *psum+15, *divt + sv.add/mr cost+4, *psum+15, cost+4 # First row of alt partial sums: # partial_sum_alt [0][y + (x >> 1)] += px; # These are essentially calculated the following way: # horiz axis: x, vert axis: y, quantity of y + (x>>1): - # - # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - # | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 | - # | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 | - # | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 | - # | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 | - # | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 | - # | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 | - # | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 | - # | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a | # # We calculate this in a similar manner to the diagonal # partial sums, but first we have to do pair-wise addition - # on all the elements of the img matrix: - #setvl 0,0,64,0,1,1 # Set VL to 64 elements - #svstep 2 - #sv.add *img, *img, *img+1 + # on all the elements of the img matrix, compressing the rows + # to half size in the process + # + # + # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | + # | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 | | 0 | 0 | 1 | 2 | 3 | + # | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 | | 1 | | 1 | 2 | 3 | 4 | + # | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 | | 2 | | 2 | 3 | 4 | 5 | + # | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 | -> | 3 | | 3 | 4 | 5 | 6 | + # | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 | | 4 | | 4 | 5 | 6 | 7 | + # | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 | | 5 | | 5 | 6 | 7 | 8 | + # | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 | | 6 | | 6 | 7 | 8 | 9 | + # | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a | | 7 | | 7 | 8 | 9 | a | + # + setvl 0,0,16,0,1,1 # Set VL to 16 elements + ori pred, 0, 0b0101010101010101 + sv.add/sm=r3 *psum+0, *img, *img+1 + sv.add/sm=r3 *psum+16, *img+16, *img+17 + #Copy the even-numbered registers only + sv.ori/sm=r3 *img+0, *psum+0, 0 + sv.ori/sm=r3 *img+8, *psum+16, 0 + # Process the next 32 elements + sv.add/sm=r3 *psum+0, *img+32, *img+33 + sv.add/sm=r3 *psum+16, *img+48, *img+49 + # Copy their sums (again even-numbered registers only) + sv.ori/sm=r3 *img+16, *psum+0, 0 + sv.ori/sm=r3 *img+24, *psum+16, 0 + + # clear registers to hold the values + setvl 0,0,11,0,1,1 # Set VL to 22 elements + sv.ori *psum_alt, 0, 0 + + setvl 0,0,4,0,1,1 # Set VL to 4 elements + sv.add *psum_alt+0, *psum_alt+0, *img+0 + sv.add *psum_alt+1, *psum_alt+1, *img+4 + sv.add *psum_alt+2, *psum_alt+2, *img+8 + sv.add *psum_alt+3, *psum_alt+3, *img+12 + sv.add *psum_alt+4, *psum_alt+4, *img+16 + sv.add *psum_alt+5, *psum_alt+5, *img+20 + sv.add *psum_alt+6, *psum_alt+6, *img+24 + sv.add *psum_alt+7, *psum_alt+7, *img+28 + + # We need to reshape div_table to ease calculations: + # The elements 3 - 8 will be multiplied by 105 + # and elements 0-3 and 8-10 will be multiplied by 420, 210, 140, resp, + # so + li divt+0, 420 + li divt+1, 210 + li divt+2, 140 + setvl 0,0,5,0,1,1 # Set VL to 5 elements + sv.ori *divt+3, 0, 105 + li divt+8, 140 + li divt+9, 210 + li divt+10, 420 + + # Now the following is equivalent to: + # for (int m = 0; m < 5; m++) + # cost[1] += partial_sum_alt[0][3 + m] * partial_sum_alt[0][3 + m]; + # cost[1] *= 105; + # for (int m = 0; m < 3; m++) { + # const int d = div_table[2 * m + 1]; + # cost[1] += (partial_sum_alt[0][m] * partial_sum_alt[0][m] + + # partial_sum_alt[0][10 - m] * partial_sum_alt[0][10 - m]) * d; + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.mulld *psum_alt+0, *psum_alt+0, *psum_alt+0 + sv.mulld *psum_alt+0, *psum_alt+0, *divt + sv.add/mr cost+1, *psum_alt+0, cost+1 + + # Next row of partial_sum_alts, + # partial_sum_alt [1][3 + y - (x >> 1)] += px; + # + # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | + # | 0 | 3 | 3 | 2 | 2 | 1 | 1 | 0 | 0 | | 0 | | 3 | 2 | 1 | 0 | + # | 1 | 4 | 4 | 3 | 3 | 2 | 2 | 1 | 1 | | 1 | | 4 | 3 | 2 | 1 | + # | 2 | 5 | 5 | 4 | 4 | 3 | 3 | 2 | 2 | | 2 | | 5 | 4 | 3 | 2 | + # | 3 | 6 | 6 | 5 | 5 | 4 | 4 | 3 | 3 | -> | 3 | | 6 | 5 | 4 | 3 | + # | 4 | 7 | 7 | 6 | 6 | 5 | 5 | 4 | 4 | | 4 | | 7 | 6 | 5 | 4 | + # | 5 | 8 | 8 | 7 | 7 | 6 | 6 | 5 | 5 | | 5 | | 8 | 7 | 6 | 5 | + # | 6 | 9 | 9 | 8 | 8 | 7 | 7 | 6 | 6 | | 6 | | 9 | 8 | 7 | 6 | + # | 7 | a | a | 9 | 9 | 8 | 8 | 7 | 7 | | 7 | a | 9 | 8 | 7 | + + setvl 0,0,32,0,1,1 # clear everything + sv.ori *96, 0, 0 + + # Same method, unfortunately now we have to load img again + # With elwidth and subvl we could pack the data to avoid any loads whatsever + # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride + mr ptr_copy, ptr_orig + setvl 0,0,8,0,1,1 # Set VL to 8 elements + sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy) + add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride + sv.lha *img + 8, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 16, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 24, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 32, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 40, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 48, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 56, 0(ptr_copy) + + setvl 0,0,64,0,1,1 # Set VL to 64 elements + sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8 + sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128 + + setvl 0,0,16,0,1,1 # Set VL to 16 elements + ori pred, 0, 0b0101010101010101 + sv.add/sm=r3 *psum+0, *img, *img+1 + sv.add/sm=r3 *psum+16, *img+16, *img+17 + #Copy the even-numbered registers only + sv.ori/sm=r3 *img+0, *psum+0, 0 + sv.ori/sm=r3 *img+8, *psum+16, 0 + # Process the next 32 elements + sv.add/sm=r3 *psum+0, *img+32, *img+33 + sv.add/sm=r3 *psum+16, *img+48, *img+49 + # Copy their sums (again even-numbered registers only) + sv.ori/sm=r3 *img+16, *psum+0, 0 + sv.ori/sm=r3 *img+24, *psum+16, 0 + + # clear registers to hold the values + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.ori *psum_alt, 0, 0 + + setvl 0,0,4,0,1,1 # Set VL to 4 elements + sv.add *psum_alt+7, *psum_alt+7, *img+0 + sv.add *psum_alt+6, *psum_alt+6, *img+4 + sv.add *psum_alt+5, *psum_alt+5, *img+8 + sv.add *psum_alt+4, *psum_alt+4, *img+12 + sv.add *psum_alt+3, *psum_alt+3, *img+16 + sv.add *psum_alt+2, *psum_alt+2, *img+20 + sv.add *psum_alt+1, *psum_alt+1, *img+24 + sv.add *psum_alt+0, *psum_alt+0, *img+28 + + # Now the following is equivalent to: + # for (int m = 0; m < 5; m++) + # cost[3] += partial_sum_alt[1][3 + m] * partial_sum_alt[1][3 + m]; + # cost[3] *= 105; + # for (int m = 0; m < 3; m++) { + # const int d = div_table[2 * m + 1]; + # cost[3] += (partial_sum_alt[1][m] * partial_sum_alt[1][m] + + # partial_sum_alt[1][10 - m] * partial_sum_alt[1][10 - m]) * d; + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.mulld *psum_alt+0, *psum_alt+0, *psum_alt+0 + sv.mulld *psum_alt+0, *psum_alt+0, *divt + sv.add/mr cost+3, *psum_alt+0, cost+3 + + #setvl 0,0,64,0,1,1 # clear everything + #sv.ori *img, 0, 0 + #setvl 0,0,32,0,1,1 # clear everything + #sv.ori *96, 0, 0 + + # Next row of partial_sum_alts, + # partial_sum_alt [2][3 - (y >> 1) + x ] += px; + # + # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | + # | 0 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 0 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 1 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 2 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 2 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 3 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -> | 3 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 4 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 4 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 5 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 5 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + + # We calculate this in a similar manner to the diagonal + # partial sums, but first we have to do pair-wise addition, this time across rows + # on all the elements of the img matrix, compressing the columns + # to half size in the process + + # Similar method, unfortunately now we have to load img again + # With elwidth and subvl we could pack the data to avoid any loads whatsever + # Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride + mr ptr_copy, ptr_orig + setvl 0,0,8,0,1,1 # Set VL to 8 elements + sv.lha *img, 0(ptr_copy) # Load 8 ints from (ptr_copy) + add ptr_copy, ptr_copy, stride # Advance ptr_copy by stride + sv.lha *img + 8, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 16, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 24, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 32, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 40, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 48, 0(ptr_copy) + add ptr_copy, ptr_copy, stride + sv.lha *img + 56, 0(ptr_copy) + + setvl 0,0,64,0,1,1 # Set VL to 64 elements + sv.sraw *img, *img, bd # img[x] >> bitdepth_min_8 + sv.addi *img, *img, -128 # px = (img[x] >> bitdepth_min_8) - 128 + + # clear registers to hold the values + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.ori *psum, 0, 0 + + setvl 0,0,8,0,1,1 # Set VL to 16 elements + # sum row 1 & 2, index +3 + sv.add *psum+3, *psum+3, *img+0 + sv.add *psum+3, *psum+3, *img+8 + # sum row 2 & 3, index +2 + sv.add *psum+2, *psum+2, *img+16 + sv.add *psum+2, *psum+2, *img+24 + # sum row 4 & 5, index +1 + sv.add *psum+1, *psum+1, *img+32 + sv.add *psum+1, *psum+1, *img+40 + # sum row 6 & 7, index +0 + sv.add *psum+0, *psum+0, *img+48 + sv.add *psum+0, *psum+0, *img+56 + + # Now the following is equivalent to: + # for (int m = 0; m < 5; m++) + # cost[5] += partial_sum_alt[2][3 + m] * partial_sum_alt[2][3 + m]; + # cost[5] *= 105; + # for (int m = 0; m < 3; m++) { + # const int d = div_table[2 * m + 1]; + # cost[5] += (partial_sum_alt[2][m] * partial_sum_alt[2][m] + + # partial_sum_alt[2][10 - m] * partial_sum_alt[2][10 - m]) * d; + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.mulld *psum+0, *psum+0, *psum+0 + sv.mulld *psum+0, *psum+0, *divt + sv.add/mr cost+5, *psum+0, cost+5 + + # Next row of partial_sum_alts, + # partial_sum_alt [3][ (y >> 1) + x ] += px; + # + # | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | + # | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 2 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 2 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 3 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -> | 3 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 4 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 4 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 5 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | | 5 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 6 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 6 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + # | 7 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a | | 7 | | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | - #setvl 0,0,8,0,1,1 # Set VL to 8 elements - #sv.add *psum+0, *psum+0, *img+0 - #sv.add *psum+0, *psum+0, *img+1 - #sv.add *psum+1, *psum+1, *img+8 - #sv.add *psum+1, *psum+1, *img+9 + # This calculation is similar to the previous, and we have enough registers available, + # we don't have to reload img. + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.ori *psum, 0, 0 - #setvl 0,0,10,0,1,1 # Set VL to 2 elements - #sv.add/mr *psum, *psum, *psum+1 -# + setvl 0,0,8,0,1,1 # Set VL to 8 elements + # sum row 1 & 2, index +0 + sv.add *psum+0, *psum+0, *img+0 + sv.add *psum+0, *psum+0, *img+8 + # sum row 2 & 3, index +1 + sv.add *psum+1, *psum+1, *img+16 + sv.add *psum+1, *psum+1, *img+24 + # sum row 4 & 5, index +2 + sv.add *psum+2, *psum+2, *img+32 + sv.add *psum+2, *psum+2, *img+40 + # sum row 6 & 7, index +3 + sv.add *psum+3, *psum+3, *img+48 + sv.add *psum+3, *psum+3, *img+56 + # Now the following is equivalent to: + # for (int m = 0; m < 5; m++) + # cost[7] += partial_sum_alt[3][3 + m] * partial_sum_alt[3][3 + m]; + # cost[7] *= 105; + # for (int m = 0; m < 3; m++) { + # const int d = div_table[2 * m + 1]; + # cost[7] += (partial_sum_alt[3][m] * partial_sum_alt[3][m] + + # partial_sum_alt[3][10 - m] * partial_sum_alt[3][10 - m]) * d; + setvl 0,0,11,0,1,1 # Set VL to 11 elements + sv.mulld *psum+0, *psum+0, *psum+0 + sv.mulld *psum+0, *psum+0, *divt + sv.add/mr cost+7, *psum+0, cost+7 + setvl 0,0,8,0,1,1 # Set VL to 8 elements + #sv.maxs/mr max, *cost + #sv.cmp/ff=ne/VLI max, *cost, 1 +# sv.addi/m=eq retval,*,0 blr .long 0 .byte 0,0,0,0,0,0,0,0