From a65084c24742b43e79da714e5cd08f0d24a83eab Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos.margaritis@vectorcamp.gr>
Date: Fri, 14 Oct 2022 00:16:59 +0000
Subject: [PATCH] first working version

---
 .../video/av1/src/ppc/cdef_tmpl_svp64_real.s  | 509 +++++++++++++-----
 1 file changed, 379 insertions(+), 130 deletions(-)

diff --git a/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s b/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s
index adbe99c4..c0111407 100644
--- a/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s
+++ b/media/video/av1/src/ppc/cdef_tmpl_svp64_real.s
@@ -1,17 +1,21 @@
-.set y, 1
-.set x, 2
-
 .set img_ptr, 3
 .set stride, 4
 .set var, 5
 .set bd, 6		# bitdepth_min_8
 
-.set cost, 7		# cost array, 8 elements
-.set divt, 14		# div_table[8]
-.set img, 24		# img array, 8x8 = 64 elements
-.set psum, 88		# We will place the results of the psums here
-.set tmp, 108	 	# temporary elements
-.set tmp2, 116	 	# temporary elements
+.set pred, 3		# predicate for last stage, reuse r3
+
+.set ptr_copy, 7	# copy of img_ptr
+.set ptr_orig, 2	# another one
+
+.set max, 2		# max result
+.set retval, 3		# return value
+
+.set divt, 8		# div_table[15]
+.set cost, 24		# cost array, 8 elements
+.set img, 32		# img array, 8x8 = 64 elements
+.set psum, 96		# We will place the results of the psums here
+.set psum_alt, 64	# reuse img when done with last stage
 
 
 	.machine libresoc
@@ -24,180 +28,425 @@
 cdef_find_dir_svp64_real:
 .L0:
 	.cfi_startproc
-	# Load div_table[7] array
+	# Load div_table array, originally it is
         # div_table[7] = { 840, 420, 280, 210, 168, 140, 120 };
-	li		divt+0, 840
-	li		divt+1, 420
-	li		divt+2, 280
-	li		divt+3, 210
-	li		divt+4, 168
-	li		divt+5, 140
-	li		divt+6, 120
-	li		divt+7, 105			# Add 105 as element 8 of the divt table
-							# saves having to do special case for it
+	# however, to make calculations easier, we add the same elements in reverse and 105 in middle
+	# and just set VL=15
+	li			divt+0, 840
+	li			divt+1, 420
+	li			divt+2, 280
+	li			divt+3, 210
+	li			divt+4, 168
+	li			divt+5, 140
+	li			divt+6, 120
+	li			divt+7, 105
+	li			divt+8, 120
+	li			divt+9, 140
+	li			divt+10, 168
+	li			divt+11, 210
+	li			divt+12, 280
+	li			divt+13, 420
+	li			divt+14, 840
+
+	mr			ptr_copy, img_ptr
+	mr			ptr_orig, img_ptr
 	
 .L1:
-	# Load 8x8 8-bit elements from img_ptr in groups of 8 with stride
-	setvl		0,0,8,0,1,1			# Set VL to 8 elements
-	sv.lha		*img, 0(img_ptr)		# Load 8 ints from (img_ptr)
-	add 		img_ptr, img_ptr, stride	# Advance img_ptr by stride
-	sv.lha	 	*img + 8, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 16, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 24, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 32, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 40, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 48, 0(img_ptr)
-	add 		img_ptr, img_ptr, stride
-	sv.lha 		*img + 56, 0(img_ptr)
-
-	setvl		0,0,64,0,1,1			# Set VL to 64 elements
-	sv.sraw		*img, *img, bd			# img[x] >> bitdepth_min_8
-	sv.addi		*img, *img, -128		# px = (img[x] >> bitdepth_min_8) - 128
+	# Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	sv.lha			*img, 0(ptr_copy)		# Load 8 ints from (ptr_copy)
+	add 			ptr_copy, ptr_copy, stride	# Advance ptr_copy by stride
+	sv.lha	 		*img + 8, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 16, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 24, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 32, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 40, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 48, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 56, 0(ptr_copy)
+
+	setvl			0,0,64,0,1,1			# Set VL to 64 elements
+	sv.sraw			*img, *img, bd			# img[x] >> bitdepth_min_8
+	sv.addi			*img, *img, -128		# px = (img[x] >> bitdepth_min_8) - 128
 
 	# Zero psum registers for partial_sum_hv
-	setvl		0,0,16,0,1,1			# Set VL to 16 elements
-	sv.ori		*psum, 0, 0
+	setvl			0,0,16,0,1,1			# Set VL to 16 elements
+	sv.ori			*psum, 0, 0
 
 	# First do the horizontal partial sums:
 	# partial_sum_hv[0][y] += px;
-	setvl		0,0,8,0,1,1			# Set VL to 8 elements
-	sv.add/mr	psum+0, psum+0, *img+0
-	sv.add/mr	psum+1, psum+1, *img+8
-	sv.add/mr	psum+2, psum+2, *img+16
-	sv.add/mr	psum+3, psum+3, *img+24
-	sv.add/mr	psum+4, psum+4, *img+32
-	sv.add/mr	psum+5, psum+5, *img+40
-	sv.add/mr	psum+6, psum+6, *img+48
-	sv.add/mr	psum+7, psum+7, *img+56
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	sv.add/mr		psum+0, psum+0, *img+0
+	sv.add/mr		psum+1, psum+1, *img+8
+	sv.add/mr		psum+2, psum+2, *img+16
+	sv.add/mr		psum+3, psum+3, *img+24
+	sv.add/mr		psum+4, psum+4, *img+32
+	sv.add/mr		psum+5, psum+5, *img+40
+	sv.add/mr		psum+6, psum+6, *img+48
+	sv.add/mr		psum+7, psum+7, *img+56
 
 	# Next the vertical partial sums:
         # partial_sum_hv[1][x] += px;
-	sv.add/mr	*psum+8, *psum+8, *img+0
-	sv.add/mr	*psum+8, *psum+8, *img+8
-	sv.add/mr	*psum+8, *psum+8, *img+16
-	sv.add/mr	*psum+8, *psum+8, *img+24
-	sv.add/mr	*psum+8, *psum+8, *img+32
-	sv.add/mr	*psum+8, *psum+8, *img+40
-	sv.add/mr	*psum+8, *psum+8, *img+48
-	sv.add/mr	*psum+8, *psum+8, *img+56
+	sv.add/mr		*psum+8, *psum+8, *img+0
+	sv.add/mr		*psum+8, *psum+8, *img+8
+	sv.add/mr		*psum+8, *psum+8, *img+16
+	sv.add/mr		*psum+8, *psum+8, *img+24
+	sv.add/mr		*psum+8, *psum+8, *img+32
+	sv.add/mr		*psum+8, *psum+8, *img+40
+	sv.add/mr		*psum+8, *psum+8, *img+48
+	sv.add/mr		*psum+8, *psum+8, *img+56
 
 	# Zero cost registers
-	setvl		0,0,8,0,1,1			# Set VL to 8 elements
-	sv.ori		*cost, 0, 0
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	sv.ori			*cost, 0, 0
 
         # cost[2] += partial_sum_hv[0][n] * partial_sum_hv[0][n];
-	sv.maddld/mr	cost+2, *psum, *psum, cost+2
+	sv.maddld/mr		cost+2, *psum, *psum, cost+2
         # cost[6] += partial_sum_hv[1][n] * partial_sum_hv[1][n];
-	sv.maddld/mr	cost+6, *psum+8, *psum+8, cost+6
+	sv.maddld/mr		cost+6, *psum+8, *psum+8, cost+6
 
 	# cost[2] *= 105
 	# cost[6] *= 105
-	mulli		cost+2, cost+2, 105
-	mulli		cost+6, cost+6, 105
+	mulli			cost+2, cost+2, 105
+	mulli			cost+6, cost+6, 105
 
 	# We're done with partial_sum_hv values, we can reuse the registers
 	# for partial_sum_diag
 	# Zero psum registers for partial_sum_diag
-	setvl		0,0,30,0,1,1			# Set VL to 30 elements
-	sv.ori		*psum, 0, 0
+	setvl			0,0,30,0,1,1			# Set VL to 30 elements
+	sv.ori			*psum, 0, 0
 
 	setvl		0,0,8,0,1,1			# Set VL to 8 elements
 	# First row of diagonal partial sums:
 	# partial_sum_diag[0][y + x] += px;
-	sv.add/mr	*psum+0, *psum+0, *img+0
-	sv.add/mr	*psum+1, *psum+1, *img+8
-	sv.add/mr	*psum+2, *psum+2, *img+16
-	sv.add/mr	*psum+3, *psum+3, *img+24
-	sv.add/mr	*psum+4, *psum+4, *img+32
-	sv.add/mr	*psum+5, *psum+5, *img+40
-	sv.add/mr	*psum+6, *psum+6, *img+48
-	sv.add/mr	*psum+7, *psum+7, *img+56
+	sv.add/mr		*psum+0, *psum+0, *img+0
+	sv.add/mr		*psum+1, *psum+1, *img+8
+	sv.add/mr		*psum+2, *psum+2, *img+16
+	sv.add/mr		*psum+3, *psum+3, *img+24
+	sv.add/mr		*psum+4, *psum+4, *img+32
+	sv.add/mr		*psum+5, *psum+5, *img+40
+	sv.add/mr		*psum+6, *psum+6, *img+48
+	sv.add/mr		*psum+7, *psum+7, *img+56
 
 	# Second row of diagonal partial sums:
 	# partial_sum_diag[1][7 + y - x] += px;
-	sv.add/mr	*psum+15, *psum+15, *img+56
-	sv.add/mr	*psum+16, *psum+16, *img+48
-	sv.add/mr	*psum+17, *psum+17, *img+40
-	sv.add/mr	*psum+18, *psum+18, *img+32
-	sv.add/mr	*psum+19, *psum+19, *img+24
-	sv.add/mr	*psum+20, *psum+20, *img+16
-	sv.add/mr	*psum+21, *psum+21, *img+8
-	sv.add/mr	*psum+22, *psum+22, *img+0
+	sv.add/mr		*psum+15, *psum+15, *img+56
+	sv.add/mr		*psum+16, *psum+16, *img+48
+	sv.add/mr		*psum+17, *psum+17, *img+40
+	sv.add/mr		*psum+18, *psum+18, *img+32
+	sv.add/mr		*psum+19, *psum+19, *img+24
+	sv.add/mr		*psum+20, *psum+20, *img+16
+	sv.add/mr		*psum+21, *psum+21, *img+8
+	sv.add/mr		*psum+22, *psum+22, *img+0
 	# these were calculated correctly but in reverse order,
 	# but since they're going to be used in a sum, order is not important.
  
-	setvl		0,0,15,0,1,1			# Set VL to 15 elements
-	sv.ori		*tmp, 0, 0
-
         # cost[0] += (partial_sum_diag[0][n]      * partial_sum_diag[0][n] +
         #             partial_sum_diag[0][14 - n] * partial_sum_diag[0][14 - n]) * d;
 	# Produce squares of all values
-	sv.maddld/mr	*tmp, *psum+0, *psum+0, *tmp
-	# Handle the first 8 elements in order, *includes* partial_sum_diag[0][7]!
-	#setvl		0,0,8,0,1,1			# Set VL to 8 elements
-	#sv.mulld	*tmp, *tmp, *divt
-	# Handle remaining 7 elements, in reverse order
-	setvl		0,0,7,0,1,1			# Set VL to 7 elements
-	sv.svstep/mrr	*tmp2, 6, 1
-	svindex		29,0b1,7,0,0,0,0
-	sv.ori		*tmp, *divt, 0
-	#sv.mulld	*tmp, *tmp, *divt
-	# Now sum those up to cost[0] element
-	#setvl		0,0,15,0,1,1			# Set VL to 15 elements
-	#sv.add/mr	cost+0, *tmp, cost+0
+	setvl			0,0,15,0,1,1			# Set VL to 15 elements
+	sv.mulld		*psum+0, *psum+0, *psum+0
+	sv.mulld		*psum+0, *psum+0, *divt
+	sv.add/mr		cost+0, *psum+0, cost+0
 
 	# Similarly for cost[4]
 	# cost[4] += (partial_sum_diag[1][n]      * partial_sum_diag[1][n] +
 	#             partial_sum_diag[1][14 - n] * partial_sum_diag[1][14 - n]) * d;
-	#sv.maddld/mr	*tmp, *psum+16, *psum+16, *tmp
-	#sv.maddld/mr	*tmp, *psum+24, *psum+24, *tmp
-	#sv.mulld	*tmp, *tmp, *divt
-	#sv.add/mr	cost+4, *tmp, cost+4
-
-
-	# Zero psum registers for partial_sum_alt, process half of 44
-	#setvl		0,0,22,0,1,1			# Set VL to 22 elements
-	#sv.ori		psum, 0, 0
+	sv.mulld		*psum+15, *psum+15, *psum+15
+	sv.mulld		*psum+15, *psum+15, *divt
+	sv.add/mr		cost+4, *psum+15, cost+4
 
 	# First row of alt partial sums:
 	# partial_sum_alt [0][y + (x >> 1)] += px;
 	# These are essentially calculated the following way:
 	# horiz axis: x, vert axis: y, quantity of y + (x>>1):
-	# 
-	# |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
-	# | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 |
-	# | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 |
-	# | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 |
-	# | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 |
-	# | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 |
-	# | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 |
-	# | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 |
-	# | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a |
 	#
 	# We calculate this in a similar manner to the diagonal
 	# partial sums, but first we have to do pair-wise addition
-	# on all the elements of the img matrix:
-	#setvl		0,0,64,0,1,1			# Set VL to 64 elements
-	#svstep		2
-	#sv.add		*img, *img, *img+1
+	# on all the elements of the img matrix, compressing the rows
+	# to half size in the process
+	#
+	# 
+	# |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
+	# | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 3 |    | 0 | 0 | 1 | 2 | 3 |
+	# | 1 | 1 | 1 | 2 | 2 | 3 | 3 | 4 | 4 |    | 1 |   | 1 | 2 | 3 | 4 |
+	# | 2 | 2 | 2 | 3 | 3 | 4 | 4 | 5 | 5 |    | 2 |       | 2 | 3 | 4 | 5 |
+	# | 3 | 3 | 3 | 4 | 4 | 5 | 5 | 6 | 6 | -> | 3 |           | 3 | 4 | 5 | 6 |
+	# | 4 | 4 | 4 | 5 | 5 | 6 | 6 | 7 | 7 |    | 4 |               | 4 | 5 | 6 | 7 |
+	# | 5 | 5 | 5 | 6 | 6 | 7 | 7 | 8 | 8 |    | 5 |                   | 5 | 6 | 7 | 8 |
+	# | 6 | 6 | 6 | 7 | 7 | 8 | 8 | 9 | 9 |    | 6 |                       | 6 | 7 | 8 | 9 |
+	# | 7 | 7 | 7 | 8 | 8 | 9 | 9 | a | a |    | 7 |                           | 7 | 8 | 9 | a |
+	#
+	setvl			0,0,16,0,1,1			# Set VL to 16 elements
+	ori             	pred, 0, 0b0101010101010101
+	sv.add/sm=r3		*psum+0, *img, *img+1
+	sv.add/sm=r3		*psum+16, *img+16, *img+17
+	#Copy the even-numbered registers only
+	sv.ori/sm=r3		*img+0, *psum+0, 0
+	sv.ori/sm=r3		*img+8, *psum+16, 0
+	# Process the next 32 elements
+	sv.add/sm=r3		*psum+0, *img+32, *img+33
+	sv.add/sm=r3		*psum+16, *img+48, *img+49
+	# Copy their sums (again even-numbered registers only)
+	sv.ori/sm=r3		*img+16, *psum+0, 0
+	sv.ori/sm=r3		*img+24, *psum+16, 0
+	
+	# clear registers to hold the values
+	setvl			0,0,11,0,1,1			# Set VL to 22 elements
+	sv.ori			*psum_alt, 0, 0
+
+	setvl			0,0,4,0,1,1			# Set VL to 4 elements
+	sv.add			*psum_alt+0, *psum_alt+0, *img+0
+	sv.add			*psum_alt+1, *psum_alt+1, *img+4
+	sv.add			*psum_alt+2, *psum_alt+2, *img+8
+	sv.add			*psum_alt+3, *psum_alt+3, *img+12
+	sv.add			*psum_alt+4, *psum_alt+4, *img+16
+	sv.add			*psum_alt+5, *psum_alt+5, *img+20
+	sv.add			*psum_alt+6, *psum_alt+6, *img+24
+	sv.add			*psum_alt+7, *psum_alt+7, *img+28
+
+	# We need to reshape div_table to ease calculations:
+	# The elements 3 - 8 will be multiplied by 105
+	# and elements 0-3 and 8-10 will be multiplied by 420, 210, 140, resp,
+	# so 
+	li			divt+0, 420
+	li			divt+1, 210
+	li			divt+2, 140
+	setvl			0,0,5,0,1,1			# Set VL to 5 elements
+	sv.ori			*divt+3, 0, 105
+	li			divt+8, 140
+	li			divt+9, 210
+	li			divt+10, 420
+
+	# Now the following is equivalent to:
+	# for (int m = 0; m < 5; m++)
+	#   cost[1] += partial_sum_alt[0][3 + m] * partial_sum_alt[0][3 + m];
+        # cost[1] *= 105;
+	# for (int m = 0; m < 3; m++) {
+        #   const int d = div_table[2 * m + 1];
+        #   cost[1] += (partial_sum_alt[0][m]      * partial_sum_alt[0][m] +
+        #               partial_sum_alt[0][10 - m] * partial_sum_alt[0][10 - m]) * d;
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.mulld		*psum_alt+0, *psum_alt+0, *psum_alt+0
+	sv.mulld		*psum_alt+0, *psum_alt+0, *divt
+	sv.add/mr		cost+1, *psum_alt+0, cost+1
+ 
+	# Next row of partial_sum_alts, 
+	# partial_sum_alt [1][3 + y - (x >> 1)] += px;
+	#
+	# |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
+	# | 0 | 3 | 3 | 2 | 2 | 1 | 1 | 0 | 0 |    | 0 |                           | 3 | 2 | 1 | 0 |
+	# | 1 | 4 | 4 | 3 | 3 | 2 | 2 | 1 | 1 |    | 1 |                       | 4 | 3 | 2 | 1 |
+	# | 2 | 5 | 5 | 4 | 4 | 3 | 3 | 2 | 2 |    | 2 |                   | 5 | 4 | 3 | 2 |
+	# | 3 | 6 | 6 | 5 | 5 | 4 | 4 | 3 | 3 | -> | 3 |               | 6 | 5 | 4 | 3 |
+	# | 4 | 7 | 7 | 6 | 6 | 5 | 5 | 4 | 4 |    | 4 |           | 7 | 6 | 5 | 4 |
+	# | 5 | 8 | 8 | 7 | 7 | 6 | 6 | 5 | 5 |    | 5 |       | 8 | 7 | 6 | 5 |
+	# | 6 | 9 | 9 | 8 | 8 | 7 | 7 | 6 | 6 |    | 6 |   | 9 | 8 | 7 | 6 |
+	# | 7 | a | a | 9 | 9 | 8 | 8 | 7 | 7 |    | 7 | a | 9 | 8 | 7 |
+
+	setvl			0,0,32,0,1,1			# clear everything
+	sv.ori			*96, 0, 0
+
+	# Same method, unfortunately now we have to load img again
+	# With elwidth and subvl we could pack the data to avoid any loads whatsever
+	# Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
+	mr			ptr_copy, ptr_orig
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	sv.lha			*img, 0(ptr_copy)		# Load 8 ints from (ptr_copy)
+	add 			ptr_copy, ptr_copy, stride	# Advance ptr_copy by stride
+	sv.lha	 		*img + 8, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 16, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 24, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 32, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 40, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 48, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 56, 0(ptr_copy)
+
+	setvl			0,0,64,0,1,1			# Set VL to 64 elements
+	sv.sraw			*img, *img, bd			# img[x] >> bitdepth_min_8
+	sv.addi			*img, *img, -128		# px = (img[x] >> bitdepth_min_8) - 128
+
+	setvl			0,0,16,0,1,1			# Set VL to 16 elements
+	ori             	pred, 0, 0b0101010101010101
+	sv.add/sm=r3		*psum+0, *img, *img+1
+	sv.add/sm=r3		*psum+16, *img+16, *img+17
+	#Copy the even-numbered registers only
+	sv.ori/sm=r3		*img+0, *psum+0, 0
+	sv.ori/sm=r3		*img+8, *psum+16, 0
+	# Process the next 32 elements
+	sv.add/sm=r3		*psum+0, *img+32, *img+33
+	sv.add/sm=r3		*psum+16, *img+48, *img+49
+	# Copy their sums (again even-numbered registers only)
+	sv.ori/sm=r3		*img+16, *psum+0, 0
+	sv.ori/sm=r3		*img+24, *psum+16, 0
+
+	# clear registers to hold the values
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.ori			*psum_alt, 0, 0
+
+	setvl			0,0,4,0,1,1			# Set VL to 4 elements
+	sv.add			*psum_alt+7, *psum_alt+7, *img+0
+	sv.add			*psum_alt+6, *psum_alt+6, *img+4
+	sv.add			*psum_alt+5, *psum_alt+5, *img+8
+	sv.add			*psum_alt+4, *psum_alt+4, *img+12
+	sv.add			*psum_alt+3, *psum_alt+3, *img+16
+	sv.add			*psum_alt+2, *psum_alt+2, *img+20
+	sv.add			*psum_alt+1, *psum_alt+1, *img+24
+	sv.add			*psum_alt+0, *psum_alt+0, *img+28
+
+	# Now the following is equivalent to:
+	# for (int m = 0; m < 5; m++)
+	#   cost[3] += partial_sum_alt[1][3 + m] * partial_sum_alt[1][3 + m];
+        # cost[3] *= 105;
+	# for (int m = 0; m < 3; m++) {
+        #   const int d = div_table[2 * m + 1];
+        #   cost[3] += (partial_sum_alt[1][m]      * partial_sum_alt[1][m] +
+        #               partial_sum_alt[1][10 - m] * partial_sum_alt[1][10 - m]) * d;
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.mulld		*psum_alt+0, *psum_alt+0, *psum_alt+0
+	sv.mulld		*psum_alt+0, *psum_alt+0, *divt
+	sv.add/mr		cost+3, *psum_alt+0, cost+3
+
+	#setvl			0,0,64,0,1,1			# clear everything
+	#sv.ori			*img, 0, 0
+	#setvl			0,0,32,0,1,1			# clear everything
+	#sv.ori			*96, 0, 0
+
+	# Next row of partial_sum_alts, 
+	# partial_sum_alt [2][3 - (y >> 1) +  x      ] += px;
+	#
+	# |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
+	# | 0 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 0 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 1 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 2 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 2 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 3 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -> | 3 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 4 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 4 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 5 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 5 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 6 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 7 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+
+	# We calculate this in a similar manner to the diagonal
+	# partial sums, but first we have to do pair-wise addition, this time across rows
+	# on all the elements of the img matrix, compressing the columns
+	# to half size in the process
+
+	# Similar method, unfortunately now we have to load img again
+	# With elwidth and subvl we could pack the data to avoid any loads whatsever
+	# Load 8x8 8-bit elements from ptr_copy in groups of 8 with stride
+	mr			ptr_copy, ptr_orig
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	sv.lha			*img, 0(ptr_copy)		# Load 8 ints from (ptr_copy)
+	add 			ptr_copy, ptr_copy, stride	# Advance ptr_copy by stride
+	sv.lha	 		*img + 8, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 16, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 24, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 32, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 40, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 48, 0(ptr_copy)
+	add 			ptr_copy, ptr_copy, stride
+	sv.lha 			*img + 56, 0(ptr_copy)
+
+	setvl			0,0,64,0,1,1			# Set VL to 64 elements
+	sv.sraw			*img, *img, bd			# img[x] >> bitdepth_min_8
+	sv.addi			*img, *img, -128		# px = (img[x] >> bitdepth_min_8) - 128
+
+	# clear registers to hold the values
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.ori			*psum, 0, 0
+
+	setvl			0,0,8,0,1,1			# Set VL to 16 elements
+	# sum row 1 & 2, index +3
+	sv.add			*psum+3, *psum+3, *img+0
+	sv.add			*psum+3, *psum+3, *img+8
+	# sum row 2 & 3, index +2
+	sv.add			*psum+2, *psum+2, *img+16
+	sv.add			*psum+2, *psum+2, *img+24
+	# sum row 4 & 5, index +1
+	sv.add			*psum+1, *psum+1, *img+32
+	sv.add			*psum+1, *psum+1, *img+40
+	# sum row 6 & 7, index +0
+	sv.add			*psum+0, *psum+0, *img+48
+	sv.add			*psum+0, *psum+0, *img+56
+
+	# Now the following is equivalent to:
+	# for (int m = 0; m < 5; m++)
+	#   cost[5] += partial_sum_alt[2][3 + m] * partial_sum_alt[2][3 + m];
+        # cost[5] *= 105;
+	# for (int m = 0; m < 3; m++) {
+        #   const int d = div_table[2 * m + 1];
+        #   cost[5] += (partial_sum_alt[2][m]      * partial_sum_alt[2][m] +
+        #               partial_sum_alt[2][10 - m] * partial_sum_alt[2][10 - m]) * d;
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.mulld		*psum+0, *psum+0, *psum+0
+	sv.mulld		*psum+0, *psum+0, *divt
+	sv.add/mr		cost+5, *psum+0, cost+5
+
+	# Next row of partial_sum_alts, 
+	# partial_sum_alt [3][ (y >> 1) +  x      ] += px;
+	#
+	# |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |
+	# | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 0 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |    | 1 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 2 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |    | 2 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 3 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -> | 3 |   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 4 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 4 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 5 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |    | 5 |       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 6 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 6 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
+	# | 7 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | a |    | 7 |           | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
 
-	#setvl		0,0,8,0,1,1			# Set VL to 8 elements
-	#sv.add		*psum+0, *psum+0, *img+0
-	#sv.add		*psum+0, *psum+0, *img+1
-	#sv.add		*psum+1, *psum+1, *img+8
-	#sv.add		*psum+1, *psum+1, *img+9
+	# This calculation is similar to the previous, and we have enough registers available,
+	# we don't have to reload img.
 
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.ori			*psum, 0, 0
 
-	#setvl		0,0,10,0,1,1			# Set VL to 2 elements
-	#sv.add/mr	*psum, *psum, *psum+1
-# 
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	# sum row 1 & 2, index +0
+	sv.add			*psum+0, *psum+0, *img+0
+	sv.add			*psum+0, *psum+0, *img+8
+	# sum row 2 & 3, index +1
+	sv.add			*psum+1, *psum+1, *img+16
+	sv.add			*psum+1, *psum+1, *img+24
+	# sum row 4 & 5, index +2
+	sv.add			*psum+2, *psum+2, *img+32
+	sv.add			*psum+2, *psum+2, *img+40
+	# sum row 6 & 7, index +3
+	sv.add			*psum+3, *psum+3, *img+48
+	sv.add			*psum+3, *psum+3, *img+56
 
+	# Now the following is equivalent to:
+	# for (int m = 0; m < 5; m++)
+	#   cost[7] += partial_sum_alt[3][3 + m] * partial_sum_alt[3][3 + m];
+        # cost[7] *= 105;
+	# for (int m = 0; m < 3; m++) {
+        #   const int d = div_table[2 * m + 1];
+        #   cost[7] += (partial_sum_alt[3][m]      * partial_sum_alt[3][m] +
+        #               partial_sum_alt[3][10 - m] * partial_sum_alt[3][10 - m]) * d;
+	setvl			0,0,11,0,1,1			# Set VL to 11 elements
+	sv.mulld		*psum+0, *psum+0, *psum+0
+	sv.mulld		*psum+0, *psum+0, *divt
+	sv.add/mr		cost+7, *psum+0, cost+7
 
+	setvl			0,0,8,0,1,1			# Set VL to 8 elements
+	#sv.maxs/mr		max, *cost
+	#sv.cmp/ff=ne/VLI	max, *cost, 1
+#	sv.addi/m=eq		retval,*,0
 	blr
 	.long 0
 	.byte 0,0,0,0,0,0,0,0
-- 
2.30.2