.set ref_col, 15
.set row, 16
.set src, 20
-.set ref, 36
-.set diff, 52
-.set prod, 68
+.set ref, 24
+.set diff, 28
+.set prod, 32
.machine libresoc
.file "variance_svp64_real.c"
variance_svp64_real:
.LFB0:
.cfi_startproc
- # Set sum to zero
- li sum, 0 # Set sum to zero
- li sse, 0 # Set sse to zero
- mr row, height # Set row to height
+ li sum, 0 # Set sum to zero
+ li sse, 0 # Set sse to zero
+ li row, 0 # Set row to zero
sldi src_stride, src_stride, 1 # strides are for 16-bit elements
sldi ref_stride, ref_stride, 1 # we need to increase by bytes
- srdi ctr, width, 2
- mtctr ctr
+ srdi width, width, 2 # We load groups of 4
setvl 0,0,4,0,1,1 # Set VL to 4 elements
.L1: # outer loop: for (r=0; r < h; r++)
-
-.L2: # inner loop: for (c=0; c < w; c += 4)
- # Load 4 elements from src_ptr and ref_ptr, at groups of 4
mr src_col, src_ptr # Temporary variables
mr ref_col, ref_ptr
+ mr ctr, width # Set up CTR to width/4 -1 on each row
+ mtctr ctr # Set up counter
+.L2: # inner loop: for (c=0; c < w; c += 4)
+ # Load 4 elements from src_ptr and ref_ptr
sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr)
sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr)
- addi src_col, src_col, 8 # Increment src, ref by 8 bytes
- addi ref_col, ref_col, 8
# equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
- sv.subf *diff, *src, *ref
+ sv.subf *diff, *ref, *src
# equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
sv.mulld *prod, *diff, *diff
# equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
sv.add/mr sum, *diff, sum
- # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
+ # equivalent to: for (i = 0; i < 4; i++) sse += diff[i]*diff[i];
sv.add/mr sse, *prod, sse
- bdnz .L2 # Loop until CTR is zero
+ addi src_col, src_col, 8 # Increment src, ref by 8 bytes
+ addi ref_col, ref_col, 8
+ bdnz .L2 # Loop until CTR is zero
+
add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
-
- subi row, row, 1 # Subtract 1 from row
- cmpwi cr1, row, 0 # Is row zero?
- bne cr1, .L1 # Go back to L1 if not done
- std sum, 0(sum_ptr) # Set (sum_ptr) to sum
- std sse, 0(sse_ptr) # Set (sum_ptr) to sum
+ addi row, row, 1 # Add 1 to row
+ cmpw cr1, row, height # Is row equal to height?
+ bne cr1, .L1 # Go back to L1 if not done
+ std sum, 0(sum_ptr) # Set (sum_ptr) to sum
+ std sse, 0(sse_ptr) # Set (sum_ptr) to sum
blr
.long 0
.byte 0,0,0,0,0,3,0,0