21 .file "variance_svp64_real.c"
25 .globl variance_svp64_real
26 .type variance_svp64_real, @function
30 li sum, 0 # Set sum to zero
31 li sse, 0 # Set sse to zero
32 li row, 0 # Set row to zero
33 sldi src_stride, src_stride, 1 # strides are for 16-bit elements
34 sldi ref_stride, ref_stride, 1 # we need to increase by bytes
35 srdi width, width, 2 # We load groups of 4
36 setvl 0,0,4,0,1,1 # Set VL to 4 elements
38 .L1: # outer loop: for (r=0; r < h; r++)
39 mr src_col, src_ptr # Temporary variables
41 mr ctr, width # Set up CTR to width/4 -1 on each row
42 mtctr ctr # Set up counter
43 .L2: # inner loop: for (c=0; c < w; c += 4)
44 # Load 4 elements from src_ptr and ref_ptr
45 sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr)
46 sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr)
48 # equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
49 sv.subf *diff, *ref, *src
50 # equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
51 sv.mulld *prod, *diff, *diff
52 # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
53 sv.add/mr sum, *diff, sum
54 # equivalent to: for (i = 0; i < 4; i++) sse += diff[i]*diff[i];
55 sv.add/mr sse, *prod, sse
57 addi src_col, src_col, 8 # Increment src, ref by 8 bytes
58 addi ref_col, ref_col, 8
59 bdnz .L2 # Loop until CTR is zero
61 add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
62 add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
63 addi row, row, 1 # Add 1 to row
64 cmpw cr1, row, height # Is row equal to height?
65 bne cr1, .L1 # Go back to L1 if not done
66 std sum, 0(sum_ptr) # Set (sum_ptr) to sum
67 std sse, 0(sse_ptr) # Set (sum_ptr) to sum
73 .size variance_svp64_real,.-variance_svp64_real
74 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
75 .section .note.GNU-stack,"",@progbits