21 .file "variance_svp64_real.c"
25 .globl variance_svp64_real
26 .type variance_svp64_real, @function
30 li sum, 0 # Set sum to zero
31 li sse, 0 # Set sse to zero
32 li row, 0 # Set row to zero
33 sldi src_stride, src_stride, 1 # strides are for 16-bit elements
34 sldi ref_stride, ref_stride, 1 # we need to increase by bytes
35 # XXX this can go, no need to divide by 4
36 srdi width, width, 2 # We load groups of 4
37 # XXX this to be moved inside (top of) L2 loop
38 setvl 0,0,4,0,1,1 # Set VL to 4 elements
40 .L1: # outer loop: for (r=0; r < h; r++)
41 mr src_col, src_ptr # Temporary variables
43 mr ctr, width # Set up CTR to width/4 -1 on each row
44 mtctr ctr # Set up counter
45 .L2: # inner loop: for (c=0; c < w; c += 4)
46 # XXX setvl 30,0,4,0,1,1 # Set MAXVL=4, and r30=VL=MIN(CTR,MAXVL)
47 # Load 4 elements from src_ptr and ref_ptr
48 sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr)
49 sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr)
51 # equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
52 sv.subf *diff, *ref, *src
53 # equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
54 sv.mulld *prod, *diff, *diff
55 # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
56 sv.add/mr sum, *diff, sum
57 # equivalent to: for (i = 0; i < 4; i++) sse += diff[i]*diff[i];
58 sv.add/mr sse, *prod, sse
60 addi src_col, src_col, 8 # Increment src, ref by 8 bytes
61 addi ref_col, ref_col, 8
62 # XXX replace with "sv.bc/all 16,*0,L2" which does "CTR -= VL"
63 bdnz .L2 # Loop until CTR is zero
65 add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
66 add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
67 addi row, row, 1 # Add 1 to row
68 cmpw cr1, row, height # Is row equal to height?
69 bne cr1, .L1 # Go back to L1 if not done
70 std sum, 0(sum_ptr) # Set (sum_ptr) to sum
71 std sse, 0(sse_ptr) # Set (sum_ptr) to sum
77 .size variance_svp64_real,.-variance_svp64_real
78 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
79 .section .note.GNU-stack,"",@progbits