13 .file "vpx_get4x4sse_cs_svp64_real.c"
17 .globl vpx_get4x4sse_cs_svp64_real
18 .type vpx_get4x4sse_cs_svp64_real, @function
19 vpx_get4x4sse_cs_svp64_real:
23 li sum, 0 # Set sum to zero
24 sldi src_stride, src_stride, 1 # strides are for 16-bit elements
25 sldi ref_stride, ref_stride, 1 # we need to increase by bytes
26 # Load 16 elements from src_ptr and ref_ptr, at groups of 4 with stride
27 setvl 0,0,4,0,1,1 # Set VL to 4 elements
28 sv.lha *src, 0(src_ptr) # Load 4 ints from (src_ptr)
29 add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
30 sv.lha *src + 4, 0(src_ptr)
31 add src_ptr, src_ptr, src_stride
32 sv.lha *src + 8, 0(src_ptr)
33 add src_ptr, src_ptr, src_stride
34 sv.lha *src + 12, 0(src_ptr)
35 sv.lha *ref, 0(ref_ptr) # Load 4 ints from (ref_ptr)
36 add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
37 sv.lha *ref + 4, 0(ref_ptr)
38 add ref_ptr, ref_ptr, ref_stride
39 sv.lha *ref + 8, 0(ref_ptr)
40 add ref_ptr, ref_ptr, ref_stride
41 sv.lha *ref + 12, 0(ref_ptr)
43 # now our values are in consecutive registers and we can set VL to 16 elements
45 # equivalent to: for (i = 0; i < 16; i++) diff[i] = src[i] - ref[i];
46 sv.subf *diff, *src, *ref
47 # equivalent to: for (i = 0; i < 16; i++) prod[i] = diff[i] * diff[i];
48 sv.mulld *prod, *diff, *diff
49 # equivalent to: for (i = 0; i < 16; i++) sum += prod[i];
50 sv.add/mr sum, *prod, sum
51 mr 3, sum # Set r3 to sum
57 .size vpx_get4x4sse_cs_svp64_real,.-vpx_get4x4sse_cs_svp64_real
58 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
59 .section .note.GNU-stack,"",@progbits