pysvp64db: fix traversal
[openpower-isa.git] / media / video / libvpx / variance_svp64_real.s
1 .set src_ptr, 3
2 .set src_stride, 4
3 .set ref_ptr, 5
4 .set ref_stride, 6
5 .set width, 7
6 .set height, 8
7 .set sse_ptr, 9
8 .set sum_ptr, 10
9 .set sum, 11
10 .set sse, 12
11 .set ctr, 13
12 .set src_col, 14
13 .set ref_col, 15
14 .set row, 16
15 .set src, 20
16 .set ref, 24
17 .set diff, 28
18 .set prod, 32
19
20 .machine libresoc
21 .file "variance_svp64_real.c"
22 .abiversion 2
23 .section ".text"
24 .align 2
25 .globl variance_svp64_real
26 .type variance_svp64_real, @function
27 variance_svp64_real:
28 .LFB0:
29 .cfi_startproc
30 li sum, 0 # Set sum to zero
31 li sse, 0 # Set sse to zero
32 li row, 0 # Set row to zero
33 sldi src_stride, src_stride, 1 # strides are for 16-bit elements
34 sldi ref_stride, ref_stride, 1 # we need to increase by bytes
35 # XXX this can go, no need to divide by 4
36 srdi width, width, 2 # We load groups of 4
37 # XXX this to be moved inside (top of) L2 loop
38 setvl 0,0,4,0,1,1 # Set VL to 4 elements
39
40 .L1: # outer loop: for (r=0; r < h; r++)
41 mr src_col, src_ptr # Temporary variables
42 mr ref_col, ref_ptr
43 mr ctr, width # Set up CTR to width/4 -1 on each row
44 mtctr ctr # Set up counter
45 .L2: # inner loop: for (c=0; c < w; c += 4)
46 # XXX setvl 30,0,4,0,1,1 # Set MAXVL=4, and r30=VL=MIN(CTR,MAXVL)
47 # Load 4 elements from src_ptr and ref_ptr
48 sv.lha *src, 0(src_col) # Load 4 ints from (src_ptr)
49 sv.lha *ref, 0(ref_col) # Load 4 ints from (ref_ptr)
50
51 # equivalent to: for (i = 0; i < 4; i++) diff[i] = src[i] - ref[i];
52 sv.subf *diff, *ref, *src
53 # equivalent to: for (i = 0; i < 4; i++) prod[i] = diff[i] * diff[i];
54 sv.mulld *prod, *diff, *diff
55 # equivalent to: for (i = 0; i < 4; i++) sum += diff[i];
56 sv.add/mr sum, *diff, sum
57 # equivalent to: for (i = 0; i < 4; i++) sse += diff[i]*diff[i];
58 sv.add/mr sse, *prod, sse
59
60 addi src_col, src_col, 8 # Increment src, ref by 8 bytes
61 addi ref_col, ref_col, 8
62 # XXX replace with "sv.bc/all 16,*0,L2" which does "CTR -= VL"
63 bdnz .L2 # Loop until CTR is zero
64
65 add src_ptr, src_ptr, src_stride # Advance src_ptr by src_stride
66 add ref_ptr, ref_ptr, ref_stride # Advance ref_ptr by ref_stride
67 addi row, row, 1 # Add 1 to row
68 cmpw cr1, row, height # Is row equal to height?
69 bne cr1, .L1 # Go back to L1 if not done
70 std sum, 0(sum_ptr) # Set (sum_ptr) to sum
71 std sse, 0(sse_ptr) # Set (sum_ptr) to sum
72 blr
73 .long 0
74 .byte 0,0,0,0,0,3,0,0
75 .cfi_endproc
76 .LFE0:
77 .size variance_svp64_real,.-variance_svp64_real
78 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
79 .section .note.GNU-stack,"",@progbits