pysvp64db: fix traversal
[openpower-isa.git] / media / video / libvpx / vp8_dct4x4_real.s
1 .set in, 3
2 .set out, 4
3 .set pitch, 5
4 .set c_2217, 6
5 .set c_5352, 7
6 .set c_7500, 9
7 .set c_12000, 11
8 .set c_51000, 12
9 .set pred, 10
10 .set ip, 16
11 .set t, 32
12 .set t2, 50
13 .set t3, 70
14 .set op, 90
15
16 .machine libresoc
17 .file "vp8_dct4x4_real.c"
18 .abiversion 2
19 .section ".text"
20 .align 2
21 .globl vp8_short_fdct4x4_svp64_real
22 .type vp8_short_fdct4x4_svp64_real, @function
23 vp8_short_fdct4x4_svp64_real:
24 .LFB0:
25 .cfi_startproc
26 li c_51000, 25500
27 sldi c_51000, c_51000, 1 # c_51000 = 51000
28 setvl 0,0,16,0,1,1 # Set VL to 16 elements
29 sv.lha *ip, 0(in) # Load 4 ints from (in)
30
31 ori pred, 0, 0b0001000100010001
32 sv.add/dm=r10 *t, *ip, *ip+3 # a1 = ip[0] + ip[3]
33 sv.add/dm=r10 *t+1, *ip+1, *ip+2 # b1 = ip[1] + ip[2]
34 sv.subf/dm=r10 *t+2, *ip+2, *ip+1 # c1 = ip[1] - ip[2]
35 sv.subf/dm=r10 *t+3, *ip+3, *ip # d1 = ip[0] - ip[3]
36 sv.mulli *t, *t, 8 # a1 *= 8, b1 *= 8, c1 *= 8, d1 *= 8
37
38 sv.add/dm=r10 *op, *t, *t+1 # op[0] = a1 + b1;
39 sv.subf/dm=r10 *op+2, *t+1, *t # op[2] = a1 - b1;
40
41 # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
42 ori pred, 0, 0b1100110011001100
43 sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217
44 sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352
45
46 ori pred, 0, 0b0010001000100010
47 # op[1] = (c1 * 2217 + d1 * 5352 + 14500)
48 sv.add/m=r10 *op, *t2+1, *t3+2 # c1 * 2217 + d1 * 5352
49 sv.addi/m=r10 *op, *op, 14500 # + 14500
50
51 ori pred, 0, 0b0100010001000100
52 # op[3] = (d1 * 2217 - c1 * 5352 + 7500)
53 sv.subf/m=r10 *op+1, *t3, *t2+1 # - c1 * 5352 + d1 * 2127
54 sv.addi/m=r10 *op+1, *op+1, 7500 # + 7500
55
56 ori pred, 0, 0b1010101010101010
57 sv.rldicl/m=r10 *op, *op, 52, 12 # op[1] >>= 12, op[3] >>= 12
58
59 # column-wise DCT
60 ori pred, 0, 0b0000000000001111
61 sv.add/m=r10 *t, *op, *op+12 # a1 = ip[0] + ip[12]
62 sv.add/m=r10 *t+4, *op+4, *op+8 # b1 = ip[4] + ip[8]
63 sv.subf/m=r10 *t+8, *op+8, *op+4 # c1 = ip[4] - ip[8]
64 sv.subf/m=r10 *t+12, *op+12, *op # d1 = ip[0] - ip[12]
65
66 # op[0] = (a1 + b1 + 7) >> 4
67 sv.add/m=r10 *op, *t, *t+4 # op[0] = a1 + b1
68 sv.addi/m=r10 *op, *op, 7 # op[0] += 7
69
70 # op[8] = (a1 - b1 + 7) >> 4
71 sv.subf/m=r10 *op+8, *t+4, *t # op[8] = a1 - b1
72 sv.addi/m=r10 *op+8, *op+8, 7 # op[8] += 7
73
74 ori pred, 0, 0b0000111100001111
75 sv.rldicl/m=r10 *op, *op, 60, 4 # op[0] >>= 4, op[8] >>= 4
76
77 # Calculate c1 * 2217, c1 *5352, d1 * 2217 and d1 * 5352
78 ori pred, 0, 0b1111111100000000
79 sv.mulli/m=r10 *t2, *t, 2217 # t2 has c1 * 2217, d1 * 2217
80 sv.mulli/m=r10 *t3, *t, 5352 # t3 has c1 * 5352, d1 * 5352
81
82 # op[4] = ((c1 * 2217 + d1 * 5352 + 12000)
83 ori pred, 0, 0b0000000011110000
84 sv.add/m=r10 *op, *t2+4, *t3+8 # c1 * 2217 + d1 * 5352
85 sv.addi/m=r10 *op, *op, 12000 # + 12000
86
87 # op[12] = (d1 * 2217 - c1 * 5352 + 51000)
88 ori pred, 0, 0b1111000000000000
89 sv.subf/m=r10 *op, *t3-4, *t2 # - c1 * 5352 + d1 * 2127
90 sv.add/m=r10 *op, *op, c_51000 # + 51000
91
92 ori pred, 0, 0b1111000011110000
93 sv.rldicl/m=r10 *op, *op, 48, 16 # op[4] >>= 16, op[12] >= 16
94
95 # op[4] += (d1 != 0)
96 #ori pred, 0, 0b0000000011110000
97 setvl 0,0,4,0,1,1 # Set VL to 16 elements
98 sv.cmpi *cr0, 0, *t+12, 1
99 sv.addi/m=ne *op+4, *op+4, 1
100
101 # store to buffer
102 setvl 0,0,16,0,1,1 # Set VL to 16 elements
103 sv.sth *op, 0(out)
104 blr
105 .long 0
106 .byte 0,0,0,0,128,1,0,1
107 .cfi_endproc
108 .LFE0:
109 .size vp8_short_fdct4x4_svp64_real,.-vp8_short_fdct4x4_svp64_real
110 .ident "GCC: (Debian 8.3.0-6) 8.3.0"
111 .section .note.GNU-stack,"",@progbits