pysvp64db: fix traversal
[openpower-isa.git] / media / audio / mp3 / mp3_1_imdct36_float_basicsv.s
1 # # ffmpeg lgpl 2.1 or later
2 #
3 # some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs)
4 # but the accuracy is so high it produces different results. this
5 # demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode
6 # also note, the FP registers are overwritten, not saved on stack yet.
7 # at some point 128 registers will be available, meaning that an EABI
8 # will be defined where there will be plenty of temporaries and no need
9 # to store 24 FP regs on the stack.
10
11 # ints
12 .set out, 3
13 .set buf, 4
14 .set in, 5
15 .set win, 6
16
17 .set i, 7
18 .set vin, 8
19 .set vin1, 9
20 .set vin2, 11
21 .set pred, %r30
22
23 # floats
24
25 .machine libresoc
26 .text
27 .abiversion 2
28 .file "imdct36_standalone.c"
29 .section .rodata.cst4,"aM",@progbits,4
30 .p2align 2 # -- Begin function imdct36
31 .LC_zero:
32 .long 0 # float 0
33 .LC_2_0:
34 .long 0x40000000 # float 2
35 .LC_0_5:
36 .long 1056964608 # float 0.5
37 .LCPI0_2:
38 .long 1064341426 # float 0.939692616
39 .LCPI0_3:
40 .long 3190935764 # float -0.173648179
41 .LCPI0_4:
42 .long 3208911741 # float -0.766044437
43 .LCPI0_5:
44 .long 3210589143 # float -0.866025388
45 .LCPI0_6:
46 .long 1065098332 # float 0.984807729
47 .LCPI0_7:
48 .long 3199147332 # float -0.342020154
49 .LCPI0_8:
50 .long 1063105495 # float 0.866025388
51 .LCPI0_9:
52 .long 3206843835 # float -0.642787635
53 .text
54 .globl imdct36
55 .p2align 4
56 .type imdct36,@function
57 imdct36: # @imdct36
58 .Lfunc_begin0:
59 .Lfunc_gep0:
60 addis 2, 12, .TOC.-.Lfunc_gep0@ha
61 addi 2, 2, .TOC.-.Lfunc_gep0@l
62 .Lfunc_lep0:
63 .localentry imdct36, .Lfunc_lep0-.Lfunc_gep0
64 # %bb.0:
65 std 30, -16(1) # 8-byte Folded Spill
66 std 3, -24(1)
67 std 4, -32(1)
68 std 5, -40(1)
69 std 6, -48(1)
70
71 .loop1:
72 setvl 0,0,18,0,1,1 # Set VL to 18 elements
73 # Load 18 floats from (in)
74 sv.lfs *vin, 0(in)
75 # equivalent to: for (i = 17; i >= 1; i--) in[i] += in[i-1];
76 sv.fadds/mrr *vin1, *vin1, *vin
77 # SETVL to 16 as the next loop is from 1-17 floats to (out)
78 setvl 0,0,16,0,1,1
79 li 30, 0
80 ori 30, 30, 0xaaaa # Predicate mask 0b1010101010101010
81 # equivalent to: for (i = 17; i >= 3; i -= 2) in[i] += in[i-2];
82 sv.fadds/mrr/m=pred *vin2, *vin2, *vin1
83 # Use SETVL again as we want to store 18 floats to (out)
84 setvl 0,0,18,0,1,1
85 sv.stfs *vin, 0(out)
86
87 # Load 2.0f constant in register 29, will be needed for SHR macro
88 # fmvis 29, 0x4000
89
90 # Use SETVL 2 for the next loop and calculate first the temporary variables, t1,t2,t3
91 # equivalent to:
92 # for (j = 0; j < 2; j++) {
93 # in1 = in + j;
94 # t1 = in1[2*0] - in1[2*6];
95 # t2 = in1[2*4] + in1[2*8] - in1[2*2];
96 # t3 = in1[2*8] + SHR(in1[2*6],1);
97 # t4 = t1 - SHR(t2, 1);
98 # t5 = t1 + t2;
99 # }
100 # t1 -> r32-r34
101 # t2 -> r35-r37
102 # t3 -> r38-r40
103 # t4 -> r41-r43
104 # t5 -> r44-r46
105 # Similarly, the values of 'in' array are already in registers 8-26
106 setvl 0,0,2,0,1,1
107 # t1
108 sv.fsubs *32, *8, *20
109 # t2
110 sv.fadds *35, *16, *24
111 sv.fsubs *35, *35, *12
112 # t3, SHR(a,b) = a * 1.0f/(1 << (1)) = a / 2 essentially fdiv a, a, 2.0
113 sv.fdivs *38, *20, 29
114 sv.fadds *38, *38, *8
115 # t4, essentially fdiv *41, *35, 29
116 sv.fdivs *41, *35, 29
117 sv.fsubs *41, *32, *41
118 # t5
119 sv.fadds *44, *32, *35
120
121 # Use SETVL again as we want to store 18 floats to (out)
122 setvl 0,0,18,0,1,1
123 sv.stfs *32, 0(3)
124 blr
125 .long 0
126 .quad 0
127 .Lfunc_end0:
128 .size imdct36, .Lfunc_end0-.Lfunc_begin0
129 # -- End function
130 .type icos36h,@object # @icos36h
131 .section .rodata,"a",@progbits
132 .p2align 2
133 icos36h:
134 .long 1048608043 # float 0.250954956
135 .long 1048871918 # float 0.258819044
136 .long 1049443197 # float 0.275844485
137 .long 1050427991 # float 0.305193633
138 .long 1052050675 # float 0.353553385
139 .long 1054812484 # float 0.435861707
140 .long 1050111961 # float 0.295775205
141 .long 1056392938 # float 0.482962906
142 .long 0 # float 0
143 .size icos36h, 36
144
145 .type icos36,@object # @icos36
146 .p2align 2
147 icos36:
148 .long 1056996651 # float 0.501909912
149 .long 1057260526 # float 0.517638087
150 .long 1057831805 # float 0.551688969
151 .long 1058816599 # float 0.610387265
152 .long 1060439283 # float 0.707106769
153 .long 1063201092 # float 0.871723413
154 .long 1066889177 # float 1.18310082
155 .long 1073170154 # float 1.93185163
156 .long 1085772884 # float 5.73685646
157 .size icos36, 36
158
159
160 .ident "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)"
161 .section ".note.GNU-stack","",@progbits
162 # .addrsig
163 # .addrsig_sym imdct36
164 # .addrsig_sym icos36h
165 # .addrsig_sym icos36