1 # # ffmpeg lgpl 2.1 or later
3 # some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs)
4 # but the accuracy is so high it produces different results. this
5 # demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode
6 # also note, the FP registers are overwritten, not saved on stack yet.
7 # at some point 128 registers will be available, meaning that an EABI
8 # will be defined where there will be plenty of temporaries and no need
9 # to store 24 FP regs on the stack.
28 .file "imdct36_standalone.c"
29 .section .rodata.cst4,"aM",@progbits,4
30 .p2align 2 # -- Begin function imdct36
34 .long 0x40000000 # float 2
36 .long 1056964608 # float 0.5
38 .long 1064341426 # float 0.939692616
40 .long 3190935764 # float -0.173648179
42 .long 3208911741 # float -0.766044437
44 .long 3210589143 # float -0.866025388
46 .long 1065098332 # float 0.984807729
48 .long 3199147332 # float -0.342020154
50 .long 1063105495 # float 0.866025388
52 .long 3206843835 # float -0.642787635
56 .type imdct36,@function
60 addis 2, 12, .TOC.-.Lfunc_gep0@ha
61 addi 2, 2, .TOC.-.Lfunc_gep0@l
63 .localentry imdct36, .Lfunc_lep0-.Lfunc_gep0
65 std 30, -16(1) # 8-byte Folded Spill
72 setvl 0,0,18,0,1,1 # Set VL to 18 elements
73 # Load 18 floats from (in)
75 # equivalent to: for (i = 17; i >= 1; i--) in[i] += in[i-1];
76 sv.fadds/mrr *vin1, *vin1, *vin
77 # SETVL to 16 as the next loop is from 1-17 floats to (out)
80 ori 30, 30, 0xaaaa # Predicate mask 0b1010101010101010
81 # equivalent to: for (i = 17; i >= 3; i -= 2) in[i] += in[i-2];
82 sv.fadds/mrr/m=pred *vin2, *vin2, *vin1
83 # Use SETVL again as we want to store 18 floats to (out)
87 # Load 2.0f constant in register 29, will be needed for SHR macro
90 # Use SETVL 2 for the next loop and calculate first the temporary variables, t1,t2,t3
92 # for (j = 0; j < 2; j++) {
94 # t1 = in1[2*0] - in1[2*6];
95 # t2 = in1[2*4] + in1[2*8] - in1[2*2];
96 # t3 = in1[2*8] + SHR(in1[2*6],1);
97 # t4 = t1 - SHR(t2, 1);
105 # Similarly, the values of 'in' array are already in registers 8-26
108 sv.fsubs *32, *8, *20
110 sv.fadds *35, *16, *24
111 sv.fsubs *35, *35, *12
112 # t3, SHR(a,b) = a * 1.0f/(1 << (1)) = a / 2 essentially fdiv a, a, 2.0
113 sv.fdivs *38, *20, 29
114 sv.fadds *38, *38, *8
115 # t4, essentially fdiv *41, *35, 29
116 sv.fdivs *41, *35, 29
117 sv.fsubs *41, *32, *41
119 sv.fadds *44, *32, *35
121 # Use SETVL again as we want to store 18 floats to (out)
128 .size imdct36, .Lfunc_end0-.Lfunc_begin0
130 .type icos36h,@object # @icos36h
131 .section .rodata,"a",@progbits
134 .long 1048608043 # float 0.250954956
135 .long 1048871918 # float 0.258819044
136 .long 1049443197 # float 0.275844485
137 .long 1050427991 # float 0.305193633
138 .long 1052050675 # float 0.353553385
139 .long 1054812484 # float 0.435861707
140 .long 1050111961 # float 0.295775205
141 .long 1056392938 # float 0.482962906
145 .type icos36,@object # @icos36
148 .long 1056996651 # float 0.501909912
149 .long 1057260526 # float 0.517638087
150 .long 1057831805 # float 0.551688969
151 .long 1058816599 # float 0.610387265
152 .long 1060439283 # float 0.707106769
153 .long 1063201092 # float 0.871723413
154 .long 1066889177 # float 1.18310082
155 .long 1073170154 # float 1.93185163
156 .long 1085772884 # float 5.73685646
160 .ident "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)"
161 .section ".note.GNU-stack","",@progbits
163 # .addrsig_sym imdct36
164 # .addrsig_sym icos36h
165 # .addrsig_sym icos36