-# ffmpeg lgpl 2.1 or later
+# # ffmpeg lgpl 2.1 or later
+#
+# some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs)
+# but the accuracy is so high it produces different results. this
+# demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode
+# also note, the FP registers are overwritten, not saved on stack yet.
+# at some point 128 registers will be available, meaning that an EABI
+# will be defined where there will be plenty of temporaries and no need
+# to store 24 FP regs on the stack.
- .file "mpegaudiodsp_float.c"
- .machine power9
+# ints
+.set out, 3
+.set buf, 4
+.set in, 5
+.set win, 6
+
+.set i, 7
+.set vin, 8
+.set vin1, 9
+.set vin2, 11
+.set pred, 30
+
+# floats
+
+ .machine libresoc
+ .text
.abiversion 2
- .section ".text"
- .section ".toc","aw"
- .align 3
-.LCTOC0:
- .tc .LCTOC1[TC],.LCTOC1
- .section ".toc1","aw"
- .align 3
-.LCTOC1 = .+32768
-.LC0:
- .quad 0x3f000000
-.LC1:
- .quad 0x3f708fb2
-.LC2:
- .quad 0xbe31d0d4
-.LC3:
- .quad 0xbf441b7d
-.LC4:
- .quad 0xbf5db3d7
-.LC5:
- .quad 0x3f7c1c5c
-.LC6:
- .quad 0xbeaf1d44
-.LC7:
- .quad 0x3f5db3d7
-.LC8:
- .quad 0xbf248dbb
-.LC9:
- .quad 0x3f007d2b
-.LC10:
- .quad 0x40b79454
-.LC11:
- .quad 0x3f0483ee
-.LC12:
- .quad 0x3ff746ea
-.LC13:
- .quad 0x3f0d3b7d
-.LC14:
- .quad 0x3f976fd9
-.LC15:
- .quad 0x3f1c4257
-.LC16:
- .quad 0x3f5f2944
-.LC17:
- .quad 0x3f3504f3
- .section ".text"
- .align 2
- .p2align 4,,15
- .globl imdct36
- .type imdct36, @function
-imdct36:
-.LCF0:
-0: addis 2,12,.TOC.-.LCF0@ha
- addi 2,2,.TOC.-.LCF0@l
- .localentry imdct36,.-imdct36
- stfd 15,-136(1)
- stfd 16,-128(1)
- stfd 17,-120(1)
- stfd 18,-112(1)
- stfd 19,-104(1)
- stfd 20,-96(1)
- stfd 21,-88(1)
- stfd 22,-80(1)
- stfd 23,-72(1)
- stfd 24,-64(1)
- stfd 25,-56(1)
- stfd 26,-48(1)
- stfd 27,-40(1)
- stfd 28,-32(1)
- stfd 29,-24(1)
- stfd 30,-16(1)
- stfd 31,-8(1)
- std 30,-152(1)
- lfs 11,60(5)
- lfs 12,52(5)
- lfs 27,48(5)
- lfs 1,40(5)
- lfs 13,32(5)
- lfs 2,24(5)
- lfs 19,64(5)
- lfs 30,56(5)
- lfs 29,44(5)
- lfs 3,36(5)
- lfs 6,28(5)
- lfs 8,20(5)
- lfs 31,16(5)
- lfs 10,68(5)
- lfs 5,12(5)
- lfs 9,4(5)
- lfs 4,0(5)
- lfs 18,8(5)
- fadds 28,12,27
- ld 30,.LCTOC0@toc(2)
- fadds 7,11,30
- fadds 0,29,1
- fadds 30,30,12
- fadds 29,27,29
- fadds 12,6,2
- fadds 6,13,6
- fadds 16,8,31
- fadds 8,2,8
- fadds 10,19,10
- fadds 19,19,11
- fadds 11,3,13
- fadds 31,31,5
- fadds 24,4,9
- fadds 3,1,3
- lfs 1,.LC0-.LCTOC1(30)
- fadds 17,5,18
- fadds 18,18,9
- lfs 27,.LC5-.LCTOC1(30)
- stfs 6,32(5)
- lfs 5,.LC6-.LCTOC1(30)
- stfs 29,48(5)
- lfs 13,.LC1-.LCTOC1(30)
- stfs 30,56(5)
- stfs 8,24(5)
- fadds 10,10,7
- fadds 7,7,28
- fadds 28,28,0
- fadds 0,0,11
- fadds 11,11,12
- fadds 12,12,16
- fadds 2,6,31
- fsubs 23,6,19
- fadds 16,16,17
- fadds 17,24,17
- fadds 6,19,6
- fsubs 26,3,30
- fadds 21,3,18
- stfs 19,64(5)
- stfs 31,16(5)
- stfs 24,4(5)
- stfs 3,40(5)
- fadds 3,30,3
- stfs 18,8(5)
- fadds 30,30,18
- fsubs 9,0,7
- stfs 28,52(5)
- stfs 10,68(5)
- stfs 0,44(5)
- fsubs 22,11,10
- fmuls 2,2,13
- stfs 11,36(5)
- stfs 12,28(5)
- fadds 20,17,0
- fadds 25,11,16
- fsubs 6,6,31
- fadds 31,19,31
- fmuls 19,28,1
- fmuls 21,21,27
- fsubs 28,24,28
- fadds 11,10,11
- fsubs 3,3,18
- stfs 16,20(5)
- fadds 10,10,16
- fadds 0,7,0
- fmuls 9,9,5
- fmuls 5,26,5
- lfs 26,.LC2-.LCTOC1(30)
- lfs 15,.LC9-.LCTOC1(30)
- stfs 7,60(5)
- stfs 17,12(5)
- fmuls 20,20,27
- lfs 27,.LC7-.LCTOC1(30)
- fmuls 13,25,13
- fadds 24,19,24
- fmuls 25,29,1
- fsubs 29,4,29
- fsubs 11,11,16
- lfs 16,.LC10-.LCTOC1(30)
- fadds 7,17,7
- fsubs 0,0,17
- fadds 18,21,5
- fmuls 22,22,26
- fmuls 23,23,26
- fadds 19,20,9
- fmuls 12,12,27
- fadds 25,25,4
- fadds 4,24,13
- fmuls 8,8,27
- fsubs 13,24,13
- fadds 19,19,12
- fadds 4,4,22
- fadds 26,25,2
- fadds 18,18,8
- fsubs 2,25,2
- fadds 27,4,19
- fsubs 4,4,19
- fmuls 19,11,1
- fmuls 1,6,1
- fadds 26,26,23
- fadds 6,6,29
- fadds 11,11,28
- fsubs 1,29,1
- fmuls 4,4,16
- lfs 29,.LC8-.LCTOC1(30)
- lfs 16,.LC4-.LCTOC1(30)
- fadds 17,26,18
- fmuls 27,27,15
- fsubs 26,26,18
- fsubs 28,28,19
- lfs 18,.LC3-.LCTOC1(30)
- fmuls 30,30,29
- fmuls 7,7,29
- lfs 29,36(6)
- fmuls 0,0,16
- fsubs 19,17,27
- fmuls 3,3,16
- fadds 27,27,17
- fmuls 31,31,18
- fmuls 10,10,18
- fsubs 17,26,4
- fadds 4,4,26
- fsubs 18,5,30
- fsubs 5,30,5
- fadds 15,28,0
- fsubs 28,28,0
- lfs 0,144(4)
- fmuls 16,19,29
- fadds 26,1,3
- fadds 29,21,30
- fsubs 2,2,31
- fsubs 3,1,3
- fadds 31,25,31
- fsubs 1,9,7
- fsubs 9,7,9
- fsubs 13,13,10
- fadds 24,24,10
- fadds 7,20,7
- fsubs 21,18,8
- fadds 16,16,0
- fsubs 10,29,8
- fadds 0,5,8
- fsubs 8,8,29
- fsubs 23,31,23
- fadds 30,9,12
- fsubs 1,1,12
- fsubs 22,24,22
- fsubs 9,7,12
- fsubs 12,12,7
- fadds 21,21,2
- lfs 5,.LC12-.LCTOC1(30)
- stfs 16,1152(3)
- fadds 29,0,2
- fadds 1,1,13
- fadds 30,30,13
- fadds 13,10,23
- fadds 8,8,23
- fadds 7,9,22
- fadds 12,12,22
- fmuls 28,28,5
- lfs 10,32(6)
- lfs 0,128(4)
- fmuls 19,19,10
- fadds 19,19,0
- stfs 19,1024(3)
- lfs 0,116(6)
- lfs 10,272(4)
- fmuls 0,0,27
- stfs 0,144(4)
- lfs 0,112(6)
- fmuls 27,0,27
- stfs 27,128(4)
- lfs 9,68(6)
- fmuls 9,17,9
- fadds 9,9,10
- stfs 9,2176(3)
- lfs 9,.LC11-.LCTOC1(30)
- lfs 10,0(6)
- lfs 0,0(4)
- fmuls 9,15,9
- fmuls 17,17,10
- fsubs 2,26,9
- fadds 9,9,26
- lfs 10,.LC13-.LCTOC1(30)
- fadds 17,17,0
- fmuls 10,1,10
- stfs 17,0(3)
- fsubs 1,21,10
- fadds 10,10,21
- lfs 0,148(6)
- lfs 27,160(4)
- fmuls 0,0,4
- stfs 0,272(4)
- lfs 0,80(6)
- fmuls 4,0,4
- lfs 0,.LC14-.LCTOC1(30)
- stfs 4,0(4)
- fsubs 4,3,28
- fadds 3,28,3
- fmuls 30,30,0
- lfs 31,40(6)
- fsubs 5,29,30
- fadds 0,30,29
- fmuls 31,2,31
- fadds 31,31,27
- stfs 31,1280(3)
- lfs 30,28(6)
- lfs 31,112(4)
- fmuls 2,2,30
- fadds 2,2,31
- stfs 2,896(3)
- lfs 31,120(6)
- lfs 30,256(4)
- fmuls 31,31,9
- stfs 31,160(4)
- lfs 31,108(6)
- fmuls 9,31,9
- stfs 9,112(4)
- lfs 2,64(6)
- fmuls 2,4,2
- fadds 2,2,30
- stfs 2,2048(3)
- lfs 2,4(6)
- lfs 9,16(4)
- fmuls 4,4,2
- fadds 4,4,9
- stfs 4,128(3)
- lfs 9,144(6)
- lfs 2,176(4)
- fmuls 9,9,3
- stfs 9,256(4)
- lfs 9,84(6)
- fmuls 3,9,3
- stfs 3,16(4)
- lfs 4,44(6)
- fmuls 4,1,4
- fadds 4,4,2
- lfs 2,.LC15-.LCTOC1(30)
- stfs 4,1408(3)
- lfs 9,24(6)
- lfs 4,96(4)
- fmuls 9,1,9
- fadds 9,9,4
- lfs 4,.LC16-.LCTOC1(30)
- stfs 9,768(3)
- fmuls 12,12,4
- lfs 9,124(6)
- lfs 3,240(4)
- fsubs 4,8,12
- fadds 8,8,12
- fmuls 9,9,10
- stfs 9,176(4)
- lfs 9,104(6)
- fmuls 9,9,10
- fmuls 10,7,2
- stfs 9,96(4)
- fsubs 9,13,10
- fadds 10,13,10
- lfs 7,60(6)
- fmuls 7,5,7
- fadds 7,7,3
- stfs 7,1920(3)
- lfs 7,8(6)
- lfs 12,32(4)
- fmuls 5,5,7
- fadds 5,5,12
- stfs 5,256(3)
- lfs 12,140(6)
- lfs 5,192(4)
- fmuls 12,12,0
- stfs 12,240(4)
- lfs 12,88(6)
- fmuls 0,12,0
- stfs 0,32(4)
- lfs 7,48(6)
- fmuls 7,9,7
- fadds 7,7,5
- stfs 7,1536(3)
- lfs 12,20(6)
- lfs 0,80(4)
- fmuls 9,9,12
- fadds 9,9,0
- stfs 9,640(3)
- lfs 0,128(6)
- lfs 12,224(4)
- fmuls 0,0,10
- stfs 0,192(4)
- lfs 0,100(6)
- fmuls 10,0,10
- stfs 10,80(4)
- lfs 10,56(6)
- fmuls 10,4,10
- fadds 10,10,12
- stfs 10,1792(3)
- lfs 0,12(6)
- lfs 12,48(4)
- fmuls 0,4,0
- fadds 0,0,12
- lfs 12,.LC17-.LCTOC1(30)
- stfs 0,384(3)
- fmuls 11,11,12
- lfs 0,136(6)
- lfs 10,208(4)
- fmuls 0,0,8
- stfs 0,224(4)
- lfs 0,92(6)
- fmuls 8,0,8
- fsubs 0,6,11
- fadds 6,11,6
- stfs 8,48(4)
- lfs 12,52(6)
- fmuls 12,0,12
- fadds 12,12,10
- stfs 12,1664(3)
- lfs 11,16(6)
- lfs 12,64(4)
- fmuls 0,0,11
- fadds 0,0,12
- stfs 0,512(3)
- lfs 0,132(6)
- fmuls 0,0,6
- stfs 0,208(4)
- lfs 0,96(6)
- fmuls 6,0,6
- stfs 6,64(4)
- ld 30,-152(1)
- lfd 15,-136(1)
- lfd 16,-128(1)
- lfd 17,-120(1)
- lfd 18,-112(1)
- lfd 19,-104(1)
- lfd 20,-96(1)
- lfd 21,-88(1)
- lfd 22,-80(1)
- lfd 23,-72(1)
- lfd 24,-64(1)
- lfd 25,-56(1)
- lfd 26,-48(1)
- lfd 27,-40(1)
- lfd 28,-32(1)
- lfd 29,-24(1)
- lfd 30,-16(1)
- lfd 31,-8(1)
+ .file "imdct36_standalone.c"
+ .section .rodata.cst4,"aM",@progbits,4
+ .p2align 2 # -- Begin function imdct36
+.LC_zero:
+ .long 0 # float 0
+.LC_2_0:
+ .long 0x40000000 # float 2
+.LC_0_5:
+ .long 1056964608 # float 0.5
+.LCPI0_2:
+ .long 1064341426 # float 0.939692616
+.LCPI0_3:
+ .long 3190935764 # float -0.173648179
+.LCPI0_4:
+ .long 3208911741 # float -0.766044437
+.LCPI0_5:
+ .long 3210589143 # float -0.866025388
+.LCPI0_6:
+ .long 1065098332 # float 0.984807729
+.LCPI0_7:
+ .long 3199147332 # float -0.342020154
+.LCPI0_8:
+ .long 1063105495 # float 0.866025388
+.LCPI0_9:
+ .long 3206843835 # float -0.642787635
+ .text
+ .globl imdct36
+ .p2align 4
+ .type imdct36,@function
+imdct36: # @imdct36
+.Lfunc_begin0:
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry imdct36, .Lfunc_lep0-.Lfunc_gep0
+# %bb.0:
+ std 30, -16(1) # 8-byte Folded Spill
+ std 3, -24(1)
+ std 4, -32(1)
+ std 5, -40(1)
+ std 6, -48(1)
+
+.loop1:
+ setvl 0,0,18,0,1,1 # Set VL to 18 elements
+ # Load 18 floats from (in)
+ sv.lfs *vin, 0(in)
+ # equivalent to: for (i = 17; i >= 1; i--) in[i] += in[i-1];
+ sv.fadds/mrr *vin1, *vin1, *vin
+ # SETVL to 16 as the next loop is from 1-17 floats to (out)
+ setvl 0,0,16,0,1,1
+ li 30, 0
+ ori 30, 30, 0xaaaa # Predicate mask 0b1010101010101010
+ # equivalent to: for (i = 17; i >= 3; i -= 2) in[i] += in[i-2];
+ sv.fadds/mrr/m=pred *vin2, *vin2, *vin1
+ # Use SETVL again as we want to store 18 floats to (out)
+ setvl 0,0,18,0,1,1
+ sv.stfs *vin, 0(out)
+
+ # Load 2.0f constant in register 29, will be needed for SHR macro
+ # fmvis 29, 0x4000
+
+ # Use SETVL 2 for the next loop and calculate first the temporary variables, t1,t2,t3
+ # equivalent to:
+ # for (j = 0; j < 2; j++) {
+ # in1 = in + j;
+ # t1 = in1[2*0] - in1[2*6];
+ # t2 = in1[2*4] + in1[2*8] - in1[2*2];
+ # t3 = in1[2*8] + SHR(in1[2*6],1);
+ # t4 = t1 - SHR(t2, 1);
+ # t5 = t1 + t2;
+ # }
+ # t1 -> r32-r34
+ # t2 -> r35-r37
+ # t3 -> r38-r40
+ # t4 -> r41-r43
+ # t5 -> r44-r46
+ # Similarly, the values of 'in' array are already in registers 8-26
+ setvl 0,0,2,0,1,1
+ # t1
+ sv.fsubs 32.v, 8.v, 20.v
+ # t2
+ sv.fadds 35.v, 16.v, 24.v
+ sv.fsubs 35.v, 35.v, 12.v
+ # t3, SHR(a,b) = a * 1.0f/(1 << (1)) = a / 2 essentially fdiv a, a, 2.0
+ sv.fdivs 38.v, 20.v, 29
+ sv.fadds 38.v, 38.v, 8.v
+ # t4, essentially fdiv 41.v, 35.v, 29
+ sv.fdivs 41.v, 35.v, 29
+ sv.fsubs 41.v, 32.v, 41.v
+ # t5
+ sv.fadds 44.v, 32.v, 35.v
+
+ # Use SETVL again as we want to store 18 floats to (out)
+ setvl 0,0,18,0,1,1
+ sv.stfs 32.v, 0(3)
blr
- .long 0
- .byte 0,0,2,0,17,2,0,0
- .size imdct36,.-imdct36
- .ident "GCC: (GNU) 10.3.0"
- .section .note.GNU-stack,"",@progbits
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size imdct36, .Lfunc_end0-.Lfunc_begin0
+ # -- End function
+ .type icos36h,@object # @icos36h
+ .section .rodata,"a",@progbits
+ .p2align 2
+icos36h:
+ .long 1048608043 # float 0.250954956
+ .long 1048871918 # float 0.258819044
+ .long 1049443197 # float 0.275844485
+ .long 1050427991 # float 0.305193633
+ .long 1052050675 # float 0.353553385
+ .long 1054812484 # float 0.435861707
+ .long 1050111961 # float 0.295775205
+ .long 1056392938 # float 0.482962906
+ .long 0 # float 0
+ .size icos36h, 36
+
+ .type icos36,@object # @icos36
+ .p2align 2
+icos36:
+ .long 1056996651 # float 0.501909912
+ .long 1057260526 # float 0.517638087
+ .long 1057831805 # float 0.551688969
+ .long 1058816599 # float 0.610387265
+ .long 1060439283 # float 0.707106769
+ .long 1063201092 # float 0.871723413
+ .long 1066889177 # float 1.18310082
+ .long 1073170154 # float 1.93185163
+ .long 1085772884 # float 5.73685646
+ .size icos36, 36
+
+
+ .ident "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)"
+ .section ".note.GNU-stack","",@progbits
+# .addrsig
+# .addrsig_sym imdct36
+# .addrsig_sym icos36h
+# .addrsig_sym icos36