From 7be5c9291465ad4594dbd16d94980b94f631023b Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 8 Aug 2022 20:52:49 +0000 Subject: [PATCH] WIP: SVP64 version --- media/audio/mp3/mp3_1_imdct36_float.s | 604 +++++++------------------- 1 file changed, 162 insertions(+), 442 deletions(-) diff --git a/media/audio/mp3/mp3_1_imdct36_float.s b/media/audio/mp3/mp3_1_imdct36_float.s index bf1dac5c..8fff01d0 100644 --- a/media/audio/mp3/mp3_1_imdct36_float.s +++ b/media/audio/mp3/mp3_1_imdct36_float.s @@ -1,445 +1,165 @@ -# ffmpeg lgpl 2.1 or later +# # ffmpeg lgpl 2.1 or later +# +# some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs) +# but the accuracy is so high it produces different results. this +# demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode +# also note, the FP registers are overwritten, not saved on stack yet. +# at some point 128 registers will be available, meaning that an EABI +# will be defined where there will be plenty of temporaries and no need +# to store 24 FP regs on the stack. - .file "mpegaudiodsp_float.c" - .machine power9 +# ints +.set out, 3 +.set buf, 4 +.set in, 5 +.set win, 6 + +.set i, 7 +.set vin, 8 +.set vin1, 9 +.set vin2, 11 +.set pred, 30 + +# floats + + .machine libresoc + .text .abiversion 2 - .section ".text" - .section ".toc","aw" - .align 3 -.LCTOC0: - .tc .LCTOC1[TC],.LCTOC1 - .section ".toc1","aw" - .align 3 -.LCTOC1 = .+32768 -.LC0: - .quad 0x3f000000 -.LC1: - .quad 0x3f708fb2 -.LC2: - .quad 0xbe31d0d4 -.LC3: - .quad 0xbf441b7d -.LC4: - .quad 0xbf5db3d7 -.LC5: - .quad 0x3f7c1c5c -.LC6: - .quad 0xbeaf1d44 -.LC7: - .quad 0x3f5db3d7 -.LC8: - .quad 0xbf248dbb -.LC9: - .quad 0x3f007d2b -.LC10: - .quad 0x40b79454 -.LC11: - .quad 0x3f0483ee -.LC12: - .quad 0x3ff746ea -.LC13: - .quad 0x3f0d3b7d -.LC14: - .quad 0x3f976fd9 -.LC15: - .quad 0x3f1c4257 -.LC16: - .quad 0x3f5f2944 -.LC17: - .quad 0x3f3504f3 - .section ".text" - .align 2 - .p2align 4,,15 - .globl imdct36 - .type imdct36, @function -imdct36: -.LCF0: -0: addis 2,12,.TOC.-.LCF0@ha - addi 2,2,.TOC.-.LCF0@l - .localentry imdct36,.-imdct36 - stfd 15,-136(1) - stfd 16,-128(1) - stfd 17,-120(1) - stfd 18,-112(1) - stfd 19,-104(1) - stfd 20,-96(1) - stfd 21,-88(1) - stfd 22,-80(1) - stfd 23,-72(1) - stfd 24,-64(1) - stfd 25,-56(1) - stfd 26,-48(1) - stfd 27,-40(1) - stfd 28,-32(1) - stfd 29,-24(1) - stfd 30,-16(1) - stfd 31,-8(1) - std 30,-152(1) - lfs 11,60(5) - lfs 12,52(5) - lfs 27,48(5) - lfs 1,40(5) - lfs 13,32(5) - lfs 2,24(5) - lfs 19,64(5) - lfs 30,56(5) - lfs 29,44(5) - lfs 3,36(5) - lfs 6,28(5) - lfs 8,20(5) - lfs 31,16(5) - lfs 10,68(5) - lfs 5,12(5) - lfs 9,4(5) - lfs 4,0(5) - lfs 18,8(5) - fadds 28,12,27 - ld 30,.LCTOC0@toc(2) - fadds 7,11,30 - fadds 0,29,1 - fadds 30,30,12 - fadds 29,27,29 - fadds 12,6,2 - fadds 6,13,6 - fadds 16,8,31 - fadds 8,2,8 - fadds 10,19,10 - fadds 19,19,11 - fadds 11,3,13 - fadds 31,31,5 - fadds 24,4,9 - fadds 3,1,3 - lfs 1,.LC0-.LCTOC1(30) - fadds 17,5,18 - fadds 18,18,9 - lfs 27,.LC5-.LCTOC1(30) - stfs 6,32(5) - lfs 5,.LC6-.LCTOC1(30) - stfs 29,48(5) - lfs 13,.LC1-.LCTOC1(30) - stfs 30,56(5) - stfs 8,24(5) - fadds 10,10,7 - fadds 7,7,28 - fadds 28,28,0 - fadds 0,0,11 - fadds 11,11,12 - fadds 12,12,16 - fadds 2,6,31 - fsubs 23,6,19 - fadds 16,16,17 - fadds 17,24,17 - fadds 6,19,6 - fsubs 26,3,30 - fadds 21,3,18 - stfs 19,64(5) - stfs 31,16(5) - stfs 24,4(5) - stfs 3,40(5) - fadds 3,30,3 - stfs 18,8(5) - fadds 30,30,18 - fsubs 9,0,7 - stfs 28,52(5) - stfs 10,68(5) - stfs 0,44(5) - fsubs 22,11,10 - fmuls 2,2,13 - stfs 11,36(5) - stfs 12,28(5) - fadds 20,17,0 - fadds 25,11,16 - fsubs 6,6,31 - fadds 31,19,31 - fmuls 19,28,1 - fmuls 21,21,27 - fsubs 28,24,28 - fadds 11,10,11 - fsubs 3,3,18 - stfs 16,20(5) - fadds 10,10,16 - fadds 0,7,0 - fmuls 9,9,5 - fmuls 5,26,5 - lfs 26,.LC2-.LCTOC1(30) - lfs 15,.LC9-.LCTOC1(30) - stfs 7,60(5) - stfs 17,12(5) - fmuls 20,20,27 - lfs 27,.LC7-.LCTOC1(30) - fmuls 13,25,13 - fadds 24,19,24 - fmuls 25,29,1 - fsubs 29,4,29 - fsubs 11,11,16 - lfs 16,.LC10-.LCTOC1(30) - fadds 7,17,7 - fsubs 0,0,17 - fadds 18,21,5 - fmuls 22,22,26 - fmuls 23,23,26 - fadds 19,20,9 - fmuls 12,12,27 - fadds 25,25,4 - fadds 4,24,13 - fmuls 8,8,27 - fsubs 13,24,13 - fadds 19,19,12 - fadds 4,4,22 - fadds 26,25,2 - fadds 18,18,8 - fsubs 2,25,2 - fadds 27,4,19 - fsubs 4,4,19 - fmuls 19,11,1 - fmuls 1,6,1 - fadds 26,26,23 - fadds 6,6,29 - fadds 11,11,28 - fsubs 1,29,1 - fmuls 4,4,16 - lfs 29,.LC8-.LCTOC1(30) - lfs 16,.LC4-.LCTOC1(30) - fadds 17,26,18 - fmuls 27,27,15 - fsubs 26,26,18 - fsubs 28,28,19 - lfs 18,.LC3-.LCTOC1(30) - fmuls 30,30,29 - fmuls 7,7,29 - lfs 29,36(6) - fmuls 0,0,16 - fsubs 19,17,27 - fmuls 3,3,16 - fadds 27,27,17 - fmuls 31,31,18 - fmuls 10,10,18 - fsubs 17,26,4 - fadds 4,4,26 - fsubs 18,5,30 - fsubs 5,30,5 - fadds 15,28,0 - fsubs 28,28,0 - lfs 0,144(4) - fmuls 16,19,29 - fadds 26,1,3 - fadds 29,21,30 - fsubs 2,2,31 - fsubs 3,1,3 - fadds 31,25,31 - fsubs 1,9,7 - fsubs 9,7,9 - fsubs 13,13,10 - fadds 24,24,10 - fadds 7,20,7 - fsubs 21,18,8 - fadds 16,16,0 - fsubs 10,29,8 - fadds 0,5,8 - fsubs 8,8,29 - fsubs 23,31,23 - fadds 30,9,12 - fsubs 1,1,12 - fsubs 22,24,22 - fsubs 9,7,12 - fsubs 12,12,7 - fadds 21,21,2 - lfs 5,.LC12-.LCTOC1(30) - stfs 16,1152(3) - fadds 29,0,2 - fadds 1,1,13 - fadds 30,30,13 - fadds 13,10,23 - fadds 8,8,23 - fadds 7,9,22 - fadds 12,12,22 - fmuls 28,28,5 - lfs 10,32(6) - lfs 0,128(4) - fmuls 19,19,10 - fadds 19,19,0 - stfs 19,1024(3) - lfs 0,116(6) - lfs 10,272(4) - fmuls 0,0,27 - stfs 0,144(4) - lfs 0,112(6) - fmuls 27,0,27 - stfs 27,128(4) - lfs 9,68(6) - fmuls 9,17,9 - fadds 9,9,10 - stfs 9,2176(3) - lfs 9,.LC11-.LCTOC1(30) - lfs 10,0(6) - lfs 0,0(4) - fmuls 9,15,9 - fmuls 17,17,10 - fsubs 2,26,9 - fadds 9,9,26 - lfs 10,.LC13-.LCTOC1(30) - fadds 17,17,0 - fmuls 10,1,10 - stfs 17,0(3) - fsubs 1,21,10 - fadds 10,10,21 - lfs 0,148(6) - lfs 27,160(4) - fmuls 0,0,4 - stfs 0,272(4) - lfs 0,80(6) - fmuls 4,0,4 - lfs 0,.LC14-.LCTOC1(30) - stfs 4,0(4) - fsubs 4,3,28 - fadds 3,28,3 - fmuls 30,30,0 - lfs 31,40(6) - fsubs 5,29,30 - fadds 0,30,29 - fmuls 31,2,31 - fadds 31,31,27 - stfs 31,1280(3) - lfs 30,28(6) - lfs 31,112(4) - fmuls 2,2,30 - fadds 2,2,31 - stfs 2,896(3) - lfs 31,120(6) - lfs 30,256(4) - fmuls 31,31,9 - stfs 31,160(4) - lfs 31,108(6) - fmuls 9,31,9 - stfs 9,112(4) - lfs 2,64(6) - fmuls 2,4,2 - fadds 2,2,30 - stfs 2,2048(3) - lfs 2,4(6) - lfs 9,16(4) - fmuls 4,4,2 - fadds 4,4,9 - stfs 4,128(3) - lfs 9,144(6) - lfs 2,176(4) - fmuls 9,9,3 - stfs 9,256(4) - lfs 9,84(6) - fmuls 3,9,3 - stfs 3,16(4) - lfs 4,44(6) - fmuls 4,1,4 - fadds 4,4,2 - lfs 2,.LC15-.LCTOC1(30) - stfs 4,1408(3) - lfs 9,24(6) - lfs 4,96(4) - fmuls 9,1,9 - fadds 9,9,4 - lfs 4,.LC16-.LCTOC1(30) - stfs 9,768(3) - fmuls 12,12,4 - lfs 9,124(6) - lfs 3,240(4) - fsubs 4,8,12 - fadds 8,8,12 - fmuls 9,9,10 - stfs 9,176(4) - lfs 9,104(6) - fmuls 9,9,10 - fmuls 10,7,2 - stfs 9,96(4) - fsubs 9,13,10 - fadds 10,13,10 - lfs 7,60(6) - fmuls 7,5,7 - fadds 7,7,3 - stfs 7,1920(3) - lfs 7,8(6) - lfs 12,32(4) - fmuls 5,5,7 - fadds 5,5,12 - stfs 5,256(3) - lfs 12,140(6) - lfs 5,192(4) - fmuls 12,12,0 - stfs 12,240(4) - lfs 12,88(6) - fmuls 0,12,0 - stfs 0,32(4) - lfs 7,48(6) - fmuls 7,9,7 - fadds 7,7,5 - stfs 7,1536(3) - lfs 12,20(6) - lfs 0,80(4) - fmuls 9,9,12 - fadds 9,9,0 - stfs 9,640(3) - lfs 0,128(6) - lfs 12,224(4) - fmuls 0,0,10 - stfs 0,192(4) - lfs 0,100(6) - fmuls 10,0,10 - stfs 10,80(4) - lfs 10,56(6) - fmuls 10,4,10 - fadds 10,10,12 - stfs 10,1792(3) - lfs 0,12(6) - lfs 12,48(4) - fmuls 0,4,0 - fadds 0,0,12 - lfs 12,.LC17-.LCTOC1(30) - stfs 0,384(3) - fmuls 11,11,12 - lfs 0,136(6) - lfs 10,208(4) - fmuls 0,0,8 - stfs 0,224(4) - lfs 0,92(6) - fmuls 8,0,8 - fsubs 0,6,11 - fadds 6,11,6 - stfs 8,48(4) - lfs 12,52(6) - fmuls 12,0,12 - fadds 12,12,10 - stfs 12,1664(3) - lfs 11,16(6) - lfs 12,64(4) - fmuls 0,0,11 - fadds 0,0,12 - stfs 0,512(3) - lfs 0,132(6) - fmuls 0,0,6 - stfs 0,208(4) - lfs 0,96(6) - fmuls 6,0,6 - stfs 6,64(4) - ld 30,-152(1) - lfd 15,-136(1) - lfd 16,-128(1) - lfd 17,-120(1) - lfd 18,-112(1) - lfd 19,-104(1) - lfd 20,-96(1) - lfd 21,-88(1) - lfd 22,-80(1) - lfd 23,-72(1) - lfd 24,-64(1) - lfd 25,-56(1) - lfd 26,-48(1) - lfd 27,-40(1) - lfd 28,-32(1) - lfd 29,-24(1) - lfd 30,-16(1) - lfd 31,-8(1) + .file "imdct36_standalone.c" + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 # -- Begin function imdct36 +.LC_zero: + .long 0 # float 0 +.LC_2_0: + .long 0x40000000 # float 2 +.LC_0_5: + .long 1056964608 # float 0.5 +.LCPI0_2: + .long 1064341426 # float 0.939692616 +.LCPI0_3: + .long 3190935764 # float -0.173648179 +.LCPI0_4: + .long 3208911741 # float -0.766044437 +.LCPI0_5: + .long 3210589143 # float -0.866025388 +.LCPI0_6: + .long 1065098332 # float 0.984807729 +.LCPI0_7: + .long 3199147332 # float -0.342020154 +.LCPI0_8: + .long 1063105495 # float 0.866025388 +.LCPI0_9: + .long 3206843835 # float -0.642787635 + .text + .globl imdct36 + .p2align 4 + .type imdct36,@function +imdct36: # @imdct36 +.Lfunc_begin0: +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry imdct36, .Lfunc_lep0-.Lfunc_gep0 +# %bb.0: + std 30, -16(1) # 8-byte Folded Spill + std 3, -24(1) + std 4, -32(1) + std 5, -40(1) + std 6, -48(1) + +.loop1: + setvl 0,0,18,0,1,1 # Set VL to 18 elements + # Load 18 floats from (in) + sv.lfs *vin, 0(in) + # equivalent to: for (i = 17; i >= 1; i--) in[i] += in[i-1]; + sv.fadds/mrr *vin1, *vin1, *vin + # SETVL to 16 as the next loop is from 1-17 floats to (out) + setvl 0,0,16,0,1,1 + li 30, 0 + ori 30, 30, 0xaaaa # Predicate mask 0b1010101010101010 + # equivalent to: for (i = 17; i >= 3; i -= 2) in[i] += in[i-2]; + sv.fadds/mrr/m=pred *vin2, *vin2, *vin1 + # Use SETVL again as we want to store 18 floats to (out) + setvl 0,0,18,0,1,1 + sv.stfs *vin, 0(out) + + # Load 2.0f constant in register 29, will be needed for SHR macro + # fmvis 29, 0x4000 + + # Use SETVL 2 for the next loop and calculate first the temporary variables, t1,t2,t3 + # equivalent to: + # for (j = 0; j < 2; j++) { + # in1 = in + j; + # t1 = in1[2*0] - in1[2*6]; + # t2 = in1[2*4] + in1[2*8] - in1[2*2]; + # t3 = in1[2*8] + SHR(in1[2*6],1); + # t4 = t1 - SHR(t2, 1); + # t5 = t1 + t2; + # } + # t1 -> r32-r34 + # t2 -> r35-r37 + # t3 -> r38-r40 + # t4 -> r41-r43 + # t5 -> r44-r46 + # Similarly, the values of 'in' array are already in registers 8-26 + setvl 0,0,2,0,1,1 + # t1 + sv.fsubs 32.v, 8.v, 20.v + # t2 + sv.fadds 35.v, 16.v, 24.v + sv.fsubs 35.v, 35.v, 12.v + # t3, SHR(a,b) = a * 1.0f/(1 << (1)) = a / 2 essentially fdiv a, a, 2.0 + sv.fdivs 38.v, 20.v, 29 + sv.fadds 38.v, 38.v, 8.v + # t4, essentially fdiv 41.v, 35.v, 29 + sv.fdivs 41.v, 35.v, 29 + sv.fsubs 41.v, 32.v, 41.v + # t5 + sv.fadds 44.v, 32.v, 35.v + + # Use SETVL again as we want to store 18 floats to (out) + setvl 0,0,18,0,1,1 + sv.stfs 32.v, 0(3) blr - .long 0 - .byte 0,0,2,0,17,2,0,0 - .size imdct36,.-imdct36 - .ident "GCC: (GNU) 10.3.0" - .section .note.GNU-stack,"",@progbits + .long 0 + .quad 0 +.Lfunc_end0: + .size imdct36, .Lfunc_end0-.Lfunc_begin0 + # -- End function + .type icos36h,@object # @icos36h + .section .rodata,"a",@progbits + .p2align 2 +icos36h: + .long 1048608043 # float 0.250954956 + .long 1048871918 # float 0.258819044 + .long 1049443197 # float 0.275844485 + .long 1050427991 # float 0.305193633 + .long 1052050675 # float 0.353553385 + .long 1054812484 # float 0.435861707 + .long 1050111961 # float 0.295775205 + .long 1056392938 # float 0.482962906 + .long 0 # float 0 + .size icos36h, 36 + + .type icos36,@object # @icos36 + .p2align 2 +icos36: + .long 1056996651 # float 0.501909912 + .long 1057260526 # float 0.517638087 + .long 1057831805 # float 0.551688969 + .long 1058816599 # float 0.610387265 + .long 1060439283 # float 0.707106769 + .long 1063201092 # float 0.871723413 + .long 1066889177 # float 1.18310082 + .long 1073170154 # float 1.93185163 + .long 1085772884 # float 5.73685646 + .size icos36, 36 + + + .ident "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)" + .section ".note.GNU-stack","",@progbits +# .addrsig +# .addrsig_sym imdct36 +# .addrsig_sym icos36h +# .addrsig_sym icos36 -- 2.30.2