media/audio/mp3/mp3_1_imdct36_float_basicsv.s

   1 # # ffmpeg lgpl 2.1 or later
   2 #
   3 # some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs)
   4 # but the accuracy is so high it produces different results.  this
   5 # demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode
   6 # also note, the FP registers are overwritten, not saved on stack yet.
   7 # at some point 128 registers will be available, meaning that an EABI
   8 # will be defined where there will be plenty of temporaries and no need
   9 # to store 24 FP regs on the stack.
  10
  11 # ints
  12 .set out, 3
  13 .set buf, 4
  14 .set in, 5
  15 .set win, 6
  16
  17 .set i, 7
  18 .set vin, 8
  19 .set vin1, 9
  20 .set vin2, 11
  21 .set pred, %r30
  22
  23 # floats
  24
  25         .machine libresoc
  26         .text
  27         .abiversion 2
  28         .file   "imdct36_standalone.c"
  29         .section        .rodata.cst4,"aM",@progbits,4
  30         .p2align        2                       # -- Begin function imdct36
  31 .LC_zero:
  32         .long   0                               # float 0
  33 .LC_2_0:
  34         .long   0x40000000                      # float 2
  35 .LC_0_5:
  36         .long   1056964608                      # float 0.5
  37 .LCPI0_2:
  38         .long   1064341426                      # float 0.939692616
  39 .LCPI0_3:
  40         .long   3190935764                      # float -0.173648179
  41 .LCPI0_4:
  42         .long   3208911741                      # float -0.766044437
  43 .LCPI0_5:
  44         .long   3210589143                      # float -0.866025388
  45 .LCPI0_6:
  46         .long   1065098332                      # float 0.984807729
  47 .LCPI0_7:
  48         .long   3199147332                      # float -0.342020154
  49 .LCPI0_8:
  50         .long   1063105495                      # float 0.866025388
  51 .LCPI0_9:
  52         .long   3206843835                      # float -0.642787635
  53         .text
  54         .globl  imdct36
  55         .p2align        4
  56         .type   imdct36,@function
  57 imdct36:                                        # @imdct36
  58 .Lfunc_begin0:
  59 .Lfunc_gep0:
  60         addis 2, 12, .TOC.-.Lfunc_gep0@ha
  61         addi 2, 2, .TOC.-.Lfunc_gep0@l
  62 .Lfunc_lep0:
  63         .localentry     imdct36, .Lfunc_lep0-.Lfunc_gep0
  64 # %bb.0:
  65         std 30, -16(1)                          # 8-byte Folded Spill
  66         std 3, -24(1)
  67         std 4, -32(1)
  68         std 5, -40(1)
  69         std 6, -48(1)
  70
  71 .loop1:
  72         setvl 0,0,18,0,1,1                      # Set VL to 18 elements
  73         # Load 18 floats from (in)
  74         sv.lfs *vin, 0(in)
  75         # equivalent to: for (i = 17; i >= 1; i--) in[i] += in[i-1];
  76         sv.fadds/mrr *vin1, *vin1, *vin
  77         # SETVL to 16 as the next loop is from 1-17 floats to (out)
  78         setvl 0,0,16,0,1,1
  79         li 30, 0
  80         ori 30, 30, 0xaaaa                      # Predicate mask 0b1010101010101010
  81         # equivalent to: for (i = 17; i >= 3; i -= 2) in[i] += in[i-2];
  82         sv.fadds/mrr/m=pred *vin2, *vin2, *vin1
  83         # Use SETVL again as we want to store 18 floats to (out)
  84         setvl 0,0,18,0,1,1
  85         sv.stfs *vin, 0(out)
  86
  87         # Load 2.0f constant in register 29, will be needed for SHR macro
  88         # fmvis 29, 0x4000
  89
  90         # Use SETVL 2 for the next loop and calculate first the temporary variables, t1,t2,t3
  91         # equivalent to:
  92         # for (j = 0; j < 2; j++) {
  93         #   in1 = in + j;
  94         #   t1 = in1[2*0] - in1[2*6];
  95         #   t2 = in1[2*4] + in1[2*8] - in1[2*2];
  96         #   t3 = in1[2*8] + SHR(in1[2*6],1);
  97         #   t4 = t1 - SHR(t2, 1);
  98         #   t5 = t1 + t2;
  99         # }
 100         # t1 -> r32-r34
 101         # t2 -> r35-r37
 102         # t3 -> r38-r40
 103         # t4 -> r41-r43
 104         # t5 -> r44-r46
 105         # Similarly, the values of 'in' array are already in registers 8-26
 106         setvl 0,0,2,0,1,1
 107         # t1
 108         sv.fsubs *32, *8, *20
 109         # t2
 110         sv.fadds *35, *16, *24
 111         sv.fsubs *35, *35, *12
 112         # t3, SHR(a,b) = a * 1.0f/(1 << (1)) = a / 2 essentially fdiv a, a, 2.0
 113         sv.fdivs *38, *20, 29
 114         sv.fadds *38, *38, *8
 115         # t4, essentially fdiv *41, *35, 29
 116         sv.fdivs *41, *35, 29
 117         sv.fsubs *41, *32, *41
 118         # t5
 119         sv.fadds *44, *32, *35
 120
 121         # Use SETVL again as we want to store 18 floats to (out)
 122         setvl 0,0,18,0,1,1
 123         sv.stfs *32, 0(3)
 124         blr
 125         .long   0
 126         .quad   0
 127 .Lfunc_end0:
 128         .size   imdct36, .Lfunc_end0-.Lfunc_begin0
 129                                         # -- End function
 130         .type   icos36h,@object         # @icos36h
 131         .section        .rodata,"a",@progbits
 132         .p2align        2
 133 icos36h:
 134         .long   1048608043              # float 0.250954956
 135         .long   1048871918              # float 0.258819044
 136         .long   1049443197              # float 0.275844485
 137         .long   1050427991              # float 0.305193633
 138         .long   1052050675              # float 0.353553385
 139         .long   1054812484              # float 0.435861707
 140         .long   1050111961              # float 0.295775205
 141         .long   1056392938              # float 0.482962906
 142         .long   0                       # float 0
 143         .size   icos36h, 36
 144
 145         .type   icos36,@object          # @icos36
 146         .p2align        2
 147 icos36:
 148         .long   1056996651              # float 0.501909912
 149         .long   1057260526              # float 0.517638087
 150         .long   1057831805              # float 0.551688969
 151         .long   1058816599              # float 0.610387265
 152         .long   1060439283              # float 0.707106769
 153         .long   1063201092              # float 0.871723413
 154         .long   1066889177              # float 1.18310082
 155         .long   1073170154              # float 1.93185163
 156         .long   1085772884              # float 5.73685646
 157         .size   icos36, 36
 158
 159
 160         .ident  "clang version 7.0.1-8+deb10u2 (tags/RELEASE_701/final)"
 161         .section        ".note.GNU-stack","",@progbits
 162 #       .addrsig
 163 #       .addrsig_sym imdct36
 164 #       .addrsig_sym icos36h
 165 #       .addrsig_sym icos36