1 # ffmpeg lgpl 2.1 or later
3 # some instructions could be saved by using fmac (sv.fmadds, sv.fnmsubs)
4 # but the accuracy is so high it produces different results. this
5 # demo therefore uses fmuls followed by fmsub/fmadd in map-reduce mode
6 # also note, the FP registers are overwritten, not saved on stack yet.
7 # at some point 128 registers will be available, meaning that an EABI
8 # will be defined where there will be plenty of temporaries and no need
9 # to store 24 FP regs on the stack.
21 # SV ints, so we don't have to play with the stack
23 # for now... TODO, add 128 regs to simulator
40 .globl ff_mpadsp_apply_window_float_sv
41 .type ff_mpadsp_apply_window_float_sv, @function
42 ff_mpadsp_apply_window_float_sv:
44 addis 2,12,.TOC.-.LCF0@ha
45 addi 2,2,.TOC.-.LCF0@l
50 # samples2 = samples + 31 * incr;
51 slwi incr, incr, 2 # incr *= 4, sizeof float
56 setvl 0, 0, 8, 1, 1, 0# setvli MVL=8, VL=8
57 addi win2, win, 124 # w2 = window + 31
59 lfiwax sum, 0, 5 # sum = *dither_state
60 addi p, buf, 64 # p = synth_buf+16
62 # SUM8(MACS, sum, w, p)
63 sv.lfs/els *fv0, 256(win)
64 sv.lfs/els *fv1, 256(p)
65 # TOO ACCURATE! hilarious sv.fmadds/mr sum, *fv0, *fv1, sum
66 sv.fmuls *fv0, *fv0, *fv1
67 sv.fadds/mr sum, *fv0, sum
69 addi p, buf, 192 # p = synth_buf + 48;
70 addi win, win, 128 # w = w + 32
71 # SUM8(MLSS, sum, w + 32, p)
72 sv.lfs/els *fv0, 256(win)
73 sv.lfs/els *fv1, 256(p)
74 # TOO ACCURATE! hilarious sv.fnmsubs/mr sum, *fv0, *fv1, sum
75 sv.fmuls *fv0, *fv0, *fv1
76 sv.fsubs/mr sum, sum, *fv0
77 addi win, win, -128 # w = w - 32
79 stfs sum, 0(out) # *samples = &sum
80 add out, out, incr # samples += incr
81 addi win, win, 4 # w++
86 li i, 4 # loop starts at 1: (for j=1;j<16;j++)
88 lfiwax sum, 0, 9 # zero it
89 lfiwax sum2, 0, 9 # zero it
91 # p = synth_buf + 16 + j
95 # SUM8P2(sum, MACS, sum2, MLSS, w, w2, p)
96 sv.lfs/els *fv0, 256(p)
97 sv.lfs/els *fv1, 256(win)
98 sv.lfs/els *fv2, 256(win2)
99 # TOO ACCURATE! hilarious sv.fmadds/mr sum, *fv0, *fv1, sum
100 # TOO ACCURATE! hilarious sv.fnmsubs/mr sum2, *fv0, *fv2, sum2
101 sv.fmuls *fv1, *fv0, *fv1
102 sv.fadds/mr sum, sum, *fv1
103 sv.fmuls *fv0, *fv0, *fv2
104 sv.fsubs/mr sum2, sum2, *fv0
106 # p = synth_buf + 48 - j
114 # SUM8P2(sum, MLSS, sum2, MLSS, w + 32, w2 + 32, p)
115 sv.lfs/els *fv0, 256(p)
116 sv.lfs/els *fv1, 256(win)
117 sv.lfs/els *fv2, 256(win2)
118 # TOO ACCURATE! hilarious sv.fnmsubs/mr sum, *fv0, *fv1, sum
119 # TOO ACCURATE! hilarious sv.fnmsubs/mr sum2, *fv0, *fv2, sum2
120 sv.fmuls *fv1, *fv0, *fv1
121 sv.fsubs/mr sum, sum, *fv1
122 sv.fmuls *fv0, *fv0, *fv2
123 sv.fsubs/mr sum2, sum2, *fv0
127 addi win2, win2, -128
130 add out, out, incr # samples += incr
132 subf out2, incr, out2 # samples2 -= incr
134 addi i, i, 4 # for-loop j=1..15
135 addi win, win, 4 # w++
136 addi win2, win2, -4 # w2--
139 addi p, buf, 128 # p = synth_buf + 32
140 addi win, win, 128 # w += 32
141 lfiwax sum, 0, 9 # zero it
142 # SUM8(MLSS, sum, w + 32, p)
143 sv.lfs/els *fv0, 256(win)
144 sv.lfs/els *fv1, 256(p)
145 # TOO ACCURATE! hilarious sv.fnmsubs/mr sum, *fv0, *fv1, sum
146 sv.fmuls *fv0, *fv0, *fv1
147 sv.fsubs/mr sum, sum, *fv0
153 .size ff_mpadsp_apply_window_float_sv,.-ff_mpadsp_apply_window_float_sv