f541c81b196d5c7e7de9dbaf8b0a49c12a7fa842
[gcc.git] / libgcc / config / sh / lib1funcs-Os-4-200.S
1 /* Copyright (C) 2006-2014 Free Software Foundation, Inc.
2
3 This file is free software; you can redistribute it and/or modify it
4 under the terms of the GNU General Public License as published by the
5 Free Software Foundation; either version 3, or (at your option) any
6 later version.
7
8 This file is distributed in the hope that it will be useful, but
9 WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 General Public License for more details.
12
13 Under Section 7 of GPL version 3, you are granted additional
14 permissions described in the GCC Runtime Library Exception, version
15 3.1, as published by the Free Software Foundation.
16
17 You should have received a copy of the GNU General Public License and
18 a copy of the GCC Runtime Library Exception along with this program;
19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
20 <http://www.gnu.org/licenses/>. */
21
22 /* Moderately Space-optimized libgcc routines for the Renesas SH /
23 STMicroelectronics ST40 CPUs.
24 Contributed by J"orn Rennecke joern.rennecke@st.com. */
25
26 #include "lib1funcs.h"
27
28 #if !__SHMEDIA__
29 #ifdef L_udivsi3_i4i
30
31 /* 88 bytes; sh4-200 cycle counts:
32 divisor >= 2G: 11 cycles
33 dividend < 2G: 48 cycles
34 dividend >= 2G: divisor != 1: 54 cycles
35 dividend >= 2G, divisor == 1: 22 cycles */
36 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
37 !! args in r4 and r5, result in r0, clobber r1
38
39 .global GLOBAL(udivsi3_i4i)
40 FUNC(GLOBAL(udivsi3_i4i))
41 GLOBAL(udivsi3_i4i):
42 mova L1,r0
43 cmp/pz r5
44 sts fpscr,r1
45 lds.l @r0+,fpscr
46 sts.l fpul,@-r15
47 bf LOCAL(huge_divisor)
48 mov.l r1,@-r15
49 lds r4,fpul
50 cmp/pz r4
51 #ifdef FMOVD_WORKS
52 fmov.d dr0,@-r15
53 float fpul,dr0
54 fmov.d dr2,@-r15
55 bt LOCAL(dividend_adjusted)
56 mov #1,r1
57 fmov.d @r0,dr2
58 cmp/eq r1,r5
59 bt LOCAL(div_by_1)
60 fadd dr2,dr0
61 LOCAL(dividend_adjusted):
62 lds r5,fpul
63 float fpul,dr2
64 fdiv dr2,dr0
65 LOCAL(div_by_1):
66 fmov.d @r15+,dr2
67 ftrc dr0,fpul
68 fmov.d @r15+,dr0
69 #else /* !FMOVD_WORKS */
70 fmov.s DR01,@-r15
71 mov #1,r1
72 fmov.s DR00,@-r15
73 float fpul,dr0
74 fmov.s DR21,@-r15
75 bt/s LOCAL(dividend_adjusted)
76 fmov.s DR20,@-r15
77 cmp/eq r1,r5
78 bt LOCAL(div_by_1)
79 fmov.s @r0+,DR20
80 fmov.s @r0,DR21
81 fadd dr2,dr0
82 LOCAL(dividend_adjusted):
83 lds r5,fpul
84 float fpul,dr2
85 fdiv dr2,dr0
86 LOCAL(div_by_1):
87 fmov.s @r15+,DR20
88 fmov.s @r15+,DR21
89 ftrc dr0,fpul
90 fmov.s @r15+,DR00
91 fmov.s @r15+,DR01
92 #endif /* !FMOVD_WORKS */
93 lds.l @r15+,fpscr
94 sts fpul,r0
95 rts
96 lds.l @r15+,fpul
97
98 #ifdef FMOVD_WORKS
99 .p2align 3 ! make double below 8 byte aligned.
100 #endif
101 LOCAL(huge_divisor):
102 lds r1,fpscr
103 add #4,r15
104 cmp/hs r5,r4
105 rts
106 movt r0
107
108 .p2align 2
109 L1:
110 #ifndef FMOVD_WORKS
111 .long 0x80000
112 #else
113 .long 0x180000
114 #endif
115 .double 4294967296
116
117 ENDFUNC(GLOBAL(udivsi3_i4i))
118 #elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */
119
120 #if 0
121 /* With 36 bytes, the following would probably be the most compact
122 implementation, but with 139 cycles on an sh4-200, it is extremely slow. */
123 GLOBAL(udivsi3_i4i):
124 mov.l r2,@-r15
125 mov #0,r1
126 div0u
127 mov r1,r2
128 mov.l r3,@-r15
129 mov r1,r3
130 sett
131 mov r4,r0
132 LOCAL(loop):
133 rotcr r2
134 ;
135 bt/s LOCAL(end)
136 cmp/gt r2,r3
137 rotcl r0
138 bra LOCAL(loop)
139 div1 r5,r1
140 LOCAL(end):
141 rotcl r0
142 mov.l @r15+,r3
143 rts
144 mov.l @r15+,r2
145 #endif /* 0 */
146
147 /* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
148 sh4-200 run times:
149 udiv small divisor: 55 cycles
150 udiv large divisor: 52 cycles
151 sdiv small divisor, positive result: 59 cycles
152 sdiv large divisor, positive result: 56 cycles
153 sdiv small divisor, negative result: 65 cycles (*)
154 sdiv large divisor, negative result: 62 cycles (*)
155 (*): r2 is restored in the rts delay slot and has a lingering latency
156 of two more cycles. */
157 .balign 4
158 .global GLOBAL(udivsi3_i4i)
159 FUNC(GLOBAL(udivsi3_i4i))
160 FUNC(GLOBAL(sdivsi3_i4i))
161 GLOBAL(udivsi3_i4i):
162 sts pr,r1
163 mov.l r4,@-r15
164 extu.w r5,r0
165 cmp/eq r5,r0
166 swap.w r4,r0
167 shlr16 r4
168 bf/s LOCAL(large_divisor)
169 div0u
170 mov.l r5,@-r15
171 shll16 r5
172 LOCAL(sdiv_small_divisor):
173 div1 r5,r4
174 bsr LOCAL(div6)
175 div1 r5,r4
176 div1 r5,r4
177 bsr LOCAL(div6)
178 div1 r5,r4
179 xtrct r4,r0
180 xtrct r0,r4
181 bsr LOCAL(div7)
182 swap.w r4,r4
183 div1 r5,r4
184 bsr LOCAL(div7)
185 div1 r5,r4
186 xtrct r4,r0
187 mov.l @r15+,r5
188 swap.w r0,r0
189 mov.l @r15+,r4
190 jmp @r1
191 rotcl r0
192 LOCAL(div7):
193 div1 r5,r4
194 LOCAL(div6):
195 div1 r5,r4; div1 r5,r4; div1 r5,r4
196 div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
197
198 LOCAL(divx3):
199 rotcl r0
200 div1 r5,r4
201 rotcl r0
202 div1 r5,r4
203 rotcl r0
204 rts
205 div1 r5,r4
206
207 LOCAL(large_divisor):
208 mov.l r5,@-r15
209 LOCAL(sdiv_large_divisor):
210 xor r4,r0
211 .rept 4
212 rotcl r0
213 bsr LOCAL(divx3)
214 div1 r5,r4
215 .endr
216 mov.l @r15+,r5
217 mov.l @r15+,r4
218 jmp @r1
219 rotcl r0
220 ENDFUNC(GLOBAL(udivsi3_i4i))
221
222 .global GLOBAL(sdivsi3_i4i)
223 GLOBAL(sdivsi3_i4i):
224 mov.l r4,@-r15
225 cmp/pz r5
226 mov.l r5,@-r15
227 bt/s LOCAL(pos_divisor)
228 cmp/pz r4
229 neg r5,r5
230 extu.w r5,r0
231 bt/s LOCAL(neg_result)
232 cmp/eq r5,r0
233 neg r4,r4
234 LOCAL(pos_result):
235 swap.w r4,r0
236 bra LOCAL(sdiv_check_divisor)
237 sts pr,r1
238 LOCAL(pos_divisor):
239 extu.w r5,r0
240 bt/s LOCAL(pos_result)
241 cmp/eq r5,r0
242 neg r4,r4
243 LOCAL(neg_result):
244 mova LOCAL(negate_result),r0
245 ;
246 mov r0,r1
247 swap.w r4,r0
248 lds r2,macl
249 sts pr,r2
250 LOCAL(sdiv_check_divisor):
251 shlr16 r4
252 bf/s LOCAL(sdiv_large_divisor)
253 div0u
254 bra LOCAL(sdiv_small_divisor)
255 shll16 r5
256 .balign 4
257 LOCAL(negate_result):
258 neg r0,r0
259 jmp @r2
260 sts macl,r2
261 ENDFUNC(GLOBAL(sdivsi3_i4i))
262 #endif /* !__SH_FPU_DOUBLE__ */
263 #endif /* L_udivsi3_i4i */
264
265 #ifdef L_sdivsi3_i4i
266 #if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
267 /* 48 bytes, 45 cycles on sh4-200 */
268 !! args in r4 and r5, result in r0, clobber r1
269
270 .global GLOBAL(sdivsi3_i4i)
271 FUNC(GLOBAL(sdivsi3_i4i))
272 GLOBAL(sdivsi3_i4i):
273 sts.l fpscr,@-r15
274 sts fpul,r1
275 mova L1,r0
276 lds.l @r0+,fpscr
277 lds r4,fpul
278 #ifdef FMOVD_WORKS
279 fmov.d dr0,@-r15
280 float fpul,dr0
281 lds r5,fpul
282 fmov.d dr2,@-r15
283 #else
284 fmov.s DR01,@-r15
285 fmov.s DR00,@-r15
286 float fpul,dr0
287 lds r5,fpul
288 fmov.s DR21,@-r15
289 fmov.s DR20,@-r15
290 #endif
291 float fpul,dr2
292 fdiv dr2,dr0
293 #ifdef FMOVD_WORKS
294 fmov.d @r15+,dr2
295 #else
296 fmov.s @r15+,DR20
297 fmov.s @r15+,DR21
298 #endif
299 ftrc dr0,fpul
300 #ifdef FMOVD_WORKS
301 fmov.d @r15+,dr0
302 #else
303 fmov.s @r15+,DR00
304 fmov.s @r15+,DR01
305 #endif
306 lds.l @r15+,fpscr
307 sts fpul,r0
308 rts
309 lds r1,fpul
310
311 .p2align 2
312 L1:
313 #ifndef FMOVD_WORKS
314 .long 0x80000
315 #else
316 .long 0x180000
317 #endif
318
319 ENDFUNC(GLOBAL(sdivsi3_i4i))
320 #endif /* __SH_FPU_DOUBLE__ */
321 #endif /* L_sdivsi3_i4i */
322 #endif /* !__SHMEDIA__ */