constraints.md (Ym): New constraint.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to
19 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "real.h"
32 #include "insn-config.h"
33 #include "conditions.h"
34 #include "output.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
37 #include "flags.h"
38 #include "except.h"
39 #include "function.h"
40 #include "recog.h"
41 #include "expr.h"
42 #include "optabs.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "langhooks.h"
49 #include "cgraph.h"
50 #include "tree-gimple.h"
51 #include "dwarf2.h"
52 #include "tm-constrs.h"
53 #include "params.h"
54
55 #ifndef CHECK_STACK_LIMIT
56 #define CHECK_STACK_LIMIT (-1)
57 #endif
58
59 /* Return index of given mode in mult and division cost tables. */
60 #define MODE_INDEX(mode) \
61 ((mode) == QImode ? 0 \
62 : (mode) == HImode ? 1 \
63 : (mode) == SImode ? 2 \
64 : (mode) == DImode ? 3 \
65 : 4)
66
67 /* Processor costs (relative to an add) */
68 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
69 #define COSTS_N_BYTES(N) ((N) * 2)
70
71 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
72
73 static const
74 struct processor_costs size_cost = { /* costs for tuning for size */
75 COSTS_N_BYTES (2), /* cost of an add instruction */
76 COSTS_N_BYTES (3), /* cost of a lea instruction */
77 COSTS_N_BYTES (2), /* variable shift costs */
78 COSTS_N_BYTES (3), /* constant shift costs */
79 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
80 COSTS_N_BYTES (3), /* HI */
81 COSTS_N_BYTES (3), /* SI */
82 COSTS_N_BYTES (3), /* DI */
83 COSTS_N_BYTES (5)}, /* other */
84 0, /* cost of multiply per each bit set */
85 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
86 COSTS_N_BYTES (3), /* HI */
87 COSTS_N_BYTES (3), /* SI */
88 COSTS_N_BYTES (3), /* DI */
89 COSTS_N_BYTES (5)}, /* other */
90 COSTS_N_BYTES (3), /* cost of movsx */
91 COSTS_N_BYTES (3), /* cost of movzx */
92 0, /* "large" insn */
93 2, /* MOVE_RATIO */
94 2, /* cost for loading QImode using movzbl */
95 {2, 2, 2}, /* cost of loading integer registers
96 in QImode, HImode and SImode.
97 Relative to reg-reg move (2). */
98 {2, 2, 2}, /* cost of storing integer registers */
99 2, /* cost of reg,reg fld/fst */
100 {2, 2, 2}, /* cost of loading fp registers
101 in SFmode, DFmode and XFmode */
102 {2, 2, 2}, /* cost of storing fp registers
103 in SFmode, DFmode and XFmode */
104 3, /* cost of moving MMX register */
105 {3, 3}, /* cost of loading MMX registers
106 in SImode and DImode */
107 {3, 3}, /* cost of storing MMX registers
108 in SImode and DImode */
109 3, /* cost of moving SSE register */
110 {3, 3, 3}, /* cost of loading SSE registers
111 in SImode, DImode and TImode */
112 {3, 3, 3}, /* cost of storing SSE registers
113 in SImode, DImode and TImode */
114 3, /* MMX or SSE register to integer */
115 0, /* size of prefetch block */
116 0, /* number of parallel prefetches */
117 2, /* Branch cost */
118 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
119 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
120 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
121 COSTS_N_BYTES (2), /* cost of FABS instruction. */
122 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
123 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
124 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
126 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
127 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
128 };
129
130 /* Processor costs (relative to an add) */
131 static const
132 struct processor_costs i386_cost = { /* 386 specific costs */
133 COSTS_N_INSNS (1), /* cost of an add instruction */
134 COSTS_N_INSNS (1), /* cost of a lea instruction */
135 COSTS_N_INSNS (3), /* variable shift costs */
136 COSTS_N_INSNS (2), /* constant shift costs */
137 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
138 COSTS_N_INSNS (6), /* HI */
139 COSTS_N_INSNS (6), /* SI */
140 COSTS_N_INSNS (6), /* DI */
141 COSTS_N_INSNS (6)}, /* other */
142 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
143 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
144 COSTS_N_INSNS (23), /* HI */
145 COSTS_N_INSNS (23), /* SI */
146 COSTS_N_INSNS (23), /* DI */
147 COSTS_N_INSNS (23)}, /* other */
148 COSTS_N_INSNS (3), /* cost of movsx */
149 COSTS_N_INSNS (2), /* cost of movzx */
150 15, /* "large" insn */
151 3, /* MOVE_RATIO */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, /* cost of moving SSE register */
168 {4, 8, 16}, /* cost of loading SSE registers
169 in SImode, DImode and TImode */
170 {4, 8, 16}, /* cost of storing SSE registers
171 in SImode, DImode and TImode */
172 3, /* MMX or SSE register to integer */
173 0, /* size of prefetch block */
174 0, /* number of parallel prefetches */
175 1, /* Branch cost */
176 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
177 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
178 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
179 COSTS_N_INSNS (22), /* cost of FABS instruction. */
180 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
181 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
182 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
183 DUMMY_STRINGOP_ALGS},
184 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
185 DUMMY_STRINGOP_ALGS},
186 };
187
188 static const
189 struct processor_costs i486_cost = { /* 486 specific costs */
190 COSTS_N_INSNS (1), /* cost of an add instruction */
191 COSTS_N_INSNS (1), /* cost of a lea instruction */
192 COSTS_N_INSNS (3), /* variable shift costs */
193 COSTS_N_INSNS (2), /* constant shift costs */
194 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
195 COSTS_N_INSNS (12), /* HI */
196 COSTS_N_INSNS (12), /* SI */
197 COSTS_N_INSNS (12), /* DI */
198 COSTS_N_INSNS (12)}, /* other */
199 1, /* cost of multiply per each bit set */
200 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
201 COSTS_N_INSNS (40), /* HI */
202 COSTS_N_INSNS (40), /* SI */
203 COSTS_N_INSNS (40), /* DI */
204 COSTS_N_INSNS (40)}, /* other */
205 COSTS_N_INSNS (3), /* cost of movsx */
206 COSTS_N_INSNS (2), /* cost of movzx */
207 15, /* "large" insn */
208 3, /* MOVE_RATIO */
209 4, /* cost for loading QImode using movzbl */
210 {2, 4, 2}, /* cost of loading integer registers
211 in QImode, HImode and SImode.
212 Relative to reg-reg move (2). */
213 {2, 4, 2}, /* cost of storing integer registers */
214 2, /* cost of reg,reg fld/fst */
215 {8, 8, 8}, /* cost of loading fp registers
216 in SFmode, DFmode and XFmode */
217 {8, 8, 8}, /* cost of storing fp registers
218 in SFmode, DFmode and XFmode */
219 2, /* cost of moving MMX register */
220 {4, 8}, /* cost of loading MMX registers
221 in SImode and DImode */
222 {4, 8}, /* cost of storing MMX registers
223 in SImode and DImode */
224 2, /* cost of moving SSE register */
225 {4, 8, 16}, /* cost of loading SSE registers
226 in SImode, DImode and TImode */
227 {4, 8, 16}, /* cost of storing SSE registers
228 in SImode, DImode and TImode */
229 3, /* MMX or SSE register to integer */
230 0, /* size of prefetch block */
231 0, /* number of parallel prefetches */
232 1, /* Branch cost */
233 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
234 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
235 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
236 COSTS_N_INSNS (3), /* cost of FABS instruction. */
237 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
238 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
239 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
240 DUMMY_STRINGOP_ALGS},
241 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
242 DUMMY_STRINGOP_ALGS}
243 };
244
245 static const
246 struct processor_costs pentium_cost = {
247 COSTS_N_INSNS (1), /* cost of an add instruction */
248 COSTS_N_INSNS (1), /* cost of a lea instruction */
249 COSTS_N_INSNS (4), /* variable shift costs */
250 COSTS_N_INSNS (1), /* constant shift costs */
251 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
252 COSTS_N_INSNS (11), /* HI */
253 COSTS_N_INSNS (11), /* SI */
254 COSTS_N_INSNS (11), /* DI */
255 COSTS_N_INSNS (11)}, /* other */
256 0, /* cost of multiply per each bit set */
257 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
258 COSTS_N_INSNS (25), /* HI */
259 COSTS_N_INSNS (25), /* SI */
260 COSTS_N_INSNS (25), /* DI */
261 COSTS_N_INSNS (25)}, /* other */
262 COSTS_N_INSNS (3), /* cost of movsx */
263 COSTS_N_INSNS (2), /* cost of movzx */
264 8, /* "large" insn */
265 6, /* MOVE_RATIO */
266 6, /* cost for loading QImode using movzbl */
267 {2, 4, 2}, /* cost of loading integer registers
268 in QImode, HImode and SImode.
269 Relative to reg-reg move (2). */
270 {2, 4, 2}, /* cost of storing integer registers */
271 2, /* cost of reg,reg fld/fst */
272 {2, 2, 6}, /* cost of loading fp registers
273 in SFmode, DFmode and XFmode */
274 {4, 4, 6}, /* cost of storing fp registers
275 in SFmode, DFmode and XFmode */
276 8, /* cost of moving MMX register */
277 {8, 8}, /* cost of loading MMX registers
278 in SImode and DImode */
279 {8, 8}, /* cost of storing MMX registers
280 in SImode and DImode */
281 2, /* cost of moving SSE register */
282 {4, 8, 16}, /* cost of loading SSE registers
283 in SImode, DImode and TImode */
284 {4, 8, 16}, /* cost of storing SSE registers
285 in SImode, DImode and TImode */
286 3, /* MMX or SSE register to integer */
287 0, /* size of prefetch block */
288 0, /* number of parallel prefetches */
289 2, /* Branch cost */
290 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
291 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
292 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
293 COSTS_N_INSNS (1), /* cost of FABS instruction. */
294 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
295 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
296 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
297 DUMMY_STRINGOP_ALGS},
298 {{libcall, {{-1, rep_prefix_4_byte}}},
299 DUMMY_STRINGOP_ALGS}
300 };
301
302 static const
303 struct processor_costs pentiumpro_cost = {
304 COSTS_N_INSNS (1), /* cost of an add instruction */
305 COSTS_N_INSNS (1), /* cost of a lea instruction */
306 COSTS_N_INSNS (1), /* variable shift costs */
307 COSTS_N_INSNS (1), /* constant shift costs */
308 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
309 COSTS_N_INSNS (4), /* HI */
310 COSTS_N_INSNS (4), /* SI */
311 COSTS_N_INSNS (4), /* DI */
312 COSTS_N_INSNS (4)}, /* other */
313 0, /* cost of multiply per each bit set */
314 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
315 COSTS_N_INSNS (17), /* HI */
316 COSTS_N_INSNS (17), /* SI */
317 COSTS_N_INSNS (17), /* DI */
318 COSTS_N_INSNS (17)}, /* other */
319 COSTS_N_INSNS (1), /* cost of movsx */
320 COSTS_N_INSNS (1), /* cost of movzx */
321 8, /* "large" insn */
322 6, /* MOVE_RATIO */
323 2, /* cost for loading QImode using movzbl */
324 {4, 4, 4}, /* cost of loading integer registers
325 in QImode, HImode and SImode.
326 Relative to reg-reg move (2). */
327 {2, 2, 2}, /* cost of storing integer registers */
328 2, /* cost of reg,reg fld/fst */
329 {2, 2, 6}, /* cost of loading fp registers
330 in SFmode, DFmode and XFmode */
331 {4, 4, 6}, /* cost of storing fp registers
332 in SFmode, DFmode and XFmode */
333 2, /* cost of moving MMX register */
334 {2, 2}, /* cost of loading MMX registers
335 in SImode and DImode */
336 {2, 2}, /* cost of storing MMX registers
337 in SImode and DImode */
338 2, /* cost of moving SSE register */
339 {2, 2, 8}, /* cost of loading SSE registers
340 in SImode, DImode and TImode */
341 {2, 2, 8}, /* cost of storing SSE registers
342 in SImode, DImode and TImode */
343 3, /* MMX or SSE register to integer */
344 32, /* size of prefetch block */
345 6, /* number of parallel prefetches */
346 2, /* Branch cost */
347 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
348 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
349 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
350 COSTS_N_INSNS (2), /* cost of FABS instruction. */
351 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
352 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
353 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
354 the alignment). For small blocks inline loop is still a noticeable win, for bigger
355 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
356 more expensive startup time in CPU, but after 4K the difference is down in the noise.
357 */
358 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
359 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
360 DUMMY_STRINGOP_ALGS},
361 {{rep_prefix_4_byte, {{1024, unrolled_loop},
362 {8192, rep_prefix_4_byte}, {-1, libcall}}},
363 DUMMY_STRINGOP_ALGS}
364 };
365
366 static const
367 struct processor_costs geode_cost = {
368 COSTS_N_INSNS (1), /* cost of an add instruction */
369 COSTS_N_INSNS (1), /* cost of a lea instruction */
370 COSTS_N_INSNS (2), /* variable shift costs */
371 COSTS_N_INSNS (1), /* constant shift costs */
372 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
373 COSTS_N_INSNS (4), /* HI */
374 COSTS_N_INSNS (7), /* SI */
375 COSTS_N_INSNS (7), /* DI */
376 COSTS_N_INSNS (7)}, /* other */
377 0, /* cost of multiply per each bit set */
378 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
379 COSTS_N_INSNS (23), /* HI */
380 COSTS_N_INSNS (39), /* SI */
381 COSTS_N_INSNS (39), /* DI */
382 COSTS_N_INSNS (39)}, /* other */
383 COSTS_N_INSNS (1), /* cost of movsx */
384 COSTS_N_INSNS (1), /* cost of movzx */
385 8, /* "large" insn */
386 4, /* MOVE_RATIO */
387 1, /* cost for loading QImode using movzbl */
388 {1, 1, 1}, /* cost of loading integer registers
389 in QImode, HImode and SImode.
390 Relative to reg-reg move (2). */
391 {1, 1, 1}, /* cost of storing integer registers */
392 1, /* cost of reg,reg fld/fst */
393 {1, 1, 1}, /* cost of loading fp registers
394 in SFmode, DFmode and XFmode */
395 {4, 6, 6}, /* cost of storing fp registers
396 in SFmode, DFmode and XFmode */
397
398 1, /* cost of moving MMX register */
399 {1, 1}, /* cost of loading MMX registers
400 in SImode and DImode */
401 {1, 1}, /* cost of storing MMX registers
402 in SImode and DImode */
403 1, /* cost of moving SSE register */
404 {1, 1, 1}, /* cost of loading SSE registers
405 in SImode, DImode and TImode */
406 {1, 1, 1}, /* cost of storing SSE registers
407 in SImode, DImode and TImode */
408 1, /* MMX or SSE register to integer */
409 32, /* size of prefetch block */
410 1, /* number of parallel prefetches */
411 1, /* Branch cost */
412 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
413 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
414 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
415 COSTS_N_INSNS (1), /* cost of FABS instruction. */
416 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
417 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
418 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
419 DUMMY_STRINGOP_ALGS},
420 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
421 DUMMY_STRINGOP_ALGS}
422 };
423
424 static const
425 struct processor_costs k6_cost = {
426 COSTS_N_INSNS (1), /* cost of an add instruction */
427 COSTS_N_INSNS (2), /* cost of a lea instruction */
428 COSTS_N_INSNS (1), /* variable shift costs */
429 COSTS_N_INSNS (1), /* constant shift costs */
430 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
431 COSTS_N_INSNS (3), /* HI */
432 COSTS_N_INSNS (3), /* SI */
433 COSTS_N_INSNS (3), /* DI */
434 COSTS_N_INSNS (3)}, /* other */
435 0, /* cost of multiply per each bit set */
436 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
437 COSTS_N_INSNS (18), /* HI */
438 COSTS_N_INSNS (18), /* SI */
439 COSTS_N_INSNS (18), /* DI */
440 COSTS_N_INSNS (18)}, /* other */
441 COSTS_N_INSNS (2), /* cost of movsx */
442 COSTS_N_INSNS (2), /* cost of movzx */
443 8, /* "large" insn */
444 4, /* MOVE_RATIO */
445 3, /* cost for loading QImode using movzbl */
446 {4, 5, 4}, /* cost of loading integer registers
447 in QImode, HImode and SImode.
448 Relative to reg-reg move (2). */
449 {2, 3, 2}, /* cost of storing integer registers */
450 4, /* cost of reg,reg fld/fst */
451 {6, 6, 6}, /* cost of loading fp registers
452 in SFmode, DFmode and XFmode */
453 {4, 4, 4}, /* cost of storing fp registers
454 in SFmode, DFmode and XFmode */
455 2, /* cost of moving MMX register */
456 {2, 2}, /* cost of loading MMX registers
457 in SImode and DImode */
458 {2, 2}, /* cost of storing MMX registers
459 in SImode and DImode */
460 2, /* cost of moving SSE register */
461 {2, 2, 8}, /* cost of loading SSE registers
462 in SImode, DImode and TImode */
463 {2, 2, 8}, /* cost of storing SSE registers
464 in SImode, DImode and TImode */
465 6, /* MMX or SSE register to integer */
466 32, /* size of prefetch block */
467 1, /* number of parallel prefetches */
468 1, /* Branch cost */
469 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
470 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
471 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
472 COSTS_N_INSNS (2), /* cost of FABS instruction. */
473 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
474 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
475 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
476 DUMMY_STRINGOP_ALGS},
477 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
478 DUMMY_STRINGOP_ALGS}
479 };
480
481 static const
482 struct processor_costs athlon_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (2), /* cost of a lea instruction */
485 COSTS_N_INSNS (1), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (5), /* HI */
489 COSTS_N_INSNS (5), /* SI */
490 COSTS_N_INSNS (5), /* DI */
491 COSTS_N_INSNS (5)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (26), /* HI */
495 COSTS_N_INSNS (42), /* SI */
496 COSTS_N_INSNS (74), /* DI */
497 COSTS_N_INSNS (74)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
501 9, /* MOVE_RATIO */
502 4, /* cost for loading QImode using movzbl */
503 {3, 4, 3}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {3, 4, 3}, /* cost of storing integer registers */
507 4, /* cost of reg,reg fld/fst */
508 {4, 4, 12}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {6, 6, 8}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
512 2, /* cost of moving MMX register */
513 {4, 4}, /* cost of loading MMX registers
514 in SImode and DImode */
515 {4, 4}, /* cost of storing MMX registers
516 in SImode and DImode */
517 2, /* cost of moving SSE register */
518 {4, 4, 6}, /* cost of loading SSE registers
519 in SImode, DImode and TImode */
520 {4, 4, 5}, /* cost of storing SSE registers
521 in SImode, DImode and TImode */
522 5, /* MMX or SSE register to integer */
523 64, /* size of prefetch block */
524 6, /* number of parallel prefetches */
525 5, /* Branch cost */
526 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
527 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
528 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
529 COSTS_N_INSNS (2), /* cost of FABS instruction. */
530 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
531 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
532 /* For some reason, Athlon deals better with REP prefix (relative to loops)
533 compared to K8. Alignment becomes important after 8 bytes for memcpy and
534 128 bytes for memset. */
535 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
536 DUMMY_STRINGOP_ALGS},
537 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
538 DUMMY_STRINGOP_ALGS}
539 };
540
541 static const
542 struct processor_costs k8_cost = {
543 COSTS_N_INSNS (1), /* cost of an add instruction */
544 COSTS_N_INSNS (2), /* cost of a lea instruction */
545 COSTS_N_INSNS (1), /* variable shift costs */
546 COSTS_N_INSNS (1), /* constant shift costs */
547 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
548 COSTS_N_INSNS (4), /* HI */
549 COSTS_N_INSNS (3), /* SI */
550 COSTS_N_INSNS (4), /* DI */
551 COSTS_N_INSNS (5)}, /* other */
552 0, /* cost of multiply per each bit set */
553 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
554 COSTS_N_INSNS (26), /* HI */
555 COSTS_N_INSNS (42), /* SI */
556 COSTS_N_INSNS (74), /* DI */
557 COSTS_N_INSNS (74)}, /* other */
558 COSTS_N_INSNS (1), /* cost of movsx */
559 COSTS_N_INSNS (1), /* cost of movzx */
560 8, /* "large" insn */
561 9, /* MOVE_RATIO */
562 4, /* cost for loading QImode using movzbl */
563 {3, 4, 3}, /* cost of loading integer registers
564 in QImode, HImode and SImode.
565 Relative to reg-reg move (2). */
566 {3, 4, 3}, /* cost of storing integer registers */
567 4, /* cost of reg,reg fld/fst */
568 {4, 4, 12}, /* cost of loading fp registers
569 in SFmode, DFmode and XFmode */
570 {6, 6, 8}, /* cost of storing fp registers
571 in SFmode, DFmode and XFmode */
572 2, /* cost of moving MMX register */
573 {3, 3}, /* cost of loading MMX registers
574 in SImode and DImode */
575 {4, 4}, /* cost of storing MMX registers
576 in SImode and DImode */
577 2, /* cost of moving SSE register */
578 {4, 3, 6}, /* cost of loading SSE registers
579 in SImode, DImode and TImode */
580 {4, 4, 5}, /* cost of storing SSE registers
581 in SImode, DImode and TImode */
582 5, /* MMX or SSE register to integer */
583 64, /* size of prefetch block */
584 /* New AMD processors never drop prefetches; if they cannot be performed
585 immediately, they are queued. We set number of simultaneous prefetches
586 to a large constant to reflect this (it probably is not a good idea not
587 to limit number of prefetches at all, as their execution also takes some
588 time). */
589 100, /* number of parallel prefetches */
590 5, /* Branch cost */
591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
597 /* K8 has optimized REP instruction for medium sized blocks, but for very small
598 blocks it is better to use loop. For large blocks, libcall can do
599 nontemporary accesses and beat inline considerably. */
600 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
601 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
602 {{libcall, {{8, loop}, {24, unrolled_loop},
603 {2048, rep_prefix_4_byte}, {-1, libcall}}},
604 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
605 };
606
607 struct processor_costs amdfam10_cost = {
608 COSTS_N_INSNS (1), /* cost of an add instruction */
609 COSTS_N_INSNS (2), /* cost of a lea instruction */
610 COSTS_N_INSNS (1), /* variable shift costs */
611 COSTS_N_INSNS (1), /* constant shift costs */
612 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
613 COSTS_N_INSNS (4), /* HI */
614 COSTS_N_INSNS (3), /* SI */
615 COSTS_N_INSNS (4), /* DI */
616 COSTS_N_INSNS (5)}, /* other */
617 0, /* cost of multiply per each bit set */
618 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
619 COSTS_N_INSNS (35), /* HI */
620 COSTS_N_INSNS (51), /* SI */
621 COSTS_N_INSNS (83), /* DI */
622 COSTS_N_INSNS (83)}, /* other */
623 COSTS_N_INSNS (1), /* cost of movsx */
624 COSTS_N_INSNS (1), /* cost of movzx */
625 8, /* "large" insn */
626 9, /* MOVE_RATIO */
627 4, /* cost for loading QImode using movzbl */
628 {3, 4, 3}, /* cost of loading integer registers
629 in QImode, HImode and SImode.
630 Relative to reg-reg move (2). */
631 {3, 4, 3}, /* cost of storing integer registers */
632 4, /* cost of reg,reg fld/fst */
633 {4, 4, 12}, /* cost of loading fp registers
634 in SFmode, DFmode and XFmode */
635 {6, 6, 8}, /* cost of storing fp registers
636 in SFmode, DFmode and XFmode */
637 2, /* cost of moving MMX register */
638 {3, 3}, /* cost of loading MMX registers
639 in SImode and DImode */
640 {4, 4}, /* cost of storing MMX registers
641 in SImode and DImode */
642 2, /* cost of moving SSE register */
643 {4, 4, 3}, /* cost of loading SSE registers
644 in SImode, DImode and TImode */
645 {4, 4, 5}, /* cost of storing SSE registers
646 in SImode, DImode and TImode */
647 3, /* MMX or SSE register to integer */
648 /* On K8
649 MOVD reg64, xmmreg Double FSTORE 4
650 MOVD reg32, xmmreg Double FSTORE 4
651 On AMDFAM10
652 MOVD reg64, xmmreg Double FADD 3
653 1/1 1/1
654 MOVD reg32, xmmreg Double FADD 3
655 1/1 1/1 */
656 64, /* size of prefetch block */
657 /* New AMD processors never drop prefetches; if they cannot be performed
658 immediately, they are queued. We set number of simultaneous prefetches
659 to a large constant to reflect this (it probably is not a good idea not
660 to limit number of prefetches at all, as their execution also takes some
661 time). */
662 100, /* number of parallel prefetches */
663 5, /* Branch cost */
664 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
665 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
666 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
667 COSTS_N_INSNS (2), /* cost of FABS instruction. */
668 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
669 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
670
671 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
672 very small blocks it is better to use loop. For large blocks, libcall can
673 do nontemporary accesses and beat inline considerably. */
674 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
675 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
676 {{libcall, {{8, loop}, {24, unrolled_loop},
677 {2048, rep_prefix_4_byte}, {-1, libcall}}},
678 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
679 };
680
681 static const
682 struct processor_costs pentium4_cost = {
683 COSTS_N_INSNS (1), /* cost of an add instruction */
684 COSTS_N_INSNS (3), /* cost of a lea instruction */
685 COSTS_N_INSNS (4), /* variable shift costs */
686 COSTS_N_INSNS (4), /* constant shift costs */
687 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
688 COSTS_N_INSNS (15), /* HI */
689 COSTS_N_INSNS (15), /* SI */
690 COSTS_N_INSNS (15), /* DI */
691 COSTS_N_INSNS (15)}, /* other */
692 0, /* cost of multiply per each bit set */
693 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
694 COSTS_N_INSNS (56), /* HI */
695 COSTS_N_INSNS (56), /* SI */
696 COSTS_N_INSNS (56), /* DI */
697 COSTS_N_INSNS (56)}, /* other */
698 COSTS_N_INSNS (1), /* cost of movsx */
699 COSTS_N_INSNS (1), /* cost of movzx */
700 16, /* "large" insn */
701 6, /* MOVE_RATIO */
702 2, /* cost for loading QImode using movzbl */
703 {4, 5, 4}, /* cost of loading integer registers
704 in QImode, HImode and SImode.
705 Relative to reg-reg move (2). */
706 {2, 3, 2}, /* cost of storing integer registers */
707 2, /* cost of reg,reg fld/fst */
708 {2, 2, 6}, /* cost of loading fp registers
709 in SFmode, DFmode and XFmode */
710 {4, 4, 6}, /* cost of storing fp registers
711 in SFmode, DFmode and XFmode */
712 2, /* cost of moving MMX register */
713 {2, 2}, /* cost of loading MMX registers
714 in SImode and DImode */
715 {2, 2}, /* cost of storing MMX registers
716 in SImode and DImode */
717 12, /* cost of moving SSE register */
718 {12, 12, 12}, /* cost of loading SSE registers
719 in SImode, DImode and TImode */
720 {2, 2, 8}, /* cost of storing SSE registers
721 in SImode, DImode and TImode */
722 10, /* MMX or SSE register to integer */
723 64, /* size of prefetch block */
724 6, /* number of parallel prefetches */
725 2, /* Branch cost */
726 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
727 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
728 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
729 COSTS_N_INSNS (2), /* cost of FABS instruction. */
730 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
731 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
732 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
733 DUMMY_STRINGOP_ALGS},
734 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
735 {-1, libcall}}},
736 DUMMY_STRINGOP_ALGS},
737 };
738
739 static const
740 struct processor_costs nocona_cost = {
741 COSTS_N_INSNS (1), /* cost of an add instruction */
742 COSTS_N_INSNS (1), /* cost of a lea instruction */
743 COSTS_N_INSNS (1), /* variable shift costs */
744 COSTS_N_INSNS (1), /* constant shift costs */
745 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
746 COSTS_N_INSNS (10), /* HI */
747 COSTS_N_INSNS (10), /* SI */
748 COSTS_N_INSNS (10), /* DI */
749 COSTS_N_INSNS (10)}, /* other */
750 0, /* cost of multiply per each bit set */
751 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
752 COSTS_N_INSNS (66), /* HI */
753 COSTS_N_INSNS (66), /* SI */
754 COSTS_N_INSNS (66), /* DI */
755 COSTS_N_INSNS (66)}, /* other */
756 COSTS_N_INSNS (1), /* cost of movsx */
757 COSTS_N_INSNS (1), /* cost of movzx */
758 16, /* "large" insn */
759 17, /* MOVE_RATIO */
760 4, /* cost for loading QImode using movzbl */
761 {4, 4, 4}, /* cost of loading integer registers
762 in QImode, HImode and SImode.
763 Relative to reg-reg move (2). */
764 {4, 4, 4}, /* cost of storing integer registers */
765 3, /* cost of reg,reg fld/fst */
766 {12, 12, 12}, /* cost of loading fp registers
767 in SFmode, DFmode and XFmode */
768 {4, 4, 4}, /* cost of storing fp registers
769 in SFmode, DFmode and XFmode */
770 6, /* cost of moving MMX register */
771 {12, 12}, /* cost of loading MMX registers
772 in SImode and DImode */
773 {12, 12}, /* cost of storing MMX registers
774 in SImode and DImode */
775 6, /* cost of moving SSE register */
776 {12, 12, 12}, /* cost of loading SSE registers
777 in SImode, DImode and TImode */
778 {12, 12, 12}, /* cost of storing SSE registers
779 in SImode, DImode and TImode */
780 8, /* MMX or SSE register to integer */
781 128, /* size of prefetch block */
782 8, /* number of parallel prefetches */
783 1, /* Branch cost */
784 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
785 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
786 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
787 COSTS_N_INSNS (3), /* cost of FABS instruction. */
788 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
789 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
790 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
791 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
792 {100000, unrolled_loop}, {-1, libcall}}}},
793 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
794 {-1, libcall}}},
795 {libcall, {{24, loop}, {64, unrolled_loop},
796 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
797 };
798
799 static const
800 struct processor_costs core2_cost = {
801 COSTS_N_INSNS (1), /* cost of an add instruction */
802 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
803 COSTS_N_INSNS (1), /* variable shift costs */
804 COSTS_N_INSNS (1), /* constant shift costs */
805 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
806 COSTS_N_INSNS (3), /* HI */
807 COSTS_N_INSNS (3), /* SI */
808 COSTS_N_INSNS (3), /* DI */
809 COSTS_N_INSNS (3)}, /* other */
810 0, /* cost of multiply per each bit set */
811 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
812 COSTS_N_INSNS (22), /* HI */
813 COSTS_N_INSNS (22), /* SI */
814 COSTS_N_INSNS (22), /* DI */
815 COSTS_N_INSNS (22)}, /* other */
816 COSTS_N_INSNS (1), /* cost of movsx */
817 COSTS_N_INSNS (1), /* cost of movzx */
818 8, /* "large" insn */
819 16, /* MOVE_RATIO */
820 2, /* cost for loading QImode using movzbl */
821 {6, 6, 6}, /* cost of loading integer registers
822 in QImode, HImode and SImode.
823 Relative to reg-reg move (2). */
824 {4, 4, 4}, /* cost of storing integer registers */
825 2, /* cost of reg,reg fld/fst */
826 {6, 6, 6}, /* cost of loading fp registers
827 in SFmode, DFmode and XFmode */
828 {4, 4, 4}, /* cost of loading integer registers */
829 2, /* cost of moving MMX register */
830 {6, 6}, /* cost of loading MMX registers
831 in SImode and DImode */
832 {4, 4}, /* cost of storing MMX registers
833 in SImode and DImode */
834 2, /* cost of moving SSE register */
835 {6, 6, 6}, /* cost of loading SSE registers
836 in SImode, DImode and TImode */
837 {4, 4, 4}, /* cost of storing SSE registers
838 in SImode, DImode and TImode */
839 2, /* MMX or SSE register to integer */
840 128, /* size of prefetch block */
841 8, /* number of parallel prefetches */
842 3, /* Branch cost */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (1), /* cost of FABS instruction. */
847 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
849 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
850 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
851 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
852 {{libcall, {{8, loop}, {15, unrolled_loop},
853 {2048, rep_prefix_4_byte}, {-1, libcall}}},
854 {libcall, {{24, loop}, {32, unrolled_loop},
855 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
856 };
857
858 /* Generic64 should produce code tuned for Nocona and K8. */
859 static const
860 struct processor_costs generic64_cost = {
861 COSTS_N_INSNS (1), /* cost of an add instruction */
862 /* On all chips taken into consideration lea is 2 cycles and more. With
863 this cost however our current implementation of synth_mult results in
864 use of unnecessary temporary registers causing regression on several
865 SPECfp benchmarks. */
866 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
867 COSTS_N_INSNS (1), /* variable shift costs */
868 COSTS_N_INSNS (1), /* constant shift costs */
869 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
870 COSTS_N_INSNS (4), /* HI */
871 COSTS_N_INSNS (3), /* SI */
872 COSTS_N_INSNS (4), /* DI */
873 COSTS_N_INSNS (2)}, /* other */
874 0, /* cost of multiply per each bit set */
875 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
876 COSTS_N_INSNS (26), /* HI */
877 COSTS_N_INSNS (42), /* SI */
878 COSTS_N_INSNS (74), /* DI */
879 COSTS_N_INSNS (74)}, /* other */
880 COSTS_N_INSNS (1), /* cost of movsx */
881 COSTS_N_INSNS (1), /* cost of movzx */
882 8, /* "large" insn */
883 17, /* MOVE_RATIO */
884 4, /* cost for loading QImode using movzbl */
885 {4, 4, 4}, /* cost of loading integer registers
886 in QImode, HImode and SImode.
887 Relative to reg-reg move (2). */
888 {4, 4, 4}, /* cost of storing integer registers */
889 4, /* cost of reg,reg fld/fst */
890 {12, 12, 12}, /* cost of loading fp registers
891 in SFmode, DFmode and XFmode */
892 {6, 6, 8}, /* cost of storing fp registers
893 in SFmode, DFmode and XFmode */
894 2, /* cost of moving MMX register */
895 {8, 8}, /* cost of loading MMX registers
896 in SImode and DImode */
897 {8, 8}, /* cost of storing MMX registers
898 in SImode and DImode */
899 2, /* cost of moving SSE register */
900 {8, 8, 8}, /* cost of loading SSE registers
901 in SImode, DImode and TImode */
902 {8, 8, 8}, /* cost of storing SSE registers
903 in SImode, DImode and TImode */
904 5, /* MMX or SSE register to integer */
905 64, /* size of prefetch block */
906 6, /* number of parallel prefetches */
907 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
908 is increased to perhaps more appropriate value of 5. */
909 3, /* Branch cost */
910 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
911 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
912 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
913 COSTS_N_INSNS (8), /* cost of FABS instruction. */
914 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
915 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
916 {DUMMY_STRINGOP_ALGS,
917 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
918 {DUMMY_STRINGOP_ALGS,
919 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
920 };
921
922 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
923 static const
924 struct processor_costs generic32_cost = {
925 COSTS_N_INSNS (1), /* cost of an add instruction */
926 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
927 COSTS_N_INSNS (1), /* variable shift costs */
928 COSTS_N_INSNS (1), /* constant shift costs */
929 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
930 COSTS_N_INSNS (4), /* HI */
931 COSTS_N_INSNS (3), /* SI */
932 COSTS_N_INSNS (4), /* DI */
933 COSTS_N_INSNS (2)}, /* other */
934 0, /* cost of multiply per each bit set */
935 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
936 COSTS_N_INSNS (26), /* HI */
937 COSTS_N_INSNS (42), /* SI */
938 COSTS_N_INSNS (74), /* DI */
939 COSTS_N_INSNS (74)}, /* other */
940 COSTS_N_INSNS (1), /* cost of movsx */
941 COSTS_N_INSNS (1), /* cost of movzx */
942 8, /* "large" insn */
943 17, /* MOVE_RATIO */
944 4, /* cost for loading QImode using movzbl */
945 {4, 4, 4}, /* cost of loading integer registers
946 in QImode, HImode and SImode.
947 Relative to reg-reg move (2). */
948 {4, 4, 4}, /* cost of storing integer registers */
949 4, /* cost of reg,reg fld/fst */
950 {12, 12, 12}, /* cost of loading fp registers
951 in SFmode, DFmode and XFmode */
952 {6, 6, 8}, /* cost of storing fp registers
953 in SFmode, DFmode and XFmode */
954 2, /* cost of moving MMX register */
955 {8, 8}, /* cost of loading MMX registers
956 in SImode and DImode */
957 {8, 8}, /* cost of storing MMX registers
958 in SImode and DImode */
959 2, /* cost of moving SSE register */
960 {8, 8, 8}, /* cost of loading SSE registers
961 in SImode, DImode and TImode */
962 {8, 8, 8}, /* cost of storing SSE registers
963 in SImode, DImode and TImode */
964 5, /* MMX or SSE register to integer */
965 64, /* size of prefetch block */
966 6, /* number of parallel prefetches */
967 3, /* Branch cost */
968 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
969 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
970 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
971 COSTS_N_INSNS (8), /* cost of FABS instruction. */
972 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
973 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
974 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
975 DUMMY_STRINGOP_ALGS},
976 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
977 DUMMY_STRINGOP_ALGS},
978 };
979
980 const struct processor_costs *ix86_cost = &pentium_cost;
981
982 /* Processor feature/optimization bitmasks. */
983 #define m_386 (1<<PROCESSOR_I386)
984 #define m_486 (1<<PROCESSOR_I486)
985 #define m_PENT (1<<PROCESSOR_PENTIUM)
986 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
987 #define m_GEODE (1<<PROCESSOR_GEODE)
988 #define m_K6_GEODE (m_K6 | m_GEODE)
989 #define m_K6 (1<<PROCESSOR_K6)
990 #define m_ATHLON (1<<PROCESSOR_ATHLON)
991 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
992 #define m_K8 (1<<PROCESSOR_K8)
993 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
994 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
995 #define m_NOCONA (1<<PROCESSOR_NOCONA)
996 #define m_CORE2 (1<<PROCESSOR_CORE2)
997 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
998 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
999 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1000 #define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
1001
1002 /* Generic instruction choice should be common subset of supported CPUs
1003 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1004
1005 /* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
1006 Generic64 seems like good code size tradeoff. We can't enable it for 32bit
1007 generic because it is not working well with PPro base chips. */
1008 const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
1009 | m_GENERIC64;
1010 const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1011 | m_NOCONA | m_CORE2 | m_GENERIC;
1012 const int x86_zero_extend_with_and = m_486 | m_PENT;
1013 /* Enable to zero extend integer registers to avoid partial dependencies */
1014 const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1015 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
1016 const int x86_double_with_add = ~m_386;
1017 const int x86_use_bit_test = m_386;
1018 const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
1019 | m_K6 | m_CORE2 | m_GENERIC;
1020 const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1021 | m_NOCONA;
1022 const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
1023 const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
1024 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1025 /* Branch hints were put in P4 based on simulation result. But
1026 after P4 was made, no performance benefit was observed with
1027 branch hints. It also increases the code size. As the result,
1028 icc never generates branch hints. */
1029 const int x86_branch_hints = 0;
1030 const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
1031 /*m_GENERIC | m_ATHLON_K8 ? */
1032 /* We probably ought to watch for partial register stalls on Generic32
1033 compilation setting as well. However in current implementation the
1034 partial register stalls are not eliminated very well - they can
1035 be introduced via subregs synthesized by combine and can happen
1036 in caller/callee saving sequences.
1037 Because this option pays back little on PPro based chips and is in conflict
1038 with partial reg. dependencies used by Athlon/P4 based chips, it is better
1039 to leave it off for generic32 for now. */
1040 const int x86_partial_reg_stall = m_PPRO;
1041 const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC;
1042 const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
1043 const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
1044 | m_CORE2 | m_GENERIC);
1045 const int x86_use_mov0 = m_K6;
1046 const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
1047 const int x86_read_modify_write = ~m_PENT;
1048 const int x86_read_modify = ~(m_PENT | m_PPRO);
1049 const int x86_split_long_moves = m_PPRO;
1050 const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
1051 | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
1052 /* m_PENT4 ? */
1053 const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
1054 const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
1055 const int x86_qimode_math = ~(0);
1056 const int x86_promote_qi_regs = 0;
1057 /* On PPro this flag is meant to avoid partial register stalls. Just like
1058 the x86_partial_reg_stall this option might be considered for Generic32
1059 if our scheme for avoiding partial stalls was more effective. */
1060 const int x86_himode_math = ~(m_PPRO);
1061 const int x86_promote_hi_regs = m_PPRO;
1062 /* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
1063 const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1064 | m_CORE2 | m_GENERIC;
1065 const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
1066 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1067 const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
1068 | m_CORE2 | m_GENERIC;
1069 const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
1070 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
1071 /* Enable if integer moves are preferred for DFmode copies */
1072 const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1073 | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
1074 const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1075 | m_CORE2 | m_GENERIC;
1076 const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
1077 | m_CORE2 | m_GENERIC;
1078 /* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
1079 for outgoing arguments will be computed and placed into the variable
1080 `current_function_outgoing_args_size'. No space will be pushed onto the stack
1081 for each call; instead, the function prologue should increase the stack frame
1082 size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
1083 not proper. */
1084 const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
1085 | m_NOCONA | m_PPRO | m_CORE2
1086 | m_GENERIC;
1087 const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
1088 const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
1089 const int x86_shift1 = ~m_486;
1090 const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
1091 | m_ATHLON_K8_AMDFAM10 | m_PENT4
1092 | m_NOCONA | m_CORE2 | m_GENERIC;
1093 /* In Generic model we have an conflict here in between PPro/Pentium4 based chips
1094 that thread 128bit SSE registers as single units versus K8 based chips that
1095 divide SSE registers to two 64bit halves.
1096 x86_sse_partial_reg_dependency promote all store destinations to be 128bit
1097 to allow register renaming on 128bit SSE units, but usually results in one
1098 extra microop on 64bit SSE units. Experimental results shows that disabling
1099 this option on P4 brings over 20% SPECfp regression, while enabling it on
1100 K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
1101 of moves. */
1102 const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1103 | m_GENERIC | m_AMDFAM10;
1104 /* Set for machines where the type and dependencies are resolved on SSE
1105 register parts instead of whole registers, so we may maintain just
1106 lower part of scalar values in proper format leaving the upper part
1107 undefined. */
1108 const int x86_sse_split_regs = m_ATHLON_K8;
1109 /* Code generation for scalar reg-reg moves of single and double precision data:
1110 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
1111 movaps reg, reg
1112 else
1113 movss reg, reg
1114 if (x86_sse_partial_reg_dependency == true)
1115 movapd reg, reg
1116 else
1117 movsd reg, reg
1118
1119 Code generation for scalar loads of double precision data:
1120 if (x86_sse_split_regs == true)
1121 movlpd mem, reg (gas syntax)
1122 else
1123 movsd mem, reg
1124
1125 Code generation for unaligned packed loads of single precision data
1126 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
1127 if (x86_sse_unaligned_move_optimal)
1128 movups mem, reg
1129
1130 if (x86_sse_partial_reg_dependency == true)
1131 {
1132 xorps reg, reg
1133 movlps mem, reg
1134 movhps mem+8, reg
1135 }
1136 else
1137 {
1138 movlps mem, reg
1139 movhps mem+8, reg
1140 }
1141
1142 Code generation for unaligned packed loads of double precision data
1143 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
1144 if (x86_sse_unaligned_move_optimal)
1145 movupd mem, reg
1146
1147 if (x86_sse_split_regs == true)
1148 {
1149 movlpd mem, reg
1150 movhpd mem+8, reg
1151 }
1152 else
1153 {
1154 movsd mem, reg
1155 movhpd mem+8, reg
1156 }
1157 */
1158 const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
1159 const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
1160 const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
1161 const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
1162 const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
1163
1164 const int x86_inter_unit_moves = ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC);
1165
1166 const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4
1167 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1168 /* Some CPU cores are not able to predict more than 4 branch instructions in
1169 the 16 byte window. */
1170 const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
1171 | m_NOCONA | m_CORE2 | m_GENERIC;
1172 const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
1173 | m_CORE2 | m_GENERIC;
1174 const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
1175 /* Compare and exchange was added for 80486. */
1176 const int x86_cmpxchg = ~m_386;
1177 /* Compare and exchange 8 bytes was added for pentium. */
1178 const int x86_cmpxchg8b = ~(m_386 | m_486);
1179 /* Exchange and add was added for 80486. */
1180 const int x86_xadd = ~m_386;
1181 /* Byteswap was added for 80486. */
1182 const int x86_bswap = ~m_386;
1183 const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
1184
1185 static enum stringop_alg stringop_alg = no_stringop;
1186
1187 /* In case the average insn count for single function invocation is
1188 lower than this constant, emit fast (but longer) prologue and
1189 epilogue code. */
1190 #define FAST_PROLOGUE_INSN_COUNT 20
1191
1192 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1193 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1194 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1195 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1196
1197 /* Array of the smallest class containing reg number REGNO, indexed by
1198 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1199
1200 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1201 {
1202 /* ax, dx, cx, bx */
1203 AREG, DREG, CREG, BREG,
1204 /* si, di, bp, sp */
1205 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1206 /* FP registers */
1207 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1208 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1209 /* arg pointer */
1210 NON_Q_REGS,
1211 /* flags, fpsr, fpcr, frame */
1212 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1213 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1214 SSE_REGS, SSE_REGS,
1215 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1216 MMX_REGS, MMX_REGS,
1217 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1218 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1219 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1220 SSE_REGS, SSE_REGS,
1221 };
1222
1223 /* The "default" register map used in 32bit mode. */
1224
1225 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1226 {
1227 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1228 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1229 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1230 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1231 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1232 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1233 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1234 };
1235
1236 static int const x86_64_int_parameter_registers[6] =
1237 {
1238 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1239 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1240 };
1241
1242 static int const x86_64_int_return_registers[4] =
1243 {
1244 0 /*RAX*/, 1 /*RDI*/, 5 /*RDI*/, 4 /*RSI*/
1245 };
1246
1247 /* The "default" register map used in 64bit mode. */
1248 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1249 {
1250 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1251 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1252 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1253 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1254 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1255 8,9,10,11,12,13,14,15, /* extended integer registers */
1256 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1257 };
1258
1259 /* Define the register numbers to be used in Dwarf debugging information.
1260 The SVR4 reference port C compiler uses the following register numbers
1261 in its Dwarf output code:
1262 0 for %eax (gcc regno = 0)
1263 1 for %ecx (gcc regno = 2)
1264 2 for %edx (gcc regno = 1)
1265 3 for %ebx (gcc regno = 3)
1266 4 for %esp (gcc regno = 7)
1267 5 for %ebp (gcc regno = 6)
1268 6 for %esi (gcc regno = 4)
1269 7 for %edi (gcc regno = 5)
1270 The following three DWARF register numbers are never generated by
1271 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1272 believes these numbers have these meanings.
1273 8 for %eip (no gcc equivalent)
1274 9 for %eflags (gcc regno = 17)
1275 10 for %trapno (no gcc equivalent)
1276 It is not at all clear how we should number the FP stack registers
1277 for the x86 architecture. If the version of SDB on x86/svr4 were
1278 a bit less brain dead with respect to floating-point then we would
1279 have a precedent to follow with respect to DWARF register numbers
1280 for x86 FP registers, but the SDB on x86/svr4 is so completely
1281 broken with respect to FP registers that it is hardly worth thinking
1282 of it as something to strive for compatibility with.
1283 The version of x86/svr4 SDB I have at the moment does (partially)
1284 seem to believe that DWARF register number 11 is associated with
1285 the x86 register %st(0), but that's about all. Higher DWARF
1286 register numbers don't seem to be associated with anything in
1287 particular, and even for DWARF regno 11, SDB only seems to under-
1288 stand that it should say that a variable lives in %st(0) (when
1289 asked via an `=' command) if we said it was in DWARF regno 11,
1290 but SDB still prints garbage when asked for the value of the
1291 variable in question (via a `/' command).
1292 (Also note that the labels SDB prints for various FP stack regs
1293 when doing an `x' command are all wrong.)
1294 Note that these problems generally don't affect the native SVR4
1295 C compiler because it doesn't allow the use of -O with -g and
1296 because when it is *not* optimizing, it allocates a memory
1297 location for each floating-point variable, and the memory
1298 location is what gets described in the DWARF AT_location
1299 attribute for the variable in question.
1300 Regardless of the severe mental illness of the x86/svr4 SDB, we
1301 do something sensible here and we use the following DWARF
1302 register numbers. Note that these are all stack-top-relative
1303 numbers.
1304 11 for %st(0) (gcc regno = 8)
1305 12 for %st(1) (gcc regno = 9)
1306 13 for %st(2) (gcc regno = 10)
1307 14 for %st(3) (gcc regno = 11)
1308 15 for %st(4) (gcc regno = 12)
1309 16 for %st(5) (gcc regno = 13)
1310 17 for %st(6) (gcc regno = 14)
1311 18 for %st(7) (gcc regno = 15)
1312 */
1313 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1314 {
1315 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1316 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1317 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1318 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1319 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1320 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1321 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1322 };
1323
1324 /* Test and compare insns in i386.md store the information needed to
1325 generate branch and scc insns here. */
1326
1327 rtx ix86_compare_op0 = NULL_RTX;
1328 rtx ix86_compare_op1 = NULL_RTX;
1329 rtx ix86_compare_emitted = NULL_RTX;
1330
1331 /* Size of the register save area. */
1332 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1333
1334 /* Define the structure for the machine field in struct function. */
1335
1336 struct stack_local_entry GTY(())
1337 {
1338 unsigned short mode;
1339 unsigned short n;
1340 rtx rtl;
1341 struct stack_local_entry *next;
1342 };
1343
1344 /* Structure describing stack frame layout.
1345 Stack grows downward:
1346
1347 [arguments]
1348 <- ARG_POINTER
1349 saved pc
1350
1351 saved frame pointer if frame_pointer_needed
1352 <- HARD_FRAME_POINTER
1353 [saved regs]
1354
1355 [padding1] \
1356 )
1357 [va_arg registers] (
1358 > to_allocate <- FRAME_POINTER
1359 [frame] (
1360 )
1361 [padding2] /
1362 */
1363 struct ix86_frame
1364 {
1365 int nregs;
1366 int padding1;
1367 int va_arg_size;
1368 HOST_WIDE_INT frame;
1369 int padding2;
1370 int outgoing_arguments_size;
1371 int red_zone_size;
1372
1373 HOST_WIDE_INT to_allocate;
1374 /* The offsets relative to ARG_POINTER. */
1375 HOST_WIDE_INT frame_pointer_offset;
1376 HOST_WIDE_INT hard_frame_pointer_offset;
1377 HOST_WIDE_INT stack_pointer_offset;
1378
1379 /* When save_regs_using_mov is set, emit prologue using
1380 move instead of push instructions. */
1381 bool save_regs_using_mov;
1382 };
1383
1384 /* Code model option. */
1385 enum cmodel ix86_cmodel;
1386 /* Asm dialect. */
1387 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1388 /* TLS dialects. */
1389 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1390
1391 /* Which unit we are generating floating point math for. */
1392 enum fpmath_unit ix86_fpmath;
1393
1394 /* Which cpu are we scheduling for. */
1395 enum processor_type ix86_tune;
1396 /* Which instruction set architecture to use. */
1397 enum processor_type ix86_arch;
1398
1399 /* true if sse prefetch instruction is not NOOP. */
1400 int x86_prefetch_sse;
1401
1402 /* true if cmpxchg16b is supported. */
1403 int x86_cmpxchg16b;
1404
1405 /* ix86_regparm_string as a number */
1406 static int ix86_regparm;
1407
1408 /* -mstackrealign option */
1409 extern int ix86_force_align_arg_pointer;
1410 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1411
1412 /* Preferred alignment for stack boundary in bits. */
1413 unsigned int ix86_preferred_stack_boundary;
1414
1415 /* Values 1-5: see jump.c */
1416 int ix86_branch_cost;
1417
1418 /* Variables which are this size or smaller are put in the data/bss
1419 or ldata/lbss sections. */
1420
1421 int ix86_section_threshold = 65536;
1422
1423 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1424 char internal_label_prefix[16];
1425 int internal_label_prefix_len;
1426 \f
1427 static bool ix86_handle_option (size_t, const char *, int);
1428 static void output_pic_addr_const (FILE *, rtx, int);
1429 static void put_condition_code (enum rtx_code, enum machine_mode,
1430 int, int, FILE *);
1431 static const char *get_some_local_dynamic_name (void);
1432 static int get_some_local_dynamic_name_1 (rtx *, void *);
1433 static rtx ix86_expand_int_compare (enum rtx_code, rtx, rtx);
1434 static enum rtx_code ix86_prepare_fp_compare_args (enum rtx_code, rtx *,
1435 rtx *);
1436 static bool ix86_fixed_condition_code_regs (unsigned int *, unsigned int *);
1437 static enum machine_mode ix86_cc_modes_compatible (enum machine_mode,
1438 enum machine_mode);
1439 static rtx get_thread_pointer (int);
1440 static rtx legitimize_tls_address (rtx, enum tls_model, int);
1441 static void get_pc_thunk_name (char [32], unsigned int);
1442 static rtx gen_push (rtx);
1443 static int ix86_flags_dependent (rtx, rtx, enum attr_type);
1444 static int ix86_agi_dependent (rtx, rtx, enum attr_type);
1445 static struct machine_function * ix86_init_machine_status (void);
1446 static int ix86_split_to_parts (rtx, rtx *, enum machine_mode);
1447 static int ix86_nsaved_regs (void);
1448 static void ix86_emit_save_regs (void);
1449 static void ix86_emit_save_regs_using_mov (rtx, HOST_WIDE_INT);
1450 static void ix86_emit_restore_regs_using_mov (rtx, HOST_WIDE_INT, int);
1451 static void ix86_output_function_epilogue (FILE *, HOST_WIDE_INT);
1452 static HOST_WIDE_INT ix86_GOT_alias_set (void);
1453 static void ix86_adjust_counter (rtx, HOST_WIDE_INT);
1454 static void ix86_expand_strlensi_unroll_1 (rtx, rtx, rtx);
1455 static int ix86_issue_rate (void);
1456 static int ix86_adjust_cost (rtx, rtx, rtx, int);
1457 static int ia32_multipass_dfa_lookahead (void);
1458 static void ix86_init_mmx_sse_builtins (void);
1459 static rtx x86_this_parameter (tree);
1460 static void x86_output_mi_thunk (FILE *, tree, HOST_WIDE_INT,
1461 HOST_WIDE_INT, tree);
1462 static bool x86_can_output_mi_thunk (tree, HOST_WIDE_INT, HOST_WIDE_INT, tree);
1463 static void x86_file_start (void);
1464 static void ix86_reorg (void);
1465 static bool ix86_expand_carry_flag_compare (enum rtx_code, rtx, rtx, rtx*);
1466 static tree ix86_build_builtin_va_list (void);
1467 static void ix86_setup_incoming_varargs (CUMULATIVE_ARGS *, enum machine_mode,
1468 tree, int *, int);
1469 static tree ix86_gimplify_va_arg (tree, tree, tree *, tree *);
1470 static bool ix86_scalar_mode_supported_p (enum machine_mode);
1471 static bool ix86_vector_mode_supported_p (enum machine_mode);
1472
1473 static int ix86_address_cost (rtx);
1474 static bool ix86_cannot_force_const_mem (rtx);
1475 static rtx ix86_delegitimize_address (rtx);
1476
1477 static void i386_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
1478
1479 struct builtin_description;
1480 static rtx ix86_expand_sse_comi (const struct builtin_description *,
1481 tree, rtx);
1482 static rtx ix86_expand_sse_compare (const struct builtin_description *,
1483 tree, rtx);
1484 static rtx ix86_expand_unop1_builtin (enum insn_code, tree, rtx);
1485 static rtx ix86_expand_unop_builtin (enum insn_code, tree, rtx, int);
1486 static rtx ix86_expand_binop_builtin (enum insn_code, tree, rtx);
1487 static rtx ix86_expand_store_builtin (enum insn_code, tree);
1488 static rtx safe_vector_operand (rtx, enum machine_mode);
1489 static rtx ix86_expand_fp_compare (enum rtx_code, rtx, rtx, rtx, rtx *, rtx *);
1490 static int ix86_fp_comparison_arithmetics_cost (enum rtx_code code);
1491 static int ix86_fp_comparison_fcomi_cost (enum rtx_code code);
1492 static int ix86_fp_comparison_sahf_cost (enum rtx_code code);
1493 static int ix86_fp_comparison_cost (enum rtx_code code);
1494 static unsigned int ix86_select_alt_pic_regnum (void);
1495 static int ix86_save_reg (unsigned int, int);
1496 static void ix86_compute_frame_layout (struct ix86_frame *);
1497 static int ix86_comp_type_attributes (tree, tree);
1498 static int ix86_function_regparm (tree, tree);
1499 const struct attribute_spec ix86_attribute_table[];
1500 static bool ix86_function_ok_for_sibcall (tree, tree);
1501 static tree ix86_handle_cconv_attribute (tree *, tree, tree, int, bool *);
1502 static int ix86_value_regno (enum machine_mode, tree, tree);
1503 static bool contains_128bit_aligned_vector_p (tree);
1504 static rtx ix86_struct_value_rtx (tree, int);
1505 static bool ix86_ms_bitfield_layout_p (tree);
1506 static tree ix86_handle_struct_attribute (tree *, tree, tree, int, bool *);
1507 static int extended_reg_mentioned_1 (rtx *, void *);
1508 static bool ix86_rtx_costs (rtx, int, int, int *);
1509 static int min_insn_size (rtx);
1510 static tree ix86_md_asm_clobbers (tree outputs, tree inputs, tree clobbers);
1511 static bool ix86_must_pass_in_stack (enum machine_mode mode, tree type);
1512 static bool ix86_pass_by_reference (CUMULATIVE_ARGS *, enum machine_mode,
1513 tree, bool);
1514 static void ix86_init_builtins (void);
1515 static rtx ix86_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
1516 static tree ix86_builtin_vectorized_function (enum built_in_function, tree, tree);
1517 static const char *ix86_mangle_fundamental_type (tree);
1518 static tree ix86_stack_protect_fail (void);
1519 static rtx ix86_internal_arg_pointer (void);
1520 static void ix86_dwarf_handle_frame_unspec (const char *, rtx, int);
1521
1522 /* This function is only used on Solaris. */
1523 static void i386_solaris_elf_named_section (const char *, unsigned int, tree)
1524 ATTRIBUTE_UNUSED;
1525
1526 /* Register class used for passing given 64bit part of the argument.
1527 These represent classes as documented by the PS ABI, with the exception
1528 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1529 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1530
1531 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1532 whenever possible (upper half does contain padding).
1533 */
1534 enum x86_64_reg_class
1535 {
1536 X86_64_NO_CLASS,
1537 X86_64_INTEGER_CLASS,
1538 X86_64_INTEGERSI_CLASS,
1539 X86_64_SSE_CLASS,
1540 X86_64_SSESF_CLASS,
1541 X86_64_SSEDF_CLASS,
1542 X86_64_SSEUP_CLASS,
1543 X86_64_X87_CLASS,
1544 X86_64_X87UP_CLASS,
1545 X86_64_COMPLEX_X87_CLASS,
1546 X86_64_MEMORY_CLASS
1547 };
1548 static const char * const x86_64_reg_class_name[] = {
1549 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1550 "sseup", "x87", "x87up", "cplx87", "no"
1551 };
1552
1553 #define MAX_CLASSES 4
1554
1555 /* Table of constants used by fldpi, fldln2, etc.... */
1556 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1557 static bool ext_80387_constants_init = 0;
1558 static void init_ext_80387_constants (void);
1559 static bool ix86_in_large_data_p (tree) ATTRIBUTE_UNUSED;
1560 static void ix86_encode_section_info (tree, rtx, int) ATTRIBUTE_UNUSED;
1561 static void x86_64_elf_unique_section (tree decl, int reloc) ATTRIBUTE_UNUSED;
1562 static section *x86_64_elf_select_section (tree decl, int reloc,
1563 unsigned HOST_WIDE_INT align)
1564 ATTRIBUTE_UNUSED;
1565 \f
1566 /* Initialize the GCC target structure. */
1567 #undef TARGET_ATTRIBUTE_TABLE
1568 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
1569 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
1570 # undef TARGET_MERGE_DECL_ATTRIBUTES
1571 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
1572 #endif
1573
1574 #undef TARGET_COMP_TYPE_ATTRIBUTES
1575 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
1576
1577 #undef TARGET_INIT_BUILTINS
1578 #define TARGET_INIT_BUILTINS ix86_init_builtins
1579 #undef TARGET_EXPAND_BUILTIN
1580 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
1581 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
1582 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
1583
1584 #undef TARGET_ASM_FUNCTION_EPILOGUE
1585 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
1586
1587 #undef TARGET_ENCODE_SECTION_INFO
1588 #ifndef SUBTARGET_ENCODE_SECTION_INFO
1589 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
1590 #else
1591 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
1592 #endif
1593
1594 #undef TARGET_ASM_OPEN_PAREN
1595 #define TARGET_ASM_OPEN_PAREN ""
1596 #undef TARGET_ASM_CLOSE_PAREN
1597 #define TARGET_ASM_CLOSE_PAREN ""
1598
1599 #undef TARGET_ASM_ALIGNED_HI_OP
1600 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
1601 #undef TARGET_ASM_ALIGNED_SI_OP
1602 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
1603 #ifdef ASM_QUAD
1604 #undef TARGET_ASM_ALIGNED_DI_OP
1605 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
1606 #endif
1607
1608 #undef TARGET_ASM_UNALIGNED_HI_OP
1609 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
1610 #undef TARGET_ASM_UNALIGNED_SI_OP
1611 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
1612 #undef TARGET_ASM_UNALIGNED_DI_OP
1613 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
1614
1615 #undef TARGET_SCHED_ADJUST_COST
1616 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
1617 #undef TARGET_SCHED_ISSUE_RATE
1618 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
1619 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
1620 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
1621 ia32_multipass_dfa_lookahead
1622
1623 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
1624 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
1625
1626 #ifdef HAVE_AS_TLS
1627 #undef TARGET_HAVE_TLS
1628 #define TARGET_HAVE_TLS true
1629 #endif
1630 #undef TARGET_CANNOT_FORCE_CONST_MEM
1631 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
1632 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
1633 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_rtx_true
1634
1635 #undef TARGET_DELEGITIMIZE_ADDRESS
1636 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
1637
1638 #undef TARGET_MS_BITFIELD_LAYOUT_P
1639 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
1640
1641 #if TARGET_MACHO
1642 #undef TARGET_BINDS_LOCAL_P
1643 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
1644 #endif
1645
1646 #undef TARGET_ASM_OUTPUT_MI_THUNK
1647 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
1648 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
1649 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
1650
1651 #undef TARGET_ASM_FILE_START
1652 #define TARGET_ASM_FILE_START x86_file_start
1653
1654 #undef TARGET_DEFAULT_TARGET_FLAGS
1655 #define TARGET_DEFAULT_TARGET_FLAGS \
1656 (TARGET_DEFAULT \
1657 | TARGET_64BIT_DEFAULT \
1658 | TARGET_SUBTARGET_DEFAULT \
1659 | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT)
1660
1661 #undef TARGET_HANDLE_OPTION
1662 #define TARGET_HANDLE_OPTION ix86_handle_option
1663
1664 #undef TARGET_RTX_COSTS
1665 #define TARGET_RTX_COSTS ix86_rtx_costs
1666 #undef TARGET_ADDRESS_COST
1667 #define TARGET_ADDRESS_COST ix86_address_cost
1668
1669 #undef TARGET_FIXED_CONDITION_CODE_REGS
1670 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
1671 #undef TARGET_CC_MODES_COMPATIBLE
1672 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
1673
1674 #undef TARGET_MACHINE_DEPENDENT_REORG
1675 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
1676
1677 #undef TARGET_BUILD_BUILTIN_VA_LIST
1678 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
1679
1680 #undef TARGET_MD_ASM_CLOBBERS
1681 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
1682
1683 #undef TARGET_PROMOTE_PROTOTYPES
1684 #define TARGET_PROMOTE_PROTOTYPES hook_bool_tree_true
1685 #undef TARGET_STRUCT_VALUE_RTX
1686 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
1687 #undef TARGET_SETUP_INCOMING_VARARGS
1688 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
1689 #undef TARGET_MUST_PASS_IN_STACK
1690 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
1691 #undef TARGET_PASS_BY_REFERENCE
1692 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
1693 #undef TARGET_INTERNAL_ARG_POINTER
1694 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
1695 #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
1696 #define TARGET_DWARF_HANDLE_FRAME_UNSPEC ix86_dwarf_handle_frame_unspec
1697
1698 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
1699 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
1700
1701 #undef TARGET_SCALAR_MODE_SUPPORTED_P
1702 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
1703
1704 #undef TARGET_VECTOR_MODE_SUPPORTED_P
1705 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
1706
1707 #ifdef HAVE_AS_TLS
1708 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
1709 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
1710 #endif
1711
1712 #ifdef SUBTARGET_INSERT_ATTRIBUTES
1713 #undef TARGET_INSERT_ATTRIBUTES
1714 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
1715 #endif
1716
1717 #undef TARGET_MANGLE_FUNDAMENTAL_TYPE
1718 #define TARGET_MANGLE_FUNDAMENTAL_TYPE ix86_mangle_fundamental_type
1719
1720 #undef TARGET_STACK_PROTECT_FAIL
1721 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
1722
1723 #undef TARGET_FUNCTION_VALUE
1724 #define TARGET_FUNCTION_VALUE ix86_function_value
1725
1726 struct gcc_target targetm = TARGET_INITIALIZER;
1727
1728 \f
1729 /* The svr4 ABI for the i386 says that records and unions are returned
1730 in memory. */
1731 #ifndef DEFAULT_PCC_STRUCT_RETURN
1732 #define DEFAULT_PCC_STRUCT_RETURN 1
1733 #endif
1734
1735 /* Implement TARGET_HANDLE_OPTION. */
1736
1737 static bool
1738 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1739 {
1740 switch (code)
1741 {
1742 case OPT_m3dnow:
1743 if (!value)
1744 {
1745 target_flags &= ~MASK_3DNOW_A;
1746 target_flags_explicit |= MASK_3DNOW_A;
1747 }
1748 return true;
1749
1750 case OPT_mmmx:
1751 if (!value)
1752 {
1753 target_flags &= ~(MASK_3DNOW | MASK_3DNOW_A);
1754 target_flags_explicit |= MASK_3DNOW | MASK_3DNOW_A;
1755 }
1756 return true;
1757
1758 case OPT_msse:
1759 if (!value)
1760 {
1761 target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSE4A);
1762 target_flags_explicit |= MASK_SSE2 | MASK_SSE3 | MASK_SSE4A;
1763 }
1764 return true;
1765
1766 case OPT_msse2:
1767 if (!value)
1768 {
1769 target_flags &= ~(MASK_SSE3 | MASK_SSE4A);
1770 target_flags_explicit |= MASK_SSE3 | MASK_SSE4A;
1771 }
1772 return true;
1773
1774 case OPT_msse3:
1775 if (!value)
1776 {
1777 target_flags &= ~MASK_SSE4A;
1778 target_flags_explicit |= MASK_SSE4A;
1779 }
1780 return true;
1781
1782 default:
1783 return true;
1784 }
1785 }
1786
1787 /* Sometimes certain combinations of command options do not make
1788 sense on a particular target machine. You can define a macro
1789 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1790 defined, is executed once just after all the command options have
1791 been parsed.
1792
1793 Don't use this macro to turn on various extra optimizations for
1794 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1795
1796 void
1797 override_options (void)
1798 {
1799 int i;
1800 int ix86_tune_defaulted = 0;
1801
1802 /* Comes from final.c -- no real reason to change it. */
1803 #define MAX_CODE_ALIGN 16
1804
1805 static struct ptt
1806 {
1807 const struct processor_costs *cost; /* Processor costs */
1808 const int target_enable; /* Target flags to enable. */
1809 const int target_disable; /* Target flags to disable. */
1810 const int align_loop; /* Default alignments. */
1811 const int align_loop_max_skip;
1812 const int align_jump;
1813 const int align_jump_max_skip;
1814 const int align_func;
1815 }
1816 const processor_target_table[PROCESSOR_max] =
1817 {
1818 {&i386_cost, 0, 0, 4, 3, 4, 3, 4},
1819 {&i486_cost, 0, 0, 16, 15, 16, 15, 16},
1820 {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},
1821 {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},
1822 {&geode_cost, 0, 0, 0, 0, 0, 0, 0},
1823 {&k6_cost, 0, 0, 32, 7, 32, 7, 32},
1824 {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
1825 {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
1826 {&k8_cost, 0, 0, 16, 7, 16, 7, 16},
1827 {&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
1828 {&core2_cost, 0, 0, 16, 7, 16, 7, 16},
1829 {&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
1830 {&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
1831 {&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
1832 };
1833
1834 static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
1835 static struct pta
1836 {
1837 const char *const name; /* processor name or nickname. */
1838 const enum processor_type processor;
1839 const enum pta_flags
1840 {
1841 PTA_SSE = 1,
1842 PTA_SSE2 = 2,
1843 PTA_SSE3 = 4,
1844 PTA_MMX = 8,
1845 PTA_PREFETCH_SSE = 16,
1846 PTA_3DNOW = 32,
1847 PTA_3DNOW_A = 64,
1848 PTA_64BIT = 128,
1849 PTA_SSSE3 = 256,
1850 PTA_CX16 = 512,
1851 PTA_POPCNT = 1024,
1852 PTA_ABM = 2048,
1853 PTA_SSE4A = 4096
1854 } flags;
1855 }
1856 const processor_alias_table[] =
1857 {
1858 {"i386", PROCESSOR_I386, 0},
1859 {"i486", PROCESSOR_I486, 0},
1860 {"i586", PROCESSOR_PENTIUM, 0},
1861 {"pentium", PROCESSOR_PENTIUM, 0},
1862 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
1863 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
1864 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1865 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1866 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_PREFETCH_SSE | PTA_SSE},
1867 {"i686", PROCESSOR_PENTIUMPRO, 0},
1868 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
1869 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
1870 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1871 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1872 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE | PTA_SSE2},
1873 {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1874 | PTA_MMX | PTA_PREFETCH_SSE},
1875 {"pentium4m", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1876 | PTA_MMX | PTA_PREFETCH_SSE},
1877 {"prescott", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3
1878 | PTA_MMX | PTA_PREFETCH_SSE},
1879 {"nocona", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_64BIT
1880 | PTA_MMX | PTA_PREFETCH_SSE | PTA_CX16},
1881 {"core2", PROCESSOR_CORE2, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
1882 | PTA_64BIT | PTA_MMX
1883 | PTA_PREFETCH_SSE | PTA_CX16},
1884 {"geode", PROCESSOR_GEODE, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1885 | PTA_3DNOW_A},
1886 {"k6", PROCESSOR_K6, PTA_MMX},
1887 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1888 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1889 {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1890 | PTA_3DNOW_A},
1891 {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE
1892 | PTA_3DNOW | PTA_3DNOW_A},
1893 {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1894 | PTA_3DNOW_A | PTA_SSE},
1895 {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1896 | PTA_3DNOW_A | PTA_SSE},
1897 {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1898 | PTA_3DNOW_A | PTA_SSE},
1899 {"x86-64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_64BIT
1900 | PTA_SSE | PTA_SSE2 },
1901 {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1902 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1903 {"opteron", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1904 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1905 {"athlon64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1906 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1907 {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1908 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2},
1909 {"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1910 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1911 | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1912 | PTA_ABM | PTA_SSE4A | PTA_CX16},
1913 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
1914 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
1915 };
1916
1917 int const pta_size = ARRAY_SIZE (processor_alias_table);
1918
1919 #ifdef SUBTARGET_OVERRIDE_OPTIONS
1920 SUBTARGET_OVERRIDE_OPTIONS;
1921 #endif
1922
1923 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
1924 SUBSUBTARGET_OVERRIDE_OPTIONS;
1925 #endif
1926
1927 /* -fPIC is the default for x86_64. */
1928 if (TARGET_MACHO && TARGET_64BIT)
1929 flag_pic = 2;
1930
1931 /* Set the default values for switches whose default depends on TARGET_64BIT
1932 in case they weren't overwritten by command line options. */
1933 if (TARGET_64BIT)
1934 {
1935 /* Mach-O doesn't support omitting the frame pointer for now. */
1936 if (flag_omit_frame_pointer == 2)
1937 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
1938 if (flag_asynchronous_unwind_tables == 2)
1939 flag_asynchronous_unwind_tables = 1;
1940 if (flag_pcc_struct_return == 2)
1941 flag_pcc_struct_return = 0;
1942 }
1943 else
1944 {
1945 if (flag_omit_frame_pointer == 2)
1946 flag_omit_frame_pointer = 0;
1947 if (flag_asynchronous_unwind_tables == 2)
1948 flag_asynchronous_unwind_tables = 0;
1949 if (flag_pcc_struct_return == 2)
1950 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
1951 }
1952
1953 /* Need to check -mtune=generic first. */
1954 if (ix86_tune_string)
1955 {
1956 if (!strcmp (ix86_tune_string, "generic")
1957 || !strcmp (ix86_tune_string, "i686")
1958 /* As special support for cross compilers we read -mtune=native
1959 as -mtune=generic. With native compilers we won't see the
1960 -mtune=native, as it was changed by the driver. */
1961 || !strcmp (ix86_tune_string, "native"))
1962 {
1963 if (TARGET_64BIT)
1964 ix86_tune_string = "generic64";
1965 else
1966 ix86_tune_string = "generic32";
1967 }
1968 else if (!strncmp (ix86_tune_string, "generic", 7))
1969 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1970 }
1971 else
1972 {
1973 if (ix86_arch_string)
1974 ix86_tune_string = ix86_arch_string;
1975 if (!ix86_tune_string)
1976 {
1977 ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
1978 ix86_tune_defaulted = 1;
1979 }
1980
1981 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
1982 need to use a sensible tune option. */
1983 if (!strcmp (ix86_tune_string, "generic")
1984 || !strcmp (ix86_tune_string, "x86-64")
1985 || !strcmp (ix86_tune_string, "i686"))
1986 {
1987 if (TARGET_64BIT)
1988 ix86_tune_string = "generic64";
1989 else
1990 ix86_tune_string = "generic32";
1991 }
1992 }
1993 if (ix86_stringop_string)
1994 {
1995 if (!strcmp (ix86_stringop_string, "rep_byte"))
1996 stringop_alg = rep_prefix_1_byte;
1997 else if (!strcmp (ix86_stringop_string, "libcall"))
1998 stringop_alg = libcall;
1999 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2000 stringop_alg = rep_prefix_4_byte;
2001 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
2002 stringop_alg = rep_prefix_8_byte;
2003 else if (!strcmp (ix86_stringop_string, "byte_loop"))
2004 stringop_alg = loop_1_byte;
2005 else if (!strcmp (ix86_stringop_string, "loop"))
2006 stringop_alg = loop;
2007 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2008 stringop_alg = unrolled_loop;
2009 else
2010 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
2011 }
2012 if (!strcmp (ix86_tune_string, "x86-64"))
2013 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
2014 "-mtune=generic instead as appropriate.");
2015
2016 if (!ix86_arch_string)
2017 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
2018 if (!strcmp (ix86_arch_string, "generic"))
2019 error ("generic CPU can be used only for -mtune= switch");
2020 if (!strncmp (ix86_arch_string, "generic", 7))
2021 error ("bad value (%s) for -march= switch", ix86_arch_string);
2022
2023 if (ix86_cmodel_string != 0)
2024 {
2025 if (!strcmp (ix86_cmodel_string, "small"))
2026 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2027 else if (!strcmp (ix86_cmodel_string, "medium"))
2028 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2029 else if (flag_pic)
2030 sorry ("code model %s not supported in PIC mode", ix86_cmodel_string);
2031 else if (!strcmp (ix86_cmodel_string, "32"))
2032 ix86_cmodel = CM_32;
2033 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2034 ix86_cmodel = CM_KERNEL;
2035 else if (!strcmp (ix86_cmodel_string, "large") && !flag_pic)
2036 ix86_cmodel = CM_LARGE;
2037 else
2038 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2039 }
2040 else
2041 {
2042 ix86_cmodel = CM_32;
2043 if (TARGET_64BIT)
2044 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2045 }
2046 if (ix86_asm_string != 0)
2047 {
2048 if (! TARGET_MACHO
2049 && !strcmp (ix86_asm_string, "intel"))
2050 ix86_asm_dialect = ASM_INTEL;
2051 else if (!strcmp (ix86_asm_string, "att"))
2052 ix86_asm_dialect = ASM_ATT;
2053 else
2054 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2055 }
2056 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2057 error ("code model %qs not supported in the %s bit mode",
2058 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2059 if (ix86_cmodel == CM_LARGE)
2060 sorry ("code model %<large%> not supported yet");
2061 if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0))
2062 sorry ("%i-bit mode not compiled in",
2063 (target_flags & MASK_64BIT) ? 64 : 32);
2064
2065 for (i = 0; i < pta_size; i++)
2066 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2067 {
2068 ix86_arch = processor_alias_table[i].processor;
2069 /* Default cpu tuning to the architecture. */
2070 ix86_tune = ix86_arch;
2071 if (processor_alias_table[i].flags & PTA_MMX
2072 && !(target_flags_explicit & MASK_MMX))
2073 target_flags |= MASK_MMX;
2074 if (processor_alias_table[i].flags & PTA_3DNOW
2075 && !(target_flags_explicit & MASK_3DNOW))
2076 target_flags |= MASK_3DNOW;
2077 if (processor_alias_table[i].flags & PTA_3DNOW_A
2078 && !(target_flags_explicit & MASK_3DNOW_A))
2079 target_flags |= MASK_3DNOW_A;
2080 if (processor_alias_table[i].flags & PTA_SSE
2081 && !(target_flags_explicit & MASK_SSE))
2082 target_flags |= MASK_SSE;
2083 if (processor_alias_table[i].flags & PTA_SSE2
2084 && !(target_flags_explicit & MASK_SSE2))
2085 target_flags |= MASK_SSE2;
2086 if (processor_alias_table[i].flags & PTA_SSE3
2087 && !(target_flags_explicit & MASK_SSE3))
2088 target_flags |= MASK_SSE3;
2089 if (processor_alias_table[i].flags & PTA_SSSE3
2090 && !(target_flags_explicit & MASK_SSSE3))
2091 target_flags |= MASK_SSSE3;
2092 if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
2093 x86_prefetch_sse = true;
2094 if (processor_alias_table[i].flags & PTA_CX16)
2095 x86_cmpxchg16b = true;
2096 if (processor_alias_table[i].flags & PTA_POPCNT
2097 && !(target_flags_explicit & MASK_POPCNT))
2098 target_flags |= MASK_POPCNT;
2099 if (processor_alias_table[i].flags & PTA_ABM
2100 && !(target_flags_explicit & MASK_ABM))
2101 target_flags |= MASK_ABM;
2102 if (processor_alias_table[i].flags & PTA_SSE4A
2103 && !(target_flags_explicit & MASK_SSE4A))
2104 target_flags |= MASK_SSE4A;
2105 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2106 error ("CPU you selected does not support x86-64 "
2107 "instruction set");
2108 break;
2109 }
2110
2111 if (i == pta_size)
2112 error ("bad value (%s) for -march= switch", ix86_arch_string);
2113
2114 for (i = 0; i < pta_size; i++)
2115 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2116 {
2117 ix86_tune = processor_alias_table[i].processor;
2118 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2119 {
2120 if (ix86_tune_defaulted)
2121 {
2122 ix86_tune_string = "x86-64";
2123 for (i = 0; i < pta_size; i++)
2124 if (! strcmp (ix86_tune_string,
2125 processor_alias_table[i].name))
2126 break;
2127 ix86_tune = processor_alias_table[i].processor;
2128 }
2129 else
2130 error ("CPU you selected does not support x86-64 "
2131 "instruction set");
2132 }
2133 /* Intel CPUs have always interpreted SSE prefetch instructions as
2134 NOPs; so, we can enable SSE prefetch instructions even when
2135 -mtune (rather than -march) points us to a processor that has them.
2136 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2137 higher processors. */
2138 if (TARGET_CMOVE && (processor_alias_table[i].flags & PTA_PREFETCH_SSE))
2139 x86_prefetch_sse = true;
2140 break;
2141 }
2142 if (i == pta_size)
2143 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2144
2145 if (optimize_size)
2146 ix86_cost = &size_cost;
2147 else
2148 ix86_cost = processor_target_table[ix86_tune].cost;
2149 target_flags |= processor_target_table[ix86_tune].target_enable;
2150 target_flags &= ~processor_target_table[ix86_tune].target_disable;
2151
2152 /* Arrange to set up i386_stack_locals for all functions. */
2153 init_machine_status = ix86_init_machine_status;
2154
2155 /* Validate -mregparm= value. */
2156 if (ix86_regparm_string)
2157 {
2158 i = atoi (ix86_regparm_string);
2159 if (i < 0 || i > REGPARM_MAX)
2160 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2161 else
2162 ix86_regparm = i;
2163 }
2164 else
2165 if (TARGET_64BIT)
2166 ix86_regparm = REGPARM_MAX;
2167
2168 /* If the user has provided any of the -malign-* options,
2169 warn and use that value only if -falign-* is not set.
2170 Remove this code in GCC 3.2 or later. */
2171 if (ix86_align_loops_string)
2172 {
2173 warning (0, "-malign-loops is obsolete, use -falign-loops");
2174 if (align_loops == 0)
2175 {
2176 i = atoi (ix86_align_loops_string);
2177 if (i < 0 || i > MAX_CODE_ALIGN)
2178 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2179 else
2180 align_loops = 1 << i;
2181 }
2182 }
2183
2184 if (ix86_align_jumps_string)
2185 {
2186 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2187 if (align_jumps == 0)
2188 {
2189 i = atoi (ix86_align_jumps_string);
2190 if (i < 0 || i > MAX_CODE_ALIGN)
2191 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2192 else
2193 align_jumps = 1 << i;
2194 }
2195 }
2196
2197 if (ix86_align_funcs_string)
2198 {
2199 warning (0, "-malign-functions is obsolete, use -falign-functions");
2200 if (align_functions == 0)
2201 {
2202 i = atoi (ix86_align_funcs_string);
2203 if (i < 0 || i > MAX_CODE_ALIGN)
2204 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2205 else
2206 align_functions = 1 << i;
2207 }
2208 }
2209
2210 /* Default align_* from the processor table. */
2211 if (align_loops == 0)
2212 {
2213 align_loops = processor_target_table[ix86_tune].align_loop;
2214 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2215 }
2216 if (align_jumps == 0)
2217 {
2218 align_jumps = processor_target_table[ix86_tune].align_jump;
2219 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2220 }
2221 if (align_functions == 0)
2222 {
2223 align_functions = processor_target_table[ix86_tune].align_func;
2224 }
2225
2226 /* Validate -mbranch-cost= value, or provide default. */
2227 ix86_branch_cost = ix86_cost->branch_cost;
2228 if (ix86_branch_cost_string)
2229 {
2230 i = atoi (ix86_branch_cost_string);
2231 if (i < 0 || i > 5)
2232 error ("-mbranch-cost=%d is not between 0 and 5", i);
2233 else
2234 ix86_branch_cost = i;
2235 }
2236 if (ix86_section_threshold_string)
2237 {
2238 i = atoi (ix86_section_threshold_string);
2239 if (i < 0)
2240 error ("-mlarge-data-threshold=%d is negative", i);
2241 else
2242 ix86_section_threshold = i;
2243 }
2244
2245 if (ix86_tls_dialect_string)
2246 {
2247 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2248 ix86_tls_dialect = TLS_DIALECT_GNU;
2249 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2250 ix86_tls_dialect = TLS_DIALECT_GNU2;
2251 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2252 ix86_tls_dialect = TLS_DIALECT_SUN;
2253 else
2254 error ("bad value (%s) for -mtls-dialect= switch",
2255 ix86_tls_dialect_string);
2256 }
2257
2258 /* Keep nonleaf frame pointers. */
2259 if (flag_omit_frame_pointer)
2260 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2261 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2262 flag_omit_frame_pointer = 1;
2263
2264 /* If we're doing fast math, we don't care about comparison order
2265 wrt NaNs. This lets us use a shorter comparison sequence. */
2266 if (flag_finite_math_only)
2267 target_flags &= ~MASK_IEEE_FP;
2268
2269 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2270 since the insns won't need emulation. */
2271 if (x86_arch_always_fancy_math_387 & (1 << ix86_arch))
2272 target_flags &= ~MASK_NO_FANCY_MATH_387;
2273
2274 /* Likewise, if the target doesn't have a 387, or we've specified
2275 software floating point, don't use 387 inline intrinsics. */
2276 if (!TARGET_80387)
2277 target_flags |= MASK_NO_FANCY_MATH_387;
2278
2279 /* Turn on SSE3 builtins for -mssse3. */
2280 if (TARGET_SSSE3)
2281 target_flags |= MASK_SSE3;
2282
2283 /* Turn on SSE3 builtins for -msse4a. */
2284 if (TARGET_SSE4A)
2285 target_flags |= MASK_SSE3;
2286
2287 /* Turn on SSE2 builtins for -msse3. */
2288 if (TARGET_SSE3)
2289 target_flags |= MASK_SSE2;
2290
2291 /* Turn on SSE builtins for -msse2. */
2292 if (TARGET_SSE2)
2293 target_flags |= MASK_SSE;
2294
2295 /* Turn on MMX builtins for -msse. */
2296 if (TARGET_SSE)
2297 {
2298 target_flags |= MASK_MMX & ~target_flags_explicit;
2299 x86_prefetch_sse = true;
2300 }
2301
2302 /* Turn on MMX builtins for 3Dnow. */
2303 if (TARGET_3DNOW)
2304 target_flags |= MASK_MMX;
2305
2306 /* Turn on POPCNT builtins for -mabm. */
2307 if (TARGET_ABM)
2308 target_flags |= MASK_POPCNT;
2309
2310 if (TARGET_64BIT)
2311 {
2312 if (TARGET_ALIGN_DOUBLE)
2313 error ("-malign-double makes no sense in the 64bit mode");
2314 if (TARGET_RTD)
2315 error ("-mrtd calling convention not supported in the 64bit mode");
2316
2317 /* Enable by default the SSE and MMX builtins. Do allow the user to
2318 explicitly disable any of these. In particular, disabling SSE and
2319 MMX for kernel code is extremely useful. */
2320 target_flags
2321 |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | MASK_128BIT_LONG_DOUBLE)
2322 & ~target_flags_explicit);
2323 }
2324 else
2325 {
2326 /* i386 ABI does not specify red zone. It still makes sense to use it
2327 when programmer takes care to stack from being destroyed. */
2328 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2329 target_flags |= MASK_NO_RED_ZONE;
2330 }
2331
2332 /* Validate -mpreferred-stack-boundary= value, or provide default.
2333 The default of 128 bits is for Pentium III's SSE __m128. We can't
2334 change it because of optimize_size. Otherwise, we can't mix object
2335 files compiled with -Os and -On. */
2336 ix86_preferred_stack_boundary = 128;
2337 if (ix86_preferred_stack_boundary_string)
2338 {
2339 i = atoi (ix86_preferred_stack_boundary_string);
2340 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2341 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2342 TARGET_64BIT ? 4 : 2);
2343 else
2344 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2345 }
2346
2347 /* Accept -msseregparm only if at least SSE support is enabled. */
2348 if (TARGET_SSEREGPARM
2349 && ! TARGET_SSE)
2350 error ("-msseregparm used without SSE enabled");
2351
2352 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2353
2354 if (ix86_fpmath_string != 0)
2355 {
2356 if (! strcmp (ix86_fpmath_string, "387"))
2357 ix86_fpmath = FPMATH_387;
2358 else if (! strcmp (ix86_fpmath_string, "sse"))
2359 {
2360 if (!TARGET_SSE)
2361 {
2362 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2363 ix86_fpmath = FPMATH_387;
2364 }
2365 else
2366 ix86_fpmath = FPMATH_SSE;
2367 }
2368 else if (! strcmp (ix86_fpmath_string, "387,sse")
2369 || ! strcmp (ix86_fpmath_string, "sse,387"))
2370 {
2371 if (!TARGET_SSE)
2372 {
2373 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2374 ix86_fpmath = FPMATH_387;
2375 }
2376 else if (!TARGET_80387)
2377 {
2378 warning (0, "387 instruction set disabled, using SSE arithmetics");
2379 ix86_fpmath = FPMATH_SSE;
2380 }
2381 else
2382 ix86_fpmath = FPMATH_SSE | FPMATH_387;
2383 }
2384 else
2385 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2386 }
2387
2388 /* If the i387 is disabled, then do not return values in it. */
2389 if (!TARGET_80387)
2390 target_flags &= ~MASK_FLOAT_RETURNS;
2391
2392 if ((x86_accumulate_outgoing_args & TUNEMASK)
2393 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2394 && !optimize_size)
2395 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2396
2397 /* ??? Unwind info is not correct around the CFG unless either a frame
2398 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2399 unwind info generation to be aware of the CFG and propagating states
2400 around edges. */
2401 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2402 || flag_exceptions || flag_non_call_exceptions)
2403 && flag_omit_frame_pointer
2404 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2405 {
2406 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2407 warning (0, "unwind tables currently require either a frame pointer "
2408 "or -maccumulate-outgoing-args for correctness");
2409 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2410 }
2411
2412 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2413 {
2414 char *p;
2415 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2416 p = strchr (internal_label_prefix, 'X');
2417 internal_label_prefix_len = p - internal_label_prefix;
2418 *p = '\0';
2419 }
2420
2421 /* When scheduling description is not available, disable scheduler pass
2422 so it won't slow down the compilation and make x87 code slower. */
2423 if (!TARGET_SCHEDULE)
2424 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2425
2426 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2427 set_param_value ("simultaneous-prefetches",
2428 ix86_cost->simultaneous_prefetches);
2429 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2430 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2431 }
2432 \f
2433 /* switch to the appropriate section for output of DECL.
2434 DECL is either a `VAR_DECL' node or a constant of some sort.
2435 RELOC indicates whether forming the initial value of DECL requires
2436 link-time relocations. */
2437
2438 static section *
2439 x86_64_elf_select_section (tree decl, int reloc,
2440 unsigned HOST_WIDE_INT align)
2441 {
2442 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2443 && ix86_in_large_data_p (decl))
2444 {
2445 const char *sname = NULL;
2446 unsigned int flags = SECTION_WRITE;
2447 switch (categorize_decl_for_section (decl, reloc, flag_pic))
2448 {
2449 case SECCAT_DATA:
2450 sname = ".ldata";
2451 break;
2452 case SECCAT_DATA_REL:
2453 sname = ".ldata.rel";
2454 break;
2455 case SECCAT_DATA_REL_LOCAL:
2456 sname = ".ldata.rel.local";
2457 break;
2458 case SECCAT_DATA_REL_RO:
2459 sname = ".ldata.rel.ro";
2460 break;
2461 case SECCAT_DATA_REL_RO_LOCAL:
2462 sname = ".ldata.rel.ro.local";
2463 break;
2464 case SECCAT_BSS:
2465 sname = ".lbss";
2466 flags |= SECTION_BSS;
2467 break;
2468 case SECCAT_RODATA:
2469 case SECCAT_RODATA_MERGE_STR:
2470 case SECCAT_RODATA_MERGE_STR_INIT:
2471 case SECCAT_RODATA_MERGE_CONST:
2472 sname = ".lrodata";
2473 flags = 0;
2474 break;
2475 case SECCAT_SRODATA:
2476 case SECCAT_SDATA:
2477 case SECCAT_SBSS:
2478 gcc_unreachable ();
2479 case SECCAT_TEXT:
2480 case SECCAT_TDATA:
2481 case SECCAT_TBSS:
2482 /* We don't split these for medium model. Place them into
2483 default sections and hope for best. */
2484 break;
2485 }
2486 if (sname)
2487 {
2488 /* We might get called with string constants, but get_named_section
2489 doesn't like them as they are not DECLs. Also, we need to set
2490 flags in that case. */
2491 if (!DECL_P (decl))
2492 return get_section (sname, flags, NULL);
2493 return get_named_section (decl, sname, reloc);
2494 }
2495 }
2496 return default_elf_select_section (decl, reloc, align);
2497 }
2498
2499 /* Build up a unique section name, expressed as a
2500 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2501 RELOC indicates whether the initial value of EXP requires
2502 link-time relocations. */
2503
2504 static void
2505 x86_64_elf_unique_section (tree decl, int reloc)
2506 {
2507 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2508 && ix86_in_large_data_p (decl))
2509 {
2510 const char *prefix = NULL;
2511 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2512 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2513
2514 switch (categorize_decl_for_section (decl, reloc, flag_pic))
2515 {
2516 case SECCAT_DATA:
2517 case SECCAT_DATA_REL:
2518 case SECCAT_DATA_REL_LOCAL:
2519 case SECCAT_DATA_REL_RO:
2520 case SECCAT_DATA_REL_RO_LOCAL:
2521 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2522 break;
2523 case SECCAT_BSS:
2524 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2525 break;
2526 case SECCAT_RODATA:
2527 case SECCAT_RODATA_MERGE_STR:
2528 case SECCAT_RODATA_MERGE_STR_INIT:
2529 case SECCAT_RODATA_MERGE_CONST:
2530 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2531 break;
2532 case SECCAT_SRODATA:
2533 case SECCAT_SDATA:
2534 case SECCAT_SBSS:
2535 gcc_unreachable ();
2536 case SECCAT_TEXT:
2537 case SECCAT_TDATA:
2538 case SECCAT_TBSS:
2539 /* We don't split these for medium model. Place them into
2540 default sections and hope for best. */
2541 break;
2542 }
2543 if (prefix)
2544 {
2545 const char *name;
2546 size_t nlen, plen;
2547 char *string;
2548 plen = strlen (prefix);
2549
2550 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2551 name = targetm.strip_name_encoding (name);
2552 nlen = strlen (name);
2553
2554 string = alloca (nlen + plen + 1);
2555 memcpy (string, prefix, plen);
2556 memcpy (string + plen, name, nlen + 1);
2557
2558 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2559 return;
2560 }
2561 }
2562 default_unique_section (decl, reloc);
2563 }
2564
2565 #ifdef COMMON_ASM_OP
2566 /* This says how to output assembler code to declare an
2567 uninitialized external linkage data object.
2568
2569 For medium model x86-64 we need to use .largecomm opcode for
2570 large objects. */
2571 void
2572 x86_elf_aligned_common (FILE *file,
2573 const char *name, unsigned HOST_WIDE_INT size,
2574 int align)
2575 {
2576 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2577 && size > (unsigned int)ix86_section_threshold)
2578 fprintf (file, ".largecomm\t");
2579 else
2580 fprintf (file, "%s", COMMON_ASM_OP);
2581 assemble_name (file, name);
2582 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2583 size, align / BITS_PER_UNIT);
2584 }
2585 #endif
2586 /* Utility function for targets to use in implementing
2587 ASM_OUTPUT_ALIGNED_BSS. */
2588
2589 void
2590 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2591 const char *name, unsigned HOST_WIDE_INT size,
2592 int align)
2593 {
2594 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2595 && size > (unsigned int)ix86_section_threshold)
2596 switch_to_section (get_named_section (decl, ".lbss", 0));
2597 else
2598 switch_to_section (bss_section);
2599 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2600 #ifdef ASM_DECLARE_OBJECT_NAME
2601 last_assemble_variable_decl = decl;
2602 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2603 #else
2604 /* Standard thing is just output label for the object. */
2605 ASM_OUTPUT_LABEL (file, name);
2606 #endif /* ASM_DECLARE_OBJECT_NAME */
2607 ASM_OUTPUT_SKIP (file, size ? size : 1);
2608 }
2609 \f
2610 void
2611 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2612 {
2613 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2614 make the problem with not enough registers even worse. */
2615 #ifdef INSN_SCHEDULING
2616 if (level > 1)
2617 flag_schedule_insns = 0;
2618 #endif
2619
2620 if (TARGET_MACHO)
2621 /* The Darwin libraries never set errno, so we might as well
2622 avoid calling them when that's the only reason we would. */
2623 flag_errno_math = 0;
2624
2625 /* The default values of these switches depend on the TARGET_64BIT
2626 that is not known at this moment. Mark these values with 2 and
2627 let user the to override these. In case there is no command line option
2628 specifying them, we will set the defaults in override_options. */
2629 if (optimize >= 1)
2630 flag_omit_frame_pointer = 2;
2631 flag_pcc_struct_return = 2;
2632 flag_asynchronous_unwind_tables = 2;
2633 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2634 SUBTARGET_OPTIMIZATION_OPTIONS;
2635 #endif
2636 }
2637 \f
2638 /* Table of valid machine attributes. */
2639 const struct attribute_spec ix86_attribute_table[] =
2640 {
2641 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
2642 /* Stdcall attribute says callee is responsible for popping arguments
2643 if they are not variable. */
2644 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2645 /* Fastcall attribute says callee is responsible for popping arguments
2646 if they are not variable. */
2647 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2648 /* Cdecl attribute says the callee is a normal C declaration */
2649 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2650 /* Regparm attribute specifies how many integer arguments are to be
2651 passed in registers. */
2652 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute },
2653 /* Sseregparm attribute says we are using x86_64 calling conventions
2654 for FP arguments. */
2655 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute },
2656 /* force_align_arg_pointer says this function realigns the stack at entry. */
2657 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
2658 false, true, true, ix86_handle_cconv_attribute },
2659 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
2660 { "dllimport", 0, 0, false, false, false, handle_dll_attribute },
2661 { "dllexport", 0, 0, false, false, false, handle_dll_attribute },
2662 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute },
2663 #endif
2664 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute },
2665 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute },
2666 #ifdef SUBTARGET_ATTRIBUTE_TABLE
2667 SUBTARGET_ATTRIBUTE_TABLE,
2668 #endif
2669 { NULL, 0, 0, false, false, false, NULL }
2670 };
2671
2672 /* Decide whether we can make a sibling call to a function. DECL is the
2673 declaration of the function being targeted by the call and EXP is the
2674 CALL_EXPR representing the call. */
2675
2676 static bool
2677 ix86_function_ok_for_sibcall (tree decl, tree exp)
2678 {
2679 tree func;
2680 rtx a, b;
2681
2682 /* If we are generating position-independent code, we cannot sibcall
2683 optimize any indirect call, or a direct call to a global function,
2684 as the PLT requires %ebx be live. */
2685 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2686 return false;
2687
2688 if (decl)
2689 func = decl;
2690 else
2691 {
2692 func = TREE_TYPE (TREE_OPERAND (exp, 0));
2693 if (POINTER_TYPE_P (func))
2694 func = TREE_TYPE (func);
2695 }
2696
2697 /* Check that the return value locations are the same. Like
2698 if we are returning floats on the 80387 register stack, we cannot
2699 make a sibcall from a function that doesn't return a float to a
2700 function that does or, conversely, from a function that does return
2701 a float to a function that doesn't; the necessary stack adjustment
2702 would not be executed. This is also the place we notice
2703 differences in the return value ABI. Note that it is ok for one
2704 of the functions to have void return type as long as the return
2705 value of the other is passed in a register. */
2706 a = ix86_function_value (TREE_TYPE (exp), func, false);
2707 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2708 cfun->decl, false);
2709 if (STACK_REG_P (a) || STACK_REG_P (b))
2710 {
2711 if (!rtx_equal_p (a, b))
2712 return false;
2713 }
2714 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2715 ;
2716 else if (!rtx_equal_p (a, b))
2717 return false;
2718
2719 /* If this call is indirect, we'll need to be able to use a call-clobbered
2720 register for the address of the target function. Make sure that all
2721 such registers are not used for passing parameters. */
2722 if (!decl && !TARGET_64BIT)
2723 {
2724 tree type;
2725
2726 /* We're looking at the CALL_EXPR, we need the type of the function. */
2727 type = TREE_OPERAND (exp, 0); /* pointer expression */
2728 type = TREE_TYPE (type); /* pointer type */
2729 type = TREE_TYPE (type); /* function type */
2730
2731 if (ix86_function_regparm (type, NULL) >= 3)
2732 {
2733 /* ??? Need to count the actual number of registers to be used,
2734 not the possible number of registers. Fix later. */
2735 return false;
2736 }
2737 }
2738
2739 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
2740 /* Dllimport'd functions are also called indirectly. */
2741 if (decl && DECL_DLLIMPORT_P (decl)
2742 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
2743 return false;
2744 #endif
2745
2746 /* If we forced aligned the stack, then sibcalling would unalign the
2747 stack, which may break the called function. */
2748 if (cfun->machine->force_align_arg_pointer)
2749 return false;
2750
2751 /* Otherwise okay. That also includes certain types of indirect calls. */
2752 return true;
2753 }
2754
2755 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
2756 calling convention attributes;
2757 arguments as in struct attribute_spec.handler. */
2758
2759 static tree
2760 ix86_handle_cconv_attribute (tree *node, tree name,
2761 tree args,
2762 int flags ATTRIBUTE_UNUSED,
2763 bool *no_add_attrs)
2764 {
2765 if (TREE_CODE (*node) != FUNCTION_TYPE
2766 && TREE_CODE (*node) != METHOD_TYPE
2767 && TREE_CODE (*node) != FIELD_DECL
2768 && TREE_CODE (*node) != TYPE_DECL)
2769 {
2770 warning (OPT_Wattributes, "%qs attribute only applies to functions",
2771 IDENTIFIER_POINTER (name));
2772 *no_add_attrs = true;
2773 return NULL_TREE;
2774 }
2775
2776 /* Can combine regparm with all attributes but fastcall. */
2777 if (is_attribute_p ("regparm", name))
2778 {
2779 tree cst;
2780
2781 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2782 {
2783 error ("fastcall and regparm attributes are not compatible");
2784 }
2785
2786 cst = TREE_VALUE (args);
2787 if (TREE_CODE (cst) != INTEGER_CST)
2788 {
2789 warning (OPT_Wattributes,
2790 "%qs attribute requires an integer constant argument",
2791 IDENTIFIER_POINTER (name));
2792 *no_add_attrs = true;
2793 }
2794 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
2795 {
2796 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
2797 IDENTIFIER_POINTER (name), REGPARM_MAX);
2798 *no_add_attrs = true;
2799 }
2800
2801 if (!TARGET_64BIT
2802 && lookup_attribute (ix86_force_align_arg_pointer_string,
2803 TYPE_ATTRIBUTES (*node))
2804 && compare_tree_int (cst, REGPARM_MAX-1))
2805 {
2806 error ("%s functions limited to %d register parameters",
2807 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
2808 }
2809
2810 return NULL_TREE;
2811 }
2812
2813 if (TARGET_64BIT)
2814 {
2815 warning (OPT_Wattributes, "%qs attribute ignored",
2816 IDENTIFIER_POINTER (name));
2817 *no_add_attrs = true;
2818 return NULL_TREE;
2819 }
2820
2821 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
2822 if (is_attribute_p ("fastcall", name))
2823 {
2824 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2825 {
2826 error ("fastcall and cdecl attributes are not compatible");
2827 }
2828 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2829 {
2830 error ("fastcall and stdcall attributes are not compatible");
2831 }
2832 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
2833 {
2834 error ("fastcall and regparm attributes are not compatible");
2835 }
2836 }
2837
2838 /* Can combine stdcall with fastcall (redundant), regparm and
2839 sseregparm. */
2840 else if (is_attribute_p ("stdcall", name))
2841 {
2842 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2843 {
2844 error ("stdcall and cdecl attributes are not compatible");
2845 }
2846 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2847 {
2848 error ("stdcall and fastcall attributes are not compatible");
2849 }
2850 }
2851
2852 /* Can combine cdecl with regparm and sseregparm. */
2853 else if (is_attribute_p ("cdecl", name))
2854 {
2855 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2856 {
2857 error ("stdcall and cdecl attributes are not compatible");
2858 }
2859 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2860 {
2861 error ("fastcall and cdecl attributes are not compatible");
2862 }
2863 }
2864
2865 /* Can combine sseregparm with all attributes. */
2866
2867 return NULL_TREE;
2868 }
2869
2870 /* Return 0 if the attributes for two types are incompatible, 1 if they
2871 are compatible, and 2 if they are nearly compatible (which causes a
2872 warning to be generated). */
2873
2874 static int
2875 ix86_comp_type_attributes (tree type1, tree type2)
2876 {
2877 /* Check for mismatch of non-default calling convention. */
2878 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
2879
2880 if (TREE_CODE (type1) != FUNCTION_TYPE)
2881 return 1;
2882
2883 /* Check for mismatched fastcall/regparm types. */
2884 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
2885 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
2886 || (ix86_function_regparm (type1, NULL)
2887 != ix86_function_regparm (type2, NULL)))
2888 return 0;
2889
2890 /* Check for mismatched sseregparm types. */
2891 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
2892 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
2893 return 0;
2894
2895 /* Check for mismatched return types (cdecl vs stdcall). */
2896 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
2897 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
2898 return 0;
2899
2900 return 1;
2901 }
2902 \f
2903 /* Return the regparm value for a function with the indicated TYPE and DECL.
2904 DECL may be NULL when calling function indirectly
2905 or considering a libcall. */
2906
2907 static int
2908 ix86_function_regparm (tree type, tree decl)
2909 {
2910 tree attr;
2911 int regparm = ix86_regparm;
2912 bool user_convention = false;
2913
2914 if (!TARGET_64BIT)
2915 {
2916 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
2917 if (attr)
2918 {
2919 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
2920 user_convention = true;
2921 }
2922
2923 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
2924 {
2925 regparm = 2;
2926 user_convention = true;
2927 }
2928
2929 /* Use register calling convention for local functions when possible. */
2930 if (!TARGET_64BIT && !user_convention && decl
2931 && flag_unit_at_a_time && !profile_flag)
2932 {
2933 struct cgraph_local_info *i = cgraph_local_info (decl);
2934 if (i && i->local)
2935 {
2936 int local_regparm, globals = 0, regno;
2937
2938 /* Make sure no regparm register is taken by a global register
2939 variable. */
2940 for (local_regparm = 0; local_regparm < 3; local_regparm++)
2941 if (global_regs[local_regparm])
2942 break;
2943 /* We can't use regparm(3) for nested functions as these use
2944 static chain pointer in third argument. */
2945 if (local_regparm == 3
2946 && decl_function_context (decl)
2947 && !DECL_NO_STATIC_CHAIN (decl))
2948 local_regparm = 2;
2949 /* If the function realigns its stackpointer, the
2950 prologue will clobber %ecx. If we've already
2951 generated code for the callee, the callee
2952 DECL_STRUCT_FUNCTION is gone, so we fall back to
2953 scanning the attributes for the self-realigning
2954 property. */
2955 if ((DECL_STRUCT_FUNCTION (decl)
2956 && DECL_STRUCT_FUNCTION (decl)->machine->force_align_arg_pointer)
2957 || (!DECL_STRUCT_FUNCTION (decl)
2958 && lookup_attribute (ix86_force_align_arg_pointer_string,
2959 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
2960 local_regparm = 2;
2961 /* Each global register variable increases register preassure,
2962 so the more global reg vars there are, the smaller regparm
2963 optimization use, unless requested by the user explicitly. */
2964 for (regno = 0; regno < 6; regno++)
2965 if (global_regs[regno])
2966 globals++;
2967 local_regparm
2968 = globals < local_regparm ? local_regparm - globals : 0;
2969
2970 if (local_regparm > regparm)
2971 regparm = local_regparm;
2972 }
2973 }
2974 }
2975 return regparm;
2976 }
2977
2978 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
2979 DFmode (2) arguments in SSE registers for a function with the
2980 indicated TYPE and DECL. DECL may be NULL when calling function
2981 indirectly or considering a libcall. Otherwise return 0. */
2982
2983 static int
2984 ix86_function_sseregparm (tree type, tree decl)
2985 {
2986 /* Use SSE registers to pass SFmode and DFmode arguments if requested
2987 by the sseregparm attribute. */
2988 if (TARGET_SSEREGPARM
2989 || (type
2990 && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
2991 {
2992 if (!TARGET_SSE)
2993 {
2994 if (decl)
2995 error ("Calling %qD with attribute sseregparm without "
2996 "SSE/SSE2 enabled", decl);
2997 else
2998 error ("Calling %qT with attribute sseregparm without "
2999 "SSE/SSE2 enabled", type);
3000 return 0;
3001 }
3002
3003 return 2;
3004 }
3005
3006 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
3007 (and DFmode for SSE2) arguments in SSE registers,
3008 even for 32-bit targets. */
3009 if (!TARGET_64BIT && decl
3010 && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
3011 {
3012 struct cgraph_local_info *i = cgraph_local_info (decl);
3013 if (i && i->local)
3014 return TARGET_SSE2 ? 2 : 1;
3015 }
3016
3017 return 0;
3018 }
3019
3020 /* Return true if EAX is live at the start of the function. Used by
3021 ix86_expand_prologue to determine if we need special help before
3022 calling allocate_stack_worker. */
3023
3024 static bool
3025 ix86_eax_live_at_start_p (void)
3026 {
3027 /* Cheat. Don't bother working forward from ix86_function_regparm
3028 to the function type to whether an actual argument is located in
3029 eax. Instead just look at cfg info, which is still close enough
3030 to correct at this point. This gives false positives for broken
3031 functions that might use uninitialized data that happens to be
3032 allocated in eax, but who cares? */
3033 return REGNO_REG_SET_P (ENTRY_BLOCK_PTR->il.rtl->global_live_at_end, 0);
3034 }
3035
3036 /* Value is the number of bytes of arguments automatically
3037 popped when returning from a subroutine call.
3038 FUNDECL is the declaration node of the function (as a tree),
3039 FUNTYPE is the data type of the function (as a tree),
3040 or for a library call it is an identifier node for the subroutine name.
3041 SIZE is the number of bytes of arguments passed on the stack.
3042
3043 On the 80386, the RTD insn may be used to pop them if the number
3044 of args is fixed, but if the number is variable then the caller
3045 must pop them all. RTD can't be used for library calls now
3046 because the library is compiled with the Unix compiler.
3047 Use of RTD is a selectable option, since it is incompatible with
3048 standard Unix calling sequences. If the option is not selected,
3049 the caller must always pop the args.
3050
3051 The attribute stdcall is equivalent to RTD on a per module basis. */
3052
3053 int
3054 ix86_return_pops_args (tree fundecl, tree funtype, int size)
3055 {
3056 int rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
3057
3058 /* Cdecl functions override -mrtd, and never pop the stack. */
3059 if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype))) {
3060
3061 /* Stdcall and fastcall functions will pop the stack if not
3062 variable args. */
3063 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
3064 || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
3065 rtd = 1;
3066
3067 if (rtd
3068 && (TYPE_ARG_TYPES (funtype) == NULL_TREE
3069 || (TREE_VALUE (tree_last (TYPE_ARG_TYPES (funtype)))
3070 == void_type_node)))
3071 return size;
3072 }
3073
3074 /* Lose any fake structure return argument if it is passed on the stack. */
3075 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
3076 && !TARGET_64BIT
3077 && !KEEP_AGGREGATE_RETURN_POINTER)
3078 {
3079 int nregs = ix86_function_regparm (funtype, fundecl);
3080
3081 if (!nregs)
3082 return GET_MODE_SIZE (Pmode);
3083 }
3084
3085 return 0;
3086 }
3087 \f
3088 /* Argument support functions. */
3089
3090 /* Return true when register may be used to pass function parameters. */
3091 bool
3092 ix86_function_arg_regno_p (int regno)
3093 {
3094 int i;
3095 if (!TARGET_64BIT)
3096 {
3097 if (TARGET_MACHO)
3098 return (regno < REGPARM_MAX
3099 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
3100 else
3101 return (regno < REGPARM_MAX
3102 || (TARGET_MMX && MMX_REGNO_P (regno)
3103 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
3104 || (TARGET_SSE && SSE_REGNO_P (regno)
3105 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
3106 }
3107
3108 if (TARGET_MACHO)
3109 {
3110 if (SSE_REGNO_P (regno) && TARGET_SSE)
3111 return true;
3112 }
3113 else
3114 {
3115 if (TARGET_SSE && SSE_REGNO_P (regno)
3116 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
3117 return true;
3118 }
3119 /* RAX is used as hidden argument to va_arg functions. */
3120 if (!regno)
3121 return true;
3122 for (i = 0; i < REGPARM_MAX; i++)
3123 if (regno == x86_64_int_parameter_registers[i])
3124 return true;
3125 return false;
3126 }
3127
3128 /* Return if we do not know how to pass TYPE solely in registers. */
3129
3130 static bool
3131 ix86_must_pass_in_stack (enum machine_mode mode, tree type)
3132 {
3133 if (must_pass_in_stack_var_size_or_pad (mode, type))
3134 return true;
3135
3136 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
3137 The layout_type routine is crafty and tries to trick us into passing
3138 currently unsupported vector types on the stack by using TImode. */
3139 return (!TARGET_64BIT && mode == TImode
3140 && type && TREE_CODE (type) != VECTOR_TYPE);
3141 }
3142
3143 /* Initialize a variable CUM of type CUMULATIVE_ARGS
3144 for a call to a function whose data type is FNTYPE.
3145 For a library call, FNTYPE is 0. */
3146
3147 void
3148 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
3149 tree fntype, /* tree ptr for function decl */
3150 rtx libname, /* SYMBOL_REF of library name or 0 */
3151 tree fndecl)
3152 {
3153 static CUMULATIVE_ARGS zero_cum;
3154 tree param, next_param;
3155
3156 if (TARGET_DEBUG_ARG)
3157 {
3158 fprintf (stderr, "\ninit_cumulative_args (");
3159 if (fntype)
3160 fprintf (stderr, "fntype code = %s, ret code = %s",
3161 tree_code_name[(int) TREE_CODE (fntype)],
3162 tree_code_name[(int) TREE_CODE (TREE_TYPE (fntype))]);
3163 else
3164 fprintf (stderr, "no fntype");
3165
3166 if (libname)
3167 fprintf (stderr, ", libname = %s", XSTR (libname, 0));
3168 }
3169
3170 *cum = zero_cum;
3171
3172 /* Set up the number of registers to use for passing arguments. */
3173 cum->nregs = ix86_regparm;
3174 if (TARGET_SSE)
3175 cum->sse_nregs = SSE_REGPARM_MAX;
3176 if (TARGET_MMX)
3177 cum->mmx_nregs = MMX_REGPARM_MAX;
3178 cum->warn_sse = true;
3179 cum->warn_mmx = true;
3180 cum->maybe_vaarg = false;
3181
3182 /* Use ecx and edx registers if function has fastcall attribute,
3183 else look for regparm information. */
3184 if (fntype && !TARGET_64BIT)
3185 {
3186 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
3187 {
3188 cum->nregs = 2;
3189 cum->fastcall = 1;
3190 }
3191 else
3192 cum->nregs = ix86_function_regparm (fntype, fndecl);
3193 }
3194
3195 /* Set up the number of SSE registers used for passing SFmode
3196 and DFmode arguments. Warn for mismatching ABI. */
3197 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl);
3198
3199 /* Determine if this function has variable arguments. This is
3200 indicated by the last argument being 'void_type_mode' if there
3201 are no variable arguments. If there are variable arguments, then
3202 we won't pass anything in registers in 32-bit mode. */
3203
3204 if (cum->nregs || cum->mmx_nregs || cum->sse_nregs)
3205 {
3206 for (param = (fntype) ? TYPE_ARG_TYPES (fntype) : 0;
3207 param != 0; param = next_param)
3208 {
3209 next_param = TREE_CHAIN (param);
3210 if (next_param == 0 && TREE_VALUE (param) != void_type_node)
3211 {
3212 if (!TARGET_64BIT)
3213 {
3214 cum->nregs = 0;
3215 cum->sse_nregs = 0;
3216 cum->mmx_nregs = 0;
3217 cum->warn_sse = 0;
3218 cum->warn_mmx = 0;
3219 cum->fastcall = 0;
3220 cum->float_in_sse = 0;
3221 }
3222 cum->maybe_vaarg = true;
3223 }
3224 }
3225 }
3226 if ((!fntype && !libname)
3227 || (fntype && !TYPE_ARG_TYPES (fntype)))
3228 cum->maybe_vaarg = true;
3229
3230 if (TARGET_DEBUG_ARG)
3231 fprintf (stderr, ", nregs=%d )\n", cum->nregs);
3232
3233 return;
3234 }
3235
3236 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
3237 But in the case of vector types, it is some vector mode.
3238
3239 When we have only some of our vector isa extensions enabled, then there
3240 are some modes for which vector_mode_supported_p is false. For these
3241 modes, the generic vector support in gcc will choose some non-vector mode
3242 in order to implement the type. By computing the natural mode, we'll
3243 select the proper ABI location for the operand and not depend on whatever
3244 the middle-end decides to do with these vector types. */
3245
3246 static enum machine_mode
3247 type_natural_mode (tree type)
3248 {
3249 enum machine_mode mode = TYPE_MODE (type);
3250
3251 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
3252 {
3253 HOST_WIDE_INT size = int_size_in_bytes (type);
3254 if ((size == 8 || size == 16)
3255 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
3256 && TYPE_VECTOR_SUBPARTS (type) > 1)
3257 {
3258 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
3259
3260 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
3261 mode = MIN_MODE_VECTOR_FLOAT;
3262 else
3263 mode = MIN_MODE_VECTOR_INT;
3264
3265 /* Get the mode which has this inner mode and number of units. */
3266 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
3267 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
3268 && GET_MODE_INNER (mode) == innermode)
3269 return mode;
3270
3271 gcc_unreachable ();
3272 }
3273 }
3274
3275 return mode;
3276 }
3277
3278 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
3279 this may not agree with the mode that the type system has chosen for the
3280 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
3281 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
3282
3283 static rtx
3284 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
3285 unsigned int regno)
3286 {
3287 rtx tmp;
3288
3289 if (orig_mode != BLKmode)
3290 tmp = gen_rtx_REG (orig_mode, regno);
3291 else
3292 {
3293 tmp = gen_rtx_REG (mode, regno);
3294 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
3295 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
3296 }
3297
3298 return tmp;
3299 }
3300
3301 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
3302 of this code is to classify each 8bytes of incoming argument by the register
3303 class and assign registers accordingly. */
3304
3305 /* Return the union class of CLASS1 and CLASS2.
3306 See the x86-64 PS ABI for details. */
3307
3308 static enum x86_64_reg_class
3309 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
3310 {
3311 /* Rule #1: If both classes are equal, this is the resulting class. */
3312 if (class1 == class2)
3313 return class1;
3314
3315 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
3316 the other class. */
3317 if (class1 == X86_64_NO_CLASS)
3318 return class2;
3319 if (class2 == X86_64_NO_CLASS)
3320 return class1;
3321
3322 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
3323 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
3324 return X86_64_MEMORY_CLASS;
3325
3326 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
3327 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
3328 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
3329 return X86_64_INTEGERSI_CLASS;
3330 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
3331 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
3332 return X86_64_INTEGER_CLASS;
3333
3334 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
3335 MEMORY is used. */
3336 if (class1 == X86_64_X87_CLASS
3337 || class1 == X86_64_X87UP_CLASS
3338 || class1 == X86_64_COMPLEX_X87_CLASS
3339 || class2 == X86_64_X87_CLASS
3340 || class2 == X86_64_X87UP_CLASS
3341 || class2 == X86_64_COMPLEX_X87_CLASS)
3342 return X86_64_MEMORY_CLASS;
3343
3344 /* Rule #6: Otherwise class SSE is used. */
3345 return X86_64_SSE_CLASS;
3346 }
3347
3348 /* Classify the argument of type TYPE and mode MODE.
3349 CLASSES will be filled by the register class used to pass each word
3350 of the operand. The number of words is returned. In case the parameter
3351 should be passed in memory, 0 is returned. As a special case for zero
3352 sized containers, classes[0] will be NO_CLASS and 1 is returned.
3353
3354 BIT_OFFSET is used internally for handling records and specifies offset
3355 of the offset in bits modulo 256 to avoid overflow cases.
3356
3357 See the x86-64 PS ABI for details.
3358 */
3359
3360 static int
3361 classify_argument (enum machine_mode mode, tree type,
3362 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
3363 {
3364 HOST_WIDE_INT bytes =
3365 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3366 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3367
3368 /* Variable sized entities are always passed/returned in memory. */
3369 if (bytes < 0)
3370 return 0;
3371
3372 if (mode != VOIDmode
3373 && targetm.calls.must_pass_in_stack (mode, type))
3374 return 0;
3375
3376 if (type && AGGREGATE_TYPE_P (type))
3377 {
3378 int i;
3379 tree field;
3380 enum x86_64_reg_class subclasses[MAX_CLASSES];
3381
3382 /* On x86-64 we pass structures larger than 16 bytes on the stack. */
3383 if (bytes > 16)
3384 return 0;
3385
3386 for (i = 0; i < words; i++)
3387 classes[i] = X86_64_NO_CLASS;
3388
3389 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
3390 signalize memory class, so handle it as special case. */
3391 if (!words)
3392 {
3393 classes[0] = X86_64_NO_CLASS;
3394 return 1;
3395 }
3396
3397 /* Classify each field of record and merge classes. */
3398 switch (TREE_CODE (type))
3399 {
3400 case RECORD_TYPE:
3401 /* And now merge the fields of structure. */
3402 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3403 {
3404 if (TREE_CODE (field) == FIELD_DECL)
3405 {
3406 int num;
3407
3408 if (TREE_TYPE (field) == error_mark_node)
3409 continue;
3410
3411 /* Bitfields are always classified as integer. Handle them
3412 early, since later code would consider them to be
3413 misaligned integers. */
3414 if (DECL_BIT_FIELD (field))
3415 {
3416 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3417 i < ((int_bit_position (field) + (bit_offset % 64))
3418 + tree_low_cst (DECL_SIZE (field), 0)
3419 + 63) / 8 / 8; i++)
3420 classes[i] =
3421 merge_classes (X86_64_INTEGER_CLASS,
3422 classes[i]);
3423 }
3424 else
3425 {
3426 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3427 TREE_TYPE (field), subclasses,
3428 (int_bit_position (field)
3429 + bit_offset) % 256);
3430 if (!num)
3431 return 0;
3432 for (i = 0; i < num; i++)
3433 {
3434 int pos =
3435 (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3436 classes[i + pos] =
3437 merge_classes (subclasses[i], classes[i + pos]);
3438 }
3439 }
3440 }
3441 }
3442 break;
3443
3444 case ARRAY_TYPE:
3445 /* Arrays are handled as small records. */
3446 {
3447 int num;
3448 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
3449 TREE_TYPE (type), subclasses, bit_offset);
3450 if (!num)
3451 return 0;
3452
3453 /* The partial classes are now full classes. */
3454 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
3455 subclasses[0] = X86_64_SSE_CLASS;
3456 if (subclasses[0] == X86_64_INTEGERSI_CLASS && bytes != 4)
3457 subclasses[0] = X86_64_INTEGER_CLASS;
3458
3459 for (i = 0; i < words; i++)
3460 classes[i] = subclasses[i % num];
3461
3462 break;
3463 }
3464 case UNION_TYPE:
3465 case QUAL_UNION_TYPE:
3466 /* Unions are similar to RECORD_TYPE but offset is always 0.
3467 */
3468 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3469 {
3470 if (TREE_CODE (field) == FIELD_DECL)
3471 {
3472 int num;
3473
3474 if (TREE_TYPE (field) == error_mark_node)
3475 continue;
3476
3477 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3478 TREE_TYPE (field), subclasses,
3479 bit_offset);
3480 if (!num)
3481 return 0;
3482 for (i = 0; i < num; i++)
3483 classes[i] = merge_classes (subclasses[i], classes[i]);
3484 }
3485 }
3486 break;
3487
3488 default:
3489 gcc_unreachable ();
3490 }
3491
3492 /* Final merger cleanup. */
3493 for (i = 0; i < words; i++)
3494 {
3495 /* If one class is MEMORY, everything should be passed in
3496 memory. */
3497 if (classes[i] == X86_64_MEMORY_CLASS)
3498 return 0;
3499
3500 /* The X86_64_SSEUP_CLASS should be always preceded by
3501 X86_64_SSE_CLASS. */
3502 if (classes[i] == X86_64_SSEUP_CLASS
3503 && (i == 0 || classes[i - 1] != X86_64_SSE_CLASS))
3504 classes[i] = X86_64_SSE_CLASS;
3505
3506 /* X86_64_X87UP_CLASS should be preceded by X86_64_X87_CLASS. */
3507 if (classes[i] == X86_64_X87UP_CLASS
3508 && (i == 0 || classes[i - 1] != X86_64_X87_CLASS))
3509 classes[i] = X86_64_SSE_CLASS;
3510 }
3511 return words;
3512 }
3513
3514 /* Compute alignment needed. We align all types to natural boundaries with
3515 exception of XFmode that is aligned to 64bits. */
3516 if (mode != VOIDmode && mode != BLKmode)
3517 {
3518 int mode_alignment = GET_MODE_BITSIZE (mode);
3519
3520 if (mode == XFmode)
3521 mode_alignment = 128;
3522 else if (mode == XCmode)
3523 mode_alignment = 256;
3524 if (COMPLEX_MODE_P (mode))
3525 mode_alignment /= 2;
3526 /* Misaligned fields are always returned in memory. */
3527 if (bit_offset % mode_alignment)
3528 return 0;
3529 }
3530
3531 /* for V1xx modes, just use the base mode */
3532 if (VECTOR_MODE_P (mode)
3533 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
3534 mode = GET_MODE_INNER (mode);
3535
3536 /* Classification of atomic types. */
3537 switch (mode)
3538 {
3539 case SDmode:
3540 case DDmode:
3541 classes[0] = X86_64_SSE_CLASS;
3542 return 1;
3543 case TDmode:
3544 classes[0] = X86_64_SSE_CLASS;
3545 classes[1] = X86_64_SSEUP_CLASS;
3546 return 2;
3547 case DImode:
3548 case SImode:
3549 case HImode:
3550 case QImode:
3551 case CSImode:
3552 case CHImode:
3553 case CQImode:
3554 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3555 classes[0] = X86_64_INTEGERSI_CLASS;
3556 else
3557 classes[0] = X86_64_INTEGER_CLASS;
3558 return 1;
3559 case CDImode:
3560 case TImode:
3561 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
3562 return 2;
3563 case CTImode:
3564 return 0;
3565 case SFmode:
3566 if (!(bit_offset % 64))
3567 classes[0] = X86_64_SSESF_CLASS;
3568 else
3569 classes[0] = X86_64_SSE_CLASS;
3570 return 1;
3571 case DFmode:
3572 classes[0] = X86_64_SSEDF_CLASS;
3573 return 1;
3574 case XFmode:
3575 classes[0] = X86_64_X87_CLASS;
3576 classes[1] = X86_64_X87UP_CLASS;
3577 return 2;
3578 case TFmode:
3579 classes[0] = X86_64_SSE_CLASS;
3580 classes[1] = X86_64_SSEUP_CLASS;
3581 return 2;
3582 case SCmode:
3583 classes[0] = X86_64_SSE_CLASS;
3584 return 1;
3585 case DCmode:
3586 classes[0] = X86_64_SSEDF_CLASS;
3587 classes[1] = X86_64_SSEDF_CLASS;
3588 return 2;
3589 case XCmode:
3590 classes[0] = X86_64_COMPLEX_X87_CLASS;
3591 return 1;
3592 case TCmode:
3593 /* This modes is larger than 16 bytes. */
3594 return 0;
3595 case V4SFmode:
3596 case V4SImode:
3597 case V16QImode:
3598 case V8HImode:
3599 case V2DFmode:
3600 case V2DImode:
3601 classes[0] = X86_64_SSE_CLASS;
3602 classes[1] = X86_64_SSEUP_CLASS;
3603 return 2;
3604 case V2SFmode:
3605 case V2SImode:
3606 case V4HImode:
3607 case V8QImode:
3608 classes[0] = X86_64_SSE_CLASS;
3609 return 1;
3610 case BLKmode:
3611 case VOIDmode:
3612 return 0;
3613 default:
3614 gcc_assert (VECTOR_MODE_P (mode));
3615
3616 if (bytes > 16)
3617 return 0;
3618
3619 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
3620
3621 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3622 classes[0] = X86_64_INTEGERSI_CLASS;
3623 else
3624 classes[0] = X86_64_INTEGER_CLASS;
3625 classes[1] = X86_64_INTEGER_CLASS;
3626 return 1 + (bytes > 8);
3627 }
3628 }
3629
3630 /* Examine the argument and return set number of register required in each
3631 class. Return 0 iff parameter should be passed in memory. */
3632 static int
3633 examine_argument (enum machine_mode mode, tree type, int in_return,
3634 int *int_nregs, int *sse_nregs)
3635 {
3636 enum x86_64_reg_class class[MAX_CLASSES];
3637 int n = classify_argument (mode, type, class, 0);
3638
3639 *int_nregs = 0;
3640 *sse_nregs = 0;
3641 if (!n)
3642 return 0;
3643 for (n--; n >= 0; n--)
3644 switch (class[n])
3645 {
3646 case X86_64_INTEGER_CLASS:
3647 case X86_64_INTEGERSI_CLASS:
3648 (*int_nregs)++;
3649 break;
3650 case X86_64_SSE_CLASS:
3651 case X86_64_SSESF_CLASS:
3652 case X86_64_SSEDF_CLASS:
3653 (*sse_nregs)++;
3654 break;
3655 case X86_64_NO_CLASS:
3656 case X86_64_SSEUP_CLASS:
3657 break;
3658 case X86_64_X87_CLASS:
3659 case X86_64_X87UP_CLASS:
3660 if (!in_return)
3661 return 0;
3662 break;
3663 case X86_64_COMPLEX_X87_CLASS:
3664 return in_return ? 2 : 0;
3665 case X86_64_MEMORY_CLASS:
3666 gcc_unreachable ();
3667 }
3668 return 1;
3669 }
3670
3671 /* Construct container for the argument used by GCC interface. See
3672 FUNCTION_ARG for the detailed description. */
3673
3674 static rtx
3675 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
3676 tree type, int in_return, int nintregs, int nsseregs,
3677 const int *intreg, int sse_regno)
3678 {
3679 /* The following variables hold the static issued_error state. */
3680 static bool issued_sse_arg_error;
3681 static bool issued_sse_ret_error;
3682 static bool issued_x87_ret_error;
3683
3684 enum machine_mode tmpmode;
3685 int bytes =
3686 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3687 enum x86_64_reg_class class[MAX_CLASSES];
3688 int n;
3689 int i;
3690 int nexps = 0;
3691 int needed_sseregs, needed_intregs;
3692 rtx exp[MAX_CLASSES];
3693 rtx ret;
3694
3695 n = classify_argument (mode, type, class, 0);
3696 if (TARGET_DEBUG_ARG)
3697 {
3698 if (!n)
3699 fprintf (stderr, "Memory class\n");
3700 else
3701 {
3702 fprintf (stderr, "Classes:");
3703 for (i = 0; i < n; i++)
3704 {
3705 fprintf (stderr, " %s", x86_64_reg_class_name[class[i]]);
3706 }
3707 fprintf (stderr, "\n");
3708 }
3709 }
3710 if (!n)
3711 return NULL;
3712 if (!examine_argument (mode, type, in_return, &needed_intregs,
3713 &needed_sseregs))
3714 return NULL;
3715 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
3716 return NULL;
3717
3718 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
3719 some less clueful developer tries to use floating-point anyway. */
3720 if (needed_sseregs && !TARGET_SSE)
3721 {
3722 if (in_return)
3723 {
3724 if (!issued_sse_ret_error)
3725 {
3726 error ("SSE register return with SSE disabled");
3727 issued_sse_ret_error = true;
3728 }
3729 }
3730 else if (!issued_sse_arg_error)
3731 {
3732 error ("SSE register argument with SSE disabled");
3733 issued_sse_arg_error = true;
3734 }
3735 return NULL;
3736 }
3737
3738 /* Likewise, error if the ABI requires us to return values in the
3739 x87 registers and the user specified -mno-80387. */
3740 if (!TARGET_80387 && in_return)
3741 for (i = 0; i < n; i++)
3742 if (class[i] == X86_64_X87_CLASS
3743 || class[i] == X86_64_X87UP_CLASS
3744 || class[i] == X86_64_COMPLEX_X87_CLASS)
3745 {
3746 if (!issued_x87_ret_error)
3747 {
3748 error ("x87 register return with x87 disabled");
3749 issued_x87_ret_error = true;
3750 }
3751 return NULL;
3752 }
3753
3754 /* First construct simple cases. Avoid SCmode, since we want to use
3755 single register to pass this type. */
3756 if (n == 1 && mode != SCmode)
3757 switch (class[0])
3758 {
3759 case X86_64_INTEGER_CLASS:
3760 case X86_64_INTEGERSI_CLASS:
3761 return gen_rtx_REG (mode, intreg[0]);
3762 case X86_64_SSE_CLASS:
3763 case X86_64_SSESF_CLASS:
3764 case X86_64_SSEDF_CLASS:
3765 return gen_reg_or_parallel (mode, orig_mode, SSE_REGNO (sse_regno));
3766 case X86_64_X87_CLASS:
3767 case X86_64_COMPLEX_X87_CLASS:
3768 return gen_rtx_REG (mode, FIRST_STACK_REG);
3769 case X86_64_NO_CLASS:
3770 /* Zero sized array, struct or class. */
3771 return NULL;
3772 default:
3773 gcc_unreachable ();
3774 }
3775 if (n == 2 && class[0] == X86_64_SSE_CLASS && class[1] == X86_64_SSEUP_CLASS
3776 && mode != BLKmode)
3777 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
3778 if (n == 2
3779 && class[0] == X86_64_X87_CLASS && class[1] == X86_64_X87UP_CLASS)
3780 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
3781 if (n == 2 && class[0] == X86_64_INTEGER_CLASS
3782 && class[1] == X86_64_INTEGER_CLASS
3783 && (mode == CDImode || mode == TImode || mode == TFmode)
3784 && intreg[0] + 1 == intreg[1])
3785 return gen_rtx_REG (mode, intreg[0]);
3786
3787 /* Otherwise figure out the entries of the PARALLEL. */
3788 for (i = 0; i < n; i++)
3789 {
3790 switch (class[i])
3791 {
3792 case X86_64_NO_CLASS:
3793 break;
3794 case X86_64_INTEGER_CLASS:
3795 case X86_64_INTEGERSI_CLASS:
3796 /* Merge TImodes on aligned occasions here too. */
3797 if (i * 8 + 8 > bytes)
3798 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
3799 else if (class[i] == X86_64_INTEGERSI_CLASS)
3800 tmpmode = SImode;
3801 else
3802 tmpmode = DImode;
3803 /* We've requested 24 bytes we don't have mode for. Use DImode. */
3804 if (tmpmode == BLKmode)
3805 tmpmode = DImode;
3806 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3807 gen_rtx_REG (tmpmode, *intreg),
3808 GEN_INT (i*8));
3809 intreg++;
3810 break;
3811 case X86_64_SSESF_CLASS:
3812 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3813 gen_rtx_REG (SFmode,
3814 SSE_REGNO (sse_regno)),
3815 GEN_INT (i*8));
3816 sse_regno++;
3817 break;
3818 case X86_64_SSEDF_CLASS:
3819 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3820 gen_rtx_REG (DFmode,
3821 SSE_REGNO (sse_regno)),
3822 GEN_INT (i*8));
3823 sse_regno++;
3824 break;
3825 case X86_64_SSE_CLASS:
3826 if (i < n - 1 && class[i + 1] == X86_64_SSEUP_CLASS)
3827 tmpmode = TImode;
3828 else
3829 tmpmode = DImode;
3830 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3831 gen_rtx_REG (tmpmode,
3832 SSE_REGNO (sse_regno)),
3833 GEN_INT (i*8));
3834 if (tmpmode == TImode)
3835 i++;
3836 sse_regno++;
3837 break;
3838 default:
3839 gcc_unreachable ();
3840 }
3841 }
3842
3843 /* Empty aligned struct, union or class. */
3844 if (nexps == 0)
3845 return NULL;
3846
3847 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
3848 for (i = 0; i < nexps; i++)
3849 XVECEXP (ret, 0, i) = exp [i];
3850 return ret;
3851 }
3852
3853 /* Update the data in CUM to advance over an argument
3854 of mode MODE and data type TYPE.
3855 (TYPE is null for libcalls where that information may not be available.) */
3856
3857 void
3858 function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3859 tree type, int named)
3860 {
3861 int bytes =
3862 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3863 int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3864
3865 if (type)
3866 mode = type_natural_mode (type);
3867
3868 if (TARGET_DEBUG_ARG)
3869 fprintf (stderr, "function_adv (sz=%d, wds=%2d, nregs=%d, ssenregs=%d, "
3870 "mode=%s, named=%d)\n\n",
3871 words, cum->words, cum->nregs, cum->sse_nregs,
3872 GET_MODE_NAME (mode), named);
3873
3874 if (TARGET_64BIT)
3875 {
3876 int int_nregs, sse_nregs;
3877 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs))
3878 cum->words += words;
3879 else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
3880 {
3881 cum->nregs -= int_nregs;
3882 cum->sse_nregs -= sse_nregs;
3883 cum->regno += int_nregs;
3884 cum->sse_regno += sse_nregs;
3885 }
3886 else
3887 cum->words += words;
3888 }
3889 else
3890 {
3891 switch (mode)
3892 {
3893 default:
3894 break;
3895
3896 case BLKmode:
3897 if (bytes < 0)
3898 break;
3899 /* FALLTHRU */
3900
3901 case DImode:
3902 case SImode:
3903 case HImode:
3904 case QImode:
3905 cum->words += words;
3906 cum->nregs -= words;
3907 cum->regno += words;
3908
3909 if (cum->nregs <= 0)
3910 {
3911 cum->nregs = 0;
3912 cum->regno = 0;
3913 }
3914 break;
3915
3916 case DFmode:
3917 if (cum->float_in_sse < 2)
3918 break;
3919 case SFmode:
3920 if (cum->float_in_sse < 1)
3921 break;
3922 /* FALLTHRU */
3923
3924 case TImode:
3925 case V16QImode:
3926 case V8HImode:
3927 case V4SImode:
3928 case V2DImode:
3929 case V4SFmode:
3930 case V2DFmode:
3931 if (!type || !AGGREGATE_TYPE_P (type))
3932 {
3933 cum->sse_words += words;
3934 cum->sse_nregs -= 1;
3935 cum->sse_regno += 1;
3936 if (cum->sse_nregs <= 0)
3937 {
3938 cum->sse_nregs = 0;
3939 cum->sse_regno = 0;
3940 }
3941 }
3942 break;
3943
3944 case V8QImode:
3945 case V4HImode:
3946 case V2SImode:
3947 case V2SFmode:
3948 if (!type || !AGGREGATE_TYPE_P (type))
3949 {
3950 cum->mmx_words += words;
3951 cum->mmx_nregs -= 1;
3952 cum->mmx_regno += 1;
3953 if (cum->mmx_nregs <= 0)
3954 {
3955 cum->mmx_nregs = 0;
3956 cum->mmx_regno = 0;
3957 }
3958 }
3959 break;
3960 }
3961 }
3962 }
3963
3964 /* Define where to put the arguments to a function.
3965 Value is zero to push the argument on the stack,
3966 or a hard register in which to store the argument.
3967
3968 MODE is the argument's machine mode.
3969 TYPE is the data type of the argument (as a tree).
3970 This is null for libcalls where that information may
3971 not be available.
3972 CUM is a variable of type CUMULATIVE_ARGS which gives info about
3973 the preceding args and about the function being called.
3974 NAMED is nonzero if this argument is a named parameter
3975 (otherwise it is an extra parameter matching an ellipsis). */
3976
3977 rtx
3978 function_arg (CUMULATIVE_ARGS *cum, enum machine_mode orig_mode,
3979 tree type, int named)
3980 {
3981 enum machine_mode mode = orig_mode;
3982 rtx ret = NULL_RTX;
3983 int bytes =
3984 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3985 int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3986 static bool warnedsse, warnedmmx;
3987
3988 /* To simplify the code below, represent vector types with a vector mode
3989 even if MMX/SSE are not active. */
3990 if (type && TREE_CODE (type) == VECTOR_TYPE)
3991 mode = type_natural_mode (type);
3992
3993 /* Handle a hidden AL argument containing number of registers for varargs
3994 x86-64 functions. For i386 ABI just return constm1_rtx to avoid
3995 any AL settings. */
3996 if (mode == VOIDmode)
3997 {
3998 if (TARGET_64BIT)
3999 return GEN_INT (cum->maybe_vaarg
4000 ? (cum->sse_nregs < 0
4001 ? SSE_REGPARM_MAX
4002 : cum->sse_regno)
4003 : -1);
4004 else
4005 return constm1_rtx;
4006 }
4007 if (TARGET_64BIT)
4008 ret = construct_container (mode, orig_mode, type, 0, cum->nregs,
4009 cum->sse_nregs,
4010 &x86_64_int_parameter_registers [cum->regno],
4011 cum->sse_regno);
4012 else
4013 switch (mode)
4014 {
4015 /* For now, pass fp/complex values on the stack. */
4016 default:
4017 break;
4018
4019 case BLKmode:
4020 if (bytes < 0)
4021 break;
4022 /* FALLTHRU */
4023 case DImode:
4024 case SImode:
4025 case HImode:
4026 case QImode:
4027 if (words <= cum->nregs)
4028 {
4029 int regno = cum->regno;
4030
4031 /* Fastcall allocates the first two DWORD (SImode) or
4032 smaller arguments to ECX and EDX. */
4033 if (cum->fastcall)
4034 {
4035 if (mode == BLKmode || mode == DImode)
4036 break;
4037
4038 /* ECX not EAX is the first allocated register. */
4039 if (regno == 0)
4040 regno = 2;
4041 }
4042 ret = gen_rtx_REG (mode, regno);
4043 }
4044 break;
4045 case DFmode:
4046 if (cum->float_in_sse < 2)
4047 break;
4048 case SFmode:
4049 if (cum->float_in_sse < 1)
4050 break;
4051 /* FALLTHRU */
4052 case TImode:
4053 case V16QImode:
4054 case V8HImode:
4055 case V4SImode:
4056 case V2DImode:
4057 case V4SFmode:
4058 case V2DFmode:
4059 if (!type || !AGGREGATE_TYPE_P (type))
4060 {
4061 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
4062 {
4063 warnedsse = true;
4064 warning (0, "SSE vector argument without SSE enabled "
4065 "changes the ABI");
4066 }
4067 if (cum->sse_nregs)
4068 ret = gen_reg_or_parallel (mode, orig_mode,
4069 cum->sse_regno + FIRST_SSE_REG);
4070 }
4071 break;
4072 case V8QImode:
4073 case V4HImode:
4074 case V2SImode:
4075 case V2SFmode:
4076 if (!type || !AGGREGATE_TYPE_P (type))
4077 {
4078 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
4079 {
4080 warnedmmx = true;
4081 warning (0, "MMX vector argument without MMX enabled "
4082 "changes the ABI");
4083 }
4084 if (cum->mmx_nregs)
4085 ret = gen_reg_or_parallel (mode, orig_mode,
4086 cum->mmx_regno + FIRST_MMX_REG);
4087 }
4088 break;
4089 }
4090
4091 if (TARGET_DEBUG_ARG)
4092 {
4093 fprintf (stderr,
4094 "function_arg (size=%d, wds=%2d, nregs=%d, mode=%4s, named=%d, ",
4095 words, cum->words, cum->nregs, GET_MODE_NAME (mode), named);
4096
4097 if (ret)
4098 print_simple_rtl (stderr, ret);
4099 else
4100 fprintf (stderr, ", stack");
4101
4102 fprintf (stderr, " )\n");
4103 }
4104
4105 return ret;
4106 }
4107
4108 /* A C expression that indicates when an argument must be passed by
4109 reference. If nonzero for an argument, a copy of that argument is
4110 made in memory and a pointer to the argument is passed instead of
4111 the argument itself. The pointer is passed in whatever way is
4112 appropriate for passing a pointer to that type. */
4113
4114 static bool
4115 ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
4116 enum machine_mode mode ATTRIBUTE_UNUSED,
4117 tree type, bool named ATTRIBUTE_UNUSED)
4118 {
4119 if (!TARGET_64BIT)
4120 return 0;
4121
4122 if (type && int_size_in_bytes (type) == -1)
4123 {
4124 if (TARGET_DEBUG_ARG)
4125 fprintf (stderr, "function_arg_pass_by_reference\n");
4126 return 1;
4127 }
4128
4129 return 0;
4130 }
4131
4132 /* Return true when TYPE should be 128bit aligned for 32bit argument passing
4133 ABI. Only called if TARGET_SSE. */
4134 static bool
4135 contains_128bit_aligned_vector_p (tree type)
4136 {
4137 enum machine_mode mode = TYPE_MODE (type);
4138 if (SSE_REG_MODE_P (mode)
4139 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
4140 return true;
4141 if (TYPE_ALIGN (type) < 128)
4142 return false;
4143
4144 if (AGGREGATE_TYPE_P (type))
4145 {
4146 /* Walk the aggregates recursively. */
4147 switch (TREE_CODE (type))
4148 {
4149 case RECORD_TYPE:
4150 case UNION_TYPE:
4151 case QUAL_UNION_TYPE:
4152 {
4153 tree field;
4154
4155 /* Walk all the structure fields. */
4156 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
4157 {
4158 if (TREE_CODE (field) == FIELD_DECL
4159 && contains_128bit_aligned_vector_p (TREE_TYPE (field)))
4160 return true;
4161 }
4162 break;
4163 }
4164
4165 case ARRAY_TYPE:
4166 /* Just for use if some languages passes arrays by value. */
4167 if (contains_128bit_aligned_vector_p (TREE_TYPE (type)))
4168 return true;
4169 break;
4170
4171 default:
4172 gcc_unreachable ();
4173 }
4174 }
4175 return false;
4176 }
4177
4178 /* Gives the alignment boundary, in bits, of an argument with the
4179 specified mode and type. */
4180
4181 int
4182 ix86_function_arg_boundary (enum machine_mode mode, tree type)
4183 {
4184 int align;
4185 if (type)
4186 align = TYPE_ALIGN (type);
4187 else
4188 align = GET_MODE_ALIGNMENT (mode);
4189 if (align < PARM_BOUNDARY)
4190 align = PARM_BOUNDARY;
4191 if (!TARGET_64BIT)
4192 {
4193 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
4194 make an exception for SSE modes since these require 128bit
4195 alignment.
4196
4197 The handling here differs from field_alignment. ICC aligns MMX
4198 arguments to 4 byte boundaries, while structure fields are aligned
4199 to 8 byte boundaries. */
4200 if (!TARGET_SSE)
4201 align = PARM_BOUNDARY;
4202 else if (!type)
4203 {
4204 if (!SSE_REG_MODE_P (mode))
4205 align = PARM_BOUNDARY;
4206 }
4207 else
4208 {
4209 if (!contains_128bit_aligned_vector_p (type))
4210 align = PARM_BOUNDARY;
4211 }
4212 }
4213 if (align > 128)
4214 align = 128;
4215 return align;
4216 }
4217
4218 /* Return true if N is a possible register number of function value. */
4219 bool
4220 ix86_function_value_regno_p (int regno)
4221 {
4222 if (TARGET_MACHO)
4223 {
4224 if (!TARGET_64BIT)
4225 {
4226 return ((regno) == 0
4227 || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)
4228 || ((regno) == FIRST_SSE_REG && TARGET_SSE));
4229 }
4230 return ((regno) == 0 || (regno) == FIRST_FLOAT_REG
4231 || ((regno) == FIRST_SSE_REG && TARGET_SSE)
4232 || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387));
4233 }
4234 else
4235 {
4236 if (regno == 0
4237 || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)
4238 || (regno == FIRST_SSE_REG && TARGET_SSE))
4239 return true;
4240
4241 if (!TARGET_64BIT
4242 && (regno == FIRST_MMX_REG && TARGET_MMX))
4243 return true;
4244
4245 return false;
4246 }
4247 }
4248
4249 /* Define how to find the value returned by a function.
4250 VALTYPE is the data type of the value (as a tree).
4251 If the precise function being called is known, FUNC is its FUNCTION_DECL;
4252 otherwise, FUNC is 0. */
4253 rtx
4254 ix86_function_value (tree valtype, tree fntype_or_decl,
4255 bool outgoing ATTRIBUTE_UNUSED)
4256 {
4257 enum machine_mode natmode = type_natural_mode (valtype);
4258
4259 if (TARGET_64BIT)
4260 {
4261 rtx ret = construct_container (natmode, TYPE_MODE (valtype), valtype,
4262 1, REGPARM_MAX, SSE_REGPARM_MAX,
4263 x86_64_int_return_registers, 0);
4264 /* For zero sized structures, construct_container return NULL, but we
4265 need to keep rest of compiler happy by returning meaningful value. */
4266 if (!ret)
4267 ret = gen_rtx_REG (TYPE_MODE (valtype), 0);
4268 return ret;
4269 }
4270 else
4271 {
4272 tree fn = NULL_TREE, fntype;
4273 if (fntype_or_decl
4274 && DECL_P (fntype_or_decl))
4275 fn = fntype_or_decl;
4276 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
4277 return gen_rtx_REG (TYPE_MODE (valtype),
4278 ix86_value_regno (natmode, fn, fntype));
4279 }
4280 }
4281
4282 /* Return true iff type is returned in memory. */
4283 int
4284 ix86_return_in_memory (tree type)
4285 {
4286 int needed_intregs, needed_sseregs, size;
4287 enum machine_mode mode = type_natural_mode (type);
4288
4289 if (TARGET_64BIT)
4290 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
4291
4292 if (mode == BLKmode)
4293 return 1;
4294
4295 size = int_size_in_bytes (type);
4296
4297 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
4298 return 0;
4299
4300 if (VECTOR_MODE_P (mode) || mode == TImode)
4301 {
4302 /* User-created vectors small enough to fit in EAX. */
4303 if (size < 8)
4304 return 0;
4305
4306 /* MMX/3dNow values are returned in MM0,
4307 except when it doesn't exits. */
4308 if (size == 8)
4309 return (TARGET_MMX ? 0 : 1);
4310
4311 /* SSE values are returned in XMM0, except when it doesn't exist. */
4312 if (size == 16)
4313 return (TARGET_SSE ? 0 : 1);
4314 }
4315
4316 if (mode == XFmode)
4317 return 0;
4318
4319 if (mode == TDmode)
4320 return 1;
4321
4322 if (size > 12)
4323 return 1;
4324 return 0;
4325 }
4326
4327 /* When returning SSE vector types, we have a choice of either
4328 (1) being abi incompatible with a -march switch, or
4329 (2) generating an error.
4330 Given no good solution, I think the safest thing is one warning.
4331 The user won't be able to use -Werror, but....
4332
4333 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
4334 called in response to actually generating a caller or callee that
4335 uses such a type. As opposed to RETURN_IN_MEMORY, which is called
4336 via aggregate_value_p for general type probing from tree-ssa. */
4337
4338 static rtx
4339 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
4340 {
4341 static bool warnedsse, warnedmmx;
4342
4343 if (type)
4344 {
4345 /* Look at the return type of the function, not the function type. */
4346 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
4347
4348 if (!TARGET_SSE && !warnedsse)
4349 {
4350 if (mode == TImode
4351 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4352 {
4353 warnedsse = true;
4354 warning (0, "SSE vector return without SSE enabled "
4355 "changes the ABI");
4356 }
4357 }
4358
4359 if (!TARGET_MMX && !warnedmmx)
4360 {
4361 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4362 {
4363 warnedmmx = true;
4364 warning (0, "MMX vector return without MMX enabled "
4365 "changes the ABI");
4366 }
4367 }
4368 }
4369
4370 return NULL;
4371 }
4372
4373 /* Define how to find the value returned by a library function
4374 assuming the value has mode MODE. */
4375 rtx
4376 ix86_libcall_value (enum machine_mode mode)
4377 {
4378 if (TARGET_64BIT)
4379 {
4380 switch (mode)
4381 {
4382 case SFmode:
4383 case SCmode:
4384 case DFmode:
4385 case DCmode:
4386 case TFmode:
4387 case SDmode:
4388 case DDmode:
4389 case TDmode:
4390 return gen_rtx_REG (mode, FIRST_SSE_REG);
4391 case XFmode:
4392 case XCmode:
4393 return gen_rtx_REG (mode, FIRST_FLOAT_REG);
4394 case TCmode:
4395 return NULL;
4396 default:
4397 return gen_rtx_REG (mode, 0);
4398 }
4399 }
4400 else
4401 return gen_rtx_REG (mode, ix86_value_regno (mode, NULL, NULL));
4402 }
4403
4404 /* Given a mode, return the register to use for a return value. */
4405
4406 static int
4407 ix86_value_regno (enum machine_mode mode, tree func, tree fntype)
4408 {
4409 gcc_assert (!TARGET_64BIT);
4410
4411 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
4412 we normally prevent this case when mmx is not available. However
4413 some ABIs may require the result to be returned like DImode. */
4414 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4415 return TARGET_MMX ? FIRST_MMX_REG : 0;
4416
4417 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
4418 we prevent this case when sse is not available. However some ABIs
4419 may require the result to be returned like integer TImode. */
4420 if (mode == TImode || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4421 return TARGET_SSE ? FIRST_SSE_REG : 0;
4422
4423 /* Decimal floating point values can go in %eax, unlike other float modes. */
4424 if (DECIMAL_FLOAT_MODE_P (mode))
4425 return 0;
4426
4427 /* Most things go in %eax, except (unless -mno-fp-ret-in-387) fp values. */
4428 if (!SCALAR_FLOAT_MODE_P (mode) || !TARGET_FLOAT_RETURNS_IN_80387)
4429 return 0;
4430
4431 /* Floating point return values in %st(0), except for local functions when
4432 SSE math is enabled or for functions with sseregparm attribute. */
4433 if ((func || fntype)
4434 && (mode == SFmode || mode == DFmode))
4435 {
4436 int sse_level = ix86_function_sseregparm (fntype, func);
4437 if ((sse_level >= 1 && mode == SFmode)
4438 || (sse_level == 2 && mode == DFmode))
4439 return FIRST_SSE_REG;
4440 }
4441
4442 return FIRST_FLOAT_REG;
4443 }
4444 \f
4445 /* Create the va_list data type. */
4446
4447 static tree
4448 ix86_build_builtin_va_list (void)
4449 {
4450 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
4451
4452 /* For i386 we use plain pointer to argument area. */
4453 if (!TARGET_64BIT)
4454 return build_pointer_type (char_type_node);
4455
4456 record = (*lang_hooks.types.make_type) (RECORD_TYPE);
4457 type_decl = build_decl (TYPE_DECL, get_identifier ("__va_list_tag"), record);
4458
4459 f_gpr = build_decl (FIELD_DECL, get_identifier ("gp_offset"),
4460 unsigned_type_node);
4461 f_fpr = build_decl (FIELD_DECL, get_identifier ("fp_offset"),
4462 unsigned_type_node);
4463 f_ovf = build_decl (FIELD_DECL, get_identifier ("overflow_arg_area"),
4464 ptr_type_node);
4465 f_sav = build_decl (FIELD_DECL, get_identifier ("reg_save_area"),
4466 ptr_type_node);
4467
4468 va_list_gpr_counter_field = f_gpr;
4469 va_list_fpr_counter_field = f_fpr;
4470
4471 DECL_FIELD_CONTEXT (f_gpr) = record;
4472 DECL_FIELD_CONTEXT (f_fpr) = record;
4473 DECL_FIELD_CONTEXT (f_ovf) = record;
4474 DECL_FIELD_CONTEXT (f_sav) = record;
4475
4476 TREE_CHAIN (record) = type_decl;
4477 TYPE_NAME (record) = type_decl;
4478 TYPE_FIELDS (record) = f_gpr;
4479 TREE_CHAIN (f_gpr) = f_fpr;
4480 TREE_CHAIN (f_fpr) = f_ovf;
4481 TREE_CHAIN (f_ovf) = f_sav;
4482
4483 layout_type (record);
4484
4485 /* The correct type is an array type of one element. */
4486 return build_array_type (record, build_index_type (size_zero_node));
4487 }
4488
4489 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
4490
4491 static void
4492 ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
4493 tree type, int *pretend_size ATTRIBUTE_UNUSED,
4494 int no_rtl)
4495 {
4496 CUMULATIVE_ARGS next_cum;
4497 rtx save_area = NULL_RTX, mem;
4498 rtx label;
4499 rtx label_ref;
4500 rtx tmp_reg;
4501 rtx nsse_reg;
4502 int set;
4503 tree fntype;
4504 int stdarg_p;
4505 int i;
4506
4507 if (!TARGET_64BIT)
4508 return;
4509
4510 if (! cfun->va_list_gpr_size && ! cfun->va_list_fpr_size)
4511 return;
4512
4513 /* Indicate to allocate space on the stack for varargs save area. */
4514 ix86_save_varrargs_registers = 1;
4515
4516 cfun->stack_alignment_needed = 128;
4517
4518 fntype = TREE_TYPE (current_function_decl);
4519 stdarg_p = (TYPE_ARG_TYPES (fntype) != 0
4520 && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype)))
4521 != void_type_node));
4522
4523 /* For varargs, we do not want to skip the dummy va_dcl argument.
4524 For stdargs, we do want to skip the last named argument. */
4525 next_cum = *cum;
4526 if (stdarg_p)
4527 function_arg_advance (&next_cum, mode, type, 1);
4528
4529 if (!no_rtl)
4530 save_area = frame_pointer_rtx;
4531
4532 set = get_varargs_alias_set ();
4533
4534 for (i = next_cum.regno;
4535 i < ix86_regparm
4536 && i < next_cum.regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
4537 i++)
4538 {
4539 mem = gen_rtx_MEM (Pmode,
4540 plus_constant (save_area, i * UNITS_PER_WORD));
4541 MEM_NOTRAP_P (mem) = 1;
4542 set_mem_alias_set (mem, set);
4543 emit_move_insn (mem, gen_rtx_REG (Pmode,
4544 x86_64_int_parameter_registers[i]));
4545 }
4546
4547 if (next_cum.sse_nregs && cfun->va_list_fpr_size)
4548 {
4549 /* Now emit code to save SSE registers. The AX parameter contains number
4550 of SSE parameter registers used to call this function. We use
4551 sse_prologue_save insn template that produces computed jump across
4552 SSE saves. We need some preparation work to get this working. */
4553
4554 label = gen_label_rtx ();
4555 label_ref = gen_rtx_LABEL_REF (Pmode, label);
4556
4557 /* Compute address to jump to :
4558 label - 5*eax + nnamed_sse_arguments*5 */
4559 tmp_reg = gen_reg_rtx (Pmode);
4560 nsse_reg = gen_reg_rtx (Pmode);
4561 emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, 0)));
4562 emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4563 gen_rtx_MULT (Pmode, nsse_reg,
4564 GEN_INT (4))));
4565 if (next_cum.sse_regno)
4566 emit_move_insn
4567 (nsse_reg,
4568 gen_rtx_CONST (DImode,
4569 gen_rtx_PLUS (DImode,
4570 label_ref,
4571 GEN_INT (next_cum.sse_regno * 4))));
4572 else
4573 emit_move_insn (nsse_reg, label_ref);
4574 emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
4575
4576 /* Compute address of memory block we save into. We always use pointer
4577 pointing 127 bytes after first byte to store - this is needed to keep
4578 instruction size limited by 4 bytes. */
4579 tmp_reg = gen_reg_rtx (Pmode);
4580 emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4581 plus_constant (save_area,
4582 8 * REGPARM_MAX + 127)));
4583 mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
4584 MEM_NOTRAP_P (mem) = 1;
4585 set_mem_alias_set (mem, set);
4586 set_mem_align (mem, BITS_PER_WORD);
4587
4588 /* And finally do the dirty job! */
4589 emit_insn (gen_sse_prologue_save (mem, nsse_reg,
4590 GEN_INT (next_cum.sse_regno), label));
4591 }
4592
4593 }
4594
4595 /* Implement va_start. */
4596
4597 void
4598 ix86_va_start (tree valist, rtx nextarg)
4599 {
4600 HOST_WIDE_INT words, n_gpr, n_fpr;
4601 tree f_gpr, f_fpr, f_ovf, f_sav;
4602 tree gpr, fpr, ovf, sav, t;
4603 tree type;
4604
4605 /* Only 64bit target needs something special. */
4606 if (!TARGET_64BIT)
4607 {
4608 std_expand_builtin_va_start (valist, nextarg);
4609 return;
4610 }
4611
4612 f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4613 f_fpr = TREE_CHAIN (f_gpr);
4614 f_ovf = TREE_CHAIN (f_fpr);
4615 f_sav = TREE_CHAIN (f_ovf);
4616
4617 valist = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (valist)), valist);
4618 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4619 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4620 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4621 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4622
4623 /* Count number of gp and fp argument registers used. */
4624 words = current_function_args_info.words;
4625 n_gpr = current_function_args_info.regno;
4626 n_fpr = current_function_args_info.sse_regno;
4627
4628 if (TARGET_DEBUG_ARG)
4629 fprintf (stderr, "va_start: words = %d, n_gpr = %d, n_fpr = %d\n",
4630 (int) words, (int) n_gpr, (int) n_fpr);
4631
4632 if (cfun->va_list_gpr_size)
4633 {
4634 type = TREE_TYPE (gpr);
4635 t = build2 (GIMPLE_MODIFY_STMT, type, gpr,
4636 build_int_cst (type, n_gpr * 8));
4637 TREE_SIDE_EFFECTS (t) = 1;
4638 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4639 }
4640
4641 if (cfun->va_list_fpr_size)
4642 {
4643 type = TREE_TYPE (fpr);
4644 t = build2 (GIMPLE_MODIFY_STMT, type, fpr,
4645 build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX));
4646 TREE_SIDE_EFFECTS (t) = 1;
4647 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4648 }
4649
4650 /* Find the overflow area. */
4651 type = TREE_TYPE (ovf);
4652 t = make_tree (type, virtual_incoming_args_rtx);
4653 if (words != 0)
4654 t = build2 (PLUS_EXPR, type, t,
4655 build_int_cst (type, words * UNITS_PER_WORD));
4656 t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t);
4657 TREE_SIDE_EFFECTS (t) = 1;
4658 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4659
4660 if (cfun->va_list_gpr_size || cfun->va_list_fpr_size)
4661 {
4662 /* Find the register save area.
4663 Prologue of the function save it right above stack frame. */
4664 type = TREE_TYPE (sav);
4665 t = make_tree (type, frame_pointer_rtx);
4666 t = build2 (GIMPLE_MODIFY_STMT, type, sav, t);
4667 TREE_SIDE_EFFECTS (t) = 1;
4668 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4669 }
4670 }
4671
4672 /* Implement va_arg. */
4673
4674 tree
4675 ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p)
4676 {
4677 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
4678 tree f_gpr, f_fpr, f_ovf, f_sav;
4679 tree gpr, fpr, ovf, sav, t;
4680 int size, rsize;
4681 tree lab_false, lab_over = NULL_TREE;
4682 tree addr, t2;
4683 rtx container;
4684 int indirect_p = 0;
4685 tree ptrtype;
4686 enum machine_mode nat_mode;
4687
4688 /* Only 64bit target needs something special. */
4689 if (!TARGET_64BIT)
4690 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
4691
4692 f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4693 f_fpr = TREE_CHAIN (f_gpr);
4694 f_ovf = TREE_CHAIN (f_fpr);
4695 f_sav = TREE_CHAIN (f_ovf);
4696
4697 valist = build_va_arg_indirect_ref (valist);
4698 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4699 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4700 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4701 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4702
4703 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
4704 if (indirect_p)
4705 type = build_pointer_type (type);
4706 size = int_size_in_bytes (type);
4707 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4708
4709 nat_mode = type_natural_mode (type);
4710 container = construct_container (nat_mode, TYPE_MODE (type), type, 0,
4711 REGPARM_MAX, SSE_REGPARM_MAX, intreg, 0);
4712
4713 /* Pull the value out of the saved registers. */
4714
4715 addr = create_tmp_var (ptr_type_node, "addr");
4716 DECL_POINTER_ALIAS_SET (addr) = get_varargs_alias_set ();
4717
4718 if (container)
4719 {
4720 int needed_intregs, needed_sseregs;
4721 bool need_temp;
4722 tree int_addr, sse_addr;
4723
4724 lab_false = create_artificial_label ();
4725 lab_over = create_artificial_label ();
4726
4727 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
4728
4729 need_temp = (!REG_P (container)
4730 && ((needed_intregs && TYPE_ALIGN (type) > 64)
4731 || TYPE_ALIGN (type) > 128));
4732
4733 /* In case we are passing structure, verify that it is consecutive block
4734 on the register save area. If not we need to do moves. */
4735 if (!need_temp && !REG_P (container))
4736 {
4737 /* Verify that all registers are strictly consecutive */
4738 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
4739 {
4740 int i;
4741
4742 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4743 {
4744 rtx slot = XVECEXP (container, 0, i);
4745 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
4746 || INTVAL (XEXP (slot, 1)) != i * 16)
4747 need_temp = 1;
4748 }
4749 }
4750 else
4751 {
4752 int i;
4753
4754 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4755 {
4756 rtx slot = XVECEXP (container, 0, i);
4757 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
4758 || INTVAL (XEXP (slot, 1)) != i * 8)
4759 need_temp = 1;
4760 }
4761 }
4762 }
4763 if (!need_temp)
4764 {
4765 int_addr = addr;
4766 sse_addr = addr;
4767 }
4768 else
4769 {
4770 int_addr = create_tmp_var (ptr_type_node, "int_addr");
4771 DECL_POINTER_ALIAS_SET (int_addr) = get_varargs_alias_set ();
4772 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
4773 DECL_POINTER_ALIAS_SET (sse_addr) = get_varargs_alias_set ();
4774 }
4775
4776 /* First ensure that we fit completely in registers. */
4777 if (needed_intregs)
4778 {
4779 t = build_int_cst (TREE_TYPE (gpr),
4780 (REGPARM_MAX - needed_intregs + 1) * 8);
4781 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
4782 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4783 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4784 gimplify_and_add (t, pre_p);
4785 }
4786 if (needed_sseregs)
4787 {
4788 t = build_int_cst (TREE_TYPE (fpr),
4789 (SSE_REGPARM_MAX - needed_sseregs + 1) * 16
4790 + REGPARM_MAX * 8);
4791 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
4792 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4793 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4794 gimplify_and_add (t, pre_p);
4795 }
4796
4797 /* Compute index to start of area used for integer regs. */
4798 if (needed_intregs)
4799 {
4800 /* int_addr = gpr + sav; */
4801 t = fold_convert (ptr_type_node, gpr);
4802 t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4803 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t);
4804 gimplify_and_add (t, pre_p);
4805 }
4806 if (needed_sseregs)
4807 {
4808 /* sse_addr = fpr + sav; */
4809 t = fold_convert (ptr_type_node, fpr);
4810 t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4811 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t);
4812 gimplify_and_add (t, pre_p);
4813 }
4814 if (need_temp)
4815 {
4816 int i;
4817 tree temp = create_tmp_var (type, "va_arg_tmp");
4818
4819 /* addr = &temp; */
4820 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
4821 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4822 gimplify_and_add (t, pre_p);
4823
4824 for (i = 0; i < XVECLEN (container, 0); i++)
4825 {
4826 rtx slot = XVECEXP (container, 0, i);
4827 rtx reg = XEXP (slot, 0);
4828 enum machine_mode mode = GET_MODE (reg);
4829 tree piece_type = lang_hooks.types.type_for_mode (mode, 1);
4830 tree addr_type = build_pointer_type (piece_type);
4831 tree src_addr, src;
4832 int src_offset;
4833 tree dest_addr, dest;
4834
4835 if (SSE_REGNO_P (REGNO (reg)))
4836 {
4837 src_addr = sse_addr;
4838 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
4839 }
4840 else
4841 {
4842 src_addr = int_addr;
4843 src_offset = REGNO (reg) * 8;
4844 }
4845 src_addr = fold_convert (addr_type, src_addr);
4846 src_addr = fold (build2 (PLUS_EXPR, addr_type, src_addr,
4847 size_int (src_offset)));
4848 src = build_va_arg_indirect_ref (src_addr);
4849
4850 dest_addr = fold_convert (addr_type, addr);
4851 dest_addr = fold (build2 (PLUS_EXPR, addr_type, dest_addr,
4852 size_int (INTVAL (XEXP (slot, 1)))));
4853 dest = build_va_arg_indirect_ref (dest_addr);
4854
4855 t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src);
4856 gimplify_and_add (t, pre_p);
4857 }
4858 }
4859
4860 if (needed_intregs)
4861 {
4862 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
4863 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
4864 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t);
4865 gimplify_and_add (t, pre_p);
4866 }
4867 if (needed_sseregs)
4868 {
4869 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
4870 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
4871 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t);
4872 gimplify_and_add (t, pre_p);
4873 }
4874
4875 t = build1 (GOTO_EXPR, void_type_node, lab_over);
4876 gimplify_and_add (t, pre_p);
4877
4878 t = build1 (LABEL_EXPR, void_type_node, lab_false);
4879 append_to_statement_list (t, pre_p);
4880 }
4881
4882 /* ... otherwise out of the overflow area. */
4883
4884 /* Care for on-stack alignment if needed. */
4885 if (FUNCTION_ARG_BOUNDARY (VOIDmode, type) <= 64
4886 || integer_zerop (TYPE_SIZE (type)))
4887 t = ovf;
4888 else
4889 {
4890 HOST_WIDE_INT align = FUNCTION_ARG_BOUNDARY (VOIDmode, type) / 8;
4891 t = build2 (PLUS_EXPR, TREE_TYPE (ovf), ovf,
4892 build_int_cst (TREE_TYPE (ovf), align - 1));
4893 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
4894 build_int_cst (TREE_TYPE (t), -align));
4895 }
4896 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
4897
4898 t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4899 gimplify_and_add (t2, pre_p);
4900
4901 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
4902 build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD));
4903 t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t);
4904 gimplify_and_add (t, pre_p);
4905
4906 if (container)
4907 {
4908 t = build1 (LABEL_EXPR, void_type_node, lab_over);
4909 append_to_statement_list (t, pre_p);
4910 }
4911
4912 ptrtype = build_pointer_type (type);
4913 addr = fold_convert (ptrtype, addr);
4914
4915 if (indirect_p)
4916 addr = build_va_arg_indirect_ref (addr);
4917 return build_va_arg_indirect_ref (addr);
4918 }
4919 \f
4920 /* Return nonzero if OPNUM's MEM should be matched
4921 in movabs* patterns. */
4922
4923 int
4924 ix86_check_movabs (rtx insn, int opnum)
4925 {
4926 rtx set, mem;
4927
4928 set = PATTERN (insn);
4929 if (GET_CODE (set) == PARALLEL)
4930 set = XVECEXP (set, 0, 0);
4931 gcc_assert (GET_CODE (set) == SET);
4932 mem = XEXP (set, opnum);
4933 while (GET_CODE (mem) == SUBREG)
4934 mem = SUBREG_REG (mem);
4935 gcc_assert (MEM_P (mem));
4936 return (volatile_ok || !MEM_VOLATILE_P (mem));
4937 }
4938 \f
4939 /* Initialize the table of extra 80387 mathematical constants. */
4940
4941 static void
4942 init_ext_80387_constants (void)
4943 {
4944 static const char * cst[5] =
4945 {
4946 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
4947 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
4948 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
4949 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
4950 "3.1415926535897932385128089594061862044", /* 4: fldpi */
4951 };
4952 int i;
4953
4954 for (i = 0; i < 5; i++)
4955 {
4956 real_from_string (&ext_80387_constants_table[i], cst[i]);
4957 /* Ensure each constant is rounded to XFmode precision. */
4958 real_convert (&ext_80387_constants_table[i],
4959 XFmode, &ext_80387_constants_table[i]);
4960 }
4961
4962 ext_80387_constants_init = 1;
4963 }
4964
4965 /* Return true if the constant is something that can be loaded with
4966 a special instruction. */
4967
4968 int
4969 standard_80387_constant_p (rtx x)
4970 {
4971 REAL_VALUE_TYPE r;
4972
4973 if (GET_CODE (x) != CONST_DOUBLE || !FLOAT_MODE_P (GET_MODE (x)))
4974 return -1;
4975
4976 if (x == CONST0_RTX (GET_MODE (x)))
4977 return 1;
4978 if (x == CONST1_RTX (GET_MODE (x)))
4979 return 2;
4980
4981 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4982
4983 /* For XFmode constants, try to find a special 80387 instruction when
4984 optimizing for size or on those CPUs that benefit from them. */
4985 if (GET_MODE (x) == XFmode
4986 && (optimize_size || x86_ext_80387_constants & TUNEMASK))
4987 {
4988 int i;
4989
4990 if (! ext_80387_constants_init)
4991 init_ext_80387_constants ();
4992
4993 for (i = 0; i < 5; i++)
4994 if (real_identical (&r, &ext_80387_constants_table[i]))
4995 return i + 3;
4996 }
4997
4998 /* Load of the constant -0.0 or -1.0 will be split as
4999 fldz;fchs or fld1;fchs sequence. */
5000 if (real_isnegzero (&r))
5001 return 8;
5002 if (real_identical (&r, &dconstm1))
5003 return 9;
5004
5005 return 0;
5006 }
5007
5008 /* Return the opcode of the special instruction to be used to load
5009 the constant X. */
5010
5011 const char *
5012 standard_80387_constant_opcode (rtx x)
5013 {
5014 switch (standard_80387_constant_p (x))
5015 {
5016 case 1:
5017 return "fldz";
5018 case 2:
5019 return "fld1";
5020 case 3:
5021 return "fldlg2";
5022 case 4:
5023 return "fldln2";
5024 case 5:
5025 return "fldl2e";
5026 case 6:
5027 return "fldl2t";
5028 case 7:
5029 return "fldpi";
5030 case 8:
5031 case 9:
5032 return "#";
5033 default:
5034 gcc_unreachable ();
5035 }
5036 }
5037
5038 /* Return the CONST_DOUBLE representing the 80387 constant that is
5039 loaded by the specified special instruction. The argument IDX
5040 matches the return value from standard_80387_constant_p. */
5041
5042 rtx
5043 standard_80387_constant_rtx (int idx)
5044 {
5045 int i;
5046
5047 if (! ext_80387_constants_init)
5048 init_ext_80387_constants ();
5049
5050 switch (idx)
5051 {
5052 case 3:
5053 case 4:
5054 case 5:
5055 case 6:
5056 case 7:
5057 i = idx - 3;
5058 break;
5059
5060 default:
5061 gcc_unreachable ();
5062 }
5063
5064 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
5065 XFmode);
5066 }
5067
5068 /* Return 1 if mode is a valid mode for sse. */
5069 static int
5070 standard_sse_mode_p (enum machine_mode mode)
5071 {
5072 switch (mode)
5073 {
5074 case V16QImode:
5075 case V8HImode:
5076 case V4SImode:
5077 case V2DImode:
5078 case V4SFmode:
5079 case V2DFmode:
5080 return 1;
5081
5082 default:
5083 return 0;
5084 }
5085 }
5086
5087 /* Return 1 if X is FP constant we can load to SSE register w/o using memory.
5088 */
5089 int
5090 standard_sse_constant_p (rtx x)
5091 {
5092 enum machine_mode mode = GET_MODE (x);
5093
5094 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
5095 return 1;
5096 if (vector_all_ones_operand (x, mode)
5097 && standard_sse_mode_p (mode))
5098 return TARGET_SSE2 ? 2 : -1;
5099
5100 return 0;
5101 }
5102
5103 /* Return the opcode of the special instruction to be used to load
5104 the constant X. */
5105
5106 const char *
5107 standard_sse_constant_opcode (rtx insn, rtx x)
5108 {
5109 switch (standard_sse_constant_p (x))
5110 {
5111 case 1:
5112 if (get_attr_mode (insn) == MODE_V4SF)
5113 return "xorps\t%0, %0";
5114 else if (get_attr_mode (insn) == MODE_V2DF)
5115 return "xorpd\t%0, %0";
5116 else
5117 return "pxor\t%0, %0";
5118 case 2:
5119 return "pcmpeqd\t%0, %0";
5120 }
5121 gcc_unreachable ();
5122 }
5123
5124 /* Returns 1 if OP contains a symbol reference */
5125
5126 int
5127 symbolic_reference_mentioned_p (rtx op)
5128 {
5129 const char *fmt;
5130 int i;
5131
5132 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
5133 return 1;
5134
5135 fmt = GET_RTX_FORMAT (GET_CODE (op));
5136 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
5137 {
5138 if (fmt[i] == 'E')
5139 {
5140 int j;
5141
5142 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
5143 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
5144 return 1;
5145 }
5146
5147 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
5148 return 1;
5149 }
5150
5151 return 0;
5152 }
5153
5154 /* Return 1 if it is appropriate to emit `ret' instructions in the
5155 body of a function. Do this only if the epilogue is simple, needing a
5156 couple of insns. Prior to reloading, we can't tell how many registers
5157 must be saved, so return 0 then. Return 0 if there is no frame
5158 marker to de-allocate. */
5159
5160 int
5161 ix86_can_use_return_insn_p (void)
5162 {
5163 struct ix86_frame frame;
5164
5165 if (! reload_completed || frame_pointer_needed)
5166 return 0;
5167
5168 /* Don't allow more than 32 pop, since that's all we can do
5169 with one instruction. */
5170 if (current_function_pops_args
5171 && current_function_args_size >= 32768)
5172 return 0;
5173
5174 ix86_compute_frame_layout (&frame);
5175 return frame.to_allocate == 0 && frame.nregs == 0;
5176 }
5177 \f
5178 /* Value should be nonzero if functions must have frame pointers.
5179 Zero means the frame pointer need not be set up (and parms may
5180 be accessed via the stack pointer) in functions that seem suitable. */
5181
5182 int
5183 ix86_frame_pointer_required (void)
5184 {
5185 /* If we accessed previous frames, then the generated code expects
5186 to be able to access the saved ebp value in our frame. */
5187 if (cfun->machine->accesses_prev_frame)
5188 return 1;
5189
5190 /* Several x86 os'es need a frame pointer for other reasons,
5191 usually pertaining to setjmp. */
5192 if (SUBTARGET_FRAME_POINTER_REQUIRED)
5193 return 1;
5194
5195 /* In override_options, TARGET_OMIT_LEAF_FRAME_POINTER turns off
5196 the frame pointer by default. Turn it back on now if we've not
5197 got a leaf function. */
5198 if (TARGET_OMIT_LEAF_FRAME_POINTER
5199 && (!current_function_is_leaf
5200 || ix86_current_function_calls_tls_descriptor))
5201 return 1;
5202
5203 if (current_function_profile)
5204 return 1;
5205
5206 return 0;
5207 }
5208
5209 /* Record that the current function accesses previous call frames. */
5210
5211 void
5212 ix86_setup_frame_addresses (void)
5213 {
5214 cfun->machine->accesses_prev_frame = 1;
5215 }
5216 \f
5217 #if (defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)) || TARGET_MACHO
5218 # define USE_HIDDEN_LINKONCE 1
5219 #else
5220 # define USE_HIDDEN_LINKONCE 0
5221 #endif
5222
5223 static int pic_labels_used;
5224
5225 /* Fills in the label name that should be used for a pc thunk for
5226 the given register. */
5227
5228 static void
5229 get_pc_thunk_name (char name[32], unsigned int regno)
5230 {
5231 gcc_assert (!TARGET_64BIT);
5232
5233 if (USE_HIDDEN_LINKONCE)
5234 sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]);
5235 else
5236 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
5237 }
5238
5239
5240 /* This function generates code for -fpic that loads %ebx with
5241 the return address of the caller and then returns. */
5242
5243 void
5244 ix86_file_end (void)
5245 {
5246 rtx xops[2];
5247 int regno;
5248
5249 for (regno = 0; regno < 8; ++regno)
5250 {
5251 char name[32];
5252
5253 if (! ((pic_labels_used >> regno) & 1))
5254 continue;
5255
5256 get_pc_thunk_name (name, regno);
5257
5258 #if TARGET_MACHO
5259 if (TARGET_MACHO)
5260 {
5261 switch_to_section (darwin_sections[text_coal_section]);
5262 fputs ("\t.weak_definition\t", asm_out_file);
5263 assemble_name (asm_out_file, name);
5264 fputs ("\n\t.private_extern\t", asm_out_file);
5265 assemble_name (asm_out_file, name);
5266 fputs ("\n", asm_out_file);
5267 ASM_OUTPUT_LABEL (asm_out_file, name);
5268 }
5269 else
5270 #endif
5271 if (USE_HIDDEN_LINKONCE)
5272 {
5273 tree decl;
5274
5275 decl = build_decl (FUNCTION_DECL, get_identifier (name),
5276 error_mark_node);
5277 TREE_PUBLIC (decl) = 1;
5278 TREE_STATIC (decl) = 1;
5279 DECL_ONE_ONLY (decl) = 1;
5280
5281 (*targetm.asm_out.unique_section) (decl, 0);
5282 switch_to_section (get_named_section (decl, NULL, 0));
5283
5284 (*targetm.asm_out.globalize_label) (asm_out_file, name);
5285 fputs ("\t.hidden\t", asm_out_file);
5286 assemble_name (asm_out_file, name);
5287 fputc ('\n', asm_out_file);
5288 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
5289 }
5290 else
5291 {
5292 switch_to_section (text_section);
5293 ASM_OUTPUT_LABEL (asm_out_file, name);
5294 }
5295
5296 xops[0] = gen_rtx_REG (SImode, regno);
5297 xops[1] = gen_rtx_MEM (SImode, stack_pointer_rtx);
5298 output_asm_insn ("mov{l}\t{%1, %0|%0, %1}", xops);
5299 output_asm_insn ("ret", xops);
5300 }
5301
5302 if (NEED_INDICATE_EXEC_STACK)
5303 file_end_indicate_exec_stack ();
5304 }
5305
5306 /* Emit code for the SET_GOT patterns. */
5307
5308 const char *
5309 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
5310 {
5311 rtx xops[3];
5312
5313 xops[0] = dest;
5314 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
5315
5316 if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
5317 {
5318 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
5319
5320 if (!flag_pic)
5321 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
5322 else
5323 output_asm_insn ("call\t%a2", xops);
5324
5325 #if TARGET_MACHO
5326 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
5327 is what will be referenced by the Mach-O PIC subsystem. */
5328 if (!label)
5329 ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5330 #endif
5331
5332 (*targetm.asm_out.internal_label) (asm_out_file, "L",
5333 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
5334
5335 if (flag_pic)
5336 output_asm_insn ("pop{l}\t%0", xops);
5337 }
5338 else
5339 {
5340 char name[32];
5341 get_pc_thunk_name (name, REGNO (dest));
5342 pic_labels_used |= 1 << REGNO (dest);
5343
5344 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
5345 xops[2] = gen_rtx_MEM (QImode, xops[2]);
5346 output_asm_insn ("call\t%X2", xops);
5347 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
5348 is what will be referenced by the Mach-O PIC subsystem. */
5349 #if TARGET_MACHO
5350 if (!label)
5351 ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5352 else
5353 targetm.asm_out.internal_label (asm_out_file, "L",
5354 CODE_LABEL_NUMBER (label));
5355 #endif
5356 }
5357
5358 if (TARGET_MACHO)
5359 return "";
5360
5361 if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
5362 output_asm_insn ("add{l}\t{%1, %0|%0, %1}", xops);
5363 else
5364 output_asm_insn ("add{l}\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
5365
5366 return "";
5367 }
5368
5369 /* Generate an "push" pattern for input ARG. */
5370
5371 static rtx
5372 gen_push (rtx arg)
5373 {
5374 return gen_rtx_SET (VOIDmode,
5375 gen_rtx_MEM (Pmode,
5376 gen_rtx_PRE_DEC (Pmode,
5377 stack_pointer_rtx)),
5378 arg);
5379 }
5380
5381 /* Return >= 0 if there is an unused call-clobbered register available
5382 for the entire function. */
5383
5384 static unsigned int
5385 ix86_select_alt_pic_regnum (void)
5386 {
5387 if (current_function_is_leaf && !current_function_profile
5388 && !ix86_current_function_calls_tls_descriptor)
5389 {
5390 int i;
5391 for (i = 2; i >= 0; --i)
5392 if (!regs_ever_live[i])
5393 return i;
5394 }
5395
5396 return INVALID_REGNUM;
5397 }
5398
5399 /* Return 1 if we need to save REGNO. */
5400 static int
5401 ix86_save_reg (unsigned int regno, int maybe_eh_return)
5402 {
5403 if (pic_offset_table_rtx
5404 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
5405 && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5406 || current_function_profile
5407 || current_function_calls_eh_return
5408 || current_function_uses_const_pool))
5409 {
5410 if (ix86_select_alt_pic_regnum () != INVALID_REGNUM)
5411 return 0;
5412 return 1;
5413 }
5414
5415 if (current_function_calls_eh_return && maybe_eh_return)
5416 {
5417 unsigned i;
5418 for (i = 0; ; i++)
5419 {
5420 unsigned test = EH_RETURN_DATA_REGNO (i);
5421 if (test == INVALID_REGNUM)
5422 break;
5423 if (test == regno)
5424 return 1;
5425 }
5426 }
5427
5428 if (cfun->machine->force_align_arg_pointer
5429 && regno == REGNO (cfun->machine->force_align_arg_pointer))
5430 return 1;
5431
5432 return (regs_ever_live[regno]
5433 && !call_used_regs[regno]
5434 && !fixed_regs[regno]
5435 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
5436 }
5437
5438 /* Return number of registers to be saved on the stack. */
5439
5440 static int
5441 ix86_nsaved_regs (void)
5442 {
5443 int nregs = 0;
5444 int regno;
5445
5446 for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
5447 if (ix86_save_reg (regno, true))
5448 nregs++;
5449 return nregs;
5450 }
5451
5452 /* Return the offset between two registers, one to be eliminated, and the other
5453 its replacement, at the start of a routine. */
5454
5455 HOST_WIDE_INT
5456 ix86_initial_elimination_offset (int from, int to)
5457 {
5458 struct ix86_frame frame;
5459 ix86_compute_frame_layout (&frame);
5460
5461 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5462 return frame.hard_frame_pointer_offset;
5463 else if (from == FRAME_POINTER_REGNUM
5464 && to == HARD_FRAME_POINTER_REGNUM)
5465 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
5466 else
5467 {
5468 gcc_assert (to == STACK_POINTER_REGNUM);
5469
5470 if (from == ARG_POINTER_REGNUM)
5471 return frame.stack_pointer_offset;
5472
5473 gcc_assert (from == FRAME_POINTER_REGNUM);
5474 return frame.stack_pointer_offset - frame.frame_pointer_offset;
5475 }
5476 }
5477
5478 /* Fill structure ix86_frame about frame of currently computed function. */
5479
5480 static void
5481 ix86_compute_frame_layout (struct ix86_frame *frame)
5482 {
5483 HOST_WIDE_INT total_size;
5484 unsigned int stack_alignment_needed;
5485 HOST_WIDE_INT offset;
5486 unsigned int preferred_alignment;
5487 HOST_WIDE_INT size = get_frame_size ();
5488
5489 frame->nregs = ix86_nsaved_regs ();
5490 total_size = size;
5491
5492 stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT;
5493 preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT;
5494
5495 /* During reload iteration the amount of registers saved can change.
5496 Recompute the value as needed. Do not recompute when amount of registers
5497 didn't change as reload does multiple calls to the function and does not
5498 expect the decision to change within single iteration. */
5499 if (!optimize_size
5500 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
5501 {
5502 int count = frame->nregs;
5503
5504 cfun->machine->use_fast_prologue_epilogue_nregs = count;
5505 /* The fast prologue uses move instead of push to save registers. This
5506 is significantly longer, but also executes faster as modern hardware
5507 can execute the moves in parallel, but can't do that for push/pop.
5508
5509 Be careful about choosing what prologue to emit: When function takes
5510 many instructions to execute we may use slow version as well as in
5511 case function is known to be outside hot spot (this is known with
5512 feedback only). Weight the size of function by number of registers
5513 to save as it is cheap to use one or two push instructions but very
5514 slow to use many of them. */
5515 if (count)
5516 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
5517 if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
5518 || (flag_branch_probabilities
5519 && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
5520 cfun->machine->use_fast_prologue_epilogue = false;
5521 else
5522 cfun->machine->use_fast_prologue_epilogue
5523 = !expensive_function_p (count);
5524 }
5525 if (TARGET_PROLOGUE_USING_MOVE
5526 && cfun->machine->use_fast_prologue_epilogue)
5527 frame->save_regs_using_mov = true;
5528 else
5529 frame->save_regs_using_mov = false;
5530
5531
5532 /* Skip return address and saved base pointer. */
5533 offset = frame_pointer_needed ? UNITS_PER_WORD * 2 : UNITS_PER_WORD;
5534
5535 frame->hard_frame_pointer_offset = offset;
5536
5537 /* Do some sanity checking of stack_alignment_needed and
5538 preferred_alignment, since i386 port is the only using those features
5539 that may break easily. */
5540
5541 gcc_assert (!size || stack_alignment_needed);
5542 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
5543 gcc_assert (preferred_alignment <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5544 gcc_assert (stack_alignment_needed
5545 <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5546
5547 if (stack_alignment_needed < STACK_BOUNDARY / BITS_PER_UNIT)
5548 stack_alignment_needed = STACK_BOUNDARY / BITS_PER_UNIT;
5549
5550 /* Register save area */
5551 offset += frame->nregs * UNITS_PER_WORD;
5552
5553 /* Va-arg area */
5554 if (ix86_save_varrargs_registers)
5555 {
5556 offset += X86_64_VARARGS_SIZE;
5557 frame->va_arg_size = X86_64_VARARGS_SIZE;
5558 }
5559 else
5560 frame->va_arg_size = 0;
5561
5562 /* Align start of frame for local function. */
5563 frame->padding1 = ((offset + stack_alignment_needed - 1)
5564 & -stack_alignment_needed) - offset;
5565
5566 offset += frame->padding1;
5567
5568 /* Frame pointer points here. */
5569 frame->frame_pointer_offset = offset;
5570
5571 offset += size;
5572
5573 /* Add outgoing arguments area. Can be skipped if we eliminated
5574 all the function calls as dead code.
5575 Skipping is however impossible when function calls alloca. Alloca
5576 expander assumes that last current_function_outgoing_args_size
5577 of stack frame are unused. */
5578 if (ACCUMULATE_OUTGOING_ARGS
5579 && (!current_function_is_leaf || current_function_calls_alloca
5580 || ix86_current_function_calls_tls_descriptor))
5581 {
5582 offset += current_function_outgoing_args_size;
5583 frame->outgoing_arguments_size = current_function_outgoing_args_size;
5584 }
5585 else
5586 frame->outgoing_arguments_size = 0;
5587
5588 /* Align stack boundary. Only needed if we're calling another function
5589 or using alloca. */
5590 if (!current_function_is_leaf || current_function_calls_alloca
5591 || ix86_current_function_calls_tls_descriptor)
5592 frame->padding2 = ((offset + preferred_alignment - 1)
5593 & -preferred_alignment) - offset;
5594 else
5595 frame->padding2 = 0;
5596
5597 offset += frame->padding2;
5598
5599 /* We've reached end of stack frame. */
5600 frame->stack_pointer_offset = offset;
5601
5602 /* Size prologue needs to allocate. */
5603 frame->to_allocate =
5604 (size + frame->padding1 + frame->padding2
5605 + frame->outgoing_arguments_size + frame->va_arg_size);
5606
5607 if ((!frame->to_allocate && frame->nregs <= 1)
5608 || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
5609 frame->save_regs_using_mov = false;
5610
5611 if (TARGET_RED_ZONE && current_function_sp_is_unchanging
5612 && current_function_is_leaf
5613 && !ix86_current_function_calls_tls_descriptor)
5614 {
5615 frame->red_zone_size = frame->to_allocate;
5616 if (frame->save_regs_using_mov)
5617 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
5618 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
5619 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
5620 }
5621 else
5622 frame->red_zone_size = 0;
5623 frame->to_allocate -= frame->red_zone_size;
5624 frame->stack_pointer_offset -= frame->red_zone_size;
5625 #if 0
5626 fprintf (stderr, "\n");
5627 fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
5628 fprintf (stderr, "size: %ld\n", (long)size);
5629 fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
5630 fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
5631 fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
5632 fprintf (stderr, "padding2: %ld\n", (long)frame->padding2);
5633 fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate);
5634 fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size);
5635 fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset);
5636 fprintf (stderr, "hard_frame_pointer_offset: %ld\n",
5637 (long)frame->hard_frame_pointer_offset);
5638 fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset);
5639 fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf);
5640 fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca);
5641 fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor);
5642 #endif
5643 }
5644
5645 /* Emit code to save registers in the prologue. */
5646
5647 static void
5648 ix86_emit_save_regs (void)
5649 {
5650 unsigned int regno;
5651 rtx insn;
5652
5653 for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
5654 if (ix86_save_reg (regno, true))
5655 {
5656 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
5657 RTX_FRAME_RELATED_P (insn) = 1;
5658 }
5659 }
5660
5661 /* Emit code to save registers using MOV insns. First register
5662 is restored from POINTER + OFFSET. */
5663 static void
5664 ix86_emit_save_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
5665 {
5666 unsigned int regno;
5667 rtx insn;
5668
5669 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
5670 if (ix86_save_reg (regno, true))
5671 {
5672 insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
5673 Pmode, offset),
5674 gen_rtx_REG (Pmode, regno));
5675 RTX_FRAME_RELATED_P (insn) = 1;
5676 offset += UNITS_PER_WORD;
5677 }
5678 }
5679
5680 /* Expand prologue or epilogue stack adjustment.
5681 The pattern exist to put a dependency on all ebp-based memory accesses.
5682 STYLE should be negative if instructions should be marked as frame related,
5683 zero if %r11 register is live and cannot be freely used and positive
5684 otherwise. */
5685
5686 static void
5687 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style)
5688 {
5689 rtx insn;
5690
5691 if (! TARGET_64BIT)
5692 insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
5693 else if (x86_64_immediate_operand (offset, DImode))
5694 insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
5695 else
5696 {
5697 rtx r11;
5698 /* r11 is used by indirect sibcall return as well, set before the
5699 epilogue and used after the epilogue. ATM indirect sibcall
5700 shouldn't be used together with huge frame sizes in one
5701 function because of the frame_size check in sibcall.c. */
5702 gcc_assert (style);
5703 r11 = gen_rtx_REG (DImode, R11_REG);
5704 insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
5705 if (style < 0)
5706 RTX_FRAME_RELATED_P (insn) = 1;
5707 insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
5708 offset));
5709 }
5710 if (style < 0)
5711 RTX_FRAME_RELATED_P (insn) = 1;
5712 }
5713
5714 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
5715
5716 static rtx
5717 ix86_internal_arg_pointer (void)
5718 {
5719 bool has_force_align_arg_pointer =
5720 (0 != lookup_attribute (ix86_force_align_arg_pointer_string,
5721 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))));
5722 if ((FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN
5723 && DECL_NAME (current_function_decl)
5724 && MAIN_NAME_P (DECL_NAME (current_function_decl))
5725 && DECL_FILE_SCOPE_P (current_function_decl))
5726 || ix86_force_align_arg_pointer
5727 || has_force_align_arg_pointer)
5728 {
5729 /* Nested functions can't realign the stack due to a register
5730 conflict. */
5731 if (DECL_CONTEXT (current_function_decl)
5732 && TREE_CODE (DECL_CONTEXT (current_function_decl)) == FUNCTION_DECL)
5733 {
5734 if (ix86_force_align_arg_pointer)
5735 warning (0, "-mstackrealign ignored for nested functions");
5736 if (has_force_align_arg_pointer)
5737 error ("%s not supported for nested functions",
5738 ix86_force_align_arg_pointer_string);
5739 return virtual_incoming_args_rtx;
5740 }
5741 cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, 2);
5742 return copy_to_reg (cfun->machine->force_align_arg_pointer);
5743 }
5744 else
5745 return virtual_incoming_args_rtx;
5746 }
5747
5748 /* Handle the TARGET_DWARF_HANDLE_FRAME_UNSPEC hook.
5749 This is called from dwarf2out.c to emit call frame instructions
5750 for frame-related insns containing UNSPECs and UNSPEC_VOLATILEs. */
5751 static void
5752 ix86_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index)
5753 {
5754 rtx unspec = SET_SRC (pattern);
5755 gcc_assert (GET_CODE (unspec) == UNSPEC);
5756
5757 switch (index)
5758 {
5759 case UNSPEC_REG_SAVE:
5760 dwarf2out_reg_save_reg (label, XVECEXP (unspec, 0, 0),
5761 SET_DEST (pattern));
5762 break;
5763 case UNSPEC_DEF_CFA:
5764 dwarf2out_def_cfa (label, REGNO (SET_DEST (pattern)),
5765 INTVAL (XVECEXP (unspec, 0, 0)));
5766 break;
5767 default:
5768 gcc_unreachable ();
5769 }
5770 }
5771
5772 /* Expand the prologue into a bunch of separate insns. */
5773
5774 void
5775 ix86_expand_prologue (void)
5776 {
5777 rtx insn;
5778 bool pic_reg_used;
5779 struct ix86_frame frame;
5780 HOST_WIDE_INT allocate;
5781
5782 ix86_compute_frame_layout (&frame);
5783
5784 if (cfun->machine->force_align_arg_pointer)
5785 {
5786 rtx x, y;
5787
5788 /* Grab the argument pointer. */
5789 x = plus_constant (stack_pointer_rtx, 4);
5790 y = cfun->machine->force_align_arg_pointer;
5791 insn = emit_insn (gen_rtx_SET (VOIDmode, y, x));
5792 RTX_FRAME_RELATED_P (insn) = 1;
5793
5794 /* The unwind info consists of two parts: install the fafp as the cfa,
5795 and record the fafp as the "save register" of the stack pointer.
5796 The later is there in order that the unwinder can see where it
5797 should restore the stack pointer across the and insn. */
5798 x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx), UNSPEC_DEF_CFA);
5799 x = gen_rtx_SET (VOIDmode, y, x);
5800 RTX_FRAME_RELATED_P (x) = 1;
5801 y = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, stack_pointer_rtx),
5802 UNSPEC_REG_SAVE);
5803 y = gen_rtx_SET (VOIDmode, cfun->machine->force_align_arg_pointer, y);
5804 RTX_FRAME_RELATED_P (y) = 1;
5805 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y));
5806 x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5807 REG_NOTES (insn) = x;
5808
5809 /* Align the stack. */
5810 emit_insn (gen_andsi3 (stack_pointer_rtx, stack_pointer_rtx,
5811 GEN_INT (-16)));
5812
5813 /* And here we cheat like madmen with the unwind info. We force the
5814 cfa register back to sp+4, which is exactly what it was at the
5815 start of the function. Re-pushing the return address results in
5816 the return at the same spot relative to the cfa, and thus is
5817 correct wrt the unwind info. */
5818 x = cfun->machine->force_align_arg_pointer;
5819 x = gen_frame_mem (Pmode, plus_constant (x, -4));
5820 insn = emit_insn (gen_push (x));
5821 RTX_FRAME_RELATED_P (insn) = 1;
5822
5823 x = GEN_INT (4);
5824 x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, x), UNSPEC_DEF_CFA);
5825 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
5826 x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5827 REG_NOTES (insn) = x;
5828 }
5829
5830 /* Note: AT&T enter does NOT have reversed args. Enter is probably
5831 slower on all targets. Also sdb doesn't like it. */
5832
5833 if (frame_pointer_needed)
5834 {
5835 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
5836 RTX_FRAME_RELATED_P (insn) = 1;
5837
5838 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
5839 RTX_FRAME_RELATED_P (insn) = 1;
5840 }
5841
5842 allocate = frame.to_allocate;
5843
5844 if (!frame.save_regs_using_mov)
5845 ix86_emit_save_regs ();
5846 else
5847 allocate += frame.nregs * UNITS_PER_WORD;
5848
5849 /* When using red zone we may start register saving before allocating
5850 the stack frame saving one cycle of the prologue. */
5851 if (TARGET_RED_ZONE && frame.save_regs_using_mov)
5852 ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx
5853 : stack_pointer_rtx,
5854 -frame.nregs * UNITS_PER_WORD);
5855
5856 if (allocate == 0)
5857 ;
5858 else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
5859 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
5860 GEN_INT (-allocate), -1);
5861 else
5862 {
5863 /* Only valid for Win32. */
5864 rtx eax = gen_rtx_REG (SImode, 0);
5865 bool eax_live = ix86_eax_live_at_start_p ();
5866 rtx t;
5867
5868 gcc_assert (!TARGET_64BIT);
5869
5870 if (eax_live)
5871 {
5872 emit_insn (gen_push (eax));
5873 allocate -= 4;
5874 }
5875
5876 emit_move_insn (eax, GEN_INT (allocate));
5877
5878 insn = emit_insn (gen_allocate_stack_worker (eax));
5879 RTX_FRAME_RELATED_P (insn) = 1;
5880 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-allocate));
5881 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
5882 REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR,
5883 t, REG_NOTES (insn));
5884
5885 if (eax_live)
5886 {
5887 if (frame_pointer_needed)
5888 t = plus_constant (hard_frame_pointer_rtx,
5889 allocate
5890 - frame.to_allocate
5891 - frame.nregs * UNITS_PER_WORD);
5892 else
5893 t = plus_constant (stack_pointer_rtx, allocate);
5894 emit_move_insn (eax, gen_rtx_MEM (SImode, t));
5895 }
5896 }
5897
5898 if (frame.save_regs_using_mov && !TARGET_RED_ZONE)
5899 {
5900 if (!frame_pointer_needed || !frame.to_allocate)
5901 ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
5902 else
5903 ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
5904 -frame.nregs * UNITS_PER_WORD);
5905 }
5906
5907 pic_reg_used = false;
5908 if (pic_offset_table_rtx
5909 && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5910 || current_function_profile))
5911 {
5912 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
5913
5914 if (alt_pic_reg_used != INVALID_REGNUM)
5915 REGNO (pic_offset_table_rtx) = alt_pic_reg_used;
5916
5917 pic_reg_used = true;
5918 }
5919
5920 if (pic_reg_used)
5921 {
5922 if (TARGET_64BIT)
5923 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
5924 else
5925 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
5926
5927 /* Even with accurate pre-reload life analysis, we can wind up
5928 deleting all references to the pic register after reload.
5929 Consider if cross-jumping unifies two sides of a branch
5930 controlled by a comparison vs the only read from a global.
5931 In which case, allow the set_got to be deleted, though we're
5932 too late to do anything about the ebx save in the prologue. */
5933 REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL);
5934 }
5935
5936 /* Prevent function calls from be scheduled before the call to mcount.
5937 In the pic_reg_used case, make sure that the got load isn't deleted. */
5938 if (current_function_profile)
5939 emit_insn (gen_blockage (pic_reg_used ? pic_offset_table_rtx : const0_rtx));
5940 }
5941
5942 /* Emit code to restore saved registers using MOV insns. First register
5943 is restored from POINTER + OFFSET. */
5944 static void
5945 ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
5946 int maybe_eh_return)
5947 {
5948 int regno;
5949 rtx base_address = gen_rtx_MEM (Pmode, pointer);
5950
5951 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
5952 if (ix86_save_reg (regno, maybe_eh_return))
5953 {
5954 /* Ensure that adjust_address won't be forced to produce pointer
5955 out of range allowed by x86-64 instruction set. */
5956 if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
5957 {
5958 rtx r11;
5959
5960 r11 = gen_rtx_REG (DImode, R11_REG);
5961 emit_move_insn (r11, GEN_INT (offset));
5962 emit_insn (gen_adddi3 (r11, r11, pointer));
5963 base_address = gen_rtx_MEM (Pmode, r11);
5964 offset = 0;
5965 }
5966 emit_move_insn (gen_rtx_REG (Pmode, regno),
5967 adjust_address (base_address, Pmode, offset));
5968 offset += UNITS_PER_WORD;
5969 }
5970 }
5971
5972 /* Restore function stack, frame, and registers. */
5973
5974 void
5975 ix86_expand_epilogue (int style)
5976 {
5977 int regno;
5978 int sp_valid = !frame_pointer_needed || current_function_sp_is_unchanging;
5979 struct ix86_frame frame;
5980 HOST_WIDE_INT offset;
5981
5982 ix86_compute_frame_layout (&frame);
5983
5984 /* Calculate start of saved registers relative to ebp. Special care
5985 must be taken for the normal return case of a function using
5986 eh_return: the eax and edx registers are marked as saved, but not
5987 restored along this path. */
5988 offset = frame.nregs;
5989 if (current_function_calls_eh_return && style != 2)
5990 offset -= 2;
5991 offset *= -UNITS_PER_WORD;
5992
5993 /* If we're only restoring one register and sp is not valid then
5994 using a move instruction to restore the register since it's
5995 less work than reloading sp and popping the register.
5996
5997 The default code result in stack adjustment using add/lea instruction,
5998 while this code results in LEAVE instruction (or discrete equivalent),
5999 so it is profitable in some other cases as well. Especially when there
6000 are no registers to restore. We also use this code when TARGET_USE_LEAVE
6001 and there is exactly one register to pop. This heuristic may need some
6002 tuning in future. */
6003 if ((!sp_valid && frame.nregs <= 1)
6004 || (TARGET_EPILOGUE_USING_MOVE
6005 && cfun->machine->use_fast_prologue_epilogue
6006 && (frame.nregs > 1 || frame.to_allocate))
6007 || (frame_pointer_needed && !frame.nregs && frame.to_allocate)
6008 || (frame_pointer_needed && TARGET_USE_LEAVE
6009 && cfun->machine->use_fast_prologue_epilogue
6010 && frame.nregs == 1)
6011 || current_function_calls_eh_return)
6012 {
6013 /* Restore registers. We can use ebp or esp to address the memory
6014 locations. If both are available, default to ebp, since offsets
6015 are known to be small. Only exception is esp pointing directly to the
6016 end of block of saved registers, where we may simplify addressing
6017 mode. */
6018
6019 if (!frame_pointer_needed || (sp_valid && !frame.to_allocate))
6020 ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
6021 frame.to_allocate, style == 2);
6022 else
6023 ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
6024 offset, style == 2);
6025
6026 /* eh_return epilogues need %ecx added to the stack pointer. */
6027 if (style == 2)
6028 {
6029 rtx tmp, sa = EH_RETURN_STACKADJ_RTX;
6030
6031 if (frame_pointer_needed)
6032 {
6033 tmp = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
6034 tmp = plus_constant (tmp, UNITS_PER_WORD);
6035 emit_insn (gen_rtx_SET (VOIDmode, sa, tmp));
6036
6037 tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
6038 emit_move_insn (hard_frame_pointer_rtx, tmp);
6039
6040 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
6041 const0_rtx, style);
6042 }
6043 else
6044 {
6045 tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
6046 tmp = plus_constant (tmp, (frame.to_allocate
6047 + frame.nregs * UNITS_PER_WORD));
6048 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
6049 }
6050 }
6051 else if (!frame_pointer_needed)
6052 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6053 GEN_INT (frame.to_allocate
6054 + frame.nregs * UNITS_PER_WORD),
6055 style);
6056 /* If not an i386, mov & pop is faster than "leave". */
6057 else if (TARGET_USE_LEAVE || optimize_size
6058 || !cfun->machine->use_fast_prologue_epilogue)
6059 emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6060 else
6061 {
6062 pro_epilogue_adjust_stack (stack_pointer_rtx,
6063 hard_frame_pointer_rtx,
6064 const0_rtx, style);
6065 if (TARGET_64BIT)
6066 emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6067 else
6068 emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6069 }
6070 }
6071 else
6072 {
6073 /* First step is to deallocate the stack frame so that we can
6074 pop the registers. */
6075 if (!sp_valid)
6076 {
6077 gcc_assert (frame_pointer_needed);
6078 pro_epilogue_adjust_stack (stack_pointer_rtx,
6079 hard_frame_pointer_rtx,
6080 GEN_INT (offset), style);
6081 }
6082 else if (frame.to_allocate)
6083 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6084 GEN_INT (frame.to_allocate), style);
6085
6086 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
6087 if (ix86_save_reg (regno, false))
6088 {
6089 if (TARGET_64BIT)
6090 emit_insn (gen_popdi1 (gen_rtx_REG (Pmode, regno)));
6091 else
6092 emit_insn (gen_popsi1 (gen_rtx_REG (Pmode, regno)));
6093 }
6094 if (frame_pointer_needed)
6095 {
6096 /* Leave results in shorter dependency chains on CPUs that are
6097 able to grok it fast. */
6098 if (TARGET_USE_LEAVE)
6099 emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6100 else if (TARGET_64BIT)
6101 emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6102 else
6103 emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6104 }
6105 }
6106
6107 if (cfun->machine->force_align_arg_pointer)
6108 {
6109 emit_insn (gen_addsi3 (stack_pointer_rtx,
6110 cfun->machine->force_align_arg_pointer,
6111 GEN_INT (-4)));
6112 }
6113
6114 /* Sibcall epilogues don't want a return instruction. */
6115 if (style == 0)
6116 return;
6117
6118 if (current_function_pops_args && current_function_args_size)
6119 {
6120 rtx popc = GEN_INT (current_function_pops_args);
6121
6122 /* i386 can only pop 64K bytes. If asked to pop more, pop
6123 return address, do explicit add, and jump indirectly to the
6124 caller. */
6125
6126 if (current_function_pops_args >= 65536)
6127 {
6128 rtx ecx = gen_rtx_REG (SImode, 2);
6129
6130 /* There is no "pascal" calling convention in 64bit ABI. */
6131 gcc_assert (!TARGET_64BIT);
6132
6133 emit_insn (gen_popsi1 (ecx));
6134 emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, popc));
6135 emit_jump_insn (gen_return_indirect_internal (ecx));
6136 }
6137 else
6138 emit_jump_insn (gen_return_pop_internal (popc));
6139 }
6140 else
6141 emit_jump_insn (gen_return_internal ());
6142 }
6143
6144 /* Reset from the function's potential modifications. */
6145
6146 static void
6147 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
6148 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
6149 {
6150 if (pic_offset_table_rtx)
6151 REGNO (pic_offset_table_rtx) = REAL_PIC_OFFSET_TABLE_REGNUM;
6152 #if TARGET_MACHO
6153 /* Mach-O doesn't support labels at the end of objects, so if
6154 it looks like we might want one, insert a NOP. */
6155 {
6156 rtx insn = get_last_insn ();
6157 while (insn
6158 && NOTE_P (insn)
6159 && NOTE_LINE_NUMBER (insn) != NOTE_INSN_DELETED_LABEL)
6160 insn = PREV_INSN (insn);
6161 if (insn
6162 && (LABEL_P (insn)
6163 || (NOTE_P (insn)
6164 && NOTE_LINE_NUMBER (insn) == NOTE_INSN_DELETED_LABEL)))
6165 fputs ("\tnop\n", file);
6166 }
6167 #endif
6168
6169 }
6170 \f
6171 /* Extract the parts of an RTL expression that is a valid memory address
6172 for an instruction. Return 0 if the structure of the address is
6173 grossly off. Return -1 if the address contains ASHIFT, so it is not
6174 strictly valid, but still used for computing length of lea instruction. */
6175
6176 int
6177 ix86_decompose_address (rtx addr, struct ix86_address *out)
6178 {
6179 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
6180 rtx base_reg, index_reg;
6181 HOST_WIDE_INT scale = 1;
6182 rtx scale_rtx = NULL_RTX;
6183 int retval = 1;
6184 enum ix86_address_seg seg = SEG_DEFAULT;
6185
6186 if (REG_P (addr) || GET_CODE (addr) == SUBREG)
6187 base = addr;
6188 else if (GET_CODE (addr) == PLUS)
6189 {
6190 rtx addends[4], op;
6191 int n = 0, i;
6192
6193 op = addr;
6194 do
6195 {
6196 if (n >= 4)
6197 return 0;
6198 addends[n++] = XEXP (op, 1);
6199 op = XEXP (op, 0);
6200 }
6201 while (GET_CODE (op) == PLUS);
6202 if (n >= 4)
6203 return 0;
6204 addends[n] = op;
6205
6206 for (i = n; i >= 0; --i)
6207 {
6208 op = addends[i];
6209 switch (GET_CODE (op))
6210 {
6211 case MULT:
6212 if (index)
6213 return 0;
6214 index = XEXP (op, 0);
6215 scale_rtx = XEXP (op, 1);
6216 break;
6217
6218 case UNSPEC:
6219 if (XINT (op, 1) == UNSPEC_TP
6220 && TARGET_TLS_DIRECT_SEG_REFS
6221 && seg == SEG_DEFAULT)
6222 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
6223 else
6224 return 0;
6225 break;
6226
6227 case REG:
6228 case SUBREG:
6229 if (!base)
6230 base = op;
6231 else if (!index)
6232 index = op;
6233 else
6234 return 0;
6235 break;
6236
6237 case CONST:
6238 case CONST_INT:
6239 case SYMBOL_REF:
6240 case LABEL_REF:
6241 if (disp)
6242 return 0;
6243 disp = op;
6244 break;
6245
6246 default:
6247 return 0;
6248 }
6249 }
6250 }
6251 else if (GET_CODE (addr) == MULT)
6252 {
6253 index = XEXP (addr, 0); /* index*scale */
6254 scale_rtx = XEXP (addr, 1);
6255 }
6256 else if (GET_CODE (addr) == ASHIFT)
6257 {
6258 rtx tmp;
6259
6260 /* We're called for lea too, which implements ashift on occasion. */
6261 index = XEXP (addr, 0);
6262 tmp = XEXP (addr, 1);
6263 if (!CONST_INT_P (tmp))
6264 return 0;
6265 scale = INTVAL (tmp);
6266 if ((unsigned HOST_WIDE_INT) scale > 3)
6267 return 0;
6268 scale = 1 << scale;
6269 retval = -1;
6270 }
6271 else
6272 disp = addr; /* displacement */
6273
6274 /* Extract the integral value of scale. */
6275 if (scale_rtx)
6276 {
6277 if (!CONST_INT_P (scale_rtx))
6278 return 0;
6279 scale = INTVAL (scale_rtx);
6280 }
6281
6282 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
6283 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
6284
6285 /* Allow arg pointer and stack pointer as index if there is not scaling. */
6286 if (base_reg && index_reg && scale == 1
6287 && (index_reg == arg_pointer_rtx
6288 || index_reg == frame_pointer_rtx
6289 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
6290 {
6291 rtx tmp;
6292 tmp = base, base = index, index = tmp;
6293 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
6294 }
6295
6296 /* Special case: %ebp cannot be encoded as a base without a displacement. */
6297 if ((base_reg == hard_frame_pointer_rtx
6298 || base_reg == frame_pointer_rtx
6299 || base_reg == arg_pointer_rtx) && !disp)
6300 disp = const0_rtx;
6301
6302 /* Special case: on K6, [%esi] makes the instruction vector decoded.
6303 Avoid this by transforming to [%esi+0]. */
6304 if (ix86_tune == PROCESSOR_K6 && !optimize_size
6305 && base_reg && !index_reg && !disp
6306 && REG_P (base_reg)
6307 && REGNO_REG_CLASS (REGNO (base_reg)) == SIREG)
6308 disp = const0_rtx;
6309
6310 /* Special case: encode reg+reg instead of reg*2. */
6311 if (!base && index && scale && scale == 2)
6312 base = index, base_reg = index_reg, scale = 1;
6313
6314 /* Special case: scaling cannot be encoded without base or displacement. */
6315 if (!base && !disp && index && scale != 1)
6316 disp = const0_rtx;
6317
6318 out->base = base;
6319 out->index = index;
6320 out->disp = disp;
6321 out->scale = scale;
6322 out->seg = seg;
6323
6324 return retval;
6325 }
6326 \f
6327 /* Return cost of the memory address x.
6328 For i386, it is better to use a complex address than let gcc copy
6329 the address into a reg and make a new pseudo. But not if the address
6330 requires to two regs - that would mean more pseudos with longer
6331 lifetimes. */
6332 static int
6333 ix86_address_cost (rtx x)
6334 {
6335 struct ix86_address parts;
6336 int cost = 1;
6337 int ok = ix86_decompose_address (x, &parts);
6338
6339 gcc_assert (ok);
6340
6341 if (parts.base && GET_CODE (parts.base) == SUBREG)
6342 parts.base = SUBREG_REG (parts.base);
6343 if (parts.index && GET_CODE (parts.index) == SUBREG)
6344 parts.index = SUBREG_REG (parts.index);
6345
6346 /* More complex memory references are better. */
6347 if (parts.disp && parts.disp != const0_rtx)
6348 cost--;
6349 if (parts.seg != SEG_DEFAULT)
6350 cost--;
6351
6352 /* Attempt to minimize number of registers in the address. */
6353 if ((parts.base
6354 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
6355 || (parts.index
6356 && (!REG_P (parts.index)
6357 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
6358 cost++;
6359
6360 if (parts.base
6361 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
6362 && parts.index
6363 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
6364 && parts.base != parts.index)
6365 cost++;
6366
6367 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
6368 since it's predecode logic can't detect the length of instructions
6369 and it degenerates to vector decoded. Increase cost of such
6370 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
6371 to split such addresses or even refuse such addresses at all.
6372
6373 Following addressing modes are affected:
6374 [base+scale*index]
6375 [scale*index+disp]
6376 [base+index]
6377
6378 The first and last case may be avoidable by explicitly coding the zero in
6379 memory address, but I don't have AMD-K6 machine handy to check this
6380 theory. */
6381
6382 if (TARGET_K6
6383 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
6384 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
6385 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
6386 cost += 10;
6387
6388 return cost;
6389 }
6390 \f
6391 /* If X is a machine specific address (i.e. a symbol or label being
6392 referenced as a displacement from the GOT implemented using an
6393 UNSPEC), then return the base term. Otherwise return X. */
6394
6395 rtx
6396 ix86_find_base_term (rtx x)
6397 {
6398 rtx term;
6399
6400 if (TARGET_64BIT)
6401 {
6402 if (GET_CODE (x) != CONST)
6403 return x;
6404 term = XEXP (x, 0);
6405 if (GET_CODE (term) == PLUS
6406 && (CONST_INT_P (XEXP (term, 1))
6407 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
6408 term = XEXP (term, 0);
6409 if (GET_CODE (term) != UNSPEC
6410 || XINT (term, 1) != UNSPEC_GOTPCREL)
6411 return x;
6412
6413 term = XVECEXP (term, 0, 0);
6414
6415 if (GET_CODE (term) != SYMBOL_REF
6416 && GET_CODE (term) != LABEL_REF)
6417 return x;
6418
6419 return term;
6420 }
6421
6422 term = ix86_delegitimize_address (x);
6423
6424 if (GET_CODE (term) != SYMBOL_REF
6425 && GET_CODE (term) != LABEL_REF)
6426 return x;
6427
6428 return term;
6429 }
6430
6431 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
6432 this is used for to form addresses to local data when -fPIC is in
6433 use. */
6434
6435 static bool
6436 darwin_local_data_pic (rtx disp)
6437 {
6438 if (GET_CODE (disp) == MINUS)
6439 {
6440 if (GET_CODE (XEXP (disp, 0)) == LABEL_REF
6441 || GET_CODE (XEXP (disp, 0)) == SYMBOL_REF)
6442 if (GET_CODE (XEXP (disp, 1)) == SYMBOL_REF)
6443 {
6444 const char *sym_name = XSTR (XEXP (disp, 1), 0);
6445 if (! strcmp (sym_name, "<pic base>"))
6446 return true;
6447 }
6448 }
6449
6450 return false;
6451 }
6452 \f
6453 /* Determine if a given RTX is a valid constant. We already know this
6454 satisfies CONSTANT_P. */
6455
6456 bool
6457 legitimate_constant_p (rtx x)
6458 {
6459 switch (GET_CODE (x))
6460 {
6461 case CONST:
6462 x = XEXP (x, 0);
6463
6464 if (GET_CODE (x) == PLUS)
6465 {
6466 if (!CONST_INT_P (XEXP (x, 1)))
6467 return false;
6468 x = XEXP (x, 0);
6469 }
6470
6471 if (TARGET_MACHO && darwin_local_data_pic (x))
6472 return true;
6473
6474 /* Only some unspecs are valid as "constants". */
6475 if (GET_CODE (x) == UNSPEC)
6476 switch (XINT (x, 1))
6477 {
6478 case UNSPEC_GOTOFF:
6479 return TARGET_64BIT;
6480 case UNSPEC_TPOFF:
6481 case UNSPEC_NTPOFF:
6482 x = XVECEXP (x, 0, 0);
6483 return (GET_CODE (x) == SYMBOL_REF
6484 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6485 case UNSPEC_DTPOFF:
6486 x = XVECEXP (x, 0, 0);
6487 return (GET_CODE (x) == SYMBOL_REF
6488 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
6489 default:
6490 return false;
6491 }
6492
6493 /* We must have drilled down to a symbol. */
6494 if (GET_CODE (x) == LABEL_REF)
6495 return true;
6496 if (GET_CODE (x) != SYMBOL_REF)
6497 return false;
6498 /* FALLTHRU */
6499
6500 case SYMBOL_REF:
6501 /* TLS symbols are never valid. */
6502 if (SYMBOL_REF_TLS_MODEL (x))
6503 return false;
6504 break;
6505
6506 case CONST_DOUBLE:
6507 if (GET_MODE (x) == TImode
6508 && x != CONST0_RTX (TImode)
6509 && !TARGET_64BIT)
6510 return false;
6511 break;
6512
6513 case CONST_VECTOR:
6514 if (x == CONST0_RTX (GET_MODE (x)))
6515 return true;
6516 return false;
6517
6518 default:
6519 break;
6520 }
6521
6522 /* Otherwise we handle everything else in the move patterns. */
6523 return true;
6524 }
6525
6526 /* Determine if it's legal to put X into the constant pool. This
6527 is not possible for the address of thread-local symbols, which
6528 is checked above. */
6529
6530 static bool
6531 ix86_cannot_force_const_mem (rtx x)
6532 {
6533 /* We can always put integral constants and vectors in memory. */
6534 switch (GET_CODE (x))
6535 {
6536 case CONST_INT:
6537 case CONST_DOUBLE:
6538 case CONST_VECTOR:
6539 return false;
6540
6541 default:
6542 break;
6543 }
6544 return !legitimate_constant_p (x);
6545 }
6546
6547 /* Determine if a given RTX is a valid constant address. */
6548
6549 bool
6550 constant_address_p (rtx x)
6551 {
6552 return CONSTANT_P (x) && legitimate_address_p (Pmode, x, 1);
6553 }
6554
6555 /* Nonzero if the constant value X is a legitimate general operand
6556 when generating PIC code. It is given that flag_pic is on and
6557 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
6558
6559 bool
6560 legitimate_pic_operand_p (rtx x)
6561 {
6562 rtx inner;
6563
6564 switch (GET_CODE (x))
6565 {
6566 case CONST:
6567 inner = XEXP (x, 0);
6568 if (GET_CODE (inner) == PLUS
6569 && CONST_INT_P (XEXP (inner, 1)))
6570 inner = XEXP (inner, 0);
6571
6572 /* Only some unspecs are valid as "constants". */
6573 if (GET_CODE (inner) == UNSPEC)
6574 switch (XINT (inner, 1))
6575 {
6576 case UNSPEC_GOTOFF:
6577 return TARGET_64BIT;
6578 case UNSPEC_TPOFF:
6579 x = XVECEXP (inner, 0, 0);
6580 return (GET_CODE (x) == SYMBOL_REF
6581 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6582 default:
6583 return false;
6584 }
6585 /* FALLTHRU */
6586
6587 case SYMBOL_REF:
6588 case LABEL_REF:
6589 return legitimate_pic_address_disp_p (x);
6590
6591 default:
6592 return true;
6593 }
6594 }
6595
6596 /* Determine if a given CONST RTX is a valid memory displacement
6597 in PIC mode. */
6598
6599 int
6600 legitimate_pic_address_disp_p (rtx disp)
6601 {
6602 bool saw_plus;
6603
6604 /* In 64bit mode we can allow direct addresses of symbols and labels
6605 when they are not dynamic symbols. */
6606 if (TARGET_64BIT)
6607 {
6608 rtx op0 = disp, op1;
6609
6610 switch (GET_CODE (disp))
6611 {
6612 case LABEL_REF:
6613 return true;
6614
6615 case CONST:
6616 if (GET_CODE (XEXP (disp, 0)) != PLUS)
6617 break;
6618 op0 = XEXP (XEXP (disp, 0), 0);
6619 op1 = XEXP (XEXP (disp, 0), 1);
6620 if (!CONST_INT_P (op1)
6621 || INTVAL (op1) >= 16*1024*1024
6622 || INTVAL (op1) < -16*1024*1024)
6623 break;
6624 if (GET_CODE (op0) == LABEL_REF)
6625 return true;
6626 if (GET_CODE (op0) != SYMBOL_REF)
6627 break;
6628 /* FALLTHRU */
6629
6630 case SYMBOL_REF:
6631 /* TLS references should always be enclosed in UNSPEC. */
6632 if (SYMBOL_REF_TLS_MODEL (op0))
6633 return false;
6634 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0))
6635 return true;
6636 break;
6637
6638 default:
6639 break;
6640 }
6641 }
6642 if (GET_CODE (disp) != CONST)
6643 return 0;
6644 disp = XEXP (disp, 0);
6645
6646 if (TARGET_64BIT)
6647 {
6648 /* We are unsafe to allow PLUS expressions. This limit allowed distance
6649 of GOT tables. We should not need these anyway. */
6650 if (GET_CODE (disp) != UNSPEC
6651 || (XINT (disp, 1) != UNSPEC_GOTPCREL
6652 && XINT (disp, 1) != UNSPEC_GOTOFF))
6653 return 0;
6654
6655 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
6656 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
6657 return 0;
6658 return 1;
6659 }
6660
6661 saw_plus = false;
6662 if (GET_CODE (disp) == PLUS)
6663 {
6664 if (!CONST_INT_P (XEXP (disp, 1)))
6665 return 0;
6666 disp = XEXP (disp, 0);
6667 saw_plus = true;
6668 }
6669
6670 if (TARGET_MACHO && darwin_local_data_pic (disp))
6671 return 1;
6672
6673 if (GET_CODE (disp) != UNSPEC)
6674 return 0;
6675
6676 switch (XINT (disp, 1))
6677 {
6678 case UNSPEC_GOT:
6679 if (saw_plus)
6680 return false;
6681 return GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF;
6682 case UNSPEC_GOTOFF:
6683 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
6684 While ABI specify also 32bit relocation but we don't produce it in
6685 small PIC model at all. */
6686 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
6687 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
6688 && !TARGET_64BIT)
6689 return local_symbolic_operand (XVECEXP (disp, 0, 0), Pmode);
6690 return false;
6691 case UNSPEC_GOTTPOFF:
6692 case UNSPEC_GOTNTPOFF:
6693 case UNSPEC_INDNTPOFF:
6694 if (saw_plus)
6695 return false;
6696 disp = XVECEXP (disp, 0, 0);
6697 return (GET_CODE (disp) == SYMBOL_REF
6698 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
6699 case UNSPEC_NTPOFF:
6700 disp = XVECEXP (disp, 0, 0);
6701 return (GET_CODE (disp) == SYMBOL_REF
6702 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
6703 case UNSPEC_DTPOFF:
6704 disp = XVECEXP (disp, 0, 0);
6705 return (GET_CODE (disp) == SYMBOL_REF
6706 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
6707 }
6708
6709 return 0;
6710 }
6711
6712 /* GO_IF_LEGITIMATE_ADDRESS recognizes an RTL expression that is a valid
6713 memory address for an instruction. The MODE argument is the machine mode
6714 for the MEM expression that wants to use this address.
6715
6716 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
6717 convert common non-canonical forms to canonical form so that they will
6718 be recognized. */
6719
6720 int
6721 legitimate_address_p (enum machine_mode mode, rtx addr, int strict)
6722 {
6723 struct ix86_address parts;
6724 rtx base, index, disp;
6725 HOST_WIDE_INT scale;
6726 const char *reason = NULL;
6727 rtx reason_rtx = NULL_RTX;
6728
6729 if (TARGET_DEBUG_ADDR)
6730 {
6731 fprintf (stderr,
6732 "\n======\nGO_IF_LEGITIMATE_ADDRESS, mode = %s, strict = %d\n",
6733 GET_MODE_NAME (mode), strict);
6734 debug_rtx (addr);
6735 }
6736
6737 if (ix86_decompose_address (addr, &parts) <= 0)
6738 {
6739 reason = "decomposition failed";
6740 goto report_error;
6741 }
6742
6743 base = parts.base;
6744 index = parts.index;
6745 disp = parts.disp;
6746 scale = parts.scale;
6747
6748 /* Validate base register.
6749
6750 Don't allow SUBREG's that span more than a word here. It can lead to spill
6751 failures when the base is one word out of a two word structure, which is
6752 represented internally as a DImode int. */
6753
6754 if (base)
6755 {
6756 rtx reg;
6757 reason_rtx = base;
6758
6759 if (REG_P (base))
6760 reg = base;
6761 else if (GET_CODE (base) == SUBREG
6762 && REG_P (SUBREG_REG (base))
6763 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base)))
6764 <= UNITS_PER_WORD)
6765 reg = SUBREG_REG (base);
6766 else
6767 {
6768 reason = "base is not a register";
6769 goto report_error;
6770 }
6771
6772 if (GET_MODE (base) != Pmode)
6773 {
6774 reason = "base is not in Pmode";
6775 goto report_error;
6776 }
6777
6778 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
6779 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
6780 {
6781 reason = "base is not valid";
6782 goto report_error;
6783 }
6784 }
6785
6786 /* Validate index register.
6787
6788 Don't allow SUBREG's that span more than a word here -- same as above. */
6789
6790 if (index)
6791 {
6792 rtx reg;
6793 reason_rtx = index;
6794
6795 if (REG_P (index))
6796 reg = index;
6797 else if (GET_CODE (index) == SUBREG
6798 && REG_P (SUBREG_REG (index))
6799 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index)))
6800 <= UNITS_PER_WORD)
6801 reg = SUBREG_REG (index);
6802 else
6803 {
6804 reason = "index is not a register";
6805 goto report_error;
6806 }
6807
6808 if (GET_MODE (index) != Pmode)
6809 {
6810 reason = "index is not in Pmode";
6811 goto report_error;
6812 }
6813
6814 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
6815 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
6816 {
6817 reason = "index is not valid";
6818 goto report_error;
6819 }
6820 }
6821
6822 /* Validate scale factor. */
6823 if (scale != 1)
6824 {
6825 reason_rtx = GEN_INT (scale);
6826 if (!index)
6827 {
6828 reason = "scale without index";
6829 goto report_error;
6830 }
6831
6832 if (scale != 2 && scale != 4 && scale != 8)
6833 {
6834 reason = "scale is not a valid multiplier";
6835 goto report_error;
6836 }
6837 }
6838
6839 /* Validate displacement. */
6840 if (disp)
6841 {
6842 reason_rtx = disp;
6843
6844 if (GET_CODE (disp) == CONST
6845 && GET_CODE (XEXP (disp, 0)) == UNSPEC)
6846 switch (XINT (XEXP (disp, 0), 1))
6847 {
6848 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
6849 used. While ABI specify also 32bit relocations, we don't produce
6850 them at all and use IP relative instead. */
6851 case UNSPEC_GOT:
6852 case UNSPEC_GOTOFF:
6853 gcc_assert (flag_pic);
6854 if (!TARGET_64BIT)
6855 goto is_legitimate_pic;
6856 reason = "64bit address unspec";
6857 goto report_error;
6858
6859 case UNSPEC_GOTPCREL:
6860 gcc_assert (flag_pic);
6861 goto is_legitimate_pic;
6862
6863 case UNSPEC_GOTTPOFF:
6864 case UNSPEC_GOTNTPOFF:
6865 case UNSPEC_INDNTPOFF:
6866 case UNSPEC_NTPOFF:
6867 case UNSPEC_DTPOFF:
6868 break;
6869
6870 default:
6871 reason = "invalid address unspec";
6872 goto report_error;
6873 }
6874
6875 else if (SYMBOLIC_CONST (disp)
6876 && (flag_pic
6877 || (TARGET_MACHO
6878 #if TARGET_MACHO
6879 && MACHOPIC_INDIRECT
6880 && !machopic_operand_p (disp)
6881 #endif
6882 )))
6883 {
6884
6885 is_legitimate_pic:
6886 if (TARGET_64BIT && (index || base))
6887 {
6888 /* foo@dtpoff(%rX) is ok. */
6889 if (GET_CODE (disp) != CONST
6890 || GET_CODE (XEXP (disp, 0)) != PLUS
6891 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
6892 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
6893 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
6894 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
6895 {
6896 reason = "non-constant pic memory reference";
6897 goto report_error;
6898 }
6899 }
6900 else if (! legitimate_pic_address_disp_p (disp))
6901 {
6902 reason = "displacement is an invalid pic construct";
6903 goto report_error;
6904 }
6905
6906 /* This code used to verify that a symbolic pic displacement
6907 includes the pic_offset_table_rtx register.
6908
6909 While this is good idea, unfortunately these constructs may
6910 be created by "adds using lea" optimization for incorrect
6911 code like:
6912
6913 int a;
6914 int foo(int i)
6915 {
6916 return *(&a+i);
6917 }
6918
6919 This code is nonsensical, but results in addressing
6920 GOT table with pic_offset_table_rtx base. We can't
6921 just refuse it easily, since it gets matched by
6922 "addsi3" pattern, that later gets split to lea in the
6923 case output register differs from input. While this
6924 can be handled by separate addsi pattern for this case
6925 that never results in lea, this seems to be easier and
6926 correct fix for crash to disable this test. */
6927 }
6928 else if (GET_CODE (disp) != LABEL_REF
6929 && !CONST_INT_P (disp)
6930 && (GET_CODE (disp) != CONST
6931 || !legitimate_constant_p (disp))
6932 && (GET_CODE (disp) != SYMBOL_REF
6933 || !legitimate_constant_p (disp)))
6934 {
6935 reason = "displacement is not constant";
6936 goto report_error;
6937 }
6938 else if (TARGET_64BIT
6939 && !x86_64_immediate_operand (disp, VOIDmode))
6940 {
6941 reason = "displacement is out of range";
6942 goto report_error;
6943 }
6944 }
6945
6946 /* Everything looks valid. */
6947 if (TARGET_DEBUG_ADDR)
6948 fprintf (stderr, "Success.\n");
6949 return TRUE;
6950
6951 report_error:
6952 if (TARGET_DEBUG_ADDR)
6953 {
6954 fprintf (stderr, "Error: %s\n", reason);
6955 debug_rtx (reason_rtx);
6956 }
6957 return FALSE;
6958 }
6959 \f
6960 /* Return a unique alias set for the GOT. */
6961
6962 static HOST_WIDE_INT
6963 ix86_GOT_alias_set (void)
6964 {
6965 static HOST_WIDE_INT set = -1;
6966 if (set == -1)
6967 set = new_alias_set ();
6968 return set;
6969 }
6970
6971 /* Return a legitimate reference for ORIG (an address) using the
6972 register REG. If REG is 0, a new pseudo is generated.
6973
6974 There are two types of references that must be handled:
6975
6976 1. Global data references must load the address from the GOT, via
6977 the PIC reg. An insn is emitted to do this load, and the reg is
6978 returned.
6979
6980 2. Static data references, constant pool addresses, and code labels
6981 compute the address as an offset from the GOT, whose base is in
6982 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
6983 differentiate them from global data objects. The returned
6984 address is the PIC reg + an unspec constant.
6985
6986 GO_IF_LEGITIMATE_ADDRESS rejects symbolic references unless the PIC
6987 reg also appears in the address. */
6988
6989 static rtx
6990 legitimize_pic_address (rtx orig, rtx reg)
6991 {
6992 rtx addr = orig;
6993 rtx new = orig;
6994 rtx base;
6995
6996 #if TARGET_MACHO
6997 if (TARGET_MACHO && !TARGET_64BIT)
6998 {
6999 if (reg == 0)
7000 reg = gen_reg_rtx (Pmode);
7001 /* Use the generic Mach-O PIC machinery. */
7002 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
7003 }
7004 #endif
7005
7006 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
7007 new = addr;
7008 else if (TARGET_64BIT
7009 && ix86_cmodel != CM_SMALL_PIC
7010 && local_symbolic_operand (addr, Pmode))
7011 {
7012 rtx tmpreg;
7013 /* This symbol may be referenced via a displacement from the PIC
7014 base address (@GOTOFF). */
7015
7016 if (reload_in_progress)
7017 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7018 if (GET_CODE (addr) == CONST)
7019 addr = XEXP (addr, 0);
7020 if (GET_CODE (addr) == PLUS)
7021 {
7022 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF);
7023 new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7024 }
7025 else
7026 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7027 new = gen_rtx_CONST (Pmode, new);
7028 if (!reg)
7029 tmpreg = gen_reg_rtx (Pmode);
7030 else
7031 tmpreg = reg;
7032 emit_move_insn (tmpreg, new);
7033
7034 if (reg != 0)
7035 {
7036 new = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
7037 tmpreg, 1, OPTAB_DIRECT);
7038 new = reg;
7039 }
7040 else new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
7041 }
7042 else if (!TARGET_64BIT && local_symbolic_operand (addr, Pmode))
7043 {
7044 /* This symbol may be referenced via a displacement from the PIC
7045 base address (@GOTOFF). */
7046
7047 if (reload_in_progress)
7048 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7049 if (GET_CODE (addr) == CONST)
7050 addr = XEXP (addr, 0);
7051 if (GET_CODE (addr) == PLUS)
7052 {
7053 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF);
7054 new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7055 }
7056 else
7057 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7058 new = gen_rtx_CONST (Pmode, new);
7059 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7060
7061 if (reg != 0)
7062 {
7063 emit_move_insn (reg, new);
7064 new = reg;
7065 }
7066 }
7067 else if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
7068 {
7069 if (TARGET_64BIT)
7070 {
7071 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
7072 new = gen_rtx_CONST (Pmode, new);
7073 new = gen_const_mem (Pmode, new);
7074 set_mem_alias_set (new, ix86_GOT_alias_set ());
7075
7076 if (reg == 0)
7077 reg = gen_reg_rtx (Pmode);
7078 /* Use directly gen_movsi, otherwise the address is loaded
7079 into register for CSE. We don't want to CSE this addresses,
7080 instead we CSE addresses from the GOT table, so skip this. */
7081 emit_insn (gen_movsi (reg, new));
7082 new = reg;
7083 }
7084 else
7085 {
7086 /* This symbol must be referenced via a load from the
7087 Global Offset Table (@GOT). */
7088
7089 if (reload_in_progress)
7090 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7091 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
7092 new = gen_rtx_CONST (Pmode, new);
7093 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7094 new = gen_const_mem (Pmode, new);
7095 set_mem_alias_set (new, ix86_GOT_alias_set ());
7096
7097 if (reg == 0)
7098 reg = gen_reg_rtx (Pmode);
7099 emit_move_insn (reg, new);
7100 new = reg;
7101 }
7102 }
7103 else
7104 {
7105 if (CONST_INT_P (addr)
7106 && !x86_64_immediate_operand (addr, VOIDmode))
7107 {
7108 if (reg)
7109 {
7110 emit_move_insn (reg, addr);
7111 new = reg;
7112 }
7113 else
7114 new = force_reg (Pmode, addr);
7115 }
7116 else if (GET_CODE (addr) == CONST)
7117 {
7118 addr = XEXP (addr, 0);
7119
7120 /* We must match stuff we generate before. Assume the only
7121 unspecs that can get here are ours. Not that we could do
7122 anything with them anyway.... */
7123 if (GET_CODE (addr) == UNSPEC
7124 || (GET_CODE (addr) == PLUS
7125 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
7126 return orig;
7127 gcc_assert (GET_CODE (addr) == PLUS);
7128 }
7129 if (GET_CODE (addr) == PLUS)
7130 {
7131 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
7132
7133 /* Check first to see if this is a constant offset from a @GOTOFF
7134 symbol reference. */
7135 if (local_symbolic_operand (op0, Pmode)
7136 && CONST_INT_P (op1))
7137 {
7138 if (!TARGET_64BIT)
7139 {
7140 if (reload_in_progress)
7141 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7142 new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
7143 UNSPEC_GOTOFF);
7144 new = gen_rtx_PLUS (Pmode, new, op1);
7145 new = gen_rtx_CONST (Pmode, new);
7146 new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7147
7148 if (reg != 0)
7149 {
7150 emit_move_insn (reg, new);
7151 new = reg;
7152 }
7153 }
7154 else
7155 {
7156 if (INTVAL (op1) < -16*1024*1024
7157 || INTVAL (op1) >= 16*1024*1024)
7158 {
7159 if (!x86_64_immediate_operand (op1, Pmode))
7160 op1 = force_reg (Pmode, op1);
7161 new = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
7162 }
7163 }
7164 }
7165 else
7166 {
7167 base = legitimize_pic_address (XEXP (addr, 0), reg);
7168 new = legitimize_pic_address (XEXP (addr, 1),
7169 base == reg ? NULL_RTX : reg);
7170
7171 if (CONST_INT_P (new))
7172 new = plus_constant (base, INTVAL (new));
7173 else
7174 {
7175 if (GET_CODE (new) == PLUS && CONSTANT_P (XEXP (new, 1)))
7176 {
7177 base = gen_rtx_PLUS (Pmode, base, XEXP (new, 0));
7178 new = XEXP (new, 1);
7179 }
7180 new = gen_rtx_PLUS (Pmode, base, new);
7181 }
7182 }
7183 }
7184 }
7185 return new;
7186 }
7187 \f
7188 /* Load the thread pointer. If TO_REG is true, force it into a register. */
7189
7190 static rtx
7191 get_thread_pointer (int to_reg)
7192 {
7193 rtx tp, reg, insn;
7194
7195 tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
7196 if (!to_reg)
7197 return tp;
7198
7199 reg = gen_reg_rtx (Pmode);
7200 insn = gen_rtx_SET (VOIDmode, reg, tp);
7201 insn = emit_insn (insn);
7202
7203 return reg;
7204 }
7205
7206 /* A subroutine of legitimize_address and ix86_expand_move. FOR_MOV is
7207 false if we expect this to be used for a memory address and true if
7208 we expect to load the address into a register. */
7209
7210 static rtx
7211 legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
7212 {
7213 rtx dest, base, off, pic, tp;
7214 int type;
7215
7216 switch (model)
7217 {
7218 case TLS_MODEL_GLOBAL_DYNAMIC:
7219 dest = gen_reg_rtx (Pmode);
7220 tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7221
7222 if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7223 {
7224 rtx rax = gen_rtx_REG (Pmode, 0), insns;
7225
7226 start_sequence ();
7227 emit_call_insn (gen_tls_global_dynamic_64 (rax, x));
7228 insns = get_insns ();
7229 end_sequence ();
7230
7231 emit_libcall_block (insns, dest, rax, x);
7232 }
7233 else if (TARGET_64BIT && TARGET_GNU2_TLS)
7234 emit_insn (gen_tls_global_dynamic_64 (dest, x));
7235 else
7236 emit_insn (gen_tls_global_dynamic_32 (dest, x));
7237
7238 if (TARGET_GNU2_TLS)
7239 {
7240 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
7241
7242 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7243 }
7244 break;
7245
7246 case TLS_MODEL_LOCAL_DYNAMIC:
7247 base = gen_reg_rtx (Pmode);
7248 tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7249
7250 if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7251 {
7252 rtx rax = gen_rtx_REG (Pmode, 0), insns, note;
7253
7254 start_sequence ();
7255 emit_call_insn (gen_tls_local_dynamic_base_64 (rax));
7256 insns = get_insns ();
7257 end_sequence ();
7258
7259 note = gen_rtx_EXPR_LIST (VOIDmode, const0_rtx, NULL);
7260 note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note);
7261 emit_libcall_block (insns, base, rax, note);
7262 }
7263 else if (TARGET_64BIT && TARGET_GNU2_TLS)
7264 emit_insn (gen_tls_local_dynamic_base_64 (base));
7265 else
7266 emit_insn (gen_tls_local_dynamic_base_32 (base));
7267
7268 if (TARGET_GNU2_TLS)
7269 {
7270 rtx x = ix86_tls_module_base ();
7271
7272 set_unique_reg_note (get_last_insn (), REG_EQUIV,
7273 gen_rtx_MINUS (Pmode, x, tp));
7274 }
7275
7276 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
7277 off = gen_rtx_CONST (Pmode, off);
7278
7279 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
7280
7281 if (TARGET_GNU2_TLS)
7282 {
7283 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
7284
7285 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7286 }
7287
7288 break;
7289
7290 case TLS_MODEL_INITIAL_EXEC:
7291 if (TARGET_64BIT)
7292 {
7293 pic = NULL;
7294 type = UNSPEC_GOTNTPOFF;
7295 }
7296 else if (flag_pic)
7297 {
7298 if (reload_in_progress)
7299 regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7300 pic = pic_offset_table_rtx;
7301 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
7302 }
7303 else if (!TARGET_ANY_GNU_TLS)
7304 {
7305 pic = gen_reg_rtx (Pmode);
7306 emit_insn (gen_set_got (pic));
7307 type = UNSPEC_GOTTPOFF;
7308 }
7309 else
7310 {
7311 pic = NULL;
7312 type = UNSPEC_INDNTPOFF;
7313 }
7314
7315 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
7316 off = gen_rtx_CONST (Pmode, off);
7317 if (pic)
7318 off = gen_rtx_PLUS (Pmode, pic, off);
7319 off = gen_const_mem (Pmode, off);
7320 set_mem_alias_set (off, ix86_GOT_alias_set ());
7321
7322 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7323 {
7324 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7325 off = force_reg (Pmode, off);
7326 return gen_rtx_PLUS (Pmode, base, off);
7327 }
7328 else
7329 {
7330 base = get_thread_pointer (true);
7331 dest = gen_reg_rtx (Pmode);
7332 emit_insn (gen_subsi3 (dest, base, off));
7333 }
7334 break;
7335
7336 case TLS_MODEL_LOCAL_EXEC:
7337 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
7338 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7339 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
7340 off = gen_rtx_CONST (Pmode, off);
7341
7342 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7343 {
7344 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7345 return gen_rtx_PLUS (Pmode, base, off);
7346 }
7347 else
7348 {
7349 base = get_thread_pointer (true);
7350 dest = gen_reg_rtx (Pmode);
7351 emit_insn (gen_subsi3 (dest, base, off));
7352 }
7353 break;
7354
7355 default:
7356 gcc_unreachable ();
7357 }
7358
7359 return dest;
7360 }
7361
7362 /* Try machine-dependent ways of modifying an illegitimate address
7363 to be legitimate. If we find one, return the new, valid address.
7364 This macro is used in only one place: `memory_address' in explow.c.
7365
7366 OLDX is the address as it was before break_out_memory_refs was called.
7367 In some cases it is useful to look at this to decide what needs to be done.
7368
7369 MODE and WIN are passed so that this macro can use
7370 GO_IF_LEGITIMATE_ADDRESS.
7371
7372 It is always safe for this macro to do nothing. It exists to recognize
7373 opportunities to optimize the output.
7374
7375 For the 80386, we handle X+REG by loading X into a register R and
7376 using R+REG. R will go in a general reg and indexing will be used.
7377 However, if REG is a broken-out memory address or multiplication,
7378 nothing needs to be done because REG can certainly go in a general reg.
7379
7380 When -fpic is used, special handling is needed for symbolic references.
7381 See comments by legitimize_pic_address in i386.c for details. */
7382
7383 rtx
7384 legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode)
7385 {
7386 int changed = 0;
7387 unsigned log;
7388
7389 if (TARGET_DEBUG_ADDR)
7390 {
7391 fprintf (stderr, "\n==========\nLEGITIMIZE_ADDRESS, mode = %s\n",
7392 GET_MODE_NAME (mode));
7393 debug_rtx (x);
7394 }
7395
7396 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
7397 if (log)
7398 return legitimize_tls_address (x, log, false);
7399 if (GET_CODE (x) == CONST
7400 && GET_CODE (XEXP (x, 0)) == PLUS
7401 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
7402 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
7403 {
7404 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), log, false);
7405 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
7406 }
7407
7408 if (flag_pic && SYMBOLIC_CONST (x))
7409 return legitimize_pic_address (x, 0);
7410
7411 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
7412 if (GET_CODE (x) == ASHIFT
7413 && CONST_INT_P (XEXP (x, 1))
7414 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
7415 {
7416 changed = 1;
7417 log = INTVAL (XEXP (x, 1));
7418 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
7419 GEN_INT (1 << log));
7420 }
7421
7422 if (GET_CODE (x) == PLUS)
7423 {
7424 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
7425
7426 if (GET_CODE (XEXP (x, 0)) == ASHIFT
7427 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7428 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
7429 {
7430 changed = 1;
7431 log = INTVAL (XEXP (XEXP (x, 0), 1));
7432 XEXP (x, 0) = gen_rtx_MULT (Pmode,
7433 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
7434 GEN_INT (1 << log));
7435 }
7436
7437 if (GET_CODE (XEXP (x, 1)) == ASHIFT
7438 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
7439 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
7440 {
7441 changed = 1;
7442 log = INTVAL (XEXP (XEXP (x, 1), 1));
7443 XEXP (x, 1) = gen_rtx_MULT (Pmode,
7444 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
7445 GEN_INT (1 << log));
7446 }
7447
7448 /* Put multiply first if it isn't already. */
7449 if (GET_CODE (XEXP (x, 1)) == MULT)
7450 {
7451 rtx tmp = XEXP (x, 0);
7452 XEXP (x, 0) = XEXP (x, 1);
7453 XEXP (x, 1) = tmp;
7454 changed = 1;
7455 }
7456
7457 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
7458 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
7459 created by virtual register instantiation, register elimination, and
7460 similar optimizations. */
7461 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
7462 {
7463 changed = 1;
7464 x = gen_rtx_PLUS (Pmode,
7465 gen_rtx_PLUS (Pmode, XEXP (x, 0),
7466 XEXP (XEXP (x, 1), 0)),
7467 XEXP (XEXP (x, 1), 1));
7468 }
7469
7470 /* Canonicalize
7471 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
7472 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
7473 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
7474 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7475 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
7476 && CONSTANT_P (XEXP (x, 1)))
7477 {
7478 rtx constant;
7479 rtx other = NULL_RTX;
7480
7481 if (CONST_INT_P (XEXP (x, 1)))
7482 {
7483 constant = XEXP (x, 1);
7484 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
7485 }
7486 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
7487 {
7488 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
7489 other = XEXP (x, 1);
7490 }
7491 else
7492 constant = 0;
7493
7494 if (constant)
7495 {
7496 changed = 1;
7497 x = gen_rtx_PLUS (Pmode,
7498 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
7499 XEXP (XEXP (XEXP (x, 0), 1), 0)),
7500 plus_constant (other, INTVAL (constant)));
7501 }
7502 }
7503
7504 if (changed && legitimate_address_p (mode, x, FALSE))
7505 return x;
7506
7507 if (GET_CODE (XEXP (x, 0)) == MULT)
7508 {
7509 changed = 1;
7510 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
7511 }
7512
7513 if (GET_CODE (XEXP (x, 1)) == MULT)
7514 {
7515 changed = 1;
7516 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
7517 }
7518
7519 if (changed
7520 && REG_P (XEXP (x, 1))
7521 && REG_P (XEXP (x, 0)))
7522 return x;
7523
7524 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
7525 {
7526 changed = 1;
7527 x = legitimize_pic_address (x, 0);
7528 }
7529
7530 if (changed && legitimate_address_p (mode, x, FALSE))
7531 return x;
7532
7533 if (REG_P (XEXP (x, 0)))
7534 {
7535 rtx temp = gen_reg_rtx (Pmode);
7536 rtx val = force_operand (XEXP (x, 1), temp);
7537 if (val != temp)
7538 emit_move_insn (temp, val);
7539
7540 XEXP (x, 1) = temp;
7541 return x;
7542 }
7543
7544 else if (REG_P (XEXP (x, 1)))
7545 {
7546 rtx temp = gen_reg_rtx (Pmode);
7547 rtx val = force_operand (XEXP (x, 0), temp);
7548 if (val != temp)
7549 emit_move_insn (temp, val);
7550
7551 XEXP (x, 0) = temp;
7552 return x;
7553 }
7554 }
7555
7556 return x;
7557 }
7558 \f
7559 /* Print an integer constant expression in assembler syntax. Addition
7560 and subtraction are the only arithmetic that may appear in these
7561 expressions. FILE is the stdio stream to write to, X is the rtx, and
7562 CODE is the operand print code from the output string. */
7563
7564 static void
7565 output_pic_addr_const (FILE *file, rtx x, int code)
7566 {
7567 char buf[256];
7568
7569 switch (GET_CODE (x))
7570 {
7571 case PC:
7572 gcc_assert (flag_pic);
7573 putc ('.', file);
7574 break;
7575
7576 case SYMBOL_REF:
7577 output_addr_const (file, x);
7578 if (!TARGET_MACHO && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
7579 fputs ("@PLT", file);
7580 break;
7581
7582 case LABEL_REF:
7583 x = XEXP (x, 0);
7584 /* FALLTHRU */
7585 case CODE_LABEL:
7586 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
7587 assemble_name (asm_out_file, buf);
7588 break;
7589
7590 case CONST_INT:
7591 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7592 break;
7593
7594 case CONST:
7595 /* This used to output parentheses around the expression,
7596 but that does not work on the 386 (either ATT or BSD assembler). */
7597 output_pic_addr_const (file, XEXP (x, 0), code);
7598 break;
7599
7600 case CONST_DOUBLE:
7601 if (GET_MODE (x) == VOIDmode)
7602 {
7603 /* We can use %d if the number is <32 bits and positive. */
7604 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
7605 fprintf (file, "0x%lx%08lx",
7606 (unsigned long) CONST_DOUBLE_HIGH (x),
7607 (unsigned long) CONST_DOUBLE_LOW (x));
7608 else
7609 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
7610 }
7611 else
7612 /* We can't handle floating point constants;
7613 PRINT_OPERAND must handle them. */
7614 output_operand_lossage ("floating constant misused");
7615 break;
7616
7617 case PLUS:
7618 /* Some assemblers need integer constants to appear first. */
7619 if (CONST_INT_P (XEXP (x, 0)))
7620 {
7621 output_pic_addr_const (file, XEXP (x, 0), code);
7622 putc ('+', file);
7623 output_pic_addr_const (file, XEXP (x, 1), code);
7624 }
7625 else
7626 {
7627 gcc_assert (CONST_INT_P (XEXP (x, 1)));
7628 output_pic_addr_const (file, XEXP (x, 1), code);
7629 putc ('+', file);
7630 output_pic_addr_const (file, XEXP (x, 0), code);
7631 }
7632 break;
7633
7634 case MINUS:
7635 if (!TARGET_MACHO)
7636 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
7637 output_pic_addr_const (file, XEXP (x, 0), code);
7638 putc ('-', file);
7639 output_pic_addr_const (file, XEXP (x, 1), code);
7640 if (!TARGET_MACHO)
7641 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
7642 break;
7643
7644 case UNSPEC:
7645 gcc_assert (XVECLEN (x, 0) == 1);
7646 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
7647 switch (XINT (x, 1))
7648 {
7649 case UNSPEC_GOT:
7650 fputs ("@GOT", file);
7651 break;
7652 case UNSPEC_GOTOFF:
7653 fputs ("@GOTOFF", file);
7654 break;
7655 case UNSPEC_GOTPCREL:
7656 fputs ("@GOTPCREL(%rip)", file);
7657 break;
7658 case UNSPEC_GOTTPOFF:
7659 /* FIXME: This might be @TPOFF in Sun ld too. */
7660 fputs ("@GOTTPOFF", file);
7661 break;
7662 case UNSPEC_TPOFF:
7663 fputs ("@TPOFF", file);
7664 break;
7665 case UNSPEC_NTPOFF:
7666 if (TARGET_64BIT)
7667 fputs ("@TPOFF", file);
7668 else
7669 fputs ("@NTPOFF", file);
7670 break;
7671 case UNSPEC_DTPOFF:
7672 fputs ("@DTPOFF", file);
7673 break;
7674 case UNSPEC_GOTNTPOFF:
7675 if (TARGET_64BIT)
7676 fputs ("@GOTTPOFF(%rip)", file);
7677 else
7678 fputs ("@GOTNTPOFF", file);
7679 break;
7680 case UNSPEC_INDNTPOFF:
7681 fputs ("@INDNTPOFF", file);
7682 break;
7683 default:
7684 output_operand_lossage ("invalid UNSPEC as operand");
7685 break;
7686 }
7687 break;
7688
7689 default:
7690 output_operand_lossage ("invalid expression as operand");
7691 }
7692 }
7693
7694 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
7695 We need to emit DTP-relative relocations. */
7696
7697 static void
7698 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
7699 {
7700 fputs (ASM_LONG, file);
7701 output_addr_const (file, x);
7702 fputs ("@DTPOFF", file);
7703 switch (size)
7704 {
7705 case 4:
7706 break;
7707 case 8:
7708 fputs (", 0", file);
7709 break;
7710 default:
7711 gcc_unreachable ();
7712 }
7713 }
7714
7715 /* In the name of slightly smaller debug output, and to cater to
7716 general assembler lossage, recognize PIC+GOTOFF and turn it back
7717 into a direct symbol reference.
7718
7719 On Darwin, this is necessary to avoid a crash, because Darwin
7720 has a different PIC label for each routine but the DWARF debugging
7721 information is not associated with any particular routine, so it's
7722 necessary to remove references to the PIC label from RTL stored by
7723 the DWARF output code. */
7724
7725 static rtx
7726 ix86_delegitimize_address (rtx orig_x)
7727 {
7728 rtx x = orig_x;
7729 /* reg_addend is NULL or a multiple of some register. */
7730 rtx reg_addend = NULL_RTX;
7731 /* const_addend is NULL or a const_int. */
7732 rtx const_addend = NULL_RTX;
7733 /* This is the result, or NULL. */
7734 rtx result = NULL_RTX;
7735
7736 if (MEM_P (x))
7737 x = XEXP (x, 0);
7738
7739 if (TARGET_64BIT)
7740 {
7741 if (GET_CODE (x) != CONST
7742 || GET_CODE (XEXP (x, 0)) != UNSPEC
7743 || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
7744 || !MEM_P (orig_x))
7745 return orig_x;
7746 return XVECEXP (XEXP (x, 0), 0, 0);
7747 }
7748
7749 if (GET_CODE (x) != PLUS
7750 || GET_CODE (XEXP (x, 1)) != CONST)
7751 return orig_x;
7752
7753 if (REG_P (XEXP (x, 0))
7754 && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM)
7755 /* %ebx + GOT/GOTOFF */
7756 ;
7757 else if (GET_CODE (XEXP (x, 0)) == PLUS)
7758 {
7759 /* %ebx + %reg * scale + GOT/GOTOFF */
7760 reg_addend = XEXP (x, 0);
7761 if (REG_P (XEXP (reg_addend, 0))
7762 && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM)
7763 reg_addend = XEXP (reg_addend, 1);
7764 else if (REG_P (XEXP (reg_addend, 1))
7765 && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM)
7766 reg_addend = XEXP (reg_addend, 0);
7767 else
7768 return orig_x;
7769 if (!REG_P (reg_addend)
7770 && GET_CODE (reg_addend) != MULT
7771 && GET_CODE (reg_addend) != ASHIFT)
7772 return orig_x;
7773 }
7774 else
7775 return orig_x;
7776
7777 x = XEXP (XEXP (x, 1), 0);
7778 if (GET_CODE (x) == PLUS
7779 && CONST_INT_P (XEXP (x, 1)))
7780 {
7781 const_addend = XEXP (x, 1);
7782 x = XEXP (x, 0);
7783 }
7784
7785 if (GET_CODE (x) == UNSPEC
7786 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
7787 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
7788 result = XVECEXP (x, 0, 0);
7789
7790 if (TARGET_MACHO && darwin_local_data_pic (x)
7791 && !MEM_P (orig_x))
7792 result = XEXP (x, 0);
7793
7794 if (! result)
7795 return orig_x;
7796
7797 if (const_addend)
7798 result = gen_rtx_PLUS (Pmode, result, const_addend);
7799 if (reg_addend)
7800 result = gen_rtx_PLUS (Pmode, reg_addend, result);
7801 return result;
7802 }
7803 \f
7804 static void
7805 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
7806 int fp, FILE *file)
7807 {
7808 const char *suffix;
7809
7810 if (mode == CCFPmode || mode == CCFPUmode)
7811 {
7812 enum rtx_code second_code, bypass_code;
7813 ix86_fp_comparison_codes (code, &bypass_code, &code, &second_code);
7814 gcc_assert (bypass_code == UNKNOWN && second_code == UNKNOWN);
7815 code = ix86_fp_compare_code_to_integer (code);
7816 mode = CCmode;
7817 }
7818 if (reverse)
7819 code = reverse_condition (code);
7820
7821 switch (code)
7822 {
7823 case EQ:
7824 suffix = "e";
7825 break;
7826 case NE:
7827 suffix = "ne";
7828 break;
7829 case GT:
7830 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
7831 suffix = "g";
7832 break;
7833 case GTU:
7834 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
7835 Those same assemblers have the same but opposite lossage on cmov. */
7836 gcc_assert (mode == CCmode);
7837 suffix = fp ? "nbe" : "a";
7838 break;
7839 case LT:
7840 switch (mode)
7841 {
7842 case CCNOmode:
7843 case CCGOCmode:
7844 suffix = "s";
7845 break;
7846
7847 case CCmode:
7848 case CCGCmode:
7849 suffix = "l";
7850 break;
7851
7852 default:
7853 gcc_unreachable ();
7854 }
7855 break;
7856 case LTU:
7857 gcc_assert (mode == CCmode);
7858 suffix = "b";
7859 break;
7860 case GE:
7861 switch (mode)
7862 {
7863 case CCNOmode:
7864 case CCGOCmode:
7865 suffix = "ns";
7866 break;
7867
7868 case CCmode:
7869 case CCGCmode:
7870 suffix = "ge";
7871 break;
7872
7873 default:
7874 gcc_unreachable ();
7875 }
7876 break;
7877 case GEU:
7878 /* ??? As above. */
7879 gcc_assert (mode == CCmode);
7880 suffix = fp ? "nb" : "ae";
7881 break;
7882 case LE:
7883 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
7884 suffix = "le";
7885 break;
7886 case LEU:
7887 gcc_assert (mode == CCmode);
7888 suffix = "be";
7889 break;
7890 case UNORDERED:
7891 suffix = fp ? "u" : "p";
7892 break;
7893 case ORDERED:
7894 suffix = fp ? "nu" : "np";
7895 break;
7896 default:
7897 gcc_unreachable ();
7898 }
7899 fputs (suffix, file);
7900 }
7901
7902 /* Print the name of register X to FILE based on its machine mode and number.
7903 If CODE is 'w', pretend the mode is HImode.
7904 If CODE is 'b', pretend the mode is QImode.
7905 If CODE is 'k', pretend the mode is SImode.
7906 If CODE is 'q', pretend the mode is DImode.
7907 If CODE is 'h', pretend the reg is the 'high' byte register.
7908 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op. */
7909
7910 void
7911 print_reg (rtx x, int code, FILE *file)
7912 {
7913 gcc_assert (REGNO (x) != ARG_POINTER_REGNUM
7914 && REGNO (x) != FRAME_POINTER_REGNUM
7915 && REGNO (x) != FLAGS_REG
7916 && REGNO (x) != FPSR_REG
7917 && REGNO (x) != FPCR_REG);
7918
7919 if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0)
7920 putc ('%', file);
7921
7922 if (code == 'w' || MMX_REG_P (x))
7923 code = 2;
7924 else if (code == 'b')
7925 code = 1;
7926 else if (code == 'k')
7927 code = 4;
7928 else if (code == 'q')
7929 code = 8;
7930 else if (code == 'y')
7931 code = 3;
7932 else if (code == 'h')
7933 code = 0;
7934 else
7935 code = GET_MODE_SIZE (GET_MODE (x));
7936
7937 /* Irritatingly, AMD extended registers use different naming convention
7938 from the normal registers. */
7939 if (REX_INT_REG_P (x))
7940 {
7941 gcc_assert (TARGET_64BIT);
7942 switch (code)
7943 {
7944 case 0:
7945 error ("extended registers have no high halves");
7946 break;
7947 case 1:
7948 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
7949 break;
7950 case 2:
7951 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
7952 break;
7953 case 4:
7954 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
7955 break;
7956 case 8:
7957 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
7958 break;
7959 default:
7960 error ("unsupported operand size for extended register");
7961 break;
7962 }
7963 return;
7964 }
7965 switch (code)
7966 {
7967 case 3:
7968 if (STACK_TOP_P (x))
7969 {
7970 fputs ("st(0)", file);
7971 break;
7972 }
7973 /* FALLTHRU */
7974 case 8:
7975 case 4:
7976 case 12:
7977 if (! ANY_FP_REG_P (x))
7978 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
7979 /* FALLTHRU */
7980 case 16:
7981 case 2:
7982 normal:
7983 fputs (hi_reg_name[REGNO (x)], file);
7984 break;
7985 case 1:
7986 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
7987 goto normal;
7988 fputs (qi_reg_name[REGNO (x)], file);
7989 break;
7990 case 0:
7991 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
7992 goto normal;
7993 fputs (qi_high_reg_name[REGNO (x)], file);
7994 break;
7995 default:
7996 gcc_unreachable ();
7997 }
7998 }
7999
8000 /* Locate some local-dynamic symbol still in use by this function
8001 so that we can print its name in some tls_local_dynamic_base
8002 pattern. */
8003
8004 static const char *
8005 get_some_local_dynamic_name (void)
8006 {
8007 rtx insn;
8008
8009 if (cfun->machine->some_ld_name)
8010 return cfun->machine->some_ld_name;
8011
8012 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
8013 if (INSN_P (insn)
8014 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
8015 return cfun->machine->some_ld_name;
8016
8017 gcc_unreachable ();
8018 }
8019
8020 static int
8021 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
8022 {
8023 rtx x = *px;
8024
8025 if (GET_CODE (x) == SYMBOL_REF
8026 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
8027 {
8028 cfun->machine->some_ld_name = XSTR (x, 0);
8029 return 1;
8030 }
8031
8032 return 0;
8033 }
8034
8035 /* Meaning of CODE:
8036 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
8037 C -- print opcode suffix for set/cmov insn.
8038 c -- like C, but print reversed condition
8039 F,f -- likewise, but for floating-point.
8040 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
8041 otherwise nothing
8042 R -- print the prefix for register names.
8043 z -- print the opcode suffix for the size of the current operand.
8044 * -- print a star (in certain assembler syntax)
8045 A -- print an absolute memory reference.
8046 w -- print the operand as if it's a "word" (HImode) even if it isn't.
8047 s -- print a shift double count, followed by the assemblers argument
8048 delimiter.
8049 b -- print the QImode name of the register for the indicated operand.
8050 %b0 would print %al if operands[0] is reg 0.
8051 w -- likewise, print the HImode name of the register.
8052 k -- likewise, print the SImode name of the register.
8053 q -- likewise, print the DImode name of the register.
8054 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
8055 y -- print "st(0)" instead of "st" as a register.
8056 D -- print condition for SSE cmp instruction.
8057 P -- if PIC, print an @PLT suffix.
8058 X -- don't print any sort of PIC '@' suffix for a symbol.
8059 & -- print some in-use local-dynamic symbol name.
8060 H -- print a memory address offset by 8; used for sse high-parts
8061 */
8062
8063 void
8064 print_operand (FILE *file, rtx x, int code)
8065 {
8066 if (code)
8067 {
8068 switch (code)
8069 {
8070 case '*':
8071 if (ASSEMBLER_DIALECT == ASM_ATT)
8072 putc ('*', file);
8073 return;
8074
8075 case '&':
8076 assemble_name (file, get_some_local_dynamic_name ());
8077 return;
8078
8079 case 'A':
8080 switch (ASSEMBLER_DIALECT)
8081 {
8082 case ASM_ATT:
8083 putc ('*', file);
8084 break;
8085
8086 case ASM_INTEL:
8087 /* Intel syntax. For absolute addresses, registers should not
8088 be surrounded by braces. */
8089 if (!REG_P (x))
8090 {
8091 putc ('[', file);
8092 PRINT_OPERAND (file, x, 0);
8093 putc (']', file);
8094 return;
8095 }
8096 break;
8097
8098 default:
8099 gcc_unreachable ();
8100 }
8101
8102 PRINT_OPERAND (file, x, 0);
8103 return;
8104
8105
8106 case 'L':
8107 if (ASSEMBLER_DIALECT == ASM_ATT)
8108 putc ('l', file);
8109 return;
8110
8111 case 'W':
8112 if (ASSEMBLER_DIALECT == ASM_ATT)
8113 putc ('w', file);
8114 return;
8115
8116 case 'B':
8117 if (ASSEMBLER_DIALECT == ASM_ATT)
8118 putc ('b', file);
8119 return;
8120
8121 case 'Q':
8122 if (ASSEMBLER_DIALECT == ASM_ATT)
8123 putc ('l', file);
8124 return;
8125
8126 case 'S':
8127 if (ASSEMBLER_DIALECT == ASM_ATT)
8128 putc ('s', file);
8129 return;
8130
8131 case 'T':
8132 if (ASSEMBLER_DIALECT == ASM_ATT)
8133 putc ('t', file);
8134 return;
8135
8136 case 'z':
8137 /* 387 opcodes don't get size suffixes if the operands are
8138 registers. */
8139 if (STACK_REG_P (x))
8140 return;
8141
8142 /* Likewise if using Intel opcodes. */
8143 if (ASSEMBLER_DIALECT == ASM_INTEL)
8144 return;
8145
8146 /* This is the size of op from size of operand. */
8147 switch (GET_MODE_SIZE (GET_MODE (x)))
8148 {
8149 case 1:
8150 putc ('b', file);
8151 return;
8152
8153 case 2:
8154 #ifdef HAVE_GAS_FILDS_FISTS
8155 putc ('s', file);
8156 #endif
8157 return;
8158
8159 case 4:
8160 if (GET_MODE (x) == SFmode)
8161 {
8162 putc ('s', file);
8163 return;
8164 }
8165 else
8166 putc ('l', file);
8167 return;
8168
8169 case 12:
8170 case 16:
8171 putc ('t', file);
8172 return;
8173
8174 case 8:
8175 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
8176 {
8177 #ifdef GAS_MNEMONICS
8178 putc ('q', file);
8179 #else
8180 putc ('l', file);
8181 putc ('l', file);
8182 #endif
8183 }
8184 else
8185 putc ('l', file);
8186 return;
8187
8188 default:
8189 gcc_unreachable ();
8190 }
8191
8192 case 'b':
8193 case 'w':
8194 case 'k':
8195 case 'q':
8196 case 'h':
8197 case 'y':
8198 case 'X':
8199 case 'P':
8200 break;
8201
8202 case 's':
8203 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
8204 {
8205 PRINT_OPERAND (file, x, 0);
8206 putc (',', file);
8207 }
8208 return;
8209
8210 case 'D':
8211 /* Little bit of braindamage here. The SSE compare instructions
8212 does use completely different names for the comparisons that the
8213 fp conditional moves. */
8214 switch (GET_CODE (x))
8215 {
8216 case EQ:
8217 case UNEQ:
8218 fputs ("eq", file);
8219 break;
8220 case LT:
8221 case UNLT:
8222 fputs ("lt", file);
8223 break;
8224 case LE:
8225 case UNLE:
8226 fputs ("le", file);
8227 break;
8228 case UNORDERED:
8229 fputs ("unord", file);
8230 break;
8231 case NE:
8232 case LTGT:
8233 fputs ("neq", file);
8234 break;
8235 case UNGE:
8236 case GE:
8237 fputs ("nlt", file);
8238 break;
8239 case UNGT:
8240 case GT:
8241 fputs ("nle", file);
8242 break;
8243 case ORDERED:
8244 fputs ("ord", file);
8245 break;
8246 default:
8247 gcc_unreachable ();
8248 }
8249 return;
8250 case 'O':
8251 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8252 if (ASSEMBLER_DIALECT == ASM_ATT)
8253 {
8254 switch (GET_MODE (x))
8255 {
8256 case HImode: putc ('w', file); break;
8257 case SImode:
8258 case SFmode: putc ('l', file); break;
8259 case DImode:
8260 case DFmode: putc ('q', file); break;
8261 default: gcc_unreachable ();
8262 }
8263 putc ('.', file);
8264 }
8265 #endif
8266 return;
8267 case 'C':
8268 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
8269 return;
8270 case 'F':
8271 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8272 if (ASSEMBLER_DIALECT == ASM_ATT)
8273 putc ('.', file);
8274 #endif
8275 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
8276 return;
8277
8278 /* Like above, but reverse condition */
8279 case 'c':
8280 /* Check to see if argument to %c is really a constant
8281 and not a condition code which needs to be reversed. */
8282 if (!COMPARISON_P (x))
8283 {
8284 output_operand_lossage ("operand is neither a constant nor a condition code, invalid operand code 'c'");
8285 return;
8286 }
8287 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
8288 return;
8289 case 'f':
8290 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8291 if (ASSEMBLER_DIALECT == ASM_ATT)
8292 putc ('.', file);
8293 #endif
8294 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
8295 return;
8296
8297 case 'H':
8298 /* It doesn't actually matter what mode we use here, as we're
8299 only going to use this for printing. */
8300 x = adjust_address_nv (x, DImode, 8);
8301 break;
8302
8303 case '+':
8304 {
8305 rtx x;
8306
8307 if (!optimize || optimize_size || !TARGET_BRANCH_PREDICTION_HINTS)
8308 return;
8309
8310 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
8311 if (x)
8312 {
8313 int pred_val = INTVAL (XEXP (x, 0));
8314
8315 if (pred_val < REG_BR_PROB_BASE * 45 / 100
8316 || pred_val > REG_BR_PROB_BASE * 55 / 100)
8317 {
8318 int taken = pred_val > REG_BR_PROB_BASE / 2;
8319 int cputaken = final_forward_branch_p (current_output_insn) == 0;
8320
8321 /* Emit hints only in the case default branch prediction
8322 heuristics would fail. */
8323 if (taken != cputaken)
8324 {
8325 /* We use 3e (DS) prefix for taken branches and
8326 2e (CS) prefix for not taken branches. */
8327 if (taken)
8328 fputs ("ds ; ", file);
8329 else
8330 fputs ("cs ; ", file);
8331 }
8332 }
8333 }
8334 return;
8335 }
8336 default:
8337 output_operand_lossage ("invalid operand code '%c'", code);
8338 }
8339 }
8340
8341 if (REG_P (x))
8342 print_reg (x, code, file);
8343
8344 else if (MEM_P (x))
8345 {
8346 /* No `byte ptr' prefix for call instructions. */
8347 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
8348 {
8349 const char * size;
8350 switch (GET_MODE_SIZE (GET_MODE (x)))
8351 {
8352 case 1: size = "BYTE"; break;
8353 case 2: size = "WORD"; break;
8354 case 4: size = "DWORD"; break;
8355 case 8: size = "QWORD"; break;
8356 case 12: size = "XWORD"; break;
8357 case 16: size = "XMMWORD"; break;
8358 default:
8359 gcc_unreachable ();
8360 }
8361
8362 /* Check for explicit size override (codes 'b', 'w' and 'k') */
8363 if (code == 'b')
8364 size = "BYTE";
8365 else if (code == 'w')
8366 size = "WORD";
8367 else if (code == 'k')
8368 size = "DWORD";
8369
8370 fputs (size, file);
8371 fputs (" PTR ", file);
8372 }
8373
8374 x = XEXP (x, 0);
8375 /* Avoid (%rip) for call operands. */
8376 if (CONSTANT_ADDRESS_P (x) && code == 'P'
8377 && !CONST_INT_P (x))
8378 output_addr_const (file, x);
8379 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
8380 output_operand_lossage ("invalid constraints for operand");
8381 else
8382 output_address (x);
8383 }
8384
8385 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
8386 {
8387 REAL_VALUE_TYPE r;
8388 long l;
8389
8390 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8391 REAL_VALUE_TO_TARGET_SINGLE (r, l);
8392
8393 if (ASSEMBLER_DIALECT == ASM_ATT)
8394 putc ('$', file);
8395 fprintf (file, "0x%08lx", l);
8396 }
8397
8398 /* These float cases don't actually occur as immediate operands. */
8399 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
8400 {
8401 char dstr[30];
8402
8403 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8404 fprintf (file, "%s", dstr);
8405 }
8406
8407 else if (GET_CODE (x) == CONST_DOUBLE
8408 && GET_MODE (x) == XFmode)
8409 {
8410 char dstr[30];
8411
8412 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8413 fprintf (file, "%s", dstr);
8414 }
8415
8416 else
8417 {
8418 /* We have patterns that allow zero sets of memory, for instance.
8419 In 64-bit mode, we should probably support all 8-byte vectors,
8420 since we can in fact encode that into an immediate. */
8421 if (GET_CODE (x) == CONST_VECTOR)
8422 {
8423 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
8424 x = const0_rtx;
8425 }
8426
8427 if (code != 'P')
8428 {
8429 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
8430 {
8431 if (ASSEMBLER_DIALECT == ASM_ATT)
8432 putc ('$', file);
8433 }
8434 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
8435 || GET_CODE (x) == LABEL_REF)
8436 {
8437 if (ASSEMBLER_DIALECT == ASM_ATT)
8438 putc ('$', file);
8439 else
8440 fputs ("OFFSET FLAT:", file);
8441 }
8442 }
8443 if (CONST_INT_P (x))
8444 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8445 else if (flag_pic)
8446 output_pic_addr_const (file, x, code);
8447 else
8448 output_addr_const (file, x);
8449 }
8450 }
8451 \f
8452 /* Print a memory operand whose address is ADDR. */
8453
8454 void
8455 print_operand_address (FILE *file, rtx addr)
8456 {
8457 struct ix86_address parts;
8458 rtx base, index, disp;
8459 int scale;
8460 int ok = ix86_decompose_address (addr, &parts);
8461
8462 gcc_assert (ok);
8463
8464 base = parts.base;
8465 index = parts.index;
8466 disp = parts.disp;
8467 scale = parts.scale;
8468
8469 switch (parts.seg)
8470 {
8471 case SEG_DEFAULT:
8472 break;
8473 case SEG_FS:
8474 case SEG_GS:
8475 if (USER_LABEL_PREFIX[0] == 0)
8476 putc ('%', file);
8477 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
8478 break;
8479 default:
8480 gcc_unreachable ();
8481 }
8482
8483 if (!base && !index)
8484 {
8485 /* Displacement only requires special attention. */
8486
8487 if (CONST_INT_P (disp))
8488 {
8489 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
8490 {
8491 if (USER_LABEL_PREFIX[0] == 0)
8492 putc ('%', file);
8493 fputs ("ds:", file);
8494 }
8495 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
8496 }
8497 else if (flag_pic)
8498 output_pic_addr_const (file, disp, 0);
8499 else
8500 output_addr_const (file, disp);
8501
8502 /* Use one byte shorter RIP relative addressing for 64bit mode. */
8503 if (TARGET_64BIT)
8504 {
8505 if (GET_CODE (disp) == CONST
8506 && GET_CODE (XEXP (disp, 0)) == PLUS
8507 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8508 disp = XEXP (XEXP (disp, 0), 0);
8509 if (GET_CODE (disp) == LABEL_REF
8510 || (GET_CODE (disp) == SYMBOL_REF
8511 && SYMBOL_REF_TLS_MODEL (disp) == 0))
8512 fputs ("(%rip)", file);
8513 }
8514 }
8515 else
8516 {
8517 if (ASSEMBLER_DIALECT == ASM_ATT)
8518 {
8519 if (disp)
8520 {
8521 if (flag_pic)
8522 output_pic_addr_const (file, disp, 0);
8523 else if (GET_CODE (disp) == LABEL_REF)
8524 output_asm_label (disp);
8525 else
8526 output_addr_const (file, disp);
8527 }
8528
8529 putc ('(', file);
8530 if (base)
8531 print_reg (base, 0, file);
8532 if (index)
8533 {
8534 putc (',', file);
8535 print_reg (index, 0, file);
8536 if (scale != 1)
8537 fprintf (file, ",%d", scale);
8538 }
8539 putc (')', file);
8540 }
8541 else
8542 {
8543 rtx offset = NULL_RTX;
8544
8545 if (disp)
8546 {
8547 /* Pull out the offset of a symbol; print any symbol itself. */
8548 if (GET_CODE (disp) == CONST
8549 && GET_CODE (XEXP (disp, 0)) == PLUS
8550 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8551 {
8552 offset = XEXP (XEXP (disp, 0), 1);
8553 disp = gen_rtx_CONST (VOIDmode,
8554 XEXP (XEXP (disp, 0), 0));
8555 }
8556
8557 if (flag_pic)
8558 output_pic_addr_const (file, disp, 0);
8559 else if (GET_CODE (disp) == LABEL_REF)
8560 output_asm_label (disp);
8561 else if (CONST_INT_P (disp))
8562 offset = disp;
8563 else
8564 output_addr_const (file, disp);
8565 }
8566
8567 putc ('[', file);
8568 if (base)
8569 {
8570 print_reg (base, 0, file);
8571 if (offset)
8572 {
8573 if (INTVAL (offset) >= 0)
8574 putc ('+', file);
8575 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8576 }
8577 }
8578 else if (offset)
8579 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8580 else
8581 putc ('0', file);
8582
8583 if (index)
8584 {
8585 putc ('+', file);
8586 print_reg (index, 0, file);
8587 if (scale != 1)
8588 fprintf (file, "*%d", scale);
8589 }
8590 putc (']', file);
8591 }
8592 }
8593 }
8594
8595 bool
8596 output_addr_const_extra (FILE *file, rtx x)
8597 {
8598 rtx op;
8599
8600 if (GET_CODE (x) != UNSPEC)
8601 return false;
8602
8603 op = XVECEXP (x, 0, 0);
8604 switch (XINT (x, 1))
8605 {
8606 case UNSPEC_GOTTPOFF:
8607 output_addr_const (file, op);
8608 /* FIXME: This might be @TPOFF in Sun ld. */
8609 fputs ("@GOTTPOFF", file);
8610 break;
8611 case UNSPEC_TPOFF:
8612 output_addr_const (file, op);
8613 fputs ("@TPOFF", file);
8614 break;
8615 case UNSPEC_NTPOFF:
8616 output_addr_const (file, op);
8617 if (TARGET_64BIT)
8618 fputs ("@TPOFF", file);
8619 else
8620 fputs ("@NTPOFF", file);
8621 break;
8622 case UNSPEC_DTPOFF:
8623 output_addr_const (file, op);
8624 fputs ("@DTPOFF", file);
8625 break;
8626 case UNSPEC_GOTNTPOFF:
8627 output_addr_const (file, op);
8628 if (TARGET_64BIT)
8629 fputs ("@GOTTPOFF(%rip)", file);
8630 else
8631 fputs ("@GOTNTPOFF", file);
8632 break;
8633 case UNSPEC_INDNTPOFF:
8634 output_addr_const (file, op);
8635 fputs ("@INDNTPOFF", file);
8636 break;
8637
8638 default:
8639 return false;
8640 }
8641
8642 return true;
8643 }
8644 \f
8645 /* Split one or more DImode RTL references into pairs of SImode
8646 references. The RTL can be REG, offsettable MEM, integer constant, or
8647 CONST_DOUBLE. "operands" is a pointer to an array of DImode RTL to
8648 split and "num" is its length. lo_half and hi_half are output arrays
8649 that parallel "operands". */
8650
8651 void
8652 split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8653 {
8654 while (num--)
8655 {
8656 rtx op = operands[num];
8657
8658 /* simplify_subreg refuse to split volatile memory addresses,
8659 but we still have to handle it. */
8660 if (MEM_P (op))
8661 {
8662 lo_half[num] = adjust_address (op, SImode, 0);
8663 hi_half[num] = adjust_address (op, SImode, 4);
8664 }
8665 else
8666 {
8667 lo_half[num] = simplify_gen_subreg (SImode, op,
8668 GET_MODE (op) == VOIDmode
8669 ? DImode : GET_MODE (op), 0);
8670 hi_half[num] = simplify_gen_subreg (SImode, op,
8671 GET_MODE (op) == VOIDmode
8672 ? DImode : GET_MODE (op), 4);
8673 }
8674 }
8675 }
8676 /* Split one or more TImode RTL references into pairs of DImode
8677 references. The RTL can be REG, offsettable MEM, integer constant, or
8678 CONST_DOUBLE. "operands" is a pointer to an array of DImode RTL to
8679 split and "num" is its length. lo_half and hi_half are output arrays
8680 that parallel "operands". */
8681
8682 void
8683 split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8684 {
8685 while (num--)
8686 {
8687 rtx op = operands[num];
8688
8689 /* simplify_subreg refuse to split volatile memory addresses, but we
8690 still have to handle it. */
8691 if (MEM_P (op))
8692 {
8693 lo_half[num] = adjust_address (op, DImode, 0);
8694 hi_half[num] = adjust_address (op, DImode, 8);
8695 }
8696 else
8697 {
8698 lo_half[num] = simplify_gen_subreg (DImode, op, TImode, 0);
8699 hi_half[num] = simplify_gen_subreg (DImode, op, TImode, 8);
8700 }
8701 }
8702 }
8703 \f
8704 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
8705 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
8706 is the expression of the binary operation. The output may either be
8707 emitted here, or returned to the caller, like all output_* functions.
8708
8709 There is no guarantee that the operands are the same mode, as they
8710 might be within FLOAT or FLOAT_EXTEND expressions. */
8711
8712 #ifndef SYSV386_COMPAT
8713 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
8714 wants to fix the assemblers because that causes incompatibility
8715 with gcc. No-one wants to fix gcc because that causes
8716 incompatibility with assemblers... You can use the option of
8717 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
8718 #define SYSV386_COMPAT 1
8719 #endif
8720
8721 const char *
8722 output_387_binary_op (rtx insn, rtx *operands)
8723 {
8724 static char buf[30];
8725 const char *p;
8726 const char *ssep;
8727 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
8728
8729 #ifdef ENABLE_CHECKING
8730 /* Even if we do not want to check the inputs, this documents input
8731 constraints. Which helps in understanding the following code. */
8732 if (STACK_REG_P (operands[0])
8733 && ((REG_P (operands[1])
8734 && REGNO (operands[0]) == REGNO (operands[1])
8735 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
8736 || (REG_P (operands[2])
8737 && REGNO (operands[0]) == REGNO (operands[2])
8738 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
8739 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
8740 ; /* ok */
8741 else
8742 gcc_assert (is_sse);
8743 #endif
8744
8745 switch (GET_CODE (operands[3]))
8746 {
8747 case PLUS:
8748 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8749 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8750 p = "fiadd";
8751 else
8752 p = "fadd";
8753 ssep = "add";
8754 break;
8755
8756 case MINUS:
8757 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8758 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8759 p = "fisub";
8760 else
8761 p = "fsub";
8762 ssep = "sub";
8763 break;
8764
8765 case MULT:
8766 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8767 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8768 p = "fimul";
8769 else
8770 p = "fmul";
8771 ssep = "mul";
8772 break;
8773
8774 case DIV:
8775 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8776 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8777 p = "fidiv";
8778 else
8779 p = "fdiv";
8780 ssep = "div";
8781 break;
8782
8783 default:
8784 gcc_unreachable ();
8785 }
8786
8787 if (is_sse)
8788 {
8789 strcpy (buf, ssep);
8790 if (GET_MODE (operands[0]) == SFmode)
8791 strcat (buf, "ss\t{%2, %0|%0, %2}");
8792 else
8793 strcat (buf, "sd\t{%2, %0|%0, %2}");
8794 return buf;
8795 }
8796 strcpy (buf, p);
8797
8798 switch (GET_CODE (operands[3]))
8799 {
8800 case MULT:
8801 case PLUS:
8802 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
8803 {
8804 rtx temp = operands[2];
8805 operands[2] = operands[1];
8806 operands[1] = temp;
8807 }
8808
8809 /* know operands[0] == operands[1]. */
8810
8811 if (MEM_P (operands[2]))
8812 {
8813 p = "%z2\t%2";
8814 break;
8815 }
8816
8817 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
8818 {
8819 if (STACK_TOP_P (operands[0]))
8820 /* How is it that we are storing to a dead operand[2]?
8821 Well, presumably operands[1] is dead too. We can't
8822 store the result to st(0) as st(0) gets popped on this
8823 instruction. Instead store to operands[2] (which I
8824 think has to be st(1)). st(1) will be popped later.
8825 gcc <= 2.8.1 didn't have this check and generated
8826 assembly code that the Unixware assembler rejected. */
8827 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
8828 else
8829 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
8830 break;
8831 }
8832
8833 if (STACK_TOP_P (operands[0]))
8834 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
8835 else
8836 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
8837 break;
8838
8839 case MINUS:
8840 case DIV:
8841 if (MEM_P (operands[1]))
8842 {
8843 p = "r%z1\t%1";
8844 break;
8845 }
8846
8847 if (MEM_P (operands[2]))
8848 {
8849 p = "%z2\t%2";
8850 break;
8851 }
8852
8853 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
8854 {
8855 #if SYSV386_COMPAT
8856 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
8857 derived assemblers, confusingly reverse the direction of
8858 the operation for fsub{r} and fdiv{r} when the
8859 destination register is not st(0). The Intel assembler
8860 doesn't have this brain damage. Read !SYSV386_COMPAT to
8861 figure out what the hardware really does. */
8862 if (STACK_TOP_P (operands[0]))
8863 p = "{p\t%0, %2|rp\t%2, %0}";
8864 else
8865 p = "{rp\t%2, %0|p\t%0, %2}";
8866 #else
8867 if (STACK_TOP_P (operands[0]))
8868 /* As above for fmul/fadd, we can't store to st(0). */
8869 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
8870 else
8871 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
8872 #endif
8873 break;
8874 }
8875
8876 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
8877 {
8878 #if SYSV386_COMPAT
8879 if (STACK_TOP_P (operands[0]))
8880 p = "{rp\t%0, %1|p\t%1, %0}";
8881 else
8882 p = "{p\t%1, %0|rp\t%0, %1}";
8883 #else
8884 if (STACK_TOP_P (operands[0]))
8885 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
8886 else
8887 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
8888 #endif
8889 break;
8890 }
8891
8892 if (STACK_TOP_P (operands[0]))
8893 {
8894 if (STACK_TOP_P (operands[1]))
8895 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
8896 else
8897 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
8898 break;
8899 }
8900 else if (STACK_TOP_P (operands[1]))
8901 {
8902 #if SYSV386_COMPAT
8903 p = "{\t%1, %0|r\t%0, %1}";
8904 #else
8905 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
8906 #endif
8907 }
8908 else
8909 {
8910 #if SYSV386_COMPAT
8911 p = "{r\t%2, %0|\t%0, %2}";
8912 #else
8913 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
8914 #endif
8915 }
8916 break;
8917
8918 default:
8919 gcc_unreachable ();
8920 }
8921
8922 strcat (buf, p);
8923 return buf;
8924 }
8925
8926 /* Return needed mode for entity in optimize_mode_switching pass. */
8927
8928 int
8929 ix86_mode_needed (int entity, rtx insn)
8930 {
8931 enum attr_i387_cw mode;
8932
8933 /* The mode UNINITIALIZED is used to store control word after a
8934 function call or ASM pattern. The mode ANY specify that function
8935 has no requirements on the control word and make no changes in the
8936 bits we are interested in. */
8937
8938 if (CALL_P (insn)
8939 || (NONJUMP_INSN_P (insn)
8940 && (asm_noperands (PATTERN (insn)) >= 0
8941 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
8942 return I387_CW_UNINITIALIZED;
8943
8944 if (recog_memoized (insn) < 0)
8945 return I387_CW_ANY;
8946
8947 mode = get_attr_i387_cw (insn);
8948
8949 switch (entity)
8950 {
8951 case I387_TRUNC:
8952 if (mode == I387_CW_TRUNC)
8953 return mode;
8954 break;
8955
8956 case I387_FLOOR:
8957 if (mode == I387_CW_FLOOR)
8958 return mode;
8959 break;
8960
8961 case I387_CEIL:
8962 if (mode == I387_CW_CEIL)
8963 return mode;
8964 break;
8965
8966 case I387_MASK_PM:
8967 if (mode == I387_CW_MASK_PM)
8968 return mode;
8969 break;
8970
8971 default:
8972 gcc_unreachable ();
8973 }
8974
8975 return I387_CW_ANY;
8976 }
8977
8978 /* Output code to initialize control word copies used by trunc?f?i and
8979 rounding patterns. CURRENT_MODE is set to current control word,
8980 while NEW_MODE is set to new control word. */
8981
8982 void
8983 emit_i387_cw_initialization (int mode)
8984 {
8985 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
8986 rtx new_mode;
8987
8988 int slot;
8989
8990 rtx reg = gen_reg_rtx (HImode);
8991
8992 emit_insn (gen_x86_fnstcw_1 (stored_mode));
8993 emit_move_insn (reg, copy_rtx (stored_mode));
8994
8995 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL || optimize_size)
8996 {
8997 switch (mode)
8998 {
8999 case I387_CW_TRUNC:
9000 /* round toward zero (truncate) */
9001 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
9002 slot = SLOT_CW_TRUNC;
9003 break;
9004
9005 case I387_CW_FLOOR:
9006 /* round down toward -oo */
9007 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9008 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
9009 slot = SLOT_CW_FLOOR;
9010 break;
9011
9012 case I387_CW_CEIL:
9013 /* round up toward +oo */
9014 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9015 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
9016 slot = SLOT_CW_CEIL;
9017 break;
9018
9019 case I387_CW_MASK_PM:
9020 /* mask precision exception for nearbyint() */
9021 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9022 slot = SLOT_CW_MASK_PM;
9023 break;
9024
9025 default:
9026 gcc_unreachable ();
9027 }
9028 }
9029 else
9030 {
9031 switch (mode)
9032 {
9033 case I387_CW_TRUNC:
9034 /* round toward zero (truncate) */
9035 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
9036 slot = SLOT_CW_TRUNC;
9037 break;
9038
9039 case I387_CW_FLOOR:
9040 /* round down toward -oo */
9041 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
9042 slot = SLOT_CW_FLOOR;
9043 break;
9044
9045 case I387_CW_CEIL:
9046 /* round up toward +oo */
9047 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
9048 slot = SLOT_CW_CEIL;
9049 break;
9050
9051 case I387_CW_MASK_PM:
9052 /* mask precision exception for nearbyint() */
9053 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9054 slot = SLOT_CW_MASK_PM;
9055 break;
9056
9057 default:
9058 gcc_unreachable ();
9059 }
9060 }
9061
9062 gcc_assert (slot < MAX_386_STACK_LOCALS);
9063
9064 new_mode = assign_386_stack_local (HImode, slot);
9065 emit_move_insn (new_mode, reg);
9066 }
9067
9068 /* Output code for INSN to convert a float to a signed int. OPERANDS
9069 are the insn operands. The output may be [HSD]Imode and the input
9070 operand may be [SDX]Fmode. */
9071
9072 const char *
9073 output_fix_trunc (rtx insn, rtx *operands, int fisttp)
9074 {
9075 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9076 int dimode_p = GET_MODE (operands[0]) == DImode;
9077 int round_mode = get_attr_i387_cw (insn);
9078
9079 /* Jump through a hoop or two for DImode, since the hardware has no
9080 non-popping instruction. We used to do this a different way, but
9081 that was somewhat fragile and broke with post-reload splitters. */
9082 if ((dimode_p || fisttp) && !stack_top_dies)
9083 output_asm_insn ("fld\t%y1", operands);
9084
9085 gcc_assert (STACK_TOP_P (operands[1]));
9086 gcc_assert (MEM_P (operands[0]));
9087
9088 if (fisttp)
9089 output_asm_insn ("fisttp%z0\t%0", operands);
9090 else
9091 {
9092 if (round_mode != I387_CW_ANY)
9093 output_asm_insn ("fldcw\t%3", operands);
9094 if (stack_top_dies || dimode_p)
9095 output_asm_insn ("fistp%z0\t%0", operands);
9096 else
9097 output_asm_insn ("fist%z0\t%0", operands);
9098 if (round_mode != I387_CW_ANY)
9099 output_asm_insn ("fldcw\t%2", operands);
9100 }
9101
9102 return "";
9103 }
9104
9105 /* Output code for x87 ffreep insn. The OPNO argument, which may only
9106 have the values zero or one, indicates the ffreep insn's operand
9107 from the OPERANDS array. */
9108
9109 static const char *
9110 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
9111 {
9112 if (TARGET_USE_FFREEP)
9113 #if HAVE_AS_IX86_FFREEP
9114 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
9115 #else
9116 {
9117 static char retval[] = ".word\t0xc_df";
9118 int regno = REGNO (operands[opno]);
9119
9120 gcc_assert (FP_REGNO_P (regno));
9121
9122 retval[9] = '0' + (regno - FIRST_STACK_REG);
9123 return retval;
9124 }
9125 #endif
9126
9127 return opno ? "fstp\t%y1" : "fstp\t%y0";
9128 }
9129
9130
9131 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
9132 should be used. UNORDERED_P is true when fucom should be used. */
9133
9134 const char *
9135 output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p)
9136 {
9137 int stack_top_dies;
9138 rtx cmp_op0, cmp_op1;
9139 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
9140
9141 if (eflags_p)
9142 {
9143 cmp_op0 = operands[0];
9144 cmp_op1 = operands[1];
9145 }
9146 else
9147 {
9148 cmp_op0 = operands[1];
9149 cmp_op1 = operands[2];
9150 }
9151
9152 if (is_sse)
9153 {
9154 if (GET_MODE (operands[0]) == SFmode)
9155 if (unordered_p)
9156 return "ucomiss\t{%1, %0|%0, %1}";
9157 else
9158 return "comiss\t{%1, %0|%0, %1}";
9159 else
9160 if (unordered_p)
9161 return "ucomisd\t{%1, %0|%0, %1}";
9162 else
9163 return "comisd\t{%1, %0|%0, %1}";
9164 }
9165
9166 gcc_assert (STACK_TOP_P (cmp_op0));
9167
9168 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9169
9170 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
9171 {
9172 if (stack_top_dies)
9173 {
9174 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
9175 return output_387_ffreep (operands, 1);
9176 }
9177 else
9178 return "ftst\n\tfnstsw\t%0";
9179 }
9180
9181 if (STACK_REG_P (cmp_op1)
9182 && stack_top_dies
9183 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
9184 && REGNO (cmp_op1) != FIRST_STACK_REG)
9185 {
9186 /* If both the top of the 387 stack dies, and the other operand
9187 is also a stack register that dies, then this must be a
9188 `fcompp' float compare */
9189
9190 if (eflags_p)
9191 {
9192 /* There is no double popping fcomi variant. Fortunately,
9193 eflags is immune from the fstp's cc clobbering. */
9194 if (unordered_p)
9195 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
9196 else
9197 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
9198 return output_387_ffreep (operands, 0);
9199 }
9200 else
9201 {
9202 if (unordered_p)
9203 return "fucompp\n\tfnstsw\t%0";
9204 else
9205 return "fcompp\n\tfnstsw\t%0";
9206 }
9207 }
9208 else
9209 {
9210 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
9211
9212 static const char * const alt[16] =
9213 {
9214 "fcom%z2\t%y2\n\tfnstsw\t%0",
9215 "fcomp%z2\t%y2\n\tfnstsw\t%0",
9216 "fucom%z2\t%y2\n\tfnstsw\t%0",
9217 "fucomp%z2\t%y2\n\tfnstsw\t%0",
9218
9219 "ficom%z2\t%y2\n\tfnstsw\t%0",
9220 "ficomp%z2\t%y2\n\tfnstsw\t%0",
9221 NULL,
9222 NULL,
9223
9224 "fcomi\t{%y1, %0|%0, %y1}",
9225 "fcomip\t{%y1, %0|%0, %y1}",
9226 "fucomi\t{%y1, %0|%0, %y1}",
9227 "fucomip\t{%y1, %0|%0, %y1}",
9228
9229 NULL,
9230 NULL,
9231 NULL,
9232 NULL
9233 };
9234
9235 int mask;
9236 const char *ret;
9237
9238 mask = eflags_p << 3;
9239 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
9240 mask |= unordered_p << 1;
9241 mask |= stack_top_dies;
9242
9243 gcc_assert (mask < 16);
9244 ret = alt[mask];
9245 gcc_assert (ret);
9246
9247 return ret;
9248 }
9249 }
9250
9251 void
9252 ix86_output_addr_vec_elt (FILE *file, int value)
9253 {
9254 const char *directive = ASM_LONG;
9255
9256 #ifdef ASM_QUAD
9257 if (TARGET_64BIT)
9258 directive = ASM_QUAD;
9259 #else
9260 gcc_assert (!TARGET_64BIT);
9261 #endif
9262
9263 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
9264 }
9265
9266 void
9267 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
9268 {
9269 if (TARGET_64BIT)
9270 fprintf (file, "%s%s%d-%s%d\n",
9271 ASM_LONG, LPREFIX, value, LPREFIX, rel);
9272 else if (HAVE_AS_GOTOFF_IN_DATA)
9273 fprintf (file, "%s%s%d@GOTOFF\n", ASM_LONG, LPREFIX, value);
9274 #if TARGET_MACHO
9275 else if (TARGET_MACHO)
9276 {
9277 fprintf (file, "%s%s%d-", ASM_LONG, LPREFIX, value);
9278 machopic_output_function_base_name (file);
9279 fprintf(file, "\n");
9280 }
9281 #endif
9282 else
9283 asm_fprintf (file, "%s%U%s+[.-%s%d]\n",
9284 ASM_LONG, GOT_SYMBOL_NAME, LPREFIX, value);
9285 }
9286 \f
9287 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
9288 for the target. */
9289
9290 void
9291 ix86_expand_clear (rtx dest)
9292 {
9293 rtx tmp;
9294
9295 /* We play register width games, which are only valid after reload. */
9296 gcc_assert (reload_completed);
9297
9298 /* Avoid HImode and its attendant prefix byte. */
9299 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
9300 dest = gen_rtx_REG (SImode, REGNO (dest));
9301
9302 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
9303
9304 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
9305 if (reload_completed && (!TARGET_USE_MOV0 || optimize_size))
9306 {
9307 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, 17));
9308 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
9309 }
9310
9311 emit_insn (tmp);
9312 }
9313
9314 /* X is an unchanging MEM. If it is a constant pool reference, return
9315 the constant pool rtx, else NULL. */
9316
9317 rtx
9318 maybe_get_pool_constant (rtx x)
9319 {
9320 x = ix86_delegitimize_address (XEXP (x, 0));
9321
9322 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
9323 return get_pool_constant (x);
9324
9325 return NULL_RTX;
9326 }
9327
9328 void
9329 ix86_expand_move (enum machine_mode mode, rtx operands[])
9330 {
9331 int strict = (reload_in_progress || reload_completed);
9332 rtx op0, op1;
9333 enum tls_model model;
9334
9335 op0 = operands[0];
9336 op1 = operands[1];
9337
9338 if (GET_CODE (op1) == SYMBOL_REF)
9339 {
9340 model = SYMBOL_REF_TLS_MODEL (op1);
9341 if (model)
9342 {
9343 op1 = legitimize_tls_address (op1, model, true);
9344 op1 = force_operand (op1, op0);
9345 if (op1 == op0)
9346 return;
9347 }
9348 }
9349 else if (GET_CODE (op1) == CONST
9350 && GET_CODE (XEXP (op1, 0)) == PLUS
9351 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
9352 {
9353 model = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (op1, 0), 0));
9354 if (model)
9355 {
9356 rtx addend = XEXP (XEXP (op1, 0), 1);
9357 op1 = legitimize_tls_address (XEXP (XEXP (op1, 0), 0), model, true);
9358 op1 = force_operand (op1, NULL);
9359 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
9360 op0, 1, OPTAB_DIRECT);
9361 if (op1 == op0)
9362 return;
9363 }
9364 }
9365
9366 if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode))
9367 {
9368 if (TARGET_MACHO && !TARGET_64BIT)
9369 {
9370 #if TARGET_MACHO
9371 if (MACHOPIC_PURE)
9372 {
9373 rtx temp = ((reload_in_progress
9374 || ((op0 && REG_P (op0))
9375 && mode == Pmode))
9376 ? op0 : gen_reg_rtx (Pmode));
9377 op1 = machopic_indirect_data_reference (op1, temp);
9378 op1 = machopic_legitimize_pic_address (op1, mode,
9379 temp == op1 ? 0 : temp);
9380 }
9381 else if (MACHOPIC_INDIRECT)
9382 op1 = machopic_indirect_data_reference (op1, 0);
9383 if (op0 == op1)
9384 return;
9385 #endif
9386 }
9387 else
9388 {
9389 if (MEM_P (op0))
9390 op1 = force_reg (Pmode, op1);
9391 else
9392 op1 = legitimize_address (op1, op1, Pmode);
9393 }
9394 }
9395 else
9396 {
9397 if (MEM_P (op0)
9398 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
9399 || !push_operand (op0, mode))
9400 && MEM_P (op1))
9401 op1 = force_reg (mode, op1);
9402
9403 if (push_operand (op0, mode)
9404 && ! general_no_elim_operand (op1, mode))
9405 op1 = copy_to_mode_reg (mode, op1);
9406
9407 /* Force large constants in 64bit compilation into register
9408 to get them CSEed. */
9409 if (TARGET_64BIT && mode == DImode
9410 && immediate_operand (op1, mode)
9411 && !x86_64_zext_immediate_operand (op1, VOIDmode)
9412 && !register_operand (op0, mode)
9413 && optimize && !reload_completed && !reload_in_progress)
9414 op1 = copy_to_mode_reg (mode, op1);
9415
9416 if (FLOAT_MODE_P (mode))
9417 {
9418 /* If we are loading a floating point constant to a register,
9419 force the value to memory now, since we'll get better code
9420 out the back end. */
9421
9422 if (strict)
9423 ;
9424 else if (GET_CODE (op1) == CONST_DOUBLE)
9425 {
9426 op1 = validize_mem (force_const_mem (mode, op1));
9427 if (!register_operand (op0, mode))
9428 {
9429 rtx temp = gen_reg_rtx (mode);
9430 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
9431 emit_move_insn (op0, temp);
9432 return;
9433 }
9434 }
9435 }
9436 }
9437
9438 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9439 }
9440
9441 void
9442 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
9443 {
9444 rtx op0 = operands[0], op1 = operands[1];
9445
9446 /* Force constants other than zero into memory. We do not know how
9447 the instructions used to build constants modify the upper 64 bits
9448 of the register, once we have that information we may be able
9449 to handle some of them more efficiently. */
9450 if ((reload_in_progress | reload_completed) == 0
9451 && register_operand (op0, mode)
9452 && CONSTANT_P (op1)
9453 && standard_sse_constant_p (op1) <= 0)
9454 op1 = validize_mem (force_const_mem (mode, op1));
9455
9456 /* Make operand1 a register if it isn't already. */
9457 if (!no_new_pseudos
9458 && !register_operand (op0, mode)
9459 && !register_operand (op1, mode))
9460 {
9461 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
9462 return;
9463 }
9464
9465 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9466 }
9467
9468 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
9469 straight to ix86_expand_vector_move. */
9470
9471 void
9472 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
9473 {
9474 rtx op0, op1, m;
9475
9476 op0 = operands[0];
9477 op1 = operands[1];
9478
9479 if (MEM_P (op1))
9480 {
9481 /* If we're optimizing for size, movups is the smallest. */
9482 if (optimize_size)
9483 {
9484 op0 = gen_lowpart (V4SFmode, op0);
9485 op1 = gen_lowpart (V4SFmode, op1);
9486 emit_insn (gen_sse_movups (op0, op1));
9487 return;
9488 }
9489
9490 /* ??? If we have typed data, then it would appear that using
9491 movdqu is the only way to get unaligned data loaded with
9492 integer type. */
9493 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9494 {
9495 op0 = gen_lowpart (V16QImode, op0);
9496 op1 = gen_lowpart (V16QImode, op1);
9497 emit_insn (gen_sse2_movdqu (op0, op1));
9498 return;
9499 }
9500
9501 if (TARGET_SSE2 && mode == V2DFmode)
9502 {
9503 rtx zero;
9504
9505 if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9506 {
9507 op0 = gen_lowpart (V2DFmode, op0);
9508 op1 = gen_lowpart (V2DFmode, op1);
9509 emit_insn (gen_sse2_movupd (op0, op1));
9510 return;
9511 }
9512
9513 /* When SSE registers are split into halves, we can avoid
9514 writing to the top half twice. */
9515 if (TARGET_SSE_SPLIT_REGS)
9516 {
9517 emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9518 zero = op0;
9519 }
9520 else
9521 {
9522 /* ??? Not sure about the best option for the Intel chips.
9523 The following would seem to satisfy; the register is
9524 entirely cleared, breaking the dependency chain. We
9525 then store to the upper half, with a dependency depth
9526 of one. A rumor has it that Intel recommends two movsd
9527 followed by an unpacklpd, but this is unconfirmed. And
9528 given that the dependency depth of the unpacklpd would
9529 still be one, I'm not sure why this would be better. */
9530 zero = CONST0_RTX (V2DFmode);
9531 }
9532
9533 m = adjust_address (op1, DFmode, 0);
9534 emit_insn (gen_sse2_loadlpd (op0, zero, m));
9535 m = adjust_address (op1, DFmode, 8);
9536 emit_insn (gen_sse2_loadhpd (op0, op0, m));
9537 }
9538 else
9539 {
9540 if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9541 {
9542 op0 = gen_lowpart (V4SFmode, op0);
9543 op1 = gen_lowpart (V4SFmode, op1);
9544 emit_insn (gen_sse_movups (op0, op1));
9545 return;
9546 }
9547
9548 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
9549 emit_move_insn (op0, CONST0_RTX (mode));
9550 else
9551 emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9552
9553 if (mode != V4SFmode)
9554 op0 = gen_lowpart (V4SFmode, op0);
9555 m = adjust_address (op1, V2SFmode, 0);
9556 emit_insn (gen_sse_loadlps (op0, op0, m));
9557 m = adjust_address (op1, V2SFmode, 8);
9558 emit_insn (gen_sse_loadhps (op0, op0, m));
9559 }
9560 }
9561 else if (MEM_P (op0))
9562 {
9563 /* If we're optimizing for size, movups is the smallest. */
9564 if (optimize_size)
9565 {
9566 op0 = gen_lowpart (V4SFmode, op0);
9567 op1 = gen_lowpart (V4SFmode, op1);
9568 emit_insn (gen_sse_movups (op0, op1));
9569 return;
9570 }
9571
9572 /* ??? Similar to above, only less clear because of quote
9573 typeless stores unquote. */
9574 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
9575 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9576 {
9577 op0 = gen_lowpart (V16QImode, op0);
9578 op1 = gen_lowpart (V16QImode, op1);
9579 emit_insn (gen_sse2_movdqu (op0, op1));
9580 return;
9581 }
9582
9583 if (TARGET_SSE2 && mode == V2DFmode)
9584 {
9585 m = adjust_address (op0, DFmode, 0);
9586 emit_insn (gen_sse2_storelpd (m, op1));
9587 m = adjust_address (op0, DFmode, 8);
9588 emit_insn (gen_sse2_storehpd (m, op1));
9589 }
9590 else
9591 {
9592 if (mode != V4SFmode)
9593 op1 = gen_lowpart (V4SFmode, op1);
9594 m = adjust_address (op0, V2SFmode, 0);
9595 emit_insn (gen_sse_storelps (m, op1));
9596 m = adjust_address (op0, V2SFmode, 8);
9597 emit_insn (gen_sse_storehps (m, op1));
9598 }
9599 }
9600 else
9601 gcc_unreachable ();
9602 }
9603
9604 /* Expand a push in MODE. This is some mode for which we do not support
9605 proper push instructions, at least from the registers that we expect
9606 the value to live in. */
9607
9608 void
9609 ix86_expand_push (enum machine_mode mode, rtx x)
9610 {
9611 rtx tmp;
9612
9613 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
9614 GEN_INT (-GET_MODE_SIZE (mode)),
9615 stack_pointer_rtx, 1, OPTAB_DIRECT);
9616 if (tmp != stack_pointer_rtx)
9617 emit_move_insn (stack_pointer_rtx, tmp);
9618
9619 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
9620 emit_move_insn (tmp, x);
9621 }
9622
9623 /* Helper function of ix86_fixup_binary_operands to canonicalize
9624 operand order. Returns true if the operands should be swapped. */
9625
9626 static bool
9627 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
9628 rtx operands[])
9629 {
9630 rtx dst = operands[0];
9631 rtx src1 = operands[1];
9632 rtx src2 = operands[2];
9633
9634 /* If the operation is not commutative, we can't do anything. */
9635 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
9636 return false;
9637
9638 /* Highest priority is that src1 should match dst. */
9639 if (rtx_equal_p (dst, src1))
9640 return false;
9641 if (rtx_equal_p (dst, src2))
9642 return true;
9643
9644 /* Next highest priority is that immediate constants come second. */
9645 if (immediate_operand (src2, mode))
9646 return false;
9647 if (immediate_operand (src1, mode))
9648 return true;
9649
9650 /* Lowest priority is that memory references should come second. */
9651 if (MEM_P (src2))
9652 return false;
9653 if (MEM_P (src1))
9654 return true;
9655
9656 return false;
9657 }
9658
9659
9660 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
9661 destination to use for the operation. If different from the true
9662 destination in operands[0], a copy operation will be required. */
9663
9664 rtx
9665 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
9666 rtx operands[])
9667 {
9668 rtx dst = operands[0];
9669 rtx src1 = operands[1];
9670 rtx src2 = operands[2];
9671
9672 /* Canonicalize operand order. */
9673 if (ix86_swap_binary_operands_p (code, mode, operands))
9674 {
9675 rtx temp = src1;
9676 src1 = src2;
9677 src2 = temp;
9678 }
9679
9680 /* Both source operands cannot be in memory. */
9681 if (MEM_P (src1) && MEM_P (src2))
9682 {
9683 /* Optimization: Only read from memory once. */
9684 if (rtx_equal_p (src1, src2))
9685 {
9686 src2 = force_reg (mode, src2);
9687 src1 = src2;
9688 }
9689 else
9690 src2 = force_reg (mode, src2);
9691 }
9692
9693 /* If the destination is memory, and we do not have matching source
9694 operands, do things in registers. */
9695 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
9696 dst = gen_reg_rtx (mode);
9697
9698 /* Source 1 cannot be a constant. */
9699 if (CONSTANT_P (src1))
9700 src1 = force_reg (mode, src1);
9701
9702 /* Source 1 cannot be a non-matching memory. */
9703 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
9704 src1 = force_reg (mode, src1);
9705
9706 operands[1] = src1;
9707 operands[2] = src2;
9708 return dst;
9709 }
9710
9711 /* Similarly, but assume that the destination has already been
9712 set up properly. */
9713
9714 void
9715 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
9716 enum machine_mode mode, rtx operands[])
9717 {
9718 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
9719 gcc_assert (dst == operands[0]);
9720 }
9721
9722 /* Attempt to expand a binary operator. Make the expansion closer to the
9723 actual machine, then just general_operand, which will allow 3 separate
9724 memory references (one output, two input) in a single insn. */
9725
9726 void
9727 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
9728 rtx operands[])
9729 {
9730 rtx src1, src2, dst, op, clob;
9731
9732 dst = ix86_fixup_binary_operands (code, mode, operands);
9733 src1 = operands[1];
9734 src2 = operands[2];
9735
9736 /* Emit the instruction. */
9737
9738 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
9739 if (reload_in_progress)
9740 {
9741 /* Reload doesn't know about the flags register, and doesn't know that
9742 it doesn't want to clobber it. We can only do this with PLUS. */
9743 gcc_assert (code == PLUS);
9744 emit_insn (op);
9745 }
9746 else
9747 {
9748 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9749 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
9750 }
9751
9752 /* Fix up the destination if needed. */
9753 if (dst != operands[0])
9754 emit_move_insn (operands[0], dst);
9755 }
9756
9757 /* Return TRUE or FALSE depending on whether the binary operator meets the
9758 appropriate constraints. */
9759
9760 int
9761 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
9762 rtx operands[3])
9763 {
9764 rtx dst = operands[0];
9765 rtx src1 = operands[1];
9766 rtx src2 = operands[2];
9767
9768 /* Both source operands cannot be in memory. */
9769 if (MEM_P (src1) && MEM_P (src2))
9770 return 0;
9771
9772 /* Canonicalize operand order for commutative operators. */
9773 if (ix86_swap_binary_operands_p (code, mode, operands))
9774 {
9775 rtx temp = src1;
9776 src1 = src2;
9777 src2 = temp;
9778 }
9779
9780 /* If the destination is memory, we must have a matching source operand. */
9781 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
9782 return 0;
9783
9784 /* Source 1 cannot be a constant. */
9785 if (CONSTANT_P (src1))
9786 return 0;
9787
9788 /* Source 1 cannot be a non-matching memory. */
9789 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
9790 return 0;
9791
9792 return 1;
9793 }
9794
9795 /* Attempt to expand a unary operator. Make the expansion closer to the
9796 actual machine, then just general_operand, which will allow 2 separate
9797 memory references (one output, one input) in a single insn. */
9798
9799 void
9800 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
9801 rtx operands[])
9802 {
9803 int matching_memory;
9804 rtx src, dst, op, clob;
9805
9806 dst = operands[0];
9807 src = operands[1];
9808
9809 /* If the destination is memory, and we do not have matching source
9810 operands, do things in registers. */
9811 matching_memory = 0;
9812 if (MEM_P (dst))
9813 {
9814 if (rtx_equal_p (dst, src))
9815 matching_memory = 1;
9816 else
9817 dst = gen_reg_rtx (mode);
9818 }
9819
9820 /* When source operand is memory, destination must match. */
9821 if (MEM_P (src) && !matching_memory)
9822 src = force_reg (mode, src);
9823
9824 /* Emit the instruction. */
9825
9826 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
9827 if (reload_in_progress || code == NOT)
9828 {
9829 /* Reload doesn't know about the flags register, and doesn't know that
9830 it doesn't want to clobber it. */
9831 gcc_assert (code == NOT);
9832 emit_insn (op);
9833 }
9834 else
9835 {
9836 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9837 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
9838 }
9839
9840 /* Fix up the destination if needed. */
9841 if (dst != operands[0])
9842 emit_move_insn (operands[0], dst);
9843 }
9844
9845 /* Return TRUE or FALSE depending on whether the unary operator meets the
9846 appropriate constraints. */
9847
9848 int
9849 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
9850 enum machine_mode mode ATTRIBUTE_UNUSED,
9851 rtx operands[2] ATTRIBUTE_UNUSED)
9852 {
9853 /* If one of operands is memory, source and destination must match. */
9854 if ((MEM_P (operands[0])
9855 || MEM_P (operands[1]))
9856 && ! rtx_equal_p (operands[0], operands[1]))
9857 return FALSE;
9858 return TRUE;
9859 }
9860
9861 /* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders.
9862 Create a mask for the sign bit in MODE for an SSE register. If VECT is
9863 true, then replicate the mask for all elements of the vector register.
9864 If INVERT is true, then create a mask excluding the sign bit. */
9865
9866 rtx
9867 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
9868 {
9869 enum machine_mode vec_mode;
9870 HOST_WIDE_INT hi, lo;
9871 int shift = 63;
9872 rtvec v;
9873 rtx mask;
9874
9875 /* Find the sign bit, sign extended to 2*HWI. */
9876 if (mode == SFmode)
9877 lo = 0x80000000, hi = lo < 0;
9878 else if (HOST_BITS_PER_WIDE_INT >= 64)
9879 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
9880 else
9881 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
9882
9883 if (invert)
9884 lo = ~lo, hi = ~hi;
9885
9886 /* Force this value into the low part of a fp vector constant. */
9887 mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode);
9888 mask = gen_lowpart (mode, mask);
9889
9890 if (mode == SFmode)
9891 {
9892 if (vect)
9893 v = gen_rtvec (4, mask, mask, mask, mask);
9894 else
9895 v = gen_rtvec (4, mask, CONST0_RTX (SFmode),
9896 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
9897 vec_mode = V4SFmode;
9898 }
9899 else
9900 {
9901 if (vect)
9902 v = gen_rtvec (2, mask, mask);
9903 else
9904 v = gen_rtvec (2, mask, CONST0_RTX (DFmode));
9905 vec_mode = V2DFmode;
9906 }
9907
9908 return force_reg (vec_mode, gen_rtx_CONST_VECTOR (vec_mode, v));
9909 }
9910
9911 /* Generate code for floating point ABS or NEG. */
9912
9913 void
9914 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
9915 rtx operands[])
9916 {
9917 rtx mask, set, use, clob, dst, src;
9918 bool matching_memory;
9919 bool use_sse = false;
9920 bool vector_mode = VECTOR_MODE_P (mode);
9921 enum machine_mode elt_mode = mode;
9922
9923 if (vector_mode)
9924 {
9925 elt_mode = GET_MODE_INNER (mode);
9926 use_sse = true;
9927 }
9928 else if (TARGET_SSE_MATH)
9929 use_sse = SSE_FLOAT_MODE_P (mode);
9930
9931 /* NEG and ABS performed with SSE use bitwise mask operations.
9932 Create the appropriate mask now. */
9933 if (use_sse)
9934 mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS);
9935 else
9936 mask = NULL_RTX;
9937
9938 dst = operands[0];
9939 src = operands[1];
9940
9941 /* If the destination is memory, and we don't have matching source
9942 operands or we're using the x87, do things in registers. */
9943 matching_memory = false;
9944 if (MEM_P (dst))
9945 {
9946 if (use_sse && rtx_equal_p (dst, src))
9947 matching_memory = true;
9948 else
9949 dst = gen_reg_rtx (mode);
9950 }
9951 if (MEM_P (src) && !matching_memory)
9952 src = force_reg (mode, src);
9953
9954 if (vector_mode)
9955 {
9956 set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask);
9957 set = gen_rtx_SET (VOIDmode, dst, set);
9958 emit_insn (set);
9959 }
9960 else
9961 {
9962 set = gen_rtx_fmt_e (code, mode, src);
9963 set = gen_rtx_SET (VOIDmode, dst, set);
9964 if (mask)
9965 {
9966 use = gen_rtx_USE (VOIDmode, mask);
9967 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9968 emit_insn (gen_rtx_PARALLEL (VOIDmode,
9969 gen_rtvec (3, set, use, clob)));
9970 }
9971 else
9972 emit_insn (set);
9973 }
9974
9975 if (dst != operands[0])
9976 emit_move_insn (operands[0], dst);
9977 }
9978
9979 /* Expand a copysign operation. Special case operand 0 being a constant. */
9980
9981 void
9982 ix86_expand_copysign (rtx operands[])
9983 {
9984 enum machine_mode mode, vmode;
9985 rtx dest, op0, op1, mask, nmask;
9986
9987 dest = operands[0];
9988 op0 = operands[1];
9989 op1 = operands[2];
9990
9991 mode = GET_MODE (dest);
9992 vmode = mode == SFmode ? V4SFmode : V2DFmode;
9993
9994 if (GET_CODE (op0) == CONST_DOUBLE)
9995 {
9996 rtvec v;
9997
9998 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
9999 op0 = simplify_unary_operation (ABS, mode, op0, mode);
10000
10001 if (op0 == CONST0_RTX (mode))
10002 op0 = CONST0_RTX (vmode);
10003 else
10004 {
10005 if (mode == SFmode)
10006 v = gen_rtvec (4, op0, CONST0_RTX (SFmode),
10007 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
10008 else
10009 v = gen_rtvec (2, op0, CONST0_RTX (DFmode));
10010 op0 = force_reg (vmode, gen_rtx_CONST_VECTOR (vmode, v));
10011 }
10012
10013 mask = ix86_build_signbit_mask (mode, 0, 0);
10014
10015 if (mode == SFmode)
10016 emit_insn (gen_copysignsf3_const (dest, op0, op1, mask));
10017 else
10018 emit_insn (gen_copysigndf3_const (dest, op0, op1, mask));
10019 }
10020 else
10021 {
10022 nmask = ix86_build_signbit_mask (mode, 0, 1);
10023 mask = ix86_build_signbit_mask (mode, 0, 0);
10024
10025 if (mode == SFmode)
10026 emit_insn (gen_copysignsf3_var (dest, NULL, op0, op1, nmask, mask));
10027 else
10028 emit_insn (gen_copysigndf3_var (dest, NULL, op0, op1, nmask, mask));
10029 }
10030 }
10031
10032 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
10033 be a constant, and so has already been expanded into a vector constant. */
10034
10035 void
10036 ix86_split_copysign_const (rtx operands[])
10037 {
10038 enum machine_mode mode, vmode;
10039 rtx dest, op0, op1, mask, x;
10040
10041 dest = operands[0];
10042 op0 = operands[1];
10043 op1 = operands[2];
10044 mask = operands[3];
10045
10046 mode = GET_MODE (dest);
10047 vmode = GET_MODE (mask);
10048
10049 dest = simplify_gen_subreg (vmode, dest, mode, 0);
10050 x = gen_rtx_AND (vmode, dest, mask);
10051 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10052
10053 if (op0 != CONST0_RTX (vmode))
10054 {
10055 x = gen_rtx_IOR (vmode, dest, op0);
10056 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10057 }
10058 }
10059
10060 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
10061 so we have to do two masks. */
10062
10063 void
10064 ix86_split_copysign_var (rtx operands[])
10065 {
10066 enum machine_mode mode, vmode;
10067 rtx dest, scratch, op0, op1, mask, nmask, x;
10068
10069 dest = operands[0];
10070 scratch = operands[1];
10071 op0 = operands[2];
10072 op1 = operands[3];
10073 nmask = operands[4];
10074 mask = operands[5];
10075
10076 mode = GET_MODE (dest);
10077 vmode = GET_MODE (mask);
10078
10079 if (rtx_equal_p (op0, op1))
10080 {
10081 /* Shouldn't happen often (it's useless, obviously), but when it does
10082 we'd generate incorrect code if we continue below. */
10083 emit_move_insn (dest, op0);
10084 return;
10085 }
10086
10087 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
10088 {
10089 gcc_assert (REGNO (op1) == REGNO (scratch));
10090
10091 x = gen_rtx_AND (vmode, scratch, mask);
10092 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10093
10094 dest = mask;
10095 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10096 x = gen_rtx_NOT (vmode, dest);
10097 x = gen_rtx_AND (vmode, x, op0);
10098 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10099 }
10100 else
10101 {
10102 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
10103 {
10104 x = gen_rtx_AND (vmode, scratch, mask);
10105 }
10106 else /* alternative 2,4 */
10107 {
10108 gcc_assert (REGNO (mask) == REGNO (scratch));
10109 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
10110 x = gen_rtx_AND (vmode, scratch, op1);
10111 }
10112 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10113
10114 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
10115 {
10116 dest = simplify_gen_subreg (vmode, op0, mode, 0);
10117 x = gen_rtx_AND (vmode, dest, nmask);
10118 }
10119 else /* alternative 3,4 */
10120 {
10121 gcc_assert (REGNO (nmask) == REGNO (dest));
10122 dest = nmask;
10123 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10124 x = gen_rtx_AND (vmode, dest, op0);
10125 }
10126 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10127 }
10128
10129 x = gen_rtx_IOR (vmode, dest, scratch);
10130 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10131 }
10132
10133 /* Return TRUE or FALSE depending on whether the first SET in INSN
10134 has source and destination with matching CC modes, and that the
10135 CC mode is at least as constrained as REQ_MODE. */
10136
10137 int
10138 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
10139 {
10140 rtx set;
10141 enum machine_mode set_mode;
10142
10143 set = PATTERN (insn);
10144 if (GET_CODE (set) == PARALLEL)
10145 set = XVECEXP (set, 0, 0);
10146 gcc_assert (GET_CODE (set) == SET);
10147 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
10148
10149 set_mode = GET_MODE (SET_DEST (set));
10150 switch (set_mode)
10151 {
10152 case CCNOmode:
10153 if (req_mode != CCNOmode
10154 && (req_mode != CCmode
10155 || XEXP (SET_SRC (set), 1) != const0_rtx))
10156 return 0;
10157 break;
10158 case CCmode:
10159 if (req_mode == CCGCmode)
10160 return 0;
10161 /* FALLTHRU */
10162 case CCGCmode:
10163 if (req_mode == CCGOCmode || req_mode == CCNOmode)
10164 return 0;
10165 /* FALLTHRU */
10166 case CCGOCmode:
10167 if (req_mode == CCZmode)
10168 return 0;
10169 /* FALLTHRU */
10170 case CCZmode:
10171 break;
10172
10173 default:
10174 gcc_unreachable ();
10175 }
10176
10177 return (GET_MODE (SET_SRC (set)) == set_mode);
10178 }
10179
10180 /* Generate insn patterns to do an integer compare of OPERANDS. */
10181
10182 static rtx
10183 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
10184 {
10185 enum machine_mode cmpmode;
10186 rtx tmp, flags;
10187
10188 cmpmode = SELECT_CC_MODE (code, op0, op1);
10189 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
10190
10191 /* This is very simple, but making the interface the same as in the
10192 FP case makes the rest of the code easier. */
10193 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
10194 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
10195
10196 /* Return the test that should be put into the flags user, i.e.
10197 the bcc, scc, or cmov instruction. */
10198 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
10199 }
10200
10201 /* Figure out whether to use ordered or unordered fp comparisons.
10202 Return the appropriate mode to use. */
10203
10204 enum machine_mode
10205 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
10206 {
10207 /* ??? In order to make all comparisons reversible, we do all comparisons
10208 non-trapping when compiling for IEEE. Once gcc is able to distinguish
10209 all forms trapping and nontrapping comparisons, we can make inequality
10210 comparisons trapping again, since it results in better code when using
10211 FCOM based compares. */
10212 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
10213 }
10214
10215 enum machine_mode
10216 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
10217 {
10218 if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
10219 return ix86_fp_compare_mode (code);
10220 switch (code)
10221 {
10222 /* Only zero flag is needed. */
10223 case EQ: /* ZF=0 */
10224 case NE: /* ZF!=0 */
10225 return CCZmode;
10226 /* Codes needing carry flag. */
10227 case GEU: /* CF=0 */
10228 case GTU: /* CF=0 & ZF=0 */
10229 case LTU: /* CF=1 */
10230 case LEU: /* CF=1 | ZF=1 */
10231 return CCmode;
10232 /* Codes possibly doable only with sign flag when
10233 comparing against zero. */
10234 case GE: /* SF=OF or SF=0 */
10235 case LT: /* SF<>OF or SF=1 */
10236 if (op1 == const0_rtx)
10237 return CCGOCmode;
10238 else
10239 /* For other cases Carry flag is not required. */
10240 return CCGCmode;
10241 /* Codes doable only with sign flag when comparing
10242 against zero, but we miss jump instruction for it
10243 so we need to use relational tests against overflow
10244 that thus needs to be zero. */
10245 case GT: /* ZF=0 & SF=OF */
10246 case LE: /* ZF=1 | SF<>OF */
10247 if (op1 == const0_rtx)
10248 return CCNOmode;
10249 else
10250 return CCGCmode;
10251 /* strcmp pattern do (use flags) and combine may ask us for proper
10252 mode. */
10253 case USE:
10254 return CCmode;
10255 default:
10256 gcc_unreachable ();
10257 }
10258 }
10259
10260 /* Return the fixed registers used for condition codes. */
10261
10262 static bool
10263 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10264 {
10265 *p1 = FLAGS_REG;
10266 *p2 = FPSR_REG;
10267 return true;
10268 }
10269
10270 /* If two condition code modes are compatible, return a condition code
10271 mode which is compatible with both. Otherwise, return
10272 VOIDmode. */
10273
10274 static enum machine_mode
10275 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
10276 {
10277 if (m1 == m2)
10278 return m1;
10279
10280 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
10281 return VOIDmode;
10282
10283 if ((m1 == CCGCmode && m2 == CCGOCmode)
10284 || (m1 == CCGOCmode && m2 == CCGCmode))
10285 return CCGCmode;
10286
10287 switch (m1)
10288 {
10289 default:
10290 gcc_unreachable ();
10291
10292 case CCmode:
10293 case CCGCmode:
10294 case CCGOCmode:
10295 case CCNOmode:
10296 case CCZmode:
10297 switch (m2)
10298 {
10299 default:
10300 return VOIDmode;
10301
10302 case CCmode:
10303 case CCGCmode:
10304 case CCGOCmode:
10305 case CCNOmode:
10306 case CCZmode:
10307 return CCmode;
10308 }
10309
10310 case CCFPmode:
10311 case CCFPUmode:
10312 /* These are only compatible with themselves, which we already
10313 checked above. */
10314 return VOIDmode;
10315 }
10316 }
10317
10318 /* Return true if we should use an FCOMI instruction for this fp comparison. */
10319
10320 int
10321 ix86_use_fcomi_compare (enum rtx_code code ATTRIBUTE_UNUSED)
10322 {
10323 enum rtx_code swapped_code = swap_condition (code);
10324 return ((ix86_fp_comparison_cost (code) == ix86_fp_comparison_fcomi_cost (code))
10325 || (ix86_fp_comparison_cost (swapped_code)
10326 == ix86_fp_comparison_fcomi_cost (swapped_code)));
10327 }
10328
10329 /* Swap, force into registers, or otherwise massage the two operands
10330 to a fp comparison. The operands are updated in place; the new
10331 comparison code is returned. */
10332
10333 static enum rtx_code
10334 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
10335 {
10336 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
10337 rtx op0 = *pop0, op1 = *pop1;
10338 enum machine_mode op_mode = GET_MODE (op0);
10339 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
10340
10341 /* All of the unordered compare instructions only work on registers.
10342 The same is true of the fcomi compare instructions. The XFmode
10343 compare instructions require registers except when comparing
10344 against zero or when converting operand 1 from fixed point to
10345 floating point. */
10346
10347 if (!is_sse
10348 && (fpcmp_mode == CCFPUmode
10349 || (op_mode == XFmode
10350 && ! (standard_80387_constant_p (op0) == 1
10351 || standard_80387_constant_p (op1) == 1)
10352 && GET_CODE (op1) != FLOAT)
10353 || ix86_use_fcomi_compare (code)))
10354 {
10355 op0 = force_reg (op_mode, op0);
10356 op1 = force_reg (op_mode, op1);
10357 }
10358 else
10359 {
10360 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
10361 things around if they appear profitable, otherwise force op0
10362 into a register. */
10363
10364 if (standard_80387_constant_p (op0) == 0
10365 || (MEM_P (op0)
10366 && ! (standard_80387_constant_p (op1) == 0
10367 || MEM_P (op1))))
10368 {
10369 rtx tmp;
10370 tmp = op0, op0 = op1, op1 = tmp;
10371 code = swap_condition (code);
10372 }
10373
10374 if (!REG_P (op0))
10375 op0 = force_reg (op_mode, op0);
10376
10377 if (CONSTANT_P (op1))
10378 {
10379 int tmp = standard_80387_constant_p (op1);
10380 if (tmp == 0)
10381 op1 = validize_mem (force_const_mem (op_mode, op1));
10382 else if (tmp == 1)
10383 {
10384 if (TARGET_CMOVE)
10385 op1 = force_reg (op_mode, op1);
10386 }
10387 else
10388 op1 = force_reg (op_mode, op1);
10389 }
10390 }
10391
10392 /* Try to rearrange the comparison to make it cheaper. */
10393 if (ix86_fp_comparison_cost (code)
10394 > ix86_fp_comparison_cost (swap_condition (code))
10395 && (REG_P (op1) || !no_new_pseudos))
10396 {
10397 rtx tmp;
10398 tmp = op0, op0 = op1, op1 = tmp;
10399 code = swap_condition (code);
10400 if (!REG_P (op0))
10401 op0 = force_reg (op_mode, op0);
10402 }
10403
10404 *pop0 = op0;
10405 *pop1 = op1;
10406 return code;
10407 }
10408
10409 /* Convert comparison codes we use to represent FP comparison to integer
10410 code that will result in proper branch. Return UNKNOWN if no such code
10411 is available. */
10412
10413 enum rtx_code
10414 ix86_fp_compare_code_to_integer (enum rtx_code code)
10415 {
10416 switch (code)
10417 {
10418 case GT:
10419 return GTU;
10420 case GE:
10421 return GEU;
10422 case ORDERED:
10423 case UNORDERED:
10424 return code;
10425 break;
10426 case UNEQ:
10427 return EQ;
10428 break;
10429 case UNLT:
10430 return LTU;
10431 break;
10432 case UNLE:
10433 return LEU;
10434 break;
10435 case LTGT:
10436 return NE;
10437 break;
10438 default:
10439 return UNKNOWN;
10440 }
10441 }
10442
10443 /* Split comparison code CODE into comparisons we can do using branch
10444 instructions. BYPASS_CODE is comparison code for branch that will
10445 branch around FIRST_CODE and SECOND_CODE. If some of branches
10446 is not required, set value to UNKNOWN.
10447 We never require more than two branches. */
10448
10449 void
10450 ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *bypass_code,
10451 enum rtx_code *first_code,
10452 enum rtx_code *second_code)
10453 {
10454 *first_code = code;
10455 *bypass_code = UNKNOWN;
10456 *second_code = UNKNOWN;
10457
10458 /* The fcomi comparison sets flags as follows:
10459
10460 cmp ZF PF CF
10461 > 0 0 0
10462 < 0 0 1
10463 = 1 0 0
10464 un 1 1 1 */
10465
10466 switch (code)
10467 {
10468 case GT: /* GTU - CF=0 & ZF=0 */
10469 case GE: /* GEU - CF=0 */
10470 case ORDERED: /* PF=0 */
10471 case UNORDERED: /* PF=1 */
10472 case UNEQ: /* EQ - ZF=1 */
10473 case UNLT: /* LTU - CF=1 */
10474 case UNLE: /* LEU - CF=1 | ZF=1 */
10475 case LTGT: /* EQ - ZF=0 */
10476 break;
10477 case LT: /* LTU - CF=1 - fails on unordered */
10478 *first_code = UNLT;
10479 *bypass_code = UNORDERED;
10480 break;
10481 case LE: /* LEU - CF=1 | ZF=1 - fails on unordered */
10482 *first_code = UNLE;
10483 *bypass_code = UNORDERED;
10484 break;
10485 case EQ: /* EQ - ZF=1 - fails on unordered */
10486 *first_code = UNEQ;
10487 *bypass_code = UNORDERED;
10488 break;
10489 case NE: /* NE - ZF=0 - fails on unordered */
10490 *first_code = LTGT;
10491 *second_code = UNORDERED;
10492 break;
10493 case UNGE: /* GEU - CF=0 - fails on unordered */
10494 *first_code = GE;
10495 *second_code = UNORDERED;
10496 break;
10497 case UNGT: /* GTU - CF=0 & ZF=0 - fails on unordered */
10498 *first_code = GT;
10499 *second_code = UNORDERED;
10500 break;
10501 default:
10502 gcc_unreachable ();
10503 }
10504 if (!TARGET_IEEE_FP)
10505 {
10506 *second_code = UNKNOWN;
10507 *bypass_code = UNKNOWN;
10508 }
10509 }
10510
10511 /* Return cost of comparison done fcom + arithmetics operations on AX.
10512 All following functions do use number of instructions as a cost metrics.
10513 In future this should be tweaked to compute bytes for optimize_size and
10514 take into account performance of various instructions on various CPUs. */
10515 static int
10516 ix86_fp_comparison_arithmetics_cost (enum rtx_code code)
10517 {
10518 if (!TARGET_IEEE_FP)
10519 return 4;
10520 /* The cost of code output by ix86_expand_fp_compare. */
10521 switch (code)
10522 {
10523 case UNLE:
10524 case UNLT:
10525 case LTGT:
10526 case GT:
10527 case GE:
10528 case UNORDERED:
10529 case ORDERED:
10530 case UNEQ:
10531 return 4;
10532 break;
10533 case LT:
10534 case NE:
10535 case EQ:
10536 case UNGE:
10537 return 5;
10538 break;
10539 case LE:
10540 case UNGT:
10541 return 6;
10542 break;
10543 default:
10544 gcc_unreachable ();
10545 }
10546 }
10547
10548 /* Return cost of comparison done using fcomi operation.
10549 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10550 static int
10551 ix86_fp_comparison_fcomi_cost (enum rtx_code code)
10552 {
10553 enum rtx_code bypass_code, first_code, second_code;
10554 /* Return arbitrarily high cost when instruction is not supported - this
10555 prevents gcc from using it. */
10556 if (!TARGET_CMOVE)
10557 return 1024;
10558 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10559 return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 2;
10560 }
10561
10562 /* Return cost of comparison done using sahf operation.
10563 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10564 static int
10565 ix86_fp_comparison_sahf_cost (enum rtx_code code)
10566 {
10567 enum rtx_code bypass_code, first_code, second_code;
10568 /* Return arbitrarily high cost when instruction is not preferred - this
10569 avoids gcc from using it. */
10570 if (!TARGET_USE_SAHF && !optimize_size)
10571 return 1024;
10572 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10573 return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 3;
10574 }
10575
10576 /* Compute cost of the comparison done using any method.
10577 See ix86_fp_comparison_arithmetics_cost for the metrics. */
10578 static int
10579 ix86_fp_comparison_cost (enum rtx_code code)
10580 {
10581 int fcomi_cost, sahf_cost, arithmetics_cost = 1024;
10582 int min;
10583
10584 fcomi_cost = ix86_fp_comparison_fcomi_cost (code);
10585 sahf_cost = ix86_fp_comparison_sahf_cost (code);
10586
10587 min = arithmetics_cost = ix86_fp_comparison_arithmetics_cost (code);
10588 if (min > sahf_cost)
10589 min = sahf_cost;
10590 if (min > fcomi_cost)
10591 min = fcomi_cost;
10592 return min;
10593 }
10594
10595 /* Generate insn patterns to do a floating point compare of OPERANDS. */
10596
10597 static rtx
10598 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch,
10599 rtx *second_test, rtx *bypass_test)
10600 {
10601 enum machine_mode fpcmp_mode, intcmp_mode;
10602 rtx tmp, tmp2;
10603 int cost = ix86_fp_comparison_cost (code);
10604 enum rtx_code bypass_code, first_code, second_code;
10605
10606 fpcmp_mode = ix86_fp_compare_mode (code);
10607 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
10608
10609 if (second_test)
10610 *second_test = NULL_RTX;
10611 if (bypass_test)
10612 *bypass_test = NULL_RTX;
10613
10614 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10615
10616 /* Do fcomi/sahf based test when profitable. */
10617 if ((bypass_code == UNKNOWN || bypass_test)
10618 && (second_code == UNKNOWN || second_test)
10619 && ix86_fp_comparison_arithmetics_cost (code) > cost)
10620 {
10621 if (TARGET_CMOVE)
10622 {
10623 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10624 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
10625 tmp);
10626 emit_insn (tmp);
10627 }
10628 else
10629 {
10630 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10631 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
10632 if (!scratch)
10633 scratch = gen_reg_rtx (HImode);
10634 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
10635 emit_insn (gen_x86_sahf_1 (scratch));
10636 }
10637
10638 /* The FP codes work out to act like unsigned. */
10639 intcmp_mode = fpcmp_mode;
10640 code = first_code;
10641 if (bypass_code != UNKNOWN)
10642 *bypass_test = gen_rtx_fmt_ee (bypass_code, VOIDmode,
10643 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10644 const0_rtx);
10645 if (second_code != UNKNOWN)
10646 *second_test = gen_rtx_fmt_ee (second_code, VOIDmode,
10647 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10648 const0_rtx);
10649 }
10650 else
10651 {
10652 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
10653 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
10654 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
10655 if (!scratch)
10656 scratch = gen_reg_rtx (HImode);
10657 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
10658
10659 /* In the unordered case, we have to check C2 for NaN's, which
10660 doesn't happen to work out to anything nice combination-wise.
10661 So do some bit twiddling on the value we've got in AH to come
10662 up with an appropriate set of condition codes. */
10663
10664 intcmp_mode = CCNOmode;
10665 switch (code)
10666 {
10667 case GT:
10668 case UNGT:
10669 if (code == GT || !TARGET_IEEE_FP)
10670 {
10671 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
10672 code = EQ;
10673 }
10674 else
10675 {
10676 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10677 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
10678 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
10679 intcmp_mode = CCmode;
10680 code = GEU;
10681 }
10682 break;
10683 case LT:
10684 case UNLT:
10685 if (code == LT && TARGET_IEEE_FP)
10686 {
10687 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10688 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x01)));
10689 intcmp_mode = CCmode;
10690 code = EQ;
10691 }
10692 else
10693 {
10694 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x01)));
10695 code = NE;
10696 }
10697 break;
10698 case GE:
10699 case UNGE:
10700 if (code == GE || !TARGET_IEEE_FP)
10701 {
10702 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
10703 code = EQ;
10704 }
10705 else
10706 {
10707 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10708 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
10709 GEN_INT (0x01)));
10710 code = NE;
10711 }
10712 break;
10713 case LE:
10714 case UNLE:
10715 if (code == LE && TARGET_IEEE_FP)
10716 {
10717 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10718 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
10719 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
10720 intcmp_mode = CCmode;
10721 code = LTU;
10722 }
10723 else
10724 {
10725 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
10726 code = NE;
10727 }
10728 break;
10729 case EQ:
10730 case UNEQ:
10731 if (code == EQ && TARGET_IEEE_FP)
10732 {
10733 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10734 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
10735 intcmp_mode = CCmode;
10736 code = EQ;
10737 }
10738 else
10739 {
10740 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
10741 code = NE;
10742 break;
10743 }
10744 break;
10745 case NE:
10746 case LTGT:
10747 if (code == NE && TARGET_IEEE_FP)
10748 {
10749 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
10750 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
10751 GEN_INT (0x40)));
10752 code = NE;
10753 }
10754 else
10755 {
10756 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
10757 code = EQ;
10758 }
10759 break;
10760
10761 case UNORDERED:
10762 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
10763 code = NE;
10764 break;
10765 case ORDERED:
10766 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
10767 code = EQ;
10768 break;
10769
10770 default:
10771 gcc_unreachable ();
10772 }
10773 }
10774
10775 /* Return the test that should be put into the flags user, i.e.
10776 the bcc, scc, or cmov instruction. */
10777 return gen_rtx_fmt_ee (code, VOIDmode,
10778 gen_rtx_REG (intcmp_mode, FLAGS_REG),
10779 const0_rtx);
10780 }
10781
10782 rtx
10783 ix86_expand_compare (enum rtx_code code, rtx *second_test, rtx *bypass_test)
10784 {
10785 rtx op0, op1, ret;
10786 op0 = ix86_compare_op0;
10787 op1 = ix86_compare_op1;
10788
10789 if (second_test)
10790 *second_test = NULL_RTX;
10791 if (bypass_test)
10792 *bypass_test = NULL_RTX;
10793
10794 if (ix86_compare_emitted)
10795 {
10796 ret = gen_rtx_fmt_ee (code, VOIDmode, ix86_compare_emitted, const0_rtx);
10797 ix86_compare_emitted = NULL_RTX;
10798 }
10799 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
10800 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
10801 second_test, bypass_test);
10802 else
10803 ret = ix86_expand_int_compare (code, op0, op1);
10804
10805 return ret;
10806 }
10807
10808 /* Return true if the CODE will result in nontrivial jump sequence. */
10809 bool
10810 ix86_fp_jump_nontrivial_p (enum rtx_code code)
10811 {
10812 enum rtx_code bypass_code, first_code, second_code;
10813 if (!TARGET_CMOVE)
10814 return true;
10815 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10816 return bypass_code != UNKNOWN || second_code != UNKNOWN;
10817 }
10818
10819 void
10820 ix86_expand_branch (enum rtx_code code, rtx label)
10821 {
10822 rtx tmp;
10823
10824 /* If we have emitted a compare insn, go straight to simple.
10825 ix86_expand_compare won't emit anything if ix86_compare_emitted
10826 is non NULL. */
10827 if (ix86_compare_emitted)
10828 goto simple;
10829
10830 switch (GET_MODE (ix86_compare_op0))
10831 {
10832 case QImode:
10833 case HImode:
10834 case SImode:
10835 simple:
10836 tmp = ix86_expand_compare (code, NULL, NULL);
10837 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10838 gen_rtx_LABEL_REF (VOIDmode, label),
10839 pc_rtx);
10840 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
10841 return;
10842
10843 case SFmode:
10844 case DFmode:
10845 case XFmode:
10846 {
10847 rtvec vec;
10848 int use_fcomi;
10849 enum rtx_code bypass_code, first_code, second_code;
10850
10851 code = ix86_prepare_fp_compare_args (code, &ix86_compare_op0,
10852 &ix86_compare_op1);
10853
10854 ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10855
10856 /* Check whether we will use the natural sequence with one jump. If
10857 so, we can expand jump early. Otherwise delay expansion by
10858 creating compound insn to not confuse optimizers. */
10859 if (bypass_code == UNKNOWN && second_code == UNKNOWN
10860 && TARGET_CMOVE)
10861 {
10862 ix86_split_fp_branch (code, ix86_compare_op0, ix86_compare_op1,
10863 gen_rtx_LABEL_REF (VOIDmode, label),
10864 pc_rtx, NULL_RTX, NULL_RTX);
10865 }
10866 else
10867 {
10868 tmp = gen_rtx_fmt_ee (code, VOIDmode,
10869 ix86_compare_op0, ix86_compare_op1);
10870 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10871 gen_rtx_LABEL_REF (VOIDmode, label),
10872 pc_rtx);
10873 tmp = gen_rtx_SET (VOIDmode, pc_rtx, tmp);
10874
10875 use_fcomi = ix86_use_fcomi_compare (code);
10876 vec = rtvec_alloc (3 + !use_fcomi);
10877 RTVEC_ELT (vec, 0) = tmp;
10878 RTVEC_ELT (vec, 1)
10879 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 18));
10880 RTVEC_ELT (vec, 2)
10881 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 17));
10882 if (! use_fcomi)
10883 RTVEC_ELT (vec, 3)
10884 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (HImode));
10885
10886 emit_jump_insn (gen_rtx_PARALLEL (VOIDmode, vec));
10887 }
10888 return;
10889 }
10890
10891 case DImode:
10892 if (TARGET_64BIT)
10893 goto simple;
10894 case TImode:
10895 /* Expand DImode branch into multiple compare+branch. */
10896 {
10897 rtx lo[2], hi[2], label2;
10898 enum rtx_code code1, code2, code3;
10899 enum machine_mode submode;
10900
10901 if (CONSTANT_P (ix86_compare_op0) && ! CONSTANT_P (ix86_compare_op1))
10902 {
10903 tmp = ix86_compare_op0;
10904 ix86_compare_op0 = ix86_compare_op1;
10905 ix86_compare_op1 = tmp;
10906 code = swap_condition (code);
10907 }
10908 if (GET_MODE (ix86_compare_op0) == DImode)
10909 {
10910 split_di (&ix86_compare_op0, 1, lo+0, hi+0);
10911 split_di (&ix86_compare_op1, 1, lo+1, hi+1);
10912 submode = SImode;
10913 }
10914 else
10915 {
10916 split_ti (&ix86_compare_op0, 1, lo+0, hi+0);
10917 split_ti (&ix86_compare_op1, 1, lo+1, hi+1);
10918 submode = DImode;
10919 }
10920
10921 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
10922 avoid two branches. This costs one extra insn, so disable when
10923 optimizing for size. */
10924
10925 if ((code == EQ || code == NE)
10926 && (!optimize_size
10927 || hi[1] == const0_rtx || lo[1] == const0_rtx))
10928 {
10929 rtx xor0, xor1;
10930
10931 xor1 = hi[0];
10932 if (hi[1] != const0_rtx)
10933 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
10934 NULL_RTX, 0, OPTAB_WIDEN);
10935
10936 xor0 = lo[0];
10937 if (lo[1] != const0_rtx)
10938 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
10939 NULL_RTX, 0, OPTAB_WIDEN);
10940
10941 tmp = expand_binop (submode, ior_optab, xor1, xor0,
10942 NULL_RTX, 0, OPTAB_WIDEN);
10943
10944 ix86_compare_op0 = tmp;
10945 ix86_compare_op1 = const0_rtx;
10946 ix86_expand_branch (code, label);
10947 return;
10948 }
10949
10950 /* Otherwise, if we are doing less-than or greater-or-equal-than,
10951 op1 is a constant and the low word is zero, then we can just
10952 examine the high word. */
10953
10954 if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx)
10955 switch (code)
10956 {
10957 case LT: case LTU: case GE: case GEU:
10958 ix86_compare_op0 = hi[0];
10959 ix86_compare_op1 = hi[1];
10960 ix86_expand_branch (code, label);
10961 return;
10962 default:
10963 break;
10964 }
10965
10966 /* Otherwise, we need two or three jumps. */
10967
10968 label2 = gen_label_rtx ();
10969
10970 code1 = code;
10971 code2 = swap_condition (code);
10972 code3 = unsigned_condition (code);
10973
10974 switch (code)
10975 {
10976 case LT: case GT: case LTU: case GTU:
10977 break;
10978
10979 case LE: code1 = LT; code2 = GT; break;
10980 case GE: code1 = GT; code2 = LT; break;
10981 case LEU: code1 = LTU; code2 = GTU; break;
10982 case GEU: code1 = GTU; code2 = LTU; break;
10983
10984 case EQ: code1 = UNKNOWN; code2 = NE; break;
10985 case NE: code2 = UNKNOWN; break;
10986
10987 default:
10988 gcc_unreachable ();
10989 }
10990
10991 /*
10992 * a < b =>
10993 * if (hi(a) < hi(b)) goto true;
10994 * if (hi(a) > hi(b)) goto false;
10995 * if (lo(a) < lo(b)) goto true;
10996 * false:
10997 */
10998
10999 ix86_compare_op0 = hi[0];
11000 ix86_compare_op1 = hi[1];
11001
11002 if (code1 != UNKNOWN)
11003 ix86_expand_branch (code1, label);
11004 if (code2 != UNKNOWN)
11005 ix86_expand_branch (code2, label2);
11006
11007 ix86_compare_op0 = lo[0];
11008 ix86_compare_op1 = lo[1];
11009 ix86_expand_branch (code3, label);
11010
11011 if (code2 != UNKNOWN)
11012 emit_label (label2);
11013 return;
11014 }
11015
11016 default:
11017 gcc_unreachable ();
11018 }
11019 }
11020
11021 /* Split branch based on floating point condition. */
11022 void
11023 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
11024 rtx target1, rtx target2, rtx tmp, rtx pushed)
11025 {
11026 rtx second, bypass;
11027 rtx label = NULL_RTX;
11028 rtx condition;
11029 int bypass_probability = -1, second_probability = -1, probability = -1;
11030 rtx i;
11031
11032 if (target2 != pc_rtx)
11033 {
11034 rtx tmp = target2;
11035 code = reverse_condition_maybe_unordered (code);
11036 target2 = target1;
11037 target1 = tmp;
11038 }
11039
11040 condition = ix86_expand_fp_compare (code, op1, op2,
11041 tmp, &second, &bypass);
11042
11043 /* Remove pushed operand from stack. */
11044 if (pushed)
11045 ix86_free_from_memory (GET_MODE (pushed));
11046
11047 if (split_branch_probability >= 0)
11048 {
11049 /* Distribute the probabilities across the jumps.
11050 Assume the BYPASS and SECOND to be always test
11051 for UNORDERED. */
11052 probability = split_branch_probability;
11053
11054 /* Value of 1 is low enough to make no need for probability
11055 to be updated. Later we may run some experiments and see
11056 if unordered values are more frequent in practice. */
11057 if (bypass)
11058 bypass_probability = 1;
11059 if (second)
11060 second_probability = 1;
11061 }
11062 if (bypass != NULL_RTX)
11063 {
11064 label = gen_label_rtx ();
11065 i = emit_jump_insn (gen_rtx_SET
11066 (VOIDmode, pc_rtx,
11067 gen_rtx_IF_THEN_ELSE (VOIDmode,
11068 bypass,
11069 gen_rtx_LABEL_REF (VOIDmode,
11070 label),
11071 pc_rtx)));
11072 if (bypass_probability >= 0)
11073 REG_NOTES (i)
11074 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11075 GEN_INT (bypass_probability),
11076 REG_NOTES (i));
11077 }
11078 i = emit_jump_insn (gen_rtx_SET
11079 (VOIDmode, pc_rtx,
11080 gen_rtx_IF_THEN_ELSE (VOIDmode,
11081 condition, target1, target2)));
11082 if (probability >= 0)
11083 REG_NOTES (i)
11084 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11085 GEN_INT (probability),
11086 REG_NOTES (i));
11087 if (second != NULL_RTX)
11088 {
11089 i = emit_jump_insn (gen_rtx_SET
11090 (VOIDmode, pc_rtx,
11091 gen_rtx_IF_THEN_ELSE (VOIDmode, second, target1,
11092 target2)));
11093 if (second_probability >= 0)
11094 REG_NOTES (i)
11095 = gen_rtx_EXPR_LIST (REG_BR_PROB,
11096 GEN_INT (second_probability),
11097 REG_NOTES (i));
11098 }
11099 if (label != NULL_RTX)
11100 emit_label (label);
11101 }
11102
11103 int
11104 ix86_expand_setcc (enum rtx_code code, rtx dest)
11105 {
11106 rtx ret, tmp, tmpreg, equiv;
11107 rtx second_test, bypass_test;
11108
11109 if (GET_MODE (ix86_compare_op0) == (TARGET_64BIT ? TImode : DImode))
11110 return 0; /* FAIL */
11111
11112 gcc_assert (GET_MODE (dest) == QImode);
11113
11114 ret = ix86_expand_compare (code, &second_test, &bypass_test);
11115 PUT_MODE (ret, QImode);
11116
11117 tmp = dest;
11118 tmpreg = dest;
11119
11120 emit_insn (gen_rtx_SET (VOIDmode, tmp, ret));
11121 if (bypass_test || second_test)
11122 {
11123 rtx test = second_test;
11124 int bypass = 0;
11125 rtx tmp2 = gen_reg_rtx (QImode);
11126 if (bypass_test)
11127 {
11128 gcc_assert (!second_test);
11129 test = bypass_test;
11130 bypass = 1;
11131 PUT_CODE (test, reverse_condition_maybe_unordered (GET_CODE (test)));
11132 }
11133 PUT_MODE (test, QImode);
11134 emit_insn (gen_rtx_SET (VOIDmode, tmp2, test));
11135
11136 if (bypass)
11137 emit_insn (gen_andqi3 (tmp, tmpreg, tmp2));
11138 else
11139 emit_insn (gen_iorqi3 (tmp, tmpreg, tmp2));
11140 }
11141
11142 /* Attach a REG_EQUAL note describing the comparison result. */
11143 if (ix86_compare_op0 && ix86_compare_op1)
11144 {
11145 equiv = simplify_gen_relational (code, QImode,
11146 GET_MODE (ix86_compare_op0),
11147 ix86_compare_op0, ix86_compare_op1);
11148 set_unique_reg_note (get_last_insn (), REG_EQUAL, equiv);
11149 }
11150
11151 return 1; /* DONE */
11152 }
11153
11154 /* Expand comparison setting or clearing carry flag. Return true when
11155 successful and set pop for the operation. */
11156 static bool
11157 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
11158 {
11159 enum machine_mode mode =
11160 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
11161
11162 /* Do not handle DImode compares that go through special path. Also we can't
11163 deal with FP compares yet. This is possible to add. */
11164 if (mode == (TARGET_64BIT ? TImode : DImode))
11165 return false;
11166 if (FLOAT_MODE_P (mode))
11167 {
11168 rtx second_test = NULL, bypass_test = NULL;
11169 rtx compare_op, compare_seq;
11170
11171 /* Shortcut: following common codes never translate into carry flag compares. */
11172 if (code == EQ || code == NE || code == UNEQ || code == LTGT
11173 || code == ORDERED || code == UNORDERED)
11174 return false;
11175
11176 /* These comparisons require zero flag; swap operands so they won't. */
11177 if ((code == GT || code == UNLE || code == LE || code == UNGT)
11178 && !TARGET_IEEE_FP)
11179 {
11180 rtx tmp = op0;
11181 op0 = op1;
11182 op1 = tmp;
11183 code = swap_condition (code);
11184 }
11185
11186 /* Try to expand the comparison and verify that we end up with carry flag
11187 based comparison. This is fails to be true only when we decide to expand
11188 comparison using arithmetic that is not too common scenario. */
11189 start_sequence ();
11190 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
11191 &second_test, &bypass_test);
11192 compare_seq = get_insns ();
11193 end_sequence ();
11194
11195 if (second_test || bypass_test)
11196 return false;
11197 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11198 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11199 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
11200 else
11201 code = GET_CODE (compare_op);
11202 if (code != LTU && code != GEU)
11203 return false;
11204 emit_insn (compare_seq);
11205 *pop = compare_op;
11206 return true;
11207 }
11208 if (!INTEGRAL_MODE_P (mode))
11209 return false;
11210 switch (code)
11211 {
11212 case LTU:
11213 case GEU:
11214 break;
11215
11216 /* Convert a==0 into (unsigned)a<1. */
11217 case EQ:
11218 case NE:
11219 if (op1 != const0_rtx)
11220 return false;
11221 op1 = const1_rtx;
11222 code = (code == EQ ? LTU : GEU);
11223 break;
11224
11225 /* Convert a>b into b<a or a>=b-1. */
11226 case GTU:
11227 case LEU:
11228 if (CONST_INT_P (op1))
11229 {
11230 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
11231 /* Bail out on overflow. We still can swap operands but that
11232 would force loading of the constant into register. */
11233 if (op1 == const0_rtx
11234 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
11235 return false;
11236 code = (code == GTU ? GEU : LTU);
11237 }
11238 else
11239 {
11240 rtx tmp = op1;
11241 op1 = op0;
11242 op0 = tmp;
11243 code = (code == GTU ? LTU : GEU);
11244 }
11245 break;
11246
11247 /* Convert a>=0 into (unsigned)a<0x80000000. */
11248 case LT:
11249 case GE:
11250 if (mode == DImode || op1 != const0_rtx)
11251 return false;
11252 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11253 code = (code == LT ? GEU : LTU);
11254 break;
11255 case LE:
11256 case GT:
11257 if (mode == DImode || op1 != constm1_rtx)
11258 return false;
11259 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11260 code = (code == LE ? GEU : LTU);
11261 break;
11262
11263 default:
11264 return false;
11265 }
11266 /* Swapping operands may cause constant to appear as first operand. */
11267 if (!nonimmediate_operand (op0, VOIDmode))
11268 {
11269 if (no_new_pseudos)
11270 return false;
11271 op0 = force_reg (mode, op0);
11272 }
11273 ix86_compare_op0 = op0;
11274 ix86_compare_op1 = op1;
11275 *pop = ix86_expand_compare (code, NULL, NULL);
11276 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
11277 return true;
11278 }
11279
11280 int
11281 ix86_expand_int_movcc (rtx operands[])
11282 {
11283 enum rtx_code code = GET_CODE (operands[1]), compare_code;
11284 rtx compare_seq, compare_op;
11285 rtx second_test, bypass_test;
11286 enum machine_mode mode = GET_MODE (operands[0]);
11287 bool sign_bit_compare_p = false;;
11288
11289 start_sequence ();
11290 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
11291 compare_seq = get_insns ();
11292 end_sequence ();
11293
11294 compare_code = GET_CODE (compare_op);
11295
11296 if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT))
11297 || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE)))
11298 sign_bit_compare_p = true;
11299
11300 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
11301 HImode insns, we'd be swallowed in word prefix ops. */
11302
11303 if ((mode != HImode || TARGET_FAST_PREFIX)
11304 && (mode != (TARGET_64BIT ? TImode : DImode))
11305 && CONST_INT_P (operands[2])
11306 && CONST_INT_P (operands[3]))
11307 {
11308 rtx out = operands[0];
11309 HOST_WIDE_INT ct = INTVAL (operands[2]);
11310 HOST_WIDE_INT cf = INTVAL (operands[3]);
11311 HOST_WIDE_INT diff;
11312
11313 diff = ct - cf;
11314 /* Sign bit compares are better done using shifts than we do by using
11315 sbb. */
11316 if (sign_bit_compare_p
11317 || ix86_expand_carry_flag_compare (code, ix86_compare_op0,
11318 ix86_compare_op1, &compare_op))
11319 {
11320 /* Detect overlap between destination and compare sources. */
11321 rtx tmp = out;
11322
11323 if (!sign_bit_compare_p)
11324 {
11325 bool fpcmp = false;
11326
11327 compare_code = GET_CODE (compare_op);
11328
11329 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11330 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11331 {
11332 fpcmp = true;
11333 compare_code = ix86_fp_compare_code_to_integer (compare_code);
11334 }
11335
11336 /* To simplify rest of code, restrict to the GEU case. */
11337 if (compare_code == LTU)
11338 {
11339 HOST_WIDE_INT tmp = ct;
11340 ct = cf;
11341 cf = tmp;
11342 compare_code = reverse_condition (compare_code);
11343 code = reverse_condition (code);
11344 }
11345 else
11346 {
11347 if (fpcmp)
11348 PUT_CODE (compare_op,
11349 reverse_condition_maybe_unordered
11350 (GET_CODE (compare_op)));
11351 else
11352 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
11353 }
11354 diff = ct - cf;
11355
11356 if (reg_overlap_mentioned_p (out, ix86_compare_op0)
11357 || reg_overlap_mentioned_p (out, ix86_compare_op1))
11358 tmp = gen_reg_rtx (mode);
11359
11360 if (mode == DImode)
11361 emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp, compare_op));
11362 else
11363 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), compare_op));
11364 }
11365 else
11366 {
11367 if (code == GT || code == GE)
11368 code = reverse_condition (code);
11369 else
11370 {
11371 HOST_WIDE_INT tmp = ct;
11372 ct = cf;
11373 cf = tmp;
11374 diff = ct - cf;
11375 }
11376 tmp = emit_store_flag (tmp, code, ix86_compare_op0,
11377 ix86_compare_op1, VOIDmode, 0, -1);
11378 }
11379
11380 if (diff == 1)
11381 {
11382 /*
11383 * cmpl op0,op1
11384 * sbbl dest,dest
11385 * [addl dest, ct]
11386 *
11387 * Size 5 - 8.
11388 */
11389 if (ct)
11390 tmp = expand_simple_binop (mode, PLUS,
11391 tmp, GEN_INT (ct),
11392 copy_rtx (tmp), 1, OPTAB_DIRECT);
11393 }
11394 else if (cf == -1)
11395 {
11396 /*
11397 * cmpl op0,op1
11398 * sbbl dest,dest
11399 * orl $ct, dest
11400 *
11401 * Size 8.
11402 */
11403 tmp = expand_simple_binop (mode, IOR,
11404 tmp, GEN_INT (ct),
11405 copy_rtx (tmp), 1, OPTAB_DIRECT);
11406 }
11407 else if (diff == -1 && ct)
11408 {
11409 /*
11410 * cmpl op0,op1
11411 * sbbl dest,dest
11412 * notl dest
11413 * [addl dest, cf]
11414 *
11415 * Size 8 - 11.
11416 */
11417 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11418 if (cf)
11419 tmp = expand_simple_binop (mode, PLUS,
11420 copy_rtx (tmp), GEN_INT (cf),
11421 copy_rtx (tmp), 1, OPTAB_DIRECT);
11422 }
11423 else
11424 {
11425 /*
11426 * cmpl op0,op1
11427 * sbbl dest,dest
11428 * [notl dest]
11429 * andl cf - ct, dest
11430 * [addl dest, ct]
11431 *
11432 * Size 8 - 11.
11433 */
11434
11435 if (cf == 0)
11436 {
11437 cf = ct;
11438 ct = 0;
11439 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11440 }
11441
11442 tmp = expand_simple_binop (mode, AND,
11443 copy_rtx (tmp),
11444 gen_int_mode (cf - ct, mode),
11445 copy_rtx (tmp), 1, OPTAB_DIRECT);
11446 if (ct)
11447 tmp = expand_simple_binop (mode, PLUS,
11448 copy_rtx (tmp), GEN_INT (ct),
11449 copy_rtx (tmp), 1, OPTAB_DIRECT);
11450 }
11451
11452 if (!rtx_equal_p (tmp, out))
11453 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
11454
11455 return 1; /* DONE */
11456 }
11457
11458 if (diff < 0)
11459 {
11460 HOST_WIDE_INT tmp;
11461 tmp = ct, ct = cf, cf = tmp;
11462 diff = -diff;
11463 if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0)))
11464 {
11465 /* We may be reversing unordered compare to normal compare, that
11466 is not valid in general (we may convert non-trapping condition
11467 to trapping one), however on i386 we currently emit all
11468 comparisons unordered. */
11469 compare_code = reverse_condition_maybe_unordered (compare_code);
11470 code = reverse_condition_maybe_unordered (code);
11471 }
11472 else
11473 {
11474 compare_code = reverse_condition (compare_code);
11475 code = reverse_condition (code);
11476 }
11477 }
11478
11479 compare_code = UNKNOWN;
11480 if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT
11481 && CONST_INT_P (ix86_compare_op1))
11482 {
11483 if (ix86_compare_op1 == const0_rtx
11484 && (code == LT || code == GE))
11485 compare_code = code;
11486 else if (ix86_compare_op1 == constm1_rtx)
11487 {
11488 if (code == LE)
11489 compare_code = LT;
11490 else if (code == GT)
11491 compare_code = GE;
11492 }
11493 }
11494
11495 /* Optimize dest = (op0 < 0) ? -1 : cf. */
11496 if (compare_code != UNKNOWN
11497 && GET_MODE (ix86_compare_op0) == GET_MODE (out)
11498 && (cf == -1 || ct == -1))
11499 {
11500 /* If lea code below could be used, only optimize
11501 if it results in a 2 insn sequence. */
11502
11503 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
11504 || diff == 3 || diff == 5 || diff == 9)
11505 || (compare_code == LT && ct == -1)
11506 || (compare_code == GE && cf == -1))
11507 {
11508 /*
11509 * notl op1 (if necessary)
11510 * sarl $31, op1
11511 * orl cf, op1
11512 */
11513 if (ct != -1)
11514 {
11515 cf = ct;
11516 ct = -1;
11517 code = reverse_condition (code);
11518 }
11519
11520 out = emit_store_flag (out, code, ix86_compare_op0,
11521 ix86_compare_op1, VOIDmode, 0, -1);
11522
11523 out = expand_simple_binop (mode, IOR,
11524 out, GEN_INT (cf),
11525 out, 1, OPTAB_DIRECT);
11526 if (out != operands[0])
11527 emit_move_insn (operands[0], out);
11528
11529 return 1; /* DONE */
11530 }
11531 }
11532
11533
11534 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
11535 || diff == 3 || diff == 5 || diff == 9)
11536 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
11537 && (mode != DImode
11538 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
11539 {
11540 /*
11541 * xorl dest,dest
11542 * cmpl op1,op2
11543 * setcc dest
11544 * lea cf(dest*(ct-cf)),dest
11545 *
11546 * Size 14.
11547 *
11548 * This also catches the degenerate setcc-only case.
11549 */
11550
11551 rtx tmp;
11552 int nops;
11553
11554 out = emit_store_flag (out, code, ix86_compare_op0,
11555 ix86_compare_op1, VOIDmode, 0, 1);
11556
11557 nops = 0;
11558 /* On x86_64 the lea instruction operates on Pmode, so we need
11559 to get arithmetics done in proper mode to match. */
11560 if (diff == 1)
11561 tmp = copy_rtx (out);
11562 else
11563 {
11564 rtx out1;
11565 out1 = copy_rtx (out);
11566 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
11567 nops++;
11568 if (diff & 1)
11569 {
11570 tmp = gen_rtx_PLUS (mode, tmp, out1);
11571 nops++;
11572 }
11573 }
11574 if (cf != 0)
11575 {
11576 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
11577 nops++;
11578 }
11579 if (!rtx_equal_p (tmp, out))
11580 {
11581 if (nops == 1)
11582 out = force_operand (tmp, copy_rtx (out));
11583 else
11584 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
11585 }
11586 if (!rtx_equal_p (out, operands[0]))
11587 emit_move_insn (operands[0], copy_rtx (out));
11588
11589 return 1; /* DONE */
11590 }
11591
11592 /*
11593 * General case: Jumpful:
11594 * xorl dest,dest cmpl op1, op2
11595 * cmpl op1, op2 movl ct, dest
11596 * setcc dest jcc 1f
11597 * decl dest movl cf, dest
11598 * andl (cf-ct),dest 1:
11599 * addl ct,dest
11600 *
11601 * Size 20. Size 14.
11602 *
11603 * This is reasonably steep, but branch mispredict costs are
11604 * high on modern cpus, so consider failing only if optimizing
11605 * for space.
11606 */
11607
11608 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
11609 && BRANCH_COST >= 2)
11610 {
11611 if (cf == 0)
11612 {
11613 cf = ct;
11614 ct = 0;
11615 if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0)))
11616 /* We may be reversing unordered compare to normal compare,
11617 that is not valid in general (we may convert non-trapping
11618 condition to trapping one), however on i386 we currently
11619 emit all comparisons unordered. */
11620 code = reverse_condition_maybe_unordered (code);
11621 else
11622 {
11623 code = reverse_condition (code);
11624 if (compare_code != UNKNOWN)
11625 compare_code = reverse_condition (compare_code);
11626 }
11627 }
11628
11629 if (compare_code != UNKNOWN)
11630 {
11631 /* notl op1 (if needed)
11632 sarl $31, op1
11633 andl (cf-ct), op1
11634 addl ct, op1
11635
11636 For x < 0 (resp. x <= -1) there will be no notl,
11637 so if possible swap the constants to get rid of the
11638 complement.
11639 True/false will be -1/0 while code below (store flag
11640 followed by decrement) is 0/-1, so the constants need
11641 to be exchanged once more. */
11642
11643 if (compare_code == GE || !cf)
11644 {
11645 code = reverse_condition (code);
11646 compare_code = LT;
11647 }
11648 else
11649 {
11650 HOST_WIDE_INT tmp = cf;
11651 cf = ct;
11652 ct = tmp;
11653 }
11654
11655 out = emit_store_flag (out, code, ix86_compare_op0,
11656 ix86_compare_op1, VOIDmode, 0, -1);
11657 }
11658 else
11659 {
11660 out = emit_store_flag (out, code, ix86_compare_op0,
11661 ix86_compare_op1, VOIDmode, 0, 1);
11662
11663 out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx,
11664 copy_rtx (out), 1, OPTAB_DIRECT);
11665 }
11666
11667 out = expand_simple_binop (mode, AND, copy_rtx (out),
11668 gen_int_mode (cf - ct, mode),
11669 copy_rtx (out), 1, OPTAB_DIRECT);
11670 if (ct)
11671 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
11672 copy_rtx (out), 1, OPTAB_DIRECT);
11673 if (!rtx_equal_p (out, operands[0]))
11674 emit_move_insn (operands[0], copy_rtx (out));
11675
11676 return 1; /* DONE */
11677 }
11678 }
11679
11680 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
11681 {
11682 /* Try a few things more with specific constants and a variable. */
11683
11684 optab op;
11685 rtx var, orig_out, out, tmp;
11686
11687 if (BRANCH_COST <= 2)
11688 return 0; /* FAIL */
11689
11690 /* If one of the two operands is an interesting constant, load a
11691 constant with the above and mask it in with a logical operation. */
11692
11693 if (CONST_INT_P (operands[2]))
11694 {
11695 var = operands[3];
11696 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
11697 operands[3] = constm1_rtx, op = and_optab;
11698 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
11699 operands[3] = const0_rtx, op = ior_optab;
11700 else
11701 return 0; /* FAIL */
11702 }
11703 else if (CONST_INT_P (operands[3]))
11704 {
11705 var = operands[2];
11706 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
11707 operands[2] = constm1_rtx, op = and_optab;
11708 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
11709 operands[2] = const0_rtx, op = ior_optab;
11710 else
11711 return 0; /* FAIL */
11712 }
11713 else
11714 return 0; /* FAIL */
11715
11716 orig_out = operands[0];
11717 tmp = gen_reg_rtx (mode);
11718 operands[0] = tmp;
11719
11720 /* Recurse to get the constant loaded. */
11721 if (ix86_expand_int_movcc (operands) == 0)
11722 return 0; /* FAIL */
11723
11724 /* Mask in the interesting variable. */
11725 out = expand_binop (mode, op, var, tmp, orig_out, 0,
11726 OPTAB_WIDEN);
11727 if (!rtx_equal_p (out, orig_out))
11728 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
11729
11730 return 1; /* DONE */
11731 }
11732
11733 /*
11734 * For comparison with above,
11735 *
11736 * movl cf,dest
11737 * movl ct,tmp
11738 * cmpl op1,op2
11739 * cmovcc tmp,dest
11740 *
11741 * Size 15.
11742 */
11743
11744 if (! nonimmediate_operand (operands[2], mode))
11745 operands[2] = force_reg (mode, operands[2]);
11746 if (! nonimmediate_operand (operands[3], mode))
11747 operands[3] = force_reg (mode, operands[3]);
11748
11749 if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
11750 {
11751 rtx tmp = gen_reg_rtx (mode);
11752 emit_move_insn (tmp, operands[3]);
11753 operands[3] = tmp;
11754 }
11755 if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
11756 {
11757 rtx tmp = gen_reg_rtx (mode);
11758 emit_move_insn (tmp, operands[2]);
11759 operands[2] = tmp;
11760 }
11761
11762 if (! register_operand (operands[2], VOIDmode)
11763 && (mode == QImode
11764 || ! register_operand (operands[3], VOIDmode)))
11765 operands[2] = force_reg (mode, operands[2]);
11766
11767 if (mode == QImode
11768 && ! register_operand (operands[3], VOIDmode))
11769 operands[3] = force_reg (mode, operands[3]);
11770
11771 emit_insn (compare_seq);
11772 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
11773 gen_rtx_IF_THEN_ELSE (mode,
11774 compare_op, operands[2],
11775 operands[3])));
11776 if (bypass_test)
11777 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
11778 gen_rtx_IF_THEN_ELSE (mode,
11779 bypass_test,
11780 copy_rtx (operands[3]),
11781 copy_rtx (operands[0]))));
11782 if (second_test)
11783 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
11784 gen_rtx_IF_THEN_ELSE (mode,
11785 second_test,
11786 copy_rtx (operands[2]),
11787 copy_rtx (operands[0]))));
11788
11789 return 1; /* DONE */
11790 }
11791
11792 /* Swap, force into registers, or otherwise massage the two operands
11793 to an sse comparison with a mask result. Thus we differ a bit from
11794 ix86_prepare_fp_compare_args which expects to produce a flags result.
11795
11796 The DEST operand exists to help determine whether to commute commutative
11797 operators. The POP0/POP1 operands are updated in place. The new
11798 comparison code is returned, or UNKNOWN if not implementable. */
11799
11800 static enum rtx_code
11801 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
11802 rtx *pop0, rtx *pop1)
11803 {
11804 rtx tmp;
11805
11806 switch (code)
11807 {
11808 case LTGT:
11809 case UNEQ:
11810 /* We have no LTGT as an operator. We could implement it with
11811 NE & ORDERED, but this requires an extra temporary. It's
11812 not clear that it's worth it. */
11813 return UNKNOWN;
11814
11815 case LT:
11816 case LE:
11817 case UNGT:
11818 case UNGE:
11819 /* These are supported directly. */
11820 break;
11821
11822 case EQ:
11823 case NE:
11824 case UNORDERED:
11825 case ORDERED:
11826 /* For commutative operators, try to canonicalize the destination
11827 operand to be first in the comparison - this helps reload to
11828 avoid extra moves. */
11829 if (!dest || !rtx_equal_p (dest, *pop1))
11830 break;
11831 /* FALLTHRU */
11832
11833 case GE:
11834 case GT:
11835 case UNLE:
11836 case UNLT:
11837 /* These are not supported directly. Swap the comparison operands
11838 to transform into something that is supported. */
11839 tmp = *pop0;
11840 *pop0 = *pop1;
11841 *pop1 = tmp;
11842 code = swap_condition (code);
11843 break;
11844
11845 default:
11846 gcc_unreachable ();
11847 }
11848
11849 return code;
11850 }
11851
11852 /* Detect conditional moves that exactly match min/max operational
11853 semantics. Note that this is IEEE safe, as long as we don't
11854 interchange the operands.
11855
11856 Returns FALSE if this conditional move doesn't match a MIN/MAX,
11857 and TRUE if the operation is successful and instructions are emitted. */
11858
11859 static bool
11860 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
11861 rtx cmp_op1, rtx if_true, rtx if_false)
11862 {
11863 enum machine_mode mode;
11864 bool is_min;
11865 rtx tmp;
11866
11867 if (code == LT)
11868 ;
11869 else if (code == UNGE)
11870 {
11871 tmp = if_true;
11872 if_true = if_false;
11873 if_false = tmp;
11874 }
11875 else
11876 return false;
11877
11878 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
11879 is_min = true;
11880 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
11881 is_min = false;
11882 else
11883 return false;
11884
11885 mode = GET_MODE (dest);
11886
11887 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
11888 but MODE may be a vector mode and thus not appropriate. */
11889 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
11890 {
11891 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
11892 rtvec v;
11893
11894 if_true = force_reg (mode, if_true);
11895 v = gen_rtvec (2, if_true, if_false);
11896 tmp = gen_rtx_UNSPEC (mode, v, u);
11897 }
11898 else
11899 {
11900 code = is_min ? SMIN : SMAX;
11901 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
11902 }
11903
11904 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
11905 return true;
11906 }
11907
11908 /* Expand an sse vector comparison. Return the register with the result. */
11909
11910 static rtx
11911 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
11912 rtx op_true, rtx op_false)
11913 {
11914 enum machine_mode mode = GET_MODE (dest);
11915 rtx x;
11916
11917 cmp_op0 = force_reg (mode, cmp_op0);
11918 if (!nonimmediate_operand (cmp_op1, mode))
11919 cmp_op1 = force_reg (mode, cmp_op1);
11920
11921 if (optimize
11922 || reg_overlap_mentioned_p (dest, op_true)
11923 || reg_overlap_mentioned_p (dest, op_false))
11924 dest = gen_reg_rtx (mode);
11925
11926 x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1);
11927 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11928
11929 return dest;
11930 }
11931
11932 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
11933 operations. This is used for both scalar and vector conditional moves. */
11934
11935 static void
11936 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
11937 {
11938 enum machine_mode mode = GET_MODE (dest);
11939 rtx t2, t3, x;
11940
11941 if (op_false == CONST0_RTX (mode))
11942 {
11943 op_true = force_reg (mode, op_true);
11944 x = gen_rtx_AND (mode, cmp, op_true);
11945 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11946 }
11947 else if (op_true == CONST0_RTX (mode))
11948 {
11949 op_false = force_reg (mode, op_false);
11950 x = gen_rtx_NOT (mode, cmp);
11951 x = gen_rtx_AND (mode, x, op_false);
11952 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11953 }
11954 else
11955 {
11956 op_true = force_reg (mode, op_true);
11957 op_false = force_reg (mode, op_false);
11958
11959 t2 = gen_reg_rtx (mode);
11960 if (optimize)
11961 t3 = gen_reg_rtx (mode);
11962 else
11963 t3 = dest;
11964
11965 x = gen_rtx_AND (mode, op_true, cmp);
11966 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
11967
11968 x = gen_rtx_NOT (mode, cmp);
11969 x = gen_rtx_AND (mode, x, op_false);
11970 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
11971
11972 x = gen_rtx_IOR (mode, t3, t2);
11973 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
11974 }
11975 }
11976
11977 /* Expand a floating-point conditional move. Return true if successful. */
11978
11979 int
11980 ix86_expand_fp_movcc (rtx operands[])
11981 {
11982 enum machine_mode mode = GET_MODE (operands[0]);
11983 enum rtx_code code = GET_CODE (operands[1]);
11984 rtx tmp, compare_op, second_test, bypass_test;
11985
11986 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
11987 {
11988 enum machine_mode cmode;
11989
11990 /* Since we've no cmove for sse registers, don't force bad register
11991 allocation just to gain access to it. Deny movcc when the
11992 comparison mode doesn't match the move mode. */
11993 cmode = GET_MODE (ix86_compare_op0);
11994 if (cmode == VOIDmode)
11995 cmode = GET_MODE (ix86_compare_op1);
11996 if (cmode != mode)
11997 return 0;
11998
11999 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12000 &ix86_compare_op0,
12001 &ix86_compare_op1);
12002 if (code == UNKNOWN)
12003 return 0;
12004
12005 if (ix86_expand_sse_fp_minmax (operands[0], code, ix86_compare_op0,
12006 ix86_compare_op1, operands[2],
12007 operands[3]))
12008 return 1;
12009
12010 tmp = ix86_expand_sse_cmp (operands[0], code, ix86_compare_op0,
12011 ix86_compare_op1, operands[2], operands[3]);
12012 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
12013 return 1;
12014 }
12015
12016 /* The floating point conditional move instructions don't directly
12017 support conditions resulting from a signed integer comparison. */
12018
12019 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
12020
12021 /* The floating point conditional move instructions don't directly
12022 support signed integer comparisons. */
12023
12024 if (!fcmov_comparison_operator (compare_op, VOIDmode))
12025 {
12026 gcc_assert (!second_test && !bypass_test);
12027 tmp = gen_reg_rtx (QImode);
12028 ix86_expand_setcc (code, tmp);
12029 code = NE;
12030 ix86_compare_op0 = tmp;
12031 ix86_compare_op1 = const0_rtx;
12032 compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
12033 }
12034 if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
12035 {
12036 tmp = gen_reg_rtx (mode);
12037 emit_move_insn (tmp, operands[3]);
12038 operands[3] = tmp;
12039 }
12040 if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
12041 {
12042 tmp = gen_reg_rtx (mode);
12043 emit_move_insn (tmp, operands[2]);
12044 operands[2] = tmp;
12045 }
12046
12047 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12048 gen_rtx_IF_THEN_ELSE (mode, compare_op,
12049 operands[2], operands[3])));
12050 if (bypass_test)
12051 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12052 gen_rtx_IF_THEN_ELSE (mode, bypass_test,
12053 operands[3], operands[0])));
12054 if (second_test)
12055 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12056 gen_rtx_IF_THEN_ELSE (mode, second_test,
12057 operands[2], operands[0])));
12058
12059 return 1;
12060 }
12061
12062 /* Expand a floating-point vector conditional move; a vcond operation
12063 rather than a movcc operation. */
12064
12065 bool
12066 ix86_expand_fp_vcond (rtx operands[])
12067 {
12068 enum rtx_code code = GET_CODE (operands[3]);
12069 rtx cmp;
12070
12071 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12072 &operands[4], &operands[5]);
12073 if (code == UNKNOWN)
12074 return false;
12075
12076 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
12077 operands[5], operands[1], operands[2]))
12078 return true;
12079
12080 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
12081 operands[1], operands[2]);
12082 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
12083 return true;
12084 }
12085
12086 /* Expand a signed integral vector conditional move. */
12087
12088 bool
12089 ix86_expand_int_vcond (rtx operands[])
12090 {
12091 enum machine_mode mode = GET_MODE (operands[0]);
12092 enum rtx_code code = GET_CODE (operands[3]);
12093 bool negate = false;
12094 rtx x, cop0, cop1;
12095
12096 cop0 = operands[4];
12097 cop1 = operands[5];
12098
12099 /* Canonicalize the comparison to EQ, GT, GTU. */
12100 switch (code)
12101 {
12102 case EQ:
12103 case GT:
12104 case GTU:
12105 break;
12106
12107 case NE:
12108 case LE:
12109 case LEU:
12110 code = reverse_condition (code);
12111 negate = true;
12112 break;
12113
12114 case GE:
12115 case GEU:
12116 code = reverse_condition (code);
12117 negate = true;
12118 /* FALLTHRU */
12119
12120 case LT:
12121 case LTU:
12122 code = swap_condition (code);
12123 x = cop0, cop0 = cop1, cop1 = x;
12124 break;
12125
12126 default:
12127 gcc_unreachable ();
12128 }
12129
12130 /* Unsigned parallel compare is not supported by the hardware. Play some
12131 tricks to turn this into a signed comparison against 0. */
12132 if (code == GTU)
12133 {
12134 cop0 = force_reg (mode, cop0);
12135
12136 switch (mode)
12137 {
12138 case V4SImode:
12139 {
12140 rtx t1, t2, mask;
12141
12142 /* Perform a parallel modulo subtraction. */
12143 t1 = gen_reg_rtx (mode);
12144 emit_insn (gen_subv4si3 (t1, cop0, cop1));
12145
12146 /* Extract the original sign bit of op0. */
12147 mask = GEN_INT (-0x80000000);
12148 mask = gen_rtx_CONST_VECTOR (mode,
12149 gen_rtvec (4, mask, mask, mask, mask));
12150 mask = force_reg (mode, mask);
12151 t2 = gen_reg_rtx (mode);
12152 emit_insn (gen_andv4si3 (t2, cop0, mask));
12153
12154 /* XOR it back into the result of the subtraction. This results
12155 in the sign bit set iff we saw unsigned underflow. */
12156 x = gen_reg_rtx (mode);
12157 emit_insn (gen_xorv4si3 (x, t1, t2));
12158
12159 code = GT;
12160 }
12161 break;
12162
12163 case V16QImode:
12164 case V8HImode:
12165 /* Perform a parallel unsigned saturating subtraction. */
12166 x = gen_reg_rtx (mode);
12167 emit_insn (gen_rtx_SET (VOIDmode, x,
12168 gen_rtx_US_MINUS (mode, cop0, cop1)));
12169
12170 code = EQ;
12171 negate = !negate;
12172 break;
12173
12174 default:
12175 gcc_unreachable ();
12176 }
12177
12178 cop0 = x;
12179 cop1 = CONST0_RTX (mode);
12180 }
12181
12182 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
12183 operands[1+negate], operands[2-negate]);
12184
12185 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
12186 operands[2-negate]);
12187 return true;
12188 }
12189
12190 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
12191 true if we should do zero extension, else sign extension. HIGH_P is
12192 true if we want the N/2 high elements, else the low elements. */
12193
12194 void
12195 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
12196 {
12197 enum machine_mode imode = GET_MODE (operands[1]);
12198 rtx (*unpack)(rtx, rtx, rtx);
12199 rtx se, dest;
12200
12201 switch (imode)
12202 {
12203 case V16QImode:
12204 if (high_p)
12205 unpack = gen_vec_interleave_highv16qi;
12206 else
12207 unpack = gen_vec_interleave_lowv16qi;
12208 break;
12209 case V8HImode:
12210 if (high_p)
12211 unpack = gen_vec_interleave_highv8hi;
12212 else
12213 unpack = gen_vec_interleave_lowv8hi;
12214 break;
12215 case V4SImode:
12216 if (high_p)
12217 unpack = gen_vec_interleave_highv4si;
12218 else
12219 unpack = gen_vec_interleave_lowv4si;
12220 break;
12221 default:
12222 gcc_unreachable ();
12223 }
12224
12225 dest = gen_lowpart (imode, operands[0]);
12226
12227 if (unsigned_p)
12228 se = force_reg (imode, CONST0_RTX (imode));
12229 else
12230 se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
12231 operands[1], pc_rtx, pc_rtx);
12232
12233 emit_insn (unpack (dest, operands[1], se));
12234 }
12235
12236 /* Expand conditional increment or decrement using adb/sbb instructions.
12237 The default case using setcc followed by the conditional move can be
12238 done by generic code. */
12239 int
12240 ix86_expand_int_addcc (rtx operands[])
12241 {
12242 enum rtx_code code = GET_CODE (operands[1]);
12243 rtx compare_op;
12244 rtx val = const0_rtx;
12245 bool fpcmp = false;
12246 enum machine_mode mode = GET_MODE (operands[0]);
12247
12248 if (operands[3] != const1_rtx
12249 && operands[3] != constm1_rtx)
12250 return 0;
12251 if (!ix86_expand_carry_flag_compare (code, ix86_compare_op0,
12252 ix86_compare_op1, &compare_op))
12253 return 0;
12254 code = GET_CODE (compare_op);
12255
12256 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
12257 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
12258 {
12259 fpcmp = true;
12260 code = ix86_fp_compare_code_to_integer (code);
12261 }
12262
12263 if (code != LTU)
12264 {
12265 val = constm1_rtx;
12266 if (fpcmp)
12267 PUT_CODE (compare_op,
12268 reverse_condition_maybe_unordered
12269 (GET_CODE (compare_op)));
12270 else
12271 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
12272 }
12273 PUT_MODE (compare_op, mode);
12274
12275 /* Construct either adc or sbb insn. */
12276 if ((code == LTU) == (operands[3] == constm1_rtx))
12277 {
12278 switch (GET_MODE (operands[0]))
12279 {
12280 case QImode:
12281 emit_insn (gen_subqi3_carry (operands[0], operands[2], val, compare_op));
12282 break;
12283 case HImode:
12284 emit_insn (gen_subhi3_carry (operands[0], operands[2], val, compare_op));
12285 break;
12286 case SImode:
12287 emit_insn (gen_subsi3_carry (operands[0], operands[2], val, compare_op));
12288 break;
12289 case DImode:
12290 emit_insn (gen_subdi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12291 break;
12292 default:
12293 gcc_unreachable ();
12294 }
12295 }
12296 else
12297 {
12298 switch (GET_MODE (operands[0]))
12299 {
12300 case QImode:
12301 emit_insn (gen_addqi3_carry (operands[0], operands[2], val, compare_op));
12302 break;
12303 case HImode:
12304 emit_insn (gen_addhi3_carry (operands[0], operands[2], val, compare_op));
12305 break;
12306 case SImode:
12307 emit_insn (gen_addsi3_carry (operands[0], operands[2], val, compare_op));
12308 break;
12309 case DImode:
12310 emit_insn (gen_adddi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12311 break;
12312 default:
12313 gcc_unreachable ();
12314 }
12315 }
12316 return 1; /* DONE */
12317 }
12318
12319
12320 /* Split operands 0 and 1 into SImode parts. Similar to split_di, but
12321 works for floating pointer parameters and nonoffsetable memories.
12322 For pushes, it returns just stack offsets; the values will be saved
12323 in the right order. Maximally three parts are generated. */
12324
12325 static int
12326 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
12327 {
12328 int size;
12329
12330 if (!TARGET_64BIT)
12331 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
12332 else
12333 size = (GET_MODE_SIZE (mode) + 4) / 8;
12334
12335 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
12336 gcc_assert (size >= 2 && size <= 3);
12337
12338 /* Optimize constant pool reference to immediates. This is used by fp
12339 moves, that force all constants to memory to allow combining. */
12340 if (MEM_P (operand) && MEM_READONLY_P (operand))
12341 {
12342 rtx tmp = maybe_get_pool_constant (operand);
12343 if (tmp)
12344 operand = tmp;
12345 }
12346
12347 if (MEM_P (operand) && !offsettable_memref_p (operand))
12348 {
12349 /* The only non-offsetable memories we handle are pushes. */
12350 int ok = push_operand (operand, VOIDmode);
12351
12352 gcc_assert (ok);
12353
12354 operand = copy_rtx (operand);
12355 PUT_MODE (operand, Pmode);
12356 parts[0] = parts[1] = parts[2] = operand;
12357 return size;
12358 }
12359
12360 if (GET_CODE (operand) == CONST_VECTOR)
12361 {
12362 enum machine_mode imode = int_mode_for_mode (mode);
12363 /* Caution: if we looked through a constant pool memory above,
12364 the operand may actually have a different mode now. That's
12365 ok, since we want to pun this all the way back to an integer. */
12366 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
12367 gcc_assert (operand != NULL);
12368 mode = imode;
12369 }
12370
12371 if (!TARGET_64BIT)
12372 {
12373 if (mode == DImode)
12374 split_di (&operand, 1, &parts[0], &parts[1]);
12375 else
12376 {
12377 if (REG_P (operand))
12378 {
12379 gcc_assert (reload_completed);
12380 parts[0] = gen_rtx_REG (SImode, REGNO (operand) + 0);
12381 parts[1] = gen_rtx_REG (SImode, REGNO (operand) + 1);
12382 if (size == 3)
12383 parts[2] = gen_rtx_REG (SImode, REGNO (operand) + 2);
12384 }
12385 else if (offsettable_memref_p (operand))
12386 {
12387 operand = adjust_address (operand, SImode, 0);
12388 parts[0] = operand;
12389 parts[1] = adjust_address (operand, SImode, 4);
12390 if (size == 3)
12391 parts[2] = adjust_address (operand, SImode, 8);
12392 }
12393 else if (GET_CODE (operand) == CONST_DOUBLE)
12394 {
12395 REAL_VALUE_TYPE r;
12396 long l[4];
12397
12398 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12399 switch (mode)
12400 {
12401 case XFmode:
12402 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
12403 parts[2] = gen_int_mode (l[2], SImode);
12404 break;
12405 case DFmode:
12406 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
12407 break;
12408 default:
12409 gcc_unreachable ();
12410 }
12411 parts[1] = gen_int_mode (l[1], SImode);
12412 parts[0] = gen_int_mode (l[0], SImode);
12413 }
12414 else
12415 gcc_unreachable ();
12416 }
12417 }
12418 else
12419 {
12420 if (mode == TImode)
12421 split_ti (&operand, 1, &parts[0], &parts[1]);
12422 if (mode == XFmode || mode == TFmode)
12423 {
12424 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
12425 if (REG_P (operand))
12426 {
12427 gcc_assert (reload_completed);
12428 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
12429 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
12430 }
12431 else if (offsettable_memref_p (operand))
12432 {
12433 operand = adjust_address (operand, DImode, 0);
12434 parts[0] = operand;
12435 parts[1] = adjust_address (operand, upper_mode, 8);
12436 }
12437 else if (GET_CODE (operand) == CONST_DOUBLE)
12438 {
12439 REAL_VALUE_TYPE r;
12440 long l[4];
12441
12442 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12443 real_to_target (l, &r, mode);
12444
12445 /* Do not use shift by 32 to avoid warning on 32bit systems. */
12446 if (HOST_BITS_PER_WIDE_INT >= 64)
12447 parts[0]
12448 = gen_int_mode
12449 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
12450 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
12451 DImode);
12452 else
12453 parts[0] = immed_double_const (l[0], l[1], DImode);
12454
12455 if (upper_mode == SImode)
12456 parts[1] = gen_int_mode (l[2], SImode);
12457 else if (HOST_BITS_PER_WIDE_INT >= 64)
12458 parts[1]
12459 = gen_int_mode
12460 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
12461 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
12462 DImode);
12463 else
12464 parts[1] = immed_double_const (l[2], l[3], DImode);
12465 }
12466 else
12467 gcc_unreachable ();
12468 }
12469 }
12470
12471 return size;
12472 }
12473
12474 /* Emit insns to perform a move or push of DI, DF, and XF values.
12475 Return false when normal moves are needed; true when all required
12476 insns have been emitted. Operands 2-4 contain the input values
12477 int the correct order; operands 5-7 contain the output values. */
12478
12479 void
12480 ix86_split_long_move (rtx operands[])
12481 {
12482 rtx part[2][3];
12483 int nparts;
12484 int push = 0;
12485 int collisions = 0;
12486 enum machine_mode mode = GET_MODE (operands[0]);
12487
12488 /* The DFmode expanders may ask us to move double.
12489 For 64bit target this is single move. By hiding the fact
12490 here we simplify i386.md splitters. */
12491 if (GET_MODE_SIZE (GET_MODE (operands[0])) == 8 && TARGET_64BIT)
12492 {
12493 /* Optimize constant pool reference to immediates. This is used by
12494 fp moves, that force all constants to memory to allow combining. */
12495
12496 if (MEM_P (operands[1])
12497 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
12498 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
12499 operands[1] = get_pool_constant (XEXP (operands[1], 0));
12500 if (push_operand (operands[0], VOIDmode))
12501 {
12502 operands[0] = copy_rtx (operands[0]);
12503 PUT_MODE (operands[0], Pmode);
12504 }
12505 else
12506 operands[0] = gen_lowpart (DImode, operands[0]);
12507 operands[1] = gen_lowpart (DImode, operands[1]);
12508 emit_move_insn (operands[0], operands[1]);
12509 return;
12510 }
12511
12512 /* The only non-offsettable memory we handle is push. */
12513 if (push_operand (operands[0], VOIDmode))
12514 push = 1;
12515 else
12516 gcc_assert (!MEM_P (operands[0])
12517 || offsettable_memref_p (operands[0]));
12518
12519 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
12520 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
12521
12522 /* When emitting push, take care for source operands on the stack. */
12523 if (push && MEM_P (operands[1])
12524 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
12525 {
12526 if (nparts == 3)
12527 part[1][1] = change_address (part[1][1], GET_MODE (part[1][1]),
12528 XEXP (part[1][2], 0));
12529 part[1][0] = change_address (part[1][0], GET_MODE (part[1][0]),
12530 XEXP (part[1][1], 0));
12531 }
12532
12533 /* We need to do copy in the right order in case an address register
12534 of the source overlaps the destination. */
12535 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
12536 {
12537 if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))
12538 collisions++;
12539 if (reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
12540 collisions++;
12541 if (nparts == 3
12542 && reg_overlap_mentioned_p (part[0][2], XEXP (part[1][0], 0)))
12543 collisions++;
12544
12545 /* Collision in the middle part can be handled by reordering. */
12546 if (collisions == 1 && nparts == 3
12547 && reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
12548 {
12549 rtx tmp;
12550 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
12551 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
12552 }
12553
12554 /* If there are more collisions, we can't handle it by reordering.
12555 Do an lea to the last part and use only one colliding move. */
12556 else if (collisions > 1)
12557 {
12558 rtx base;
12559
12560 collisions = 1;
12561
12562 base = part[0][nparts - 1];
12563
12564 /* Handle the case when the last part isn't valid for lea.
12565 Happens in 64-bit mode storing the 12-byte XFmode. */
12566 if (GET_MODE (base) != Pmode)
12567 base = gen_rtx_REG (Pmode, REGNO (base));
12568
12569 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
12570 part[1][0] = replace_equiv_address (part[1][0], base);
12571 part[1][1] = replace_equiv_address (part[1][1],
12572 plus_constant (base, UNITS_PER_WORD));
12573 if (nparts == 3)
12574 part[1][2] = replace_equiv_address (part[1][2],
12575 plus_constant (base, 8));
12576 }
12577 }
12578
12579 if (push)
12580 {
12581 if (!TARGET_64BIT)
12582 {
12583 if (nparts == 3)
12584 {
12585 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
12586 emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-4)));
12587 emit_move_insn (part[0][2], part[1][2]);
12588 }
12589 }
12590 else
12591 {
12592 /* In 64bit mode we don't have 32bit push available. In case this is
12593 register, it is OK - we will just use larger counterpart. We also
12594 retype memory - these comes from attempt to avoid REX prefix on
12595 moving of second half of TFmode value. */
12596 if (GET_MODE (part[1][1]) == SImode)
12597 {
12598 switch (GET_CODE (part[1][1]))
12599 {
12600 case MEM:
12601 part[1][1] = adjust_address (part[1][1], DImode, 0);
12602 break;
12603
12604 case REG:
12605 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
12606 break;
12607
12608 default:
12609 gcc_unreachable ();
12610 }
12611
12612 if (GET_MODE (part[1][0]) == SImode)
12613 part[1][0] = part[1][1];
12614 }
12615 }
12616 emit_move_insn (part[0][1], part[1][1]);
12617 emit_move_insn (part[0][0], part[1][0]);
12618 return;
12619 }
12620
12621 /* Choose correct order to not overwrite the source before it is copied. */
12622 if ((REG_P (part[0][0])
12623 && REG_P (part[1][1])
12624 && (REGNO (part[0][0]) == REGNO (part[1][1])
12625 || (nparts == 3
12626 && REGNO (part[0][0]) == REGNO (part[1][2]))))
12627 || (collisions > 0
12628 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
12629 {
12630 if (nparts == 3)
12631 {
12632 operands[2] = part[0][2];
12633 operands[3] = part[0][1];
12634 operands[4] = part[0][0];
12635 operands[5] = part[1][2];
12636 operands[6] = part[1][1];
12637 operands[7] = part[1][0];
12638 }
12639 else
12640 {
12641 operands[2] = part[0][1];
12642 operands[3] = part[0][0];
12643 operands[5] = part[1][1];
12644 operands[6] = part[1][0];
12645 }
12646 }
12647 else
12648 {
12649 if (nparts == 3)
12650 {
12651 operands[2] = part[0][0];
12652 operands[3] = part[0][1];
12653 operands[4] = part[0][2];
12654 operands[5] = part[1][0];
12655 operands[6] = part[1][1];
12656 operands[7] = part[1][2];
12657 }
12658 else
12659 {
12660 operands[2] = part[0][0];
12661 operands[3] = part[0][1];
12662 operands[5] = part[1][0];
12663 operands[6] = part[1][1];
12664 }
12665 }
12666
12667 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
12668 if (optimize_size)
12669 {
12670 if (CONST_INT_P (operands[5])
12671 && operands[5] != const0_rtx
12672 && REG_P (operands[2]))
12673 {
12674 if (CONST_INT_P (operands[6])
12675 && INTVAL (operands[6]) == INTVAL (operands[5]))
12676 operands[6] = operands[2];
12677
12678 if (nparts == 3
12679 && CONST_INT_P (operands[7])
12680 && INTVAL (operands[7]) == INTVAL (operands[5]))
12681 operands[7] = operands[2];
12682 }
12683
12684 if (nparts == 3
12685 && CONST_INT_P (operands[6])
12686 && operands[6] != const0_rtx
12687 && REG_P (operands[3])
12688 && CONST_INT_P (operands[7])
12689 && INTVAL (operands[7]) == INTVAL (operands[6]))
12690 operands[7] = operands[3];
12691 }
12692
12693 emit_move_insn (operands[2], operands[5]);
12694 emit_move_insn (operands[3], operands[6]);
12695 if (nparts == 3)
12696 emit_move_insn (operands[4], operands[7]);
12697
12698 return;
12699 }
12700
12701 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
12702 left shift by a constant, either using a single shift or
12703 a sequence of add instructions. */
12704
12705 static void
12706 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
12707 {
12708 if (count == 1)
12709 {
12710 emit_insn ((mode == DImode
12711 ? gen_addsi3
12712 : gen_adddi3) (operand, operand, operand));
12713 }
12714 else if (!optimize_size
12715 && count * ix86_cost->add <= ix86_cost->shift_const)
12716 {
12717 int i;
12718 for (i=0; i<count; i++)
12719 {
12720 emit_insn ((mode == DImode
12721 ? gen_addsi3
12722 : gen_adddi3) (operand, operand, operand));
12723 }
12724 }
12725 else
12726 emit_insn ((mode == DImode
12727 ? gen_ashlsi3
12728 : gen_ashldi3) (operand, operand, GEN_INT (count)));
12729 }
12730
12731 void
12732 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
12733 {
12734 rtx low[2], high[2];
12735 int count;
12736 const int single_width = mode == DImode ? 32 : 64;
12737
12738 if (CONST_INT_P (operands[2]))
12739 {
12740 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12741 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12742
12743 if (count >= single_width)
12744 {
12745 emit_move_insn (high[0], low[1]);
12746 emit_move_insn (low[0], const0_rtx);
12747
12748 if (count > single_width)
12749 ix86_expand_ashl_const (high[0], count - single_width, mode);
12750 }
12751 else
12752 {
12753 if (!rtx_equal_p (operands[0], operands[1]))
12754 emit_move_insn (operands[0], operands[1]);
12755 emit_insn ((mode == DImode
12756 ? gen_x86_shld_1
12757 : gen_x86_64_shld) (high[0], low[0], GEN_INT (count)));
12758 ix86_expand_ashl_const (low[0], count, mode);
12759 }
12760 return;
12761 }
12762
12763 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12764
12765 if (operands[1] == const1_rtx)
12766 {
12767 /* Assuming we've chosen a QImode capable registers, then 1 << N
12768 can be done with two 32/64-bit shifts, no branches, no cmoves. */
12769 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
12770 {
12771 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
12772
12773 ix86_expand_clear (low[0]);
12774 ix86_expand_clear (high[0]);
12775 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (single_width)));
12776
12777 d = gen_lowpart (QImode, low[0]);
12778 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
12779 s = gen_rtx_EQ (QImode, flags, const0_rtx);
12780 emit_insn (gen_rtx_SET (VOIDmode, d, s));
12781
12782 d = gen_lowpart (QImode, high[0]);
12783 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
12784 s = gen_rtx_NE (QImode, flags, const0_rtx);
12785 emit_insn (gen_rtx_SET (VOIDmode, d, s));
12786 }
12787
12788 /* Otherwise, we can get the same results by manually performing
12789 a bit extract operation on bit 5/6, and then performing the two
12790 shifts. The two methods of getting 0/1 into low/high are exactly
12791 the same size. Avoiding the shift in the bit extract case helps
12792 pentium4 a bit; no one else seems to care much either way. */
12793 else
12794 {
12795 rtx x;
12796
12797 if (TARGET_PARTIAL_REG_STALL && !optimize_size)
12798 x = gen_rtx_ZERO_EXTEND (mode == DImode ? SImode : DImode, operands[2]);
12799 else
12800 x = gen_lowpart (mode == DImode ? SImode : DImode, operands[2]);
12801 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
12802
12803 emit_insn ((mode == DImode
12804 ? gen_lshrsi3
12805 : gen_lshrdi3) (high[0], high[0], GEN_INT (mode == DImode ? 5 : 6)));
12806 emit_insn ((mode == DImode
12807 ? gen_andsi3
12808 : gen_anddi3) (high[0], high[0], GEN_INT (1)));
12809 emit_move_insn (low[0], high[0]);
12810 emit_insn ((mode == DImode
12811 ? gen_xorsi3
12812 : gen_xordi3) (low[0], low[0], GEN_INT (1)));
12813 }
12814
12815 emit_insn ((mode == DImode
12816 ? gen_ashlsi3
12817 : gen_ashldi3) (low[0], low[0], operands[2]));
12818 emit_insn ((mode == DImode
12819 ? gen_ashlsi3
12820 : gen_ashldi3) (high[0], high[0], operands[2]));
12821 return;
12822 }
12823
12824 if (operands[1] == constm1_rtx)
12825 {
12826 /* For -1 << N, we can avoid the shld instruction, because we
12827 know that we're shifting 0...31/63 ones into a -1. */
12828 emit_move_insn (low[0], constm1_rtx);
12829 if (optimize_size)
12830 emit_move_insn (high[0], low[0]);
12831 else
12832 emit_move_insn (high[0], constm1_rtx);
12833 }
12834 else
12835 {
12836 if (!rtx_equal_p (operands[0], operands[1]))
12837 emit_move_insn (operands[0], operands[1]);
12838
12839 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12840 emit_insn ((mode == DImode
12841 ? gen_x86_shld_1
12842 : gen_x86_64_shld) (high[0], low[0], operands[2]));
12843 }
12844
12845 emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2]));
12846
12847 if (TARGET_CMOVE && scratch)
12848 {
12849 ix86_expand_clear (scratch);
12850 emit_insn ((mode == DImode
12851 ? gen_x86_shift_adj_1
12852 : gen_x86_64_shift_adj) (high[0], low[0], operands[2], scratch));
12853 }
12854 else
12855 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
12856 }
12857
12858 void
12859 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
12860 {
12861 rtx low[2], high[2];
12862 int count;
12863 const int single_width = mode == DImode ? 32 : 64;
12864
12865 if (CONST_INT_P (operands[2]))
12866 {
12867 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12868 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12869
12870 if (count == single_width * 2 - 1)
12871 {
12872 emit_move_insn (high[0], high[1]);
12873 emit_insn ((mode == DImode
12874 ? gen_ashrsi3
12875 : gen_ashrdi3) (high[0], high[0],
12876 GEN_INT (single_width - 1)));
12877 emit_move_insn (low[0], high[0]);
12878
12879 }
12880 else if (count >= single_width)
12881 {
12882 emit_move_insn (low[0], high[1]);
12883 emit_move_insn (high[0], low[0]);
12884 emit_insn ((mode == DImode
12885 ? gen_ashrsi3
12886 : gen_ashrdi3) (high[0], high[0],
12887 GEN_INT (single_width - 1)));
12888 if (count > single_width)
12889 emit_insn ((mode == DImode
12890 ? gen_ashrsi3
12891 : gen_ashrdi3) (low[0], low[0],
12892 GEN_INT (count - single_width)));
12893 }
12894 else
12895 {
12896 if (!rtx_equal_p (operands[0], operands[1]))
12897 emit_move_insn (operands[0], operands[1]);
12898 emit_insn ((mode == DImode
12899 ? gen_x86_shrd_1
12900 : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
12901 emit_insn ((mode == DImode
12902 ? gen_ashrsi3
12903 : gen_ashrdi3) (high[0], high[0], GEN_INT (count)));
12904 }
12905 }
12906 else
12907 {
12908 if (!rtx_equal_p (operands[0], operands[1]))
12909 emit_move_insn (operands[0], operands[1]);
12910
12911 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12912
12913 emit_insn ((mode == DImode
12914 ? gen_x86_shrd_1
12915 : gen_x86_64_shrd) (low[0], high[0], operands[2]));
12916 emit_insn ((mode == DImode
12917 ? gen_ashrsi3
12918 : gen_ashrdi3) (high[0], high[0], operands[2]));
12919
12920 if (TARGET_CMOVE && scratch)
12921 {
12922 emit_move_insn (scratch, high[0]);
12923 emit_insn ((mode == DImode
12924 ? gen_ashrsi3
12925 : gen_ashrdi3) (scratch, scratch,
12926 GEN_INT (single_width - 1)));
12927 emit_insn ((mode == DImode
12928 ? gen_x86_shift_adj_1
12929 : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
12930 scratch));
12931 }
12932 else
12933 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
12934 }
12935 }
12936
12937 void
12938 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
12939 {
12940 rtx low[2], high[2];
12941 int count;
12942 const int single_width = mode == DImode ? 32 : 64;
12943
12944 if (CONST_INT_P (operands[2]))
12945 {
12946 (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
12947 count = INTVAL (operands[2]) & (single_width * 2 - 1);
12948
12949 if (count >= single_width)
12950 {
12951 emit_move_insn (low[0], high[1]);
12952 ix86_expand_clear (high[0]);
12953
12954 if (count > single_width)
12955 emit_insn ((mode == DImode
12956 ? gen_lshrsi3
12957 : gen_lshrdi3) (low[0], low[0],
12958 GEN_INT (count - single_width)));
12959 }
12960 else
12961 {
12962 if (!rtx_equal_p (operands[0], operands[1]))
12963 emit_move_insn (operands[0], operands[1]);
12964 emit_insn ((mode == DImode
12965 ? gen_x86_shrd_1
12966 : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
12967 emit_insn ((mode == DImode
12968 ? gen_lshrsi3
12969 : gen_lshrdi3) (high[0], high[0], GEN_INT (count)));
12970 }
12971 }
12972 else
12973 {
12974 if (!rtx_equal_p (operands[0], operands[1]))
12975 emit_move_insn (operands[0], operands[1]);
12976
12977 (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
12978
12979 emit_insn ((mode == DImode
12980 ? gen_x86_shrd_1
12981 : gen_x86_64_shrd) (low[0], high[0], operands[2]));
12982 emit_insn ((mode == DImode
12983 ? gen_lshrsi3
12984 : gen_lshrdi3) (high[0], high[0], operands[2]));
12985
12986 /* Heh. By reversing the arguments, we can reuse this pattern. */
12987 if (TARGET_CMOVE && scratch)
12988 {
12989 ix86_expand_clear (scratch);
12990 emit_insn ((mode == DImode
12991 ? gen_x86_shift_adj_1
12992 : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
12993 scratch));
12994 }
12995 else
12996 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
12997 }
12998 }
12999
13000 /* Predict just emitted jump instruction to be taken with probability PROB. */
13001 static void
13002 predict_jump (int prob)
13003 {
13004 rtx insn = get_last_insn ();
13005 gcc_assert (JUMP_P (insn));
13006 REG_NOTES (insn)
13007 = gen_rtx_EXPR_LIST (REG_BR_PROB,
13008 GEN_INT (prob),
13009 REG_NOTES (insn));
13010 }
13011
13012 /* Helper function for the string operations below. Dest VARIABLE whether
13013 it is aligned to VALUE bytes. If true, jump to the label. */
13014 static rtx
13015 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
13016 {
13017 rtx label = gen_label_rtx ();
13018 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
13019 if (GET_MODE (variable) == DImode)
13020 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
13021 else
13022 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
13023 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
13024 1, label);
13025 if (epilogue)
13026 predict_jump (REG_BR_PROB_BASE * 50 / 100);
13027 else
13028 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13029 return label;
13030 }
13031
13032 /* Adjust COUNTER by the VALUE. */
13033 static void
13034 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
13035 {
13036 if (GET_MODE (countreg) == DImode)
13037 emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value)));
13038 else
13039 emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value)));
13040 }
13041
13042 /* Zero extend possibly SImode EXP to Pmode register. */
13043 rtx
13044 ix86_zero_extend_to_Pmode (rtx exp)
13045 {
13046 rtx r;
13047 if (GET_MODE (exp) == VOIDmode)
13048 return force_reg (Pmode, exp);
13049 if (GET_MODE (exp) == Pmode)
13050 return copy_to_mode_reg (Pmode, exp);
13051 r = gen_reg_rtx (Pmode);
13052 emit_insn (gen_zero_extendsidi2 (r, exp));
13053 return r;
13054 }
13055
13056 /* Divide COUNTREG by SCALE. */
13057 static rtx
13058 scale_counter (rtx countreg, int scale)
13059 {
13060 rtx sc;
13061 rtx piece_size_mask;
13062
13063 if (scale == 1)
13064 return countreg;
13065 if (CONST_INT_P (countreg))
13066 return GEN_INT (INTVAL (countreg) / scale);
13067 gcc_assert (REG_P (countreg));
13068
13069 piece_size_mask = GEN_INT (scale - 1);
13070 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
13071 GEN_INT (exact_log2 (scale)),
13072 NULL, 1, OPTAB_DIRECT);
13073 return sc;
13074 }
13075
13076 /* When SRCPTR is non-NULL, output simple loop to move memory
13077 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
13078 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
13079 equivalent loop to set memory by VALUE (supposed to be in MODE).
13080
13081 The size is rounded down to whole number of chunk size moved at once.
13082 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
13083
13084
13085 static void
13086 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
13087 rtx destptr, rtx srcptr, rtx value,
13088 rtx count, enum machine_mode mode, int unroll,
13089 int expected_size)
13090 {
13091 rtx out_label, top_label, iter, tmp;
13092 enum machine_mode iter_mode;
13093 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
13094 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
13095 rtx size;
13096 rtx x_addr;
13097 rtx y_addr;
13098 int i;
13099
13100 iter_mode = GET_MODE (count);
13101 if (iter_mode == VOIDmode)
13102 iter_mode = word_mode;
13103
13104 top_label = gen_label_rtx ();
13105 out_label = gen_label_rtx ();
13106 iter = gen_reg_rtx (iter_mode);
13107
13108 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
13109 NULL, 1, OPTAB_DIRECT);
13110 /* Those two should combine. */
13111 if (piece_size == const1_rtx)
13112 {
13113 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
13114 true, out_label);
13115 predict_jump (REG_BR_PROB_BASE * 10 / 100);
13116 }
13117 emit_move_insn (iter, const0_rtx);
13118
13119 emit_label (top_label);
13120
13121 tmp = convert_modes (Pmode, iter_mode, iter, true);
13122 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
13123 destmem = change_address (destmem, mode, x_addr);
13124
13125 if (srcmem)
13126 {
13127 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
13128 srcmem = change_address (srcmem, mode, y_addr);
13129
13130 /* When unrolling for chips that reorder memory reads and writes,
13131 we can save registers by using single temporary.
13132 Also using 4 temporaries is overkill in 32bit mode. */
13133 if (!TARGET_64BIT && 0)
13134 {
13135 for (i = 0; i < unroll; i++)
13136 {
13137 if (i)
13138 {
13139 destmem =
13140 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13141 srcmem =
13142 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13143 }
13144 emit_move_insn (destmem, srcmem);
13145 }
13146 }
13147 else
13148 {
13149 rtx tmpreg[4];
13150 gcc_assert (unroll <= 4);
13151 for (i = 0; i < unroll; i++)
13152 {
13153 tmpreg[i] = gen_reg_rtx (mode);
13154 if (i)
13155 {
13156 srcmem =
13157 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13158 }
13159 emit_move_insn (tmpreg[i], srcmem);
13160 }
13161 for (i = 0; i < unroll; i++)
13162 {
13163 if (i)
13164 {
13165 destmem =
13166 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13167 }
13168 emit_move_insn (destmem, tmpreg[i]);
13169 }
13170 }
13171 }
13172 else
13173 for (i = 0; i < unroll; i++)
13174 {
13175 if (i)
13176 destmem =
13177 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13178 emit_move_insn (destmem, value);
13179 }
13180
13181 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
13182 true, OPTAB_LIB_WIDEN);
13183 if (tmp != iter)
13184 emit_move_insn (iter, tmp);
13185
13186 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
13187 true, top_label);
13188 if (expected_size != -1)
13189 {
13190 expected_size /= GET_MODE_SIZE (mode) * unroll;
13191 if (expected_size == 0)
13192 predict_jump (0);
13193 else if (expected_size > REG_BR_PROB_BASE)
13194 predict_jump (REG_BR_PROB_BASE - 1);
13195 else
13196 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
13197 }
13198 else
13199 predict_jump (REG_BR_PROB_BASE * 80 / 100);
13200 iter = ix86_zero_extend_to_Pmode (iter);
13201 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
13202 true, OPTAB_LIB_WIDEN);
13203 if (tmp != destptr)
13204 emit_move_insn (destptr, tmp);
13205 if (srcptr)
13206 {
13207 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
13208 true, OPTAB_LIB_WIDEN);
13209 if (tmp != srcptr)
13210 emit_move_insn (srcptr, tmp);
13211 }
13212 emit_label (out_label);
13213 }
13214
13215 /* Output "rep; mov" instruction.
13216 Arguments have same meaning as for previous function */
13217 static void
13218 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
13219 rtx destptr, rtx srcptr,
13220 rtx count,
13221 enum machine_mode mode)
13222 {
13223 rtx destexp;
13224 rtx srcexp;
13225 rtx countreg;
13226
13227 /* If the size is known, it is shorter to use rep movs. */
13228 if (mode == QImode && CONST_INT_P (count)
13229 && !(INTVAL (count) & 3))
13230 mode = SImode;
13231
13232 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13233 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13234 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
13235 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
13236 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13237 if (mode != QImode)
13238 {
13239 destexp = gen_rtx_ASHIFT (Pmode, countreg,
13240 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13241 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13242 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
13243 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13244 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
13245 }
13246 else
13247 {
13248 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13249 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
13250 }
13251 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
13252 destexp, srcexp));
13253 }
13254
13255 /* Output "rep; stos" instruction.
13256 Arguments have same meaning as for previous function */
13257 static void
13258 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
13259 rtx count,
13260 enum machine_mode mode)
13261 {
13262 rtx destexp;
13263 rtx countreg;
13264
13265 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13266 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13267 value = force_reg (mode, gen_lowpart (mode, value));
13268 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13269 if (mode != QImode)
13270 {
13271 destexp = gen_rtx_ASHIFT (Pmode, countreg,
13272 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13273 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13274 }
13275 else
13276 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13277 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
13278 }
13279
13280 static void
13281 emit_strmov (rtx destmem, rtx srcmem,
13282 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
13283 {
13284 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
13285 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
13286 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13287 }
13288
13289 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
13290 static void
13291 expand_movmem_epilogue (rtx destmem, rtx srcmem,
13292 rtx destptr, rtx srcptr, rtx count, int max_size)
13293 {
13294 rtx src, dest;
13295 if (CONST_INT_P (count))
13296 {
13297 HOST_WIDE_INT countval = INTVAL (count);
13298 int offset = 0;
13299
13300 if ((countval & 0x16) && max_size > 16)
13301 {
13302 if (TARGET_64BIT)
13303 {
13304 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13305 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
13306 }
13307 else
13308 gcc_unreachable ();
13309 offset += 16;
13310 }
13311 if ((countval & 0x08) && max_size > 8)
13312 {
13313 if (TARGET_64BIT)
13314 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13315 else
13316 {
13317 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13318 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 4);
13319 }
13320 offset += 8;
13321 }
13322 if ((countval & 0x04) && max_size > 4)
13323 {
13324 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
13325 offset += 4;
13326 }
13327 if ((countval & 0x02) && max_size > 2)
13328 {
13329 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
13330 offset += 2;
13331 }
13332 if ((countval & 0x01) && max_size > 1)
13333 {
13334 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
13335 offset += 1;
13336 }
13337 return;
13338 }
13339 if (max_size > 8)
13340 {
13341 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
13342 count, 1, OPTAB_DIRECT);
13343 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
13344 count, QImode, 1, 4);
13345 return;
13346 }
13347
13348 /* When there are stringops, we can cheaply increase dest and src pointers.
13349 Otherwise we save code size by maintaining offset (zero is readily
13350 available from preceding rep operation) and using x86 addressing modes.
13351 */
13352 if (TARGET_SINGLE_STRINGOP)
13353 {
13354 if (max_size > 4)
13355 {
13356 rtx label = ix86_expand_aligntest (count, 4, true);
13357 src = change_address (srcmem, SImode, srcptr);
13358 dest = change_address (destmem, SImode, destptr);
13359 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13360 emit_label (label);
13361 LABEL_NUSES (label) = 1;
13362 }
13363 if (max_size > 2)
13364 {
13365 rtx label = ix86_expand_aligntest (count, 2, true);
13366 src = change_address (srcmem, HImode, srcptr);
13367 dest = change_address (destmem, HImode, destptr);
13368 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13369 emit_label (label);
13370 LABEL_NUSES (label) = 1;
13371 }
13372 if (max_size > 1)
13373 {
13374 rtx label = ix86_expand_aligntest (count, 1, true);
13375 src = change_address (srcmem, QImode, srcptr);
13376 dest = change_address (destmem, QImode, destptr);
13377 emit_insn (gen_strmov (destptr, dest, srcptr, src));
13378 emit_label (label);
13379 LABEL_NUSES (label) = 1;
13380 }
13381 }
13382 else
13383 {
13384 rtx offset = force_reg (Pmode, const0_rtx);
13385 rtx tmp;
13386
13387 if (max_size > 4)
13388 {
13389 rtx label = ix86_expand_aligntest (count, 4, true);
13390 src = change_address (srcmem, SImode, srcptr);
13391 dest = change_address (destmem, SImode, destptr);
13392 emit_move_insn (dest, src);
13393 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
13394 true, OPTAB_LIB_WIDEN);
13395 if (tmp != offset)
13396 emit_move_insn (offset, tmp);
13397 emit_label (label);
13398 LABEL_NUSES (label) = 1;
13399 }
13400 if (max_size > 2)
13401 {
13402 rtx label = ix86_expand_aligntest (count, 2, true);
13403 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13404 src = change_address (srcmem, HImode, tmp);
13405 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13406 dest = change_address (destmem, HImode, tmp);
13407 emit_move_insn (dest, src);
13408 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
13409 true, OPTAB_LIB_WIDEN);
13410 if (tmp != offset)
13411 emit_move_insn (offset, tmp);
13412 emit_label (label);
13413 LABEL_NUSES (label) = 1;
13414 }
13415 if (max_size > 1)
13416 {
13417 rtx label = ix86_expand_aligntest (count, 1, true);
13418 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13419 src = change_address (srcmem, QImode, tmp);
13420 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13421 dest = change_address (destmem, QImode, tmp);
13422 emit_move_insn (dest, src);
13423 emit_label (label);
13424 LABEL_NUSES (label) = 1;
13425 }
13426 }
13427 }
13428
13429 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
13430 static void
13431 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
13432 rtx count, int max_size)
13433 {
13434 count =
13435 expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
13436 count, 1, OPTAB_DIRECT);
13437 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
13438 gen_lowpart (QImode, value), count, QImode,
13439 1, max_size / 2);
13440 }
13441
13442 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
13443 static void
13444 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
13445 {
13446 rtx dest;
13447
13448 if (CONST_INT_P (count))
13449 {
13450 HOST_WIDE_INT countval = INTVAL (count);
13451 int offset = 0;
13452
13453 if ((countval & 0x16) && max_size > 16)
13454 {
13455 if (TARGET_64BIT)
13456 {
13457 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13458 emit_insn (gen_strset (destptr, dest, value));
13459 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
13460 emit_insn (gen_strset (destptr, dest, value));
13461 }
13462 else
13463 gcc_unreachable ();
13464 offset += 16;
13465 }
13466 if ((countval & 0x08) && max_size > 8)
13467 {
13468 if (TARGET_64BIT)
13469 {
13470 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13471 emit_insn (gen_strset (destptr, dest, value));
13472 }
13473 else
13474 {
13475 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13476 emit_insn (gen_strset (destptr, dest, value));
13477 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
13478 emit_insn (gen_strset (destptr, dest, value));
13479 }
13480 offset += 8;
13481 }
13482 if ((countval & 0x04) && max_size > 4)
13483 {
13484 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13485 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
13486 offset += 4;
13487 }
13488 if ((countval & 0x02) && max_size > 2)
13489 {
13490 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
13491 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
13492 offset += 2;
13493 }
13494 if ((countval & 0x01) && max_size > 1)
13495 {
13496 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
13497 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
13498 offset += 1;
13499 }
13500 return;
13501 }
13502 if (max_size > 32)
13503 {
13504 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
13505 return;
13506 }
13507 if (max_size > 16)
13508 {
13509 rtx label = ix86_expand_aligntest (count, 16, true);
13510 if (TARGET_64BIT)
13511 {
13512 dest = change_address (destmem, DImode, destptr);
13513 emit_insn (gen_strset (destptr, dest, value));
13514 emit_insn (gen_strset (destptr, dest, value));
13515 }
13516 else
13517 {
13518 dest = change_address (destmem, SImode, destptr);
13519 emit_insn (gen_strset (destptr, dest, value));
13520 emit_insn (gen_strset (destptr, dest, value));
13521 emit_insn (gen_strset (destptr, dest, value));
13522 emit_insn (gen_strset (destptr, dest, value));
13523 }
13524 emit_label (label);
13525 LABEL_NUSES (label) = 1;
13526 }
13527 if (max_size > 8)
13528 {
13529 rtx label = ix86_expand_aligntest (count, 8, true);
13530 if (TARGET_64BIT)
13531 {
13532 dest = change_address (destmem, DImode, destptr);
13533 emit_insn (gen_strset (destptr, dest, value));
13534 }
13535 else
13536 {
13537 dest = change_address (destmem, SImode, destptr);
13538 emit_insn (gen_strset (destptr, dest, value));
13539 emit_insn (gen_strset (destptr, dest, value));
13540 }
13541 emit_label (label);
13542 LABEL_NUSES (label) = 1;
13543 }
13544 if (max_size > 4)
13545 {
13546 rtx label = ix86_expand_aligntest (count, 4, true);
13547 dest = change_address (destmem, SImode, destptr);
13548 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
13549 emit_label (label);
13550 LABEL_NUSES (label) = 1;
13551 }
13552 if (max_size > 2)
13553 {
13554 rtx label = ix86_expand_aligntest (count, 2, true);
13555 dest = change_address (destmem, HImode, destptr);
13556 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
13557 emit_label (label);
13558 LABEL_NUSES (label) = 1;
13559 }
13560 if (max_size > 1)
13561 {
13562 rtx label = ix86_expand_aligntest (count, 1, true);
13563 dest = change_address (destmem, QImode, destptr);
13564 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
13565 emit_label (label);
13566 LABEL_NUSES (label) = 1;
13567 }
13568 }
13569
13570 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
13571 DESIRED_ALIGNMENT. */
13572 static void
13573 expand_movmem_prologue (rtx destmem, rtx srcmem,
13574 rtx destptr, rtx srcptr, rtx count,
13575 int align, int desired_alignment)
13576 {
13577 if (align <= 1 && desired_alignment > 1)
13578 {
13579 rtx label = ix86_expand_aligntest (destptr, 1, false);
13580 srcmem = change_address (srcmem, QImode, srcptr);
13581 destmem = change_address (destmem, QImode, destptr);
13582 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13583 ix86_adjust_counter (count, 1);
13584 emit_label (label);
13585 LABEL_NUSES (label) = 1;
13586 }
13587 if (align <= 2 && desired_alignment > 2)
13588 {
13589 rtx label = ix86_expand_aligntest (destptr, 2, false);
13590 srcmem = change_address (srcmem, HImode, srcptr);
13591 destmem = change_address (destmem, HImode, destptr);
13592 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13593 ix86_adjust_counter (count, 2);
13594 emit_label (label);
13595 LABEL_NUSES (label) = 1;
13596 }
13597 if (align <= 4 && desired_alignment > 4)
13598 {
13599 rtx label = ix86_expand_aligntest (destptr, 4, false);
13600 srcmem = change_address (srcmem, SImode, srcptr);
13601 destmem = change_address (destmem, SImode, destptr);
13602 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
13603 ix86_adjust_counter (count, 4);
13604 emit_label (label);
13605 LABEL_NUSES (label) = 1;
13606 }
13607 gcc_assert (desired_alignment <= 8);
13608 }
13609
13610 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
13611 DESIRED_ALIGNMENT. */
13612 static void
13613 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
13614 int align, int desired_alignment)
13615 {
13616 if (align <= 1 && desired_alignment > 1)
13617 {
13618 rtx label = ix86_expand_aligntest (destptr, 1, false);
13619 destmem = change_address (destmem, QImode, destptr);
13620 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
13621 ix86_adjust_counter (count, 1);
13622 emit_label (label);
13623 LABEL_NUSES (label) = 1;
13624 }
13625 if (align <= 2 && desired_alignment > 2)
13626 {
13627 rtx label = ix86_expand_aligntest (destptr, 2, false);
13628 destmem = change_address (destmem, HImode, destptr);
13629 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
13630 ix86_adjust_counter (count, 2);
13631 emit_label (label);
13632 LABEL_NUSES (label) = 1;
13633 }
13634 if (align <= 4 && desired_alignment > 4)
13635 {
13636 rtx label = ix86_expand_aligntest (destptr, 4, false);
13637 destmem = change_address (destmem, SImode, destptr);
13638 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
13639 ix86_adjust_counter (count, 4);
13640 emit_label (label);
13641 LABEL_NUSES (label) = 1;
13642 }
13643 gcc_assert (desired_alignment <= 8);
13644 }
13645
13646 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
13647 static enum stringop_alg
13648 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
13649 int *dynamic_check)
13650 {
13651 const struct stringop_algs * algs;
13652
13653 *dynamic_check = -1;
13654 if (memset)
13655 algs = &ix86_cost->memset[TARGET_64BIT != 0];
13656 else
13657 algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
13658 if (stringop_alg != no_stringop)
13659 return stringop_alg;
13660 /* rep; movq or rep; movl is the smallest variant. */
13661 else if (optimize_size)
13662 {
13663 if (!count || (count & 3))
13664 return rep_prefix_1_byte;
13665 else
13666 return rep_prefix_4_byte;
13667 }
13668 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
13669 */
13670 else if (expected_size != -1 && expected_size < 4)
13671 return loop_1_byte;
13672 else if (expected_size != -1)
13673 {
13674 unsigned int i;
13675 enum stringop_alg alg = libcall;
13676 for (i = 0; i < NAX_STRINGOP_ALGS; i++)
13677 {
13678 gcc_assert (algs->size[i].max);
13679 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
13680 {
13681 if (algs->size[i].alg != libcall)
13682 alg = algs->size[i].alg;
13683 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
13684 last non-libcall inline algorithm. */
13685 if (TARGET_INLINE_ALL_STRINGOPS)
13686 {
13687 /* When the current size is best to be copied by a libcall,
13688 but we are still forced to inline, run the heuristic bellow
13689 that will pick code for medium sized blocks. */
13690 if (alg != libcall)
13691 return alg;
13692 break;
13693 }
13694 else
13695 return algs->size[i].alg;
13696 }
13697 }
13698 gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
13699 }
13700 /* When asked to inline the call anyway, try to pick meaningful choice.
13701 We look for maximal size of block that is faster to copy by hand and
13702 take blocks of at most of that size guessing that average size will
13703 be roughly half of the block.
13704
13705 If this turns out to be bad, we might simply specify the preferred
13706 choice in ix86_costs. */
13707 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
13708 && algs->unknown_size == libcall)
13709 {
13710 int max = -1;
13711 enum stringop_alg alg;
13712 int i;
13713
13714 for (i = 0; i < NAX_STRINGOP_ALGS; i++)
13715 if (algs->size[i].alg != libcall && algs->size[i].alg)
13716 max = algs->size[i].max;
13717 if (max == -1)
13718 max = 4096;
13719 alg = decide_alg (count, max / 2, memset, dynamic_check);
13720 gcc_assert (*dynamic_check == -1);
13721 gcc_assert (alg != libcall);
13722 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
13723 *dynamic_check = max;
13724 return alg;
13725 }
13726 return algs->unknown_size;
13727 }
13728
13729 /* Decide on alignment. We know that the operand is already aligned to ALIGN
13730 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
13731 static int
13732 decide_alignment (int align,
13733 enum stringop_alg alg,
13734 int expected_size)
13735 {
13736 int desired_align = 0;
13737 switch (alg)
13738 {
13739 case no_stringop:
13740 gcc_unreachable ();
13741 case loop:
13742 case unrolled_loop:
13743 desired_align = GET_MODE_SIZE (Pmode);
13744 break;
13745 case rep_prefix_8_byte:
13746 desired_align = 8;
13747 break;
13748 case rep_prefix_4_byte:
13749 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
13750 copying whole cacheline at once. */
13751 if (TARGET_PENTIUMPRO)
13752 desired_align = 8;
13753 else
13754 desired_align = 4;
13755 break;
13756 case rep_prefix_1_byte:
13757 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
13758 copying whole cacheline at once. */
13759 if (TARGET_PENTIUMPRO)
13760 desired_align = 8;
13761 else
13762 desired_align = 1;
13763 break;
13764 case loop_1_byte:
13765 desired_align = 1;
13766 break;
13767 case libcall:
13768 return 0;
13769 }
13770
13771 if (optimize_size)
13772 desired_align = 1;
13773 if (desired_align < align)
13774 desired_align = align;
13775 if (expected_size != -1 && expected_size < 4)
13776 desired_align = align;
13777 return desired_align;
13778 }
13779
13780 /* Return the smallest power of 2 greater than VAL. */
13781 static int
13782 smallest_pow2_greater_than (int val)
13783 {
13784 int ret = 1;
13785 while (ret <= val)
13786 ret <<= 1;
13787 return ret;
13788 }
13789
13790 /* Expand string move (memcpy) operation. Use i386 string operations when
13791 profitable. expand_clrmem contains similar code. The code depends upon
13792 architecture, block size and alignment, but always has the same
13793 overall structure:
13794
13795 1) Prologue guard: Conditional that jumps up to epilogues for small
13796 blocks that can be handled by epilogue alone. This is faster but
13797 also needed for correctness, since prologue assume the block is larger
13798 than the desired alignment.
13799
13800 Optional dynamic check for size and libcall for large
13801 blocks is emitted here too, with -minline-stringops-dynamically.
13802
13803 2) Prologue: copy first few bytes in order to get destination aligned
13804 to DESIRED_ALIGN. It is emitted only when ALIGN is less than
13805 DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied.
13806 We emit either a jump tree on power of two sized blocks, or a byte loop.
13807
13808 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
13809 with specified algorithm.
13810
13811 4) Epilogue: code copying tail of the block that is too small to be
13812 handled by main body (or up to size guarded by prologue guard). */
13813
13814 int
13815 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
13816 rtx expected_align_exp, rtx expected_size_exp)
13817 {
13818 rtx destreg;
13819 rtx srcreg;
13820 rtx label = NULL;
13821 rtx tmp;
13822 rtx jump_around_label = NULL;
13823 HOST_WIDE_INT align = 1;
13824 unsigned HOST_WIDE_INT count = 0;
13825 HOST_WIDE_INT expected_size = -1;
13826 int size_needed = 0, epilogue_size_needed;
13827 int desired_align = 0;
13828 enum stringop_alg alg;
13829 int dynamic_check;
13830
13831 if (CONST_INT_P (align_exp))
13832 align = INTVAL (align_exp);
13833 /* i386 can do misaligned access on reasonably increased cost. */
13834 if (CONST_INT_P (expected_align_exp)
13835 && INTVAL (expected_align_exp) > align)
13836 align = INTVAL (expected_align_exp);
13837 if (CONST_INT_P (count_exp))
13838 count = expected_size = INTVAL (count_exp);
13839 if (CONST_INT_P (expected_size_exp) && count == 0)
13840 expected_size = INTVAL (expected_size_exp);
13841
13842 /* Step 0: Decide on preferred algorithm, desired alignment and
13843 size of chunks to be copied by main loop. */
13844
13845 alg = decide_alg (count, expected_size, false, &dynamic_check);
13846 desired_align = decide_alignment (align, alg, expected_size);
13847
13848 if (!TARGET_ALIGN_STRINGOPS)
13849 align = desired_align;
13850
13851 if (alg == libcall)
13852 return 0;
13853 gcc_assert (alg != no_stringop);
13854 if (!count)
13855 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
13856 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13857 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
13858 switch (alg)
13859 {
13860 case libcall:
13861 case no_stringop:
13862 gcc_unreachable ();
13863 case loop:
13864 size_needed = GET_MODE_SIZE (Pmode);
13865 break;
13866 case unrolled_loop:
13867 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
13868 break;
13869 case rep_prefix_8_byte:
13870 size_needed = 8;
13871 break;
13872 case rep_prefix_4_byte:
13873 size_needed = 4;
13874 break;
13875 case rep_prefix_1_byte:
13876 case loop_1_byte:
13877 size_needed = 1;
13878 break;
13879 }
13880
13881 epilogue_size_needed = size_needed;
13882
13883 /* Step 1: Prologue guard. */
13884
13885 /* Alignment code needs count to be in register. */
13886 if (CONST_INT_P (count_exp) && desired_align > align)
13887 {
13888 enum machine_mode mode = SImode;
13889 if (TARGET_64BIT && (count & ~0xffffffff))
13890 mode = DImode;
13891 count_exp = force_reg (mode, count_exp);
13892 }
13893 gcc_assert (desired_align >= 1 && align >= 1);
13894
13895 /* Ensure that alignment prologue won't copy past end of block. */
13896 if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
13897 && !count)
13898 {
13899 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
13900
13901 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
13902 Make sure it is power of 2. */
13903 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
13904
13905 label = gen_label_rtx ();
13906 emit_cmp_and_jump_insns (count_exp,
13907 GEN_INT (epilogue_size_needed),
13908 LTU, 0, GET_MODE (count_exp), 1, label);
13909 if (expected_size == -1 || expected_size < epilogue_size_needed)
13910 predict_jump (REG_BR_PROB_BASE * 60 / 100);
13911 else
13912 predict_jump (REG_BR_PROB_BASE * 20 / 100);
13913 }
13914 /* Emit code to decide on runtime whether library call or inline should be
13915 used. */
13916 if (dynamic_check != -1)
13917 {
13918 rtx hot_label = gen_label_rtx ();
13919 jump_around_label = gen_label_rtx ();
13920 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
13921 LEU, 0, GET_MODE (count_exp), 1, hot_label);
13922 predict_jump (REG_BR_PROB_BASE * 90 / 100);
13923 emit_block_move_via_libcall (dst, src, count_exp, false);
13924 emit_jump (jump_around_label);
13925 emit_label (hot_label);
13926 }
13927
13928 /* Step 2: Alignment prologue. */
13929
13930 if (desired_align > align)
13931 {
13932 /* Except for the first move in epilogue, we no longer know
13933 constant offset in aliasing info. It don't seems to worth
13934 the pain to maintain it for the first move, so throw away
13935 the info early. */
13936 src = change_address (src, BLKmode, srcreg);
13937 dst = change_address (dst, BLKmode, destreg);
13938 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
13939 desired_align);
13940 }
13941 if (label && size_needed == 1)
13942 {
13943 emit_label (label);
13944 LABEL_NUSES (label) = 1;
13945 label = NULL;
13946 }
13947
13948 /* Step 3: Main loop. */
13949
13950 switch (alg)
13951 {
13952 case libcall:
13953 case no_stringop:
13954 gcc_unreachable ();
13955 case loop_1_byte:
13956 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13957 count_exp, QImode, 1, expected_size);
13958 break;
13959 case loop:
13960 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13961 count_exp, Pmode, 1, expected_size);
13962 break;
13963 case unrolled_loop:
13964 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
13965 registers for 4 temporaries anyway. */
13966 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
13967 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
13968 expected_size);
13969 break;
13970 case rep_prefix_8_byte:
13971 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13972 DImode);
13973 break;
13974 case rep_prefix_4_byte:
13975 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13976 SImode);
13977 break;
13978 case rep_prefix_1_byte:
13979 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
13980 QImode);
13981 break;
13982 }
13983 /* Adjust properly the offset of src and dest memory for aliasing. */
13984 if (CONST_INT_P (count_exp))
13985 {
13986 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
13987 (count / size_needed) * size_needed);
13988 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
13989 (count / size_needed) * size_needed);
13990 }
13991 else
13992 {
13993 src = change_address (src, BLKmode, srcreg);
13994 dst = change_address (dst, BLKmode, destreg);
13995 }
13996
13997 /* Step 4: Epilogue to copy the remaining bytes. */
13998
13999 if (label)
14000 {
14001 /* When the main loop is done, COUNT_EXP might hold original count,
14002 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14003 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14004 bytes. Compensate if needed. */
14005
14006 if (size_needed < epilogue_size_needed)
14007 {
14008 tmp =
14009 expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
14010 GEN_INT (size_needed - 1), count_exp, 1,
14011 OPTAB_DIRECT);
14012 if (tmp != count_exp)
14013 emit_move_insn (count_exp, tmp);
14014 }
14015 emit_label (label);
14016 LABEL_NUSES (label) = 1;
14017 }
14018
14019 if (count_exp != const0_rtx && epilogue_size_needed > 1)
14020 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
14021 epilogue_size_needed);
14022 if (jump_around_label)
14023 emit_label (jump_around_label);
14024 return 1;
14025 }
14026
14027 /* Helper function for memcpy. For QImode value 0xXY produce
14028 0xXYXYXYXY of wide specified by MODE. This is essentially
14029 a * 0x10101010, but we can do slightly better than
14030 synth_mult by unwinding the sequence by hand on CPUs with
14031 slow multiply. */
14032 static rtx
14033 promote_duplicated_reg (enum machine_mode mode, rtx val)
14034 {
14035 enum machine_mode valmode = GET_MODE (val);
14036 rtx tmp;
14037 int nops = mode == DImode ? 3 : 2;
14038
14039 gcc_assert (mode == SImode || mode == DImode);
14040 if (val == const0_rtx)
14041 return copy_to_mode_reg (mode, const0_rtx);
14042 if (CONST_INT_P (val))
14043 {
14044 HOST_WIDE_INT v = INTVAL (val) & 255;
14045
14046 v |= v << 8;
14047 v |= v << 16;
14048 if (mode == DImode)
14049 v |= (v << 16) << 16;
14050 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
14051 }
14052
14053 if (valmode == VOIDmode)
14054 valmode = QImode;
14055 if (valmode != QImode)
14056 val = gen_lowpart (QImode, val);
14057 if (mode == QImode)
14058 return val;
14059 if (!TARGET_PARTIAL_REG_STALL)
14060 nops--;
14061 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
14062 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
14063 <= (ix86_cost->shift_const + ix86_cost->add) * nops
14064 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
14065 {
14066 rtx reg = convert_modes (mode, QImode, val, true);
14067 tmp = promote_duplicated_reg (mode, const1_rtx);
14068 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
14069 OPTAB_DIRECT);
14070 }
14071 else
14072 {
14073 rtx reg = convert_modes (mode, QImode, val, true);
14074
14075 if (!TARGET_PARTIAL_REG_STALL)
14076 if (mode == SImode)
14077 emit_insn (gen_movsi_insv_1 (reg, reg));
14078 else
14079 emit_insn (gen_movdi_insv_1_rex64 (reg, reg));
14080 else
14081 {
14082 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
14083 NULL, 1, OPTAB_DIRECT);
14084 reg =
14085 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14086 }
14087 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
14088 NULL, 1, OPTAB_DIRECT);
14089 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14090 if (mode == SImode)
14091 return reg;
14092 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
14093 NULL, 1, OPTAB_DIRECT);
14094 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14095 return reg;
14096 }
14097 }
14098
14099 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
14100 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
14101 alignment from ALIGN to DESIRED_ALIGN. */
14102 static rtx
14103 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
14104 {
14105 rtx promoted_val;
14106
14107 if (TARGET_64BIT
14108 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
14109 promoted_val = promote_duplicated_reg (DImode, val);
14110 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
14111 promoted_val = promote_duplicated_reg (SImode, val);
14112 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
14113 promoted_val = promote_duplicated_reg (HImode, val);
14114 else
14115 promoted_val = val;
14116
14117 return promoted_val;
14118 }
14119
14120 /* Expand string clear operation (bzero). Use i386 string operations when
14121 profitable. See expand_movmem comment for explanation of individual
14122 steps performed. */
14123 int
14124 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
14125 rtx expected_align_exp, rtx expected_size_exp)
14126 {
14127 rtx destreg;
14128 rtx label = NULL;
14129 rtx tmp;
14130 rtx jump_around_label = NULL;
14131 HOST_WIDE_INT align = 1;
14132 unsigned HOST_WIDE_INT count = 0;
14133 HOST_WIDE_INT expected_size = -1;
14134 int size_needed = 0, epilogue_size_needed;
14135 int desired_align = 0;
14136 enum stringop_alg alg;
14137 rtx promoted_val = NULL;
14138 bool force_loopy_epilogue = false;
14139 int dynamic_check;
14140
14141 if (CONST_INT_P (align_exp))
14142 align = INTVAL (align_exp);
14143 /* i386 can do misaligned access on reasonably increased cost. */
14144 if (CONST_INT_P (expected_align_exp)
14145 && INTVAL (expected_align_exp) > align)
14146 align = INTVAL (expected_align_exp);
14147 if (CONST_INT_P (count_exp))
14148 count = expected_size = INTVAL (count_exp);
14149 if (CONST_INT_P (expected_size_exp) && count == 0)
14150 expected_size = INTVAL (expected_size_exp);
14151
14152 /* Step 0: Decide on preferred algorithm, desired alignment and
14153 size of chunks to be copied by main loop. */
14154
14155 alg = decide_alg (count, expected_size, true, &dynamic_check);
14156 desired_align = decide_alignment (align, alg, expected_size);
14157
14158 if (!TARGET_ALIGN_STRINGOPS)
14159 align = desired_align;
14160
14161 if (alg == libcall)
14162 return 0;
14163 gcc_assert (alg != no_stringop);
14164 if (!count)
14165 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
14166 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14167 switch (alg)
14168 {
14169 case libcall:
14170 case no_stringop:
14171 gcc_unreachable ();
14172 case loop:
14173 size_needed = GET_MODE_SIZE (Pmode);
14174 break;
14175 case unrolled_loop:
14176 size_needed = GET_MODE_SIZE (Pmode) * 4;
14177 break;
14178 case rep_prefix_8_byte:
14179 size_needed = 8;
14180 break;
14181 case rep_prefix_4_byte:
14182 size_needed = 4;
14183 break;
14184 case rep_prefix_1_byte:
14185 case loop_1_byte:
14186 size_needed = 1;
14187 break;
14188 }
14189 epilogue_size_needed = size_needed;
14190
14191 /* Step 1: Prologue guard. */
14192
14193 /* Alignment code needs count to be in register. */
14194 if (CONST_INT_P (count_exp) && desired_align > align)
14195 {
14196 enum machine_mode mode = SImode;
14197 if (TARGET_64BIT && (count & ~0xffffffff))
14198 mode = DImode;
14199 count_exp = force_reg (mode, count_exp);
14200 }
14201 /* Do the cheap promotion to allow better CSE across the
14202 main loop and epilogue (ie one load of the big constant in the
14203 front of all code. */
14204 if (CONST_INT_P (val_exp))
14205 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14206 desired_align, align);
14207 /* Ensure that alignment prologue won't copy past end of block. */
14208 if ((size_needed > 1 || (desired_align > 1 && desired_align > align))
14209 && !count)
14210 {
14211 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
14212
14213 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
14214 Make sure it is power of 2. */
14215 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
14216
14217 /* To improve performance of small blocks, we jump around the VAL
14218 promoting mode. This mean that if the promoted VAL is not constant,
14219 we might not use it in the epilogue and have to use byte
14220 loop variant. */
14221 if (epilogue_size_needed > 2 && !promoted_val)
14222 force_loopy_epilogue = true;
14223 label = gen_label_rtx ();
14224 emit_cmp_and_jump_insns (count_exp,
14225 GEN_INT (epilogue_size_needed),
14226 LTU, 0, GET_MODE (count_exp), 1, label);
14227 if (expected_size == -1 || expected_size <= epilogue_size_needed)
14228 predict_jump (REG_BR_PROB_BASE * 60 / 100);
14229 else
14230 predict_jump (REG_BR_PROB_BASE * 20 / 100);
14231 }
14232 if (dynamic_check != -1)
14233 {
14234 rtx hot_label = gen_label_rtx ();
14235 jump_around_label = gen_label_rtx ();
14236 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
14237 LEU, 0, GET_MODE (count_exp), 1, hot_label);
14238 predict_jump (REG_BR_PROB_BASE * 90 / 100);
14239 set_storage_via_libcall (dst, count_exp, val_exp, false);
14240 emit_jump (jump_around_label);
14241 emit_label (hot_label);
14242 }
14243
14244 /* Step 2: Alignment prologue. */
14245
14246 /* Do the expensive promotion once we branched off the small blocks. */
14247 if (!promoted_val)
14248 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14249 desired_align, align);
14250 gcc_assert (desired_align >= 1 && align >= 1);
14251
14252 if (desired_align > align)
14253 {
14254 /* Except for the first move in epilogue, we no longer know
14255 constant offset in aliasing info. It don't seems to worth
14256 the pain to maintain it for the first move, so throw away
14257 the info early. */
14258 dst = change_address (dst, BLKmode, destreg);
14259 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
14260 desired_align);
14261 }
14262 if (label && size_needed == 1)
14263 {
14264 emit_label (label);
14265 LABEL_NUSES (label) = 1;
14266 label = NULL;
14267 }
14268
14269 /* Step 3: Main loop. */
14270
14271 switch (alg)
14272 {
14273 case libcall:
14274 case no_stringop:
14275 gcc_unreachable ();
14276 case loop_1_byte:
14277 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14278 count_exp, QImode, 1, expected_size);
14279 break;
14280 case loop:
14281 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14282 count_exp, Pmode, 1, expected_size);
14283 break;
14284 case unrolled_loop:
14285 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14286 count_exp, Pmode, 4, expected_size);
14287 break;
14288 case rep_prefix_8_byte:
14289 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14290 DImode);
14291 break;
14292 case rep_prefix_4_byte:
14293 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14294 SImode);
14295 break;
14296 case rep_prefix_1_byte:
14297 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14298 QImode);
14299 break;
14300 }
14301 /* Adjust properly the offset of src and dest memory for aliasing. */
14302 if (CONST_INT_P (count_exp))
14303 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
14304 (count / size_needed) * size_needed);
14305 else
14306 dst = change_address (dst, BLKmode, destreg);
14307
14308 /* Step 4: Epilogue to copy the remaining bytes. */
14309
14310 if (label)
14311 {
14312 /* When the main loop is done, COUNT_EXP might hold original count,
14313 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14314 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14315 bytes. Compensate if needed. */
14316
14317 if (size_needed < desired_align - align)
14318 {
14319 tmp =
14320 expand_simple_binop (GET_MODE (count_exp), AND, count_exp,
14321 GEN_INT (size_needed - 1), count_exp, 1,
14322 OPTAB_DIRECT);
14323 size_needed = desired_align - align + 1;
14324 if (tmp != count_exp)
14325 emit_move_insn (count_exp, tmp);
14326 }
14327 emit_label (label);
14328 LABEL_NUSES (label) = 1;
14329 }
14330 if (count_exp != const0_rtx && epilogue_size_needed > 1)
14331 {
14332 if (force_loopy_epilogue)
14333 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
14334 size_needed);
14335 else
14336 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
14337 size_needed);
14338 }
14339 if (jump_around_label)
14340 emit_label (jump_around_label);
14341 return 1;
14342 }
14343
14344 /* Expand strlen. */
14345 int
14346 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
14347 {
14348 rtx addr, scratch1, scratch2, scratch3, scratch4;
14349
14350 /* The generic case of strlen expander is long. Avoid it's
14351 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
14352
14353 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
14354 && !TARGET_INLINE_ALL_STRINGOPS
14355 && !optimize_size
14356 && (!CONST_INT_P (align) || INTVAL (align) < 4))
14357 return 0;
14358
14359 addr = force_reg (Pmode, XEXP (src, 0));
14360 scratch1 = gen_reg_rtx (Pmode);
14361
14362 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
14363 && !optimize_size)
14364 {
14365 /* Well it seems that some optimizer does not combine a call like
14366 foo(strlen(bar), strlen(bar));
14367 when the move and the subtraction is done here. It does calculate
14368 the length just once when these instructions are done inside of
14369 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
14370 often used and I use one fewer register for the lifetime of
14371 output_strlen_unroll() this is better. */
14372
14373 emit_move_insn (out, addr);
14374
14375 ix86_expand_strlensi_unroll_1 (out, src, align);
14376
14377 /* strlensi_unroll_1 returns the address of the zero at the end of
14378 the string, like memchr(), so compute the length by subtracting
14379 the start address. */
14380 if (TARGET_64BIT)
14381 emit_insn (gen_subdi3 (out, out, addr));
14382 else
14383 emit_insn (gen_subsi3 (out, out, addr));
14384 }
14385 else
14386 {
14387 rtx unspec;
14388 scratch2 = gen_reg_rtx (Pmode);
14389 scratch3 = gen_reg_rtx (Pmode);
14390 scratch4 = force_reg (Pmode, constm1_rtx);
14391
14392 emit_move_insn (scratch3, addr);
14393 eoschar = force_reg (QImode, eoschar);
14394
14395 src = replace_equiv_address_nv (src, scratch3);
14396
14397 /* If .md starts supporting :P, this can be done in .md. */
14398 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
14399 scratch4), UNSPEC_SCAS);
14400 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
14401 if (TARGET_64BIT)
14402 {
14403 emit_insn (gen_one_cmpldi2 (scratch2, scratch1));
14404 emit_insn (gen_adddi3 (out, scratch2, constm1_rtx));
14405 }
14406 else
14407 {
14408 emit_insn (gen_one_cmplsi2 (scratch2, scratch1));
14409 emit_insn (gen_addsi3 (out, scratch2, constm1_rtx));
14410 }
14411 }
14412 return 1;
14413 }
14414
14415 /* Expand the appropriate insns for doing strlen if not just doing
14416 repnz; scasb
14417
14418 out = result, initialized with the start address
14419 align_rtx = alignment of the address.
14420 scratch = scratch register, initialized with the startaddress when
14421 not aligned, otherwise undefined
14422
14423 This is just the body. It needs the initializations mentioned above and
14424 some address computing at the end. These things are done in i386.md. */
14425
14426 static void
14427 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
14428 {
14429 int align;
14430 rtx tmp;
14431 rtx align_2_label = NULL_RTX;
14432 rtx align_3_label = NULL_RTX;
14433 rtx align_4_label = gen_label_rtx ();
14434 rtx end_0_label = gen_label_rtx ();
14435 rtx mem;
14436 rtx tmpreg = gen_reg_rtx (SImode);
14437 rtx scratch = gen_reg_rtx (SImode);
14438 rtx cmp;
14439
14440 align = 0;
14441 if (CONST_INT_P (align_rtx))
14442 align = INTVAL (align_rtx);
14443
14444 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
14445
14446 /* Is there a known alignment and is it less than 4? */
14447 if (align < 4)
14448 {
14449 rtx scratch1 = gen_reg_rtx (Pmode);
14450 emit_move_insn (scratch1, out);
14451 /* Is there a known alignment and is it not 2? */
14452 if (align != 2)
14453 {
14454 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
14455 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
14456
14457 /* Leave just the 3 lower bits. */
14458 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
14459 NULL_RTX, 0, OPTAB_WIDEN);
14460
14461 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14462 Pmode, 1, align_4_label);
14463 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
14464 Pmode, 1, align_2_label);
14465 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
14466 Pmode, 1, align_3_label);
14467 }
14468 else
14469 {
14470 /* Since the alignment is 2, we have to check 2 or 0 bytes;
14471 check if is aligned to 4 - byte. */
14472
14473 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
14474 NULL_RTX, 0, OPTAB_WIDEN);
14475
14476 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14477 Pmode, 1, align_4_label);
14478 }
14479
14480 mem = change_address (src, QImode, out);
14481
14482 /* Now compare the bytes. */
14483
14484 /* Compare the first n unaligned byte on a byte per byte basis. */
14485 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
14486 QImode, 1, end_0_label);
14487
14488 /* Increment the address. */
14489 if (TARGET_64BIT)
14490 emit_insn (gen_adddi3 (out, out, const1_rtx));
14491 else
14492 emit_insn (gen_addsi3 (out, out, const1_rtx));
14493
14494 /* Not needed with an alignment of 2 */
14495 if (align != 2)
14496 {
14497 emit_label (align_2_label);
14498
14499 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14500 end_0_label);
14501
14502 if (TARGET_64BIT)
14503 emit_insn (gen_adddi3 (out, out, const1_rtx));
14504 else
14505 emit_insn (gen_addsi3 (out, out, const1_rtx));
14506
14507 emit_label (align_3_label);
14508 }
14509
14510 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14511 end_0_label);
14512
14513 if (TARGET_64BIT)
14514 emit_insn (gen_adddi3 (out, out, const1_rtx));
14515 else
14516 emit_insn (gen_addsi3 (out, out, const1_rtx));
14517 }
14518
14519 /* Generate loop to check 4 bytes at a time. It is not a good idea to
14520 align this loop. It gives only huge programs, but does not help to
14521 speed up. */
14522 emit_label (align_4_label);
14523
14524 mem = change_address (src, SImode, out);
14525 emit_move_insn (scratch, mem);
14526 if (TARGET_64BIT)
14527 emit_insn (gen_adddi3 (out, out, GEN_INT (4)));
14528 else
14529 emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
14530
14531 /* This formula yields a nonzero result iff one of the bytes is zero.
14532 This saves three branches inside loop and many cycles. */
14533
14534 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
14535 emit_insn (gen_one_cmplsi2 (scratch, scratch));
14536 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
14537 emit_insn (gen_andsi3 (tmpreg, tmpreg,
14538 gen_int_mode (0x80808080, SImode)));
14539 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
14540 align_4_label);
14541
14542 if (TARGET_CMOVE)
14543 {
14544 rtx reg = gen_reg_rtx (SImode);
14545 rtx reg2 = gen_reg_rtx (Pmode);
14546 emit_move_insn (reg, tmpreg);
14547 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
14548
14549 /* If zero is not in the first two bytes, move two bytes forward. */
14550 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
14551 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14552 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14553 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
14554 gen_rtx_IF_THEN_ELSE (SImode, tmp,
14555 reg,
14556 tmpreg)));
14557 /* Emit lea manually to avoid clobbering of flags. */
14558 emit_insn (gen_rtx_SET (SImode, reg2,
14559 gen_rtx_PLUS (Pmode, out, const2_rtx)));
14560
14561 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14562 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14563 emit_insn (gen_rtx_SET (VOIDmode, out,
14564 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
14565 reg2,
14566 out)));
14567
14568 }
14569 else
14570 {
14571 rtx end_2_label = gen_label_rtx ();
14572 /* Is zero in the first two bytes? */
14573
14574 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
14575 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14576 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
14577 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
14578 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
14579 pc_rtx);
14580 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
14581 JUMP_LABEL (tmp) = end_2_label;
14582
14583 /* Not in the first two. Move two bytes forward. */
14584 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
14585 if (TARGET_64BIT)
14586 emit_insn (gen_adddi3 (out, out, const2_rtx));
14587 else
14588 emit_insn (gen_addsi3 (out, out, const2_rtx));
14589
14590 emit_label (end_2_label);
14591
14592 }
14593
14594 /* Avoid branch in fixing the byte. */
14595 tmpreg = gen_lowpart (QImode, tmpreg);
14596 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
14597 cmp = gen_rtx_LTU (Pmode, gen_rtx_REG (CCmode, 17), const0_rtx);
14598 if (TARGET_64BIT)
14599 emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3), cmp));
14600 else
14601 emit_insn (gen_subsi3_carry (out, out, GEN_INT (3), cmp));
14602
14603 emit_label (end_0_label);
14604 }
14605
14606 void
14607 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
14608 rtx callarg2 ATTRIBUTE_UNUSED,
14609 rtx pop, int sibcall)
14610 {
14611 rtx use = NULL, call;
14612
14613 if (pop == const0_rtx)
14614 pop = NULL;
14615 gcc_assert (!TARGET_64BIT || !pop);
14616
14617 if (TARGET_MACHO && !TARGET_64BIT)
14618 {
14619 #if TARGET_MACHO
14620 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
14621 fnaddr = machopic_indirect_call_target (fnaddr);
14622 #endif
14623 }
14624 else
14625 {
14626 /* Static functions and indirect calls don't need the pic register. */
14627 if (! TARGET_64BIT && flag_pic
14628 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
14629 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
14630 use_reg (&use, pic_offset_table_rtx);
14631 }
14632
14633 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
14634 {
14635 rtx al = gen_rtx_REG (QImode, 0);
14636 emit_move_insn (al, callarg2);
14637 use_reg (&use, al);
14638 }
14639
14640 if (! call_insn_operand (XEXP (fnaddr, 0), Pmode))
14641 {
14642 fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
14643 fnaddr = gen_rtx_MEM (QImode, fnaddr);
14644 }
14645 if (sibcall && TARGET_64BIT
14646 && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode))
14647 {
14648 rtx addr;
14649 addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
14650 fnaddr = gen_rtx_REG (Pmode, R11_REG);
14651 emit_move_insn (fnaddr, addr);
14652 fnaddr = gen_rtx_MEM (QImode, fnaddr);
14653 }
14654
14655 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
14656 if (retval)
14657 call = gen_rtx_SET (VOIDmode, retval, call);
14658 if (pop)
14659 {
14660 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
14661 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
14662 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
14663 }
14664
14665 call = emit_call_insn (call);
14666 if (use)
14667 CALL_INSN_FUNCTION_USAGE (call) = use;
14668 }
14669
14670 \f
14671 /* Clear stack slot assignments remembered from previous functions.
14672 This is called from INIT_EXPANDERS once before RTL is emitted for each
14673 function. */
14674
14675 static struct machine_function *
14676 ix86_init_machine_status (void)
14677 {
14678 struct machine_function *f;
14679
14680 f = ggc_alloc_cleared (sizeof (struct machine_function));
14681 f->use_fast_prologue_epilogue_nregs = -1;
14682 f->tls_descriptor_call_expanded_p = 0;
14683
14684 return f;
14685 }
14686
14687 /* Return a MEM corresponding to a stack slot with mode MODE.
14688 Allocate a new slot if necessary.
14689
14690 The RTL for a function can have several slots available: N is
14691 which slot to use. */
14692
14693 rtx
14694 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
14695 {
14696 struct stack_local_entry *s;
14697
14698 gcc_assert (n < MAX_386_STACK_LOCALS);
14699
14700 for (s = ix86_stack_locals; s; s = s->next)
14701 if (s->mode == mode && s->n == n)
14702 return copy_rtx (s->rtl);
14703
14704 s = (struct stack_local_entry *)
14705 ggc_alloc (sizeof (struct stack_local_entry));
14706 s->n = n;
14707 s->mode = mode;
14708 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
14709
14710 s->next = ix86_stack_locals;
14711 ix86_stack_locals = s;
14712 return s->rtl;
14713 }
14714
14715 /* Construct the SYMBOL_REF for the tls_get_addr function. */
14716
14717 static GTY(()) rtx ix86_tls_symbol;
14718 rtx
14719 ix86_tls_get_addr (void)
14720 {
14721
14722 if (!ix86_tls_symbol)
14723 {
14724 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode,
14725 (TARGET_ANY_GNU_TLS
14726 && !TARGET_64BIT)
14727 ? "___tls_get_addr"
14728 : "__tls_get_addr");
14729 }
14730
14731 return ix86_tls_symbol;
14732 }
14733
14734 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
14735
14736 static GTY(()) rtx ix86_tls_module_base_symbol;
14737 rtx
14738 ix86_tls_module_base (void)
14739 {
14740
14741 if (!ix86_tls_module_base_symbol)
14742 {
14743 ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode,
14744 "_TLS_MODULE_BASE_");
14745 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
14746 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
14747 }
14748
14749 return ix86_tls_module_base_symbol;
14750 }
14751 \f
14752 /* Calculate the length of the memory address in the instruction
14753 encoding. Does not include the one-byte modrm, opcode, or prefix. */
14754
14755 int
14756 memory_address_length (rtx addr)
14757 {
14758 struct ix86_address parts;
14759 rtx base, index, disp;
14760 int len;
14761 int ok;
14762
14763 if (GET_CODE (addr) == PRE_DEC
14764 || GET_CODE (addr) == POST_INC
14765 || GET_CODE (addr) == PRE_MODIFY
14766 || GET_CODE (addr) == POST_MODIFY)
14767 return 0;
14768
14769 ok = ix86_decompose_address (addr, &parts);
14770 gcc_assert (ok);
14771
14772 if (parts.base && GET_CODE (parts.base) == SUBREG)
14773 parts.base = SUBREG_REG (parts.base);
14774 if (parts.index && GET_CODE (parts.index) == SUBREG)
14775 parts.index = SUBREG_REG (parts.index);
14776
14777 base = parts.base;
14778 index = parts.index;
14779 disp = parts.disp;
14780 len = 0;
14781
14782 /* Rule of thumb:
14783 - esp as the base always wants an index,
14784 - ebp as the base always wants a displacement. */
14785
14786 /* Register Indirect. */
14787 if (base && !index && !disp)
14788 {
14789 /* esp (for its index) and ebp (for its displacement) need
14790 the two-byte modrm form. */
14791 if (addr == stack_pointer_rtx
14792 || addr == arg_pointer_rtx
14793 || addr == frame_pointer_rtx
14794 || addr == hard_frame_pointer_rtx)
14795 len = 1;
14796 }
14797
14798 /* Direct Addressing. */
14799 else if (disp && !base && !index)
14800 len = 4;
14801
14802 else
14803 {
14804 /* Find the length of the displacement constant. */
14805 if (disp)
14806 {
14807 if (base && satisfies_constraint_K (disp))
14808 len = 1;
14809 else
14810 len = 4;
14811 }
14812 /* ebp always wants a displacement. */
14813 else if (base == hard_frame_pointer_rtx)
14814 len = 1;
14815
14816 /* An index requires the two-byte modrm form.... */
14817 if (index
14818 /* ...like esp, which always wants an index. */
14819 || base == stack_pointer_rtx
14820 || base == arg_pointer_rtx
14821 || base == frame_pointer_rtx)
14822 len += 1;
14823 }
14824
14825 return len;
14826 }
14827
14828 /* Compute default value for "length_immediate" attribute. When SHORTFORM
14829 is set, expect that insn have 8bit immediate alternative. */
14830 int
14831 ix86_attr_length_immediate_default (rtx insn, int shortform)
14832 {
14833 int len = 0;
14834 int i;
14835 extract_insn_cached (insn);
14836 for (i = recog_data.n_operands - 1; i >= 0; --i)
14837 if (CONSTANT_P (recog_data.operand[i]))
14838 {
14839 gcc_assert (!len);
14840 if (shortform && satisfies_constraint_K (recog_data.operand[i]))
14841 len = 1;
14842 else
14843 {
14844 switch (get_attr_mode (insn))
14845 {
14846 case MODE_QI:
14847 len+=1;
14848 break;
14849 case MODE_HI:
14850 len+=2;
14851 break;
14852 case MODE_SI:
14853 len+=4;
14854 break;
14855 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
14856 case MODE_DI:
14857 len+=4;
14858 break;
14859 default:
14860 fatal_insn ("unknown insn mode", insn);
14861 }
14862 }
14863 }
14864 return len;
14865 }
14866 /* Compute default value for "length_address" attribute. */
14867 int
14868 ix86_attr_length_address_default (rtx insn)
14869 {
14870 int i;
14871
14872 if (get_attr_type (insn) == TYPE_LEA)
14873 {
14874 rtx set = PATTERN (insn);
14875
14876 if (GET_CODE (set) == PARALLEL)
14877 set = XVECEXP (set, 0, 0);
14878
14879 gcc_assert (GET_CODE (set) == SET);
14880
14881 return memory_address_length (SET_SRC (set));
14882 }
14883
14884 extract_insn_cached (insn);
14885 for (i = recog_data.n_operands - 1; i >= 0; --i)
14886 if (MEM_P (recog_data.operand[i]))
14887 {
14888 return memory_address_length (XEXP (recog_data.operand[i], 0));
14889 break;
14890 }
14891 return 0;
14892 }
14893 \f
14894 /* Return the maximum number of instructions a cpu can issue. */
14895
14896 static int
14897 ix86_issue_rate (void)
14898 {
14899 switch (ix86_tune)
14900 {
14901 case PROCESSOR_PENTIUM:
14902 case PROCESSOR_K6:
14903 return 2;
14904
14905 case PROCESSOR_PENTIUMPRO:
14906 case PROCESSOR_PENTIUM4:
14907 case PROCESSOR_ATHLON:
14908 case PROCESSOR_K8:
14909 case PROCESSOR_AMDFAM10:
14910 case PROCESSOR_NOCONA:
14911 case PROCESSOR_GENERIC32:
14912 case PROCESSOR_GENERIC64:
14913 return 3;
14914
14915 case PROCESSOR_CORE2:
14916 return 4;
14917
14918 default:
14919 return 1;
14920 }
14921 }
14922
14923 /* A subroutine of ix86_adjust_cost -- return true iff INSN reads flags set
14924 by DEP_INSN and nothing set by DEP_INSN. */
14925
14926 static int
14927 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
14928 {
14929 rtx set, set2;
14930
14931 /* Simplify the test for uninteresting insns. */
14932 if (insn_type != TYPE_SETCC
14933 && insn_type != TYPE_ICMOV
14934 && insn_type != TYPE_FCMOV
14935 && insn_type != TYPE_IBR)
14936 return 0;
14937
14938 if ((set = single_set (dep_insn)) != 0)
14939 {
14940 set = SET_DEST (set);
14941 set2 = NULL_RTX;
14942 }
14943 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
14944 && XVECLEN (PATTERN (dep_insn), 0) == 2
14945 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
14946 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
14947 {
14948 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
14949 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
14950 }
14951 else
14952 return 0;
14953
14954 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
14955 return 0;
14956
14957 /* This test is true if the dependent insn reads the flags but
14958 not any other potentially set register. */
14959 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
14960 return 0;
14961
14962 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
14963 return 0;
14964
14965 return 1;
14966 }
14967
14968 /* A subroutine of ix86_adjust_cost -- return true iff INSN has a memory
14969 address with operands set by DEP_INSN. */
14970
14971 static int
14972 ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
14973 {
14974 rtx addr;
14975
14976 if (insn_type == TYPE_LEA
14977 && TARGET_PENTIUM)
14978 {
14979 addr = PATTERN (insn);
14980
14981 if (GET_CODE (addr) == PARALLEL)
14982 addr = XVECEXP (addr, 0, 0);
14983
14984 gcc_assert (GET_CODE (addr) == SET);
14985
14986 addr = SET_SRC (addr);
14987 }
14988 else
14989 {
14990 int i;
14991 extract_insn_cached (insn);
14992 for (i = recog_data.n_operands - 1; i >= 0; --i)
14993 if (MEM_P (recog_data.operand[i]))
14994 {
14995 addr = XEXP (recog_data.operand[i], 0);
14996 goto found;
14997 }
14998 return 0;
14999 found:;
15000 }
15001
15002 return modified_in_p (addr, dep_insn);
15003 }
15004
15005 static int
15006 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
15007 {
15008 enum attr_type insn_type, dep_insn_type;
15009 enum attr_memory memory;
15010 rtx set, set2;
15011 int dep_insn_code_number;
15012
15013 /* Anti and output dependencies have zero cost on all CPUs. */
15014 if (REG_NOTE_KIND (link) != 0)
15015 return 0;
15016
15017 dep_insn_code_number = recog_memoized (dep_insn);
15018
15019 /* If we can't recognize the insns, we can't really do anything. */
15020 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
15021 return cost;
15022
15023 insn_type = get_attr_type (insn);
15024 dep_insn_type = get_attr_type (dep_insn);
15025
15026 switch (ix86_tune)
15027 {
15028 case PROCESSOR_PENTIUM:
15029 /* Address Generation Interlock adds a cycle of latency. */
15030 if (ix86_agi_dependent (insn, dep_insn, insn_type))
15031 cost += 1;
15032
15033 /* ??? Compares pair with jump/setcc. */
15034 if (ix86_flags_dependent (insn, dep_insn, insn_type))
15035 cost = 0;
15036
15037 /* Floating point stores require value to be ready one cycle earlier. */
15038 if (insn_type == TYPE_FMOV
15039 && get_attr_memory (insn) == MEMORY_STORE
15040 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15041 cost += 1;
15042 break;
15043
15044 case PROCESSOR_PENTIUMPRO:
15045 memory = get_attr_memory (insn);
15046
15047 /* INT->FP conversion is expensive. */
15048 if (get_attr_fp_int_src (dep_insn))
15049 cost += 5;
15050
15051 /* There is one cycle extra latency between an FP op and a store. */
15052 if (insn_type == TYPE_FMOV
15053 && (set = single_set (dep_insn)) != NULL_RTX
15054 && (set2 = single_set (insn)) != NULL_RTX
15055 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
15056 && MEM_P (SET_DEST (set2)))
15057 cost += 1;
15058
15059 /* Show ability of reorder buffer to hide latency of load by executing
15060 in parallel with previous instruction in case
15061 previous instruction is not needed to compute the address. */
15062 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15063 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15064 {
15065 /* Claim moves to take one cycle, as core can issue one load
15066 at time and the next load can start cycle later. */
15067 if (dep_insn_type == TYPE_IMOV
15068 || dep_insn_type == TYPE_FMOV)
15069 cost = 1;
15070 else if (cost > 1)
15071 cost--;
15072 }
15073 break;
15074
15075 case PROCESSOR_K6:
15076 memory = get_attr_memory (insn);
15077
15078 /* The esp dependency is resolved before the instruction is really
15079 finished. */
15080 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
15081 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
15082 return 1;
15083
15084 /* INT->FP conversion is expensive. */
15085 if (get_attr_fp_int_src (dep_insn))
15086 cost += 5;
15087
15088 /* Show ability of reorder buffer to hide latency of load by executing
15089 in parallel with previous instruction in case
15090 previous instruction is not needed to compute the address. */
15091 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15092 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15093 {
15094 /* Claim moves to take one cycle, as core can issue one load
15095 at time and the next load can start cycle later. */
15096 if (dep_insn_type == TYPE_IMOV
15097 || dep_insn_type == TYPE_FMOV)
15098 cost = 1;
15099 else if (cost > 2)
15100 cost -= 2;
15101 else
15102 cost = 1;
15103 }
15104 break;
15105
15106 case PROCESSOR_ATHLON:
15107 case PROCESSOR_K8:
15108 case PROCESSOR_AMDFAM10:
15109 case PROCESSOR_GENERIC32:
15110 case PROCESSOR_GENERIC64:
15111 memory = get_attr_memory (insn);
15112
15113 /* Show ability of reorder buffer to hide latency of load by executing
15114 in parallel with previous instruction in case
15115 previous instruction is not needed to compute the address. */
15116 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15117 && !ix86_agi_dependent (insn, dep_insn, insn_type))
15118 {
15119 enum attr_unit unit = get_attr_unit (insn);
15120 int loadcost = 3;
15121
15122 /* Because of the difference between the length of integer and
15123 floating unit pipeline preparation stages, the memory operands
15124 for floating point are cheaper.
15125
15126 ??? For Athlon it the difference is most probably 2. */
15127 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
15128 loadcost = 3;
15129 else
15130 loadcost = TARGET_ATHLON ? 2 : 0;
15131
15132 if (cost >= loadcost)
15133 cost -= loadcost;
15134 else
15135 cost = 0;
15136 }
15137
15138 default:
15139 break;
15140 }
15141
15142 return cost;
15143 }
15144
15145 /* How many alternative schedules to try. This should be as wide as the
15146 scheduling freedom in the DFA, but no wider. Making this value too
15147 large results extra work for the scheduler. */
15148
15149 static int
15150 ia32_multipass_dfa_lookahead (void)
15151 {
15152 if (ix86_tune == PROCESSOR_PENTIUM)
15153 return 2;
15154
15155 if (ix86_tune == PROCESSOR_PENTIUMPRO
15156 || ix86_tune == PROCESSOR_K6)
15157 return 1;
15158
15159 else
15160 return 0;
15161 }
15162
15163 \f
15164 /* Compute the alignment given to a constant that is being placed in memory.
15165 EXP is the constant and ALIGN is the alignment that the object would
15166 ordinarily have.
15167 The value of this function is used instead of that alignment to align
15168 the object. */
15169
15170 int
15171 ix86_constant_alignment (tree exp, int align)
15172 {
15173 if (TREE_CODE (exp) == REAL_CST)
15174 {
15175 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
15176 return 64;
15177 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
15178 return 128;
15179 }
15180 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
15181 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
15182 return BITS_PER_WORD;
15183
15184 return align;
15185 }
15186
15187 /* Compute the alignment for a static variable.
15188 TYPE is the data type, and ALIGN is the alignment that
15189 the object would ordinarily have. The value of this function is used
15190 instead of that alignment to align the object. */
15191
15192 int
15193 ix86_data_alignment (tree type, int align)
15194 {
15195 int max_align = optimize_size ? BITS_PER_WORD : 256;
15196
15197 if (AGGREGATE_TYPE_P (type)
15198 && TYPE_SIZE (type)
15199 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15200 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
15201 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
15202 && align < max_align)
15203 align = max_align;
15204
15205 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15206 to 16byte boundary. */
15207 if (TARGET_64BIT)
15208 {
15209 if (AGGREGATE_TYPE_P (type)
15210 && TYPE_SIZE (type)
15211 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15212 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
15213 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15214 return 128;
15215 }
15216
15217 if (TREE_CODE (type) == ARRAY_TYPE)
15218 {
15219 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15220 return 64;
15221 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15222 return 128;
15223 }
15224 else if (TREE_CODE (type) == COMPLEX_TYPE)
15225 {
15226
15227 if (TYPE_MODE (type) == DCmode && align < 64)
15228 return 64;
15229 if (TYPE_MODE (type) == XCmode && align < 128)
15230 return 128;
15231 }
15232 else if ((TREE_CODE (type) == RECORD_TYPE
15233 || TREE_CODE (type) == UNION_TYPE
15234 || TREE_CODE (type) == QUAL_UNION_TYPE)
15235 && TYPE_FIELDS (type))
15236 {
15237 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15238 return 64;
15239 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15240 return 128;
15241 }
15242 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15243 || TREE_CODE (type) == INTEGER_TYPE)
15244 {
15245 if (TYPE_MODE (type) == DFmode && align < 64)
15246 return 64;
15247 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15248 return 128;
15249 }
15250
15251 return align;
15252 }
15253
15254 /* Compute the alignment for a local variable.
15255 TYPE is the data type, and ALIGN is the alignment that
15256 the object would ordinarily have. The value of this macro is used
15257 instead of that alignment to align the object. */
15258
15259 int
15260 ix86_local_alignment (tree type, int align)
15261 {
15262 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15263 to 16byte boundary. */
15264 if (TARGET_64BIT)
15265 {
15266 if (AGGREGATE_TYPE_P (type)
15267 && TYPE_SIZE (type)
15268 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15269 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
15270 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15271 return 128;
15272 }
15273 if (TREE_CODE (type) == ARRAY_TYPE)
15274 {
15275 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15276 return 64;
15277 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15278 return 128;
15279 }
15280 else if (TREE_CODE (type) == COMPLEX_TYPE)
15281 {
15282 if (TYPE_MODE (type) == DCmode && align < 64)
15283 return 64;
15284 if (TYPE_MODE (type) == XCmode && align < 128)
15285 return 128;
15286 }
15287 else if ((TREE_CODE (type) == RECORD_TYPE
15288 || TREE_CODE (type) == UNION_TYPE
15289 || TREE_CODE (type) == QUAL_UNION_TYPE)
15290 && TYPE_FIELDS (type))
15291 {
15292 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15293 return 64;
15294 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15295 return 128;
15296 }
15297 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15298 || TREE_CODE (type) == INTEGER_TYPE)
15299 {
15300
15301 if (TYPE_MODE (type) == DFmode && align < 64)
15302 return 64;
15303 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15304 return 128;
15305 }
15306 return align;
15307 }
15308 \f
15309 /* Emit RTL insns to initialize the variable parts of a trampoline.
15310 FNADDR is an RTX for the address of the function's pure code.
15311 CXT is an RTX for the static chain value for the function. */
15312 void
15313 x86_initialize_trampoline (rtx tramp, rtx fnaddr, rtx cxt)
15314 {
15315 if (!TARGET_64BIT)
15316 {
15317 /* Compute offset from the end of the jmp to the target function. */
15318 rtx disp = expand_binop (SImode, sub_optab, fnaddr,
15319 plus_constant (tramp, 10),
15320 NULL_RTX, 1, OPTAB_DIRECT);
15321 emit_move_insn (gen_rtx_MEM (QImode, tramp),
15322 gen_int_mode (0xb9, QImode));
15323 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 1)), cxt);
15324 emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, 5)),
15325 gen_int_mode (0xe9, QImode));
15326 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 6)), disp);
15327 }
15328 else
15329 {
15330 int offset = 0;
15331 /* Try to load address using shorter movl instead of movabs.
15332 We may want to support movq for kernel mode, but kernel does not use
15333 trampolines at the moment. */
15334 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
15335 {
15336 fnaddr = copy_to_mode_reg (DImode, fnaddr);
15337 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15338 gen_int_mode (0xbb41, HImode));
15339 emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, offset + 2)),
15340 gen_lowpart (SImode, fnaddr));
15341 offset += 6;
15342 }
15343 else
15344 {
15345 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15346 gen_int_mode (0xbb49, HImode));
15347 emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15348 fnaddr);
15349 offset += 10;
15350 }
15351 /* Load static chain using movabs to r10. */
15352 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15353 gen_int_mode (0xba49, HImode));
15354 emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15355 cxt);
15356 offset += 10;
15357 /* Jump to the r11 */
15358 emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15359 gen_int_mode (0xff49, HImode));
15360 emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, offset+2)),
15361 gen_int_mode (0xe3, QImode));
15362 offset += 3;
15363 gcc_assert (offset <= TRAMPOLINE_SIZE);
15364 }
15365
15366 #ifdef ENABLE_EXECUTE_STACK
15367 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
15368 LCT_NORMAL, VOIDmode, 1, tramp, Pmode);
15369 #endif
15370 }
15371 \f
15372 /* Codes for all the SSE/MMX builtins. */
15373 enum ix86_builtins
15374 {
15375 IX86_BUILTIN_ADDPS,
15376 IX86_BUILTIN_ADDSS,
15377 IX86_BUILTIN_DIVPS,
15378 IX86_BUILTIN_DIVSS,
15379 IX86_BUILTIN_MULPS,
15380 IX86_BUILTIN_MULSS,
15381 IX86_BUILTIN_SUBPS,
15382 IX86_BUILTIN_SUBSS,
15383
15384 IX86_BUILTIN_CMPEQPS,
15385 IX86_BUILTIN_CMPLTPS,
15386 IX86_BUILTIN_CMPLEPS,
15387 IX86_BUILTIN_CMPGTPS,
15388 IX86_BUILTIN_CMPGEPS,
15389 IX86_BUILTIN_CMPNEQPS,
15390 IX86_BUILTIN_CMPNLTPS,
15391 IX86_BUILTIN_CMPNLEPS,
15392 IX86_BUILTIN_CMPNGTPS,
15393 IX86_BUILTIN_CMPNGEPS,
15394 IX86_BUILTIN_CMPORDPS,
15395 IX86_BUILTIN_CMPUNORDPS,
15396 IX86_BUILTIN_CMPEQSS,
15397 IX86_BUILTIN_CMPLTSS,
15398 IX86_BUILTIN_CMPLESS,
15399 IX86_BUILTIN_CMPNEQSS,
15400 IX86_BUILTIN_CMPNLTSS,
15401 IX86_BUILTIN_CMPNLESS,
15402 IX86_BUILTIN_CMPNGTSS,
15403 IX86_BUILTIN_CMPNGESS,
15404 IX86_BUILTIN_CMPORDSS,
15405 IX86_BUILTIN_CMPUNORDSS,
15406
15407 IX86_BUILTIN_COMIEQSS,
15408 IX86_BUILTIN_COMILTSS,
15409 IX86_BUILTIN_COMILESS,
15410 IX86_BUILTIN_COMIGTSS,
15411 IX86_BUILTIN_COMIGESS,
15412 IX86_BUILTIN_COMINEQSS,
15413 IX86_BUILTIN_UCOMIEQSS,
15414 IX86_BUILTIN_UCOMILTSS,
15415 IX86_BUILTIN_UCOMILESS,
15416 IX86_BUILTIN_UCOMIGTSS,
15417 IX86_BUILTIN_UCOMIGESS,
15418 IX86_BUILTIN_UCOMINEQSS,
15419
15420 IX86_BUILTIN_CVTPI2PS,
15421 IX86_BUILTIN_CVTPS2PI,
15422 IX86_BUILTIN_CVTSI2SS,
15423 IX86_BUILTIN_CVTSI642SS,
15424 IX86_BUILTIN_CVTSS2SI,
15425 IX86_BUILTIN_CVTSS2SI64,
15426 IX86_BUILTIN_CVTTPS2PI,
15427 IX86_BUILTIN_CVTTSS2SI,
15428 IX86_BUILTIN_CVTTSS2SI64,
15429
15430 IX86_BUILTIN_MAXPS,
15431 IX86_BUILTIN_MAXSS,
15432 IX86_BUILTIN_MINPS,
15433 IX86_BUILTIN_MINSS,
15434
15435 IX86_BUILTIN_LOADUPS,
15436 IX86_BUILTIN_STOREUPS,
15437 IX86_BUILTIN_MOVSS,
15438
15439 IX86_BUILTIN_MOVHLPS,
15440 IX86_BUILTIN_MOVLHPS,
15441 IX86_BUILTIN_LOADHPS,
15442 IX86_BUILTIN_LOADLPS,
15443 IX86_BUILTIN_STOREHPS,
15444 IX86_BUILTIN_STORELPS,
15445
15446 IX86_BUILTIN_MASKMOVQ,
15447 IX86_BUILTIN_MOVMSKPS,
15448 IX86_BUILTIN_PMOVMSKB,
15449
15450 IX86_BUILTIN_MOVNTPS,
15451 IX86_BUILTIN_MOVNTQ,
15452
15453 IX86_BUILTIN_LOADDQU,
15454 IX86_BUILTIN_STOREDQU,
15455
15456 IX86_BUILTIN_PACKSSWB,
15457 IX86_BUILTIN_PACKSSDW,
15458 IX86_BUILTIN_PACKUSWB,
15459
15460 IX86_BUILTIN_PADDB,
15461 IX86_BUILTIN_PADDW,
15462 IX86_BUILTIN_PADDD,
15463 IX86_BUILTIN_PADDQ,
15464 IX86_BUILTIN_PADDSB,
15465 IX86_BUILTIN_PADDSW,
15466 IX86_BUILTIN_PADDUSB,
15467 IX86_BUILTIN_PADDUSW,
15468 IX86_BUILTIN_PSUBB,
15469 IX86_BUILTIN_PSUBW,
15470 IX86_BUILTIN_PSUBD,
15471 IX86_BUILTIN_PSUBQ,
15472 IX86_BUILTIN_PSUBSB,
15473 IX86_BUILTIN_PSUBSW,
15474 IX86_BUILTIN_PSUBUSB,
15475 IX86_BUILTIN_PSUBUSW,
15476
15477 IX86_BUILTIN_PAND,
15478 IX86_BUILTIN_PANDN,
15479 IX86_BUILTIN_POR,
15480 IX86_BUILTIN_PXOR,
15481
15482 IX86_BUILTIN_PAVGB,
15483 IX86_BUILTIN_PAVGW,
15484
15485 IX86_BUILTIN_PCMPEQB,
15486 IX86_BUILTIN_PCMPEQW,
15487 IX86_BUILTIN_PCMPEQD,
15488 IX86_BUILTIN_PCMPGTB,
15489 IX86_BUILTIN_PCMPGTW,
15490 IX86_BUILTIN_PCMPGTD,
15491
15492 IX86_BUILTIN_PMADDWD,
15493
15494 IX86_BUILTIN_PMAXSW,
15495 IX86_BUILTIN_PMAXUB,
15496 IX86_BUILTIN_PMINSW,
15497 IX86_BUILTIN_PMINUB,
15498
15499 IX86_BUILTIN_PMULHUW,
15500 IX86_BUILTIN_PMULHW,
15501 IX86_BUILTIN_PMULLW,
15502
15503 IX86_BUILTIN_PSADBW,
15504 IX86_BUILTIN_PSHUFW,
15505
15506 IX86_BUILTIN_PSLLW,
15507 IX86_BUILTIN_PSLLD,
15508 IX86_BUILTIN_PSLLQ,
15509 IX86_BUILTIN_PSRAW,
15510 IX86_BUILTIN_PSRAD,
15511 IX86_BUILTIN_PSRLW,
15512 IX86_BUILTIN_PSRLD,
15513 IX86_BUILTIN_PSRLQ,
15514 IX86_BUILTIN_PSLLWI,
15515 IX86_BUILTIN_PSLLDI,
15516 IX86_BUILTIN_PSLLQI,
15517 IX86_BUILTIN_PSRAWI,
15518 IX86_BUILTIN_PSRADI,
15519 IX86_BUILTIN_PSRLWI,
15520 IX86_BUILTIN_PSRLDI,
15521 IX86_BUILTIN_PSRLQI,
15522
15523 IX86_BUILTIN_PUNPCKHBW,
15524 IX86_BUILTIN_PUNPCKHWD,
15525 IX86_BUILTIN_PUNPCKHDQ,
15526 IX86_BUILTIN_PUNPCKLBW,
15527 IX86_BUILTIN_PUNPCKLWD,
15528 IX86_BUILTIN_PUNPCKLDQ,
15529
15530 IX86_BUILTIN_SHUFPS,
15531
15532 IX86_BUILTIN_RCPPS,
15533 IX86_BUILTIN_RCPSS,
15534 IX86_BUILTIN_RSQRTPS,
15535 IX86_BUILTIN_RSQRTSS,
15536 IX86_BUILTIN_SQRTPS,
15537 IX86_BUILTIN_SQRTSS,
15538
15539 IX86_BUILTIN_UNPCKHPS,
15540 IX86_BUILTIN_UNPCKLPS,
15541
15542 IX86_BUILTIN_ANDPS,
15543 IX86_BUILTIN_ANDNPS,
15544 IX86_BUILTIN_ORPS,
15545 IX86_BUILTIN_XORPS,
15546
15547 IX86_BUILTIN_EMMS,
15548 IX86_BUILTIN_LDMXCSR,
15549 IX86_BUILTIN_STMXCSR,
15550 IX86_BUILTIN_SFENCE,
15551
15552 /* 3DNow! Original */
15553 IX86_BUILTIN_FEMMS,
15554 IX86_BUILTIN_PAVGUSB,
15555 IX86_BUILTIN_PF2ID,
15556 IX86_BUILTIN_PFACC,
15557 IX86_BUILTIN_PFADD,
15558 IX86_BUILTIN_PFCMPEQ,
15559 IX86_BUILTIN_PFCMPGE,
15560 IX86_BUILTIN_PFCMPGT,
15561 IX86_BUILTIN_PFMAX,
15562 IX86_BUILTIN_PFMIN,
15563 IX86_BUILTIN_PFMUL,
15564 IX86_BUILTIN_PFRCP,
15565 IX86_BUILTIN_PFRCPIT1,
15566 IX86_BUILTIN_PFRCPIT2,
15567 IX86_BUILTIN_PFRSQIT1,
15568 IX86_BUILTIN_PFRSQRT,
15569 IX86_BUILTIN_PFSUB,
15570 IX86_BUILTIN_PFSUBR,
15571 IX86_BUILTIN_PI2FD,
15572 IX86_BUILTIN_PMULHRW,
15573
15574 /* 3DNow! Athlon Extensions */
15575 IX86_BUILTIN_PF2IW,
15576 IX86_BUILTIN_PFNACC,
15577 IX86_BUILTIN_PFPNACC,
15578 IX86_BUILTIN_PI2FW,
15579 IX86_BUILTIN_PSWAPDSI,
15580 IX86_BUILTIN_PSWAPDSF,
15581
15582 /* SSE2 */
15583 IX86_BUILTIN_ADDPD,
15584 IX86_BUILTIN_ADDSD,
15585 IX86_BUILTIN_DIVPD,
15586 IX86_BUILTIN_DIVSD,
15587 IX86_BUILTIN_MULPD,
15588 IX86_BUILTIN_MULSD,
15589 IX86_BUILTIN_SUBPD,
15590 IX86_BUILTIN_SUBSD,
15591
15592 IX86_BUILTIN_CMPEQPD,
15593 IX86_BUILTIN_CMPLTPD,
15594 IX86_BUILTIN_CMPLEPD,
15595 IX86_BUILTIN_CMPGTPD,
15596 IX86_BUILTIN_CMPGEPD,
15597 IX86_BUILTIN_CMPNEQPD,
15598 IX86_BUILTIN_CMPNLTPD,
15599 IX86_BUILTIN_CMPNLEPD,
15600 IX86_BUILTIN_CMPNGTPD,
15601 IX86_BUILTIN_CMPNGEPD,
15602 IX86_BUILTIN_CMPORDPD,
15603 IX86_BUILTIN_CMPUNORDPD,
15604 IX86_BUILTIN_CMPNEPD,
15605 IX86_BUILTIN_CMPEQSD,
15606 IX86_BUILTIN_CMPLTSD,
15607 IX86_BUILTIN_CMPLESD,
15608 IX86_BUILTIN_CMPNEQSD,
15609 IX86_BUILTIN_CMPNLTSD,
15610 IX86_BUILTIN_CMPNLESD,
15611 IX86_BUILTIN_CMPORDSD,
15612 IX86_BUILTIN_CMPUNORDSD,
15613 IX86_BUILTIN_CMPNESD,
15614
15615 IX86_BUILTIN_COMIEQSD,
15616 IX86_BUILTIN_COMILTSD,
15617 IX86_BUILTIN_COMILESD,
15618 IX86_BUILTIN_COMIGTSD,
15619 IX86_BUILTIN_COMIGESD,
15620 IX86_BUILTIN_COMINEQSD,
15621 IX86_BUILTIN_UCOMIEQSD,
15622 IX86_BUILTIN_UCOMILTSD,
15623 IX86_BUILTIN_UCOMILESD,
15624 IX86_BUILTIN_UCOMIGTSD,
15625 IX86_BUILTIN_UCOMIGESD,
15626 IX86_BUILTIN_UCOMINEQSD,
15627
15628 IX86_BUILTIN_MAXPD,
15629 IX86_BUILTIN_MAXSD,
15630 IX86_BUILTIN_MINPD,
15631 IX86_BUILTIN_MINSD,
15632
15633 IX86_BUILTIN_ANDPD,
15634 IX86_BUILTIN_ANDNPD,
15635 IX86_BUILTIN_ORPD,
15636 IX86_BUILTIN_XORPD,
15637
15638 IX86_BUILTIN_SQRTPD,
15639 IX86_BUILTIN_SQRTSD,
15640
15641 IX86_BUILTIN_UNPCKHPD,
15642 IX86_BUILTIN_UNPCKLPD,
15643
15644 IX86_BUILTIN_SHUFPD,
15645
15646 IX86_BUILTIN_LOADUPD,
15647 IX86_BUILTIN_STOREUPD,
15648 IX86_BUILTIN_MOVSD,
15649
15650 IX86_BUILTIN_LOADHPD,
15651 IX86_BUILTIN_LOADLPD,
15652
15653 IX86_BUILTIN_CVTDQ2PD,
15654 IX86_BUILTIN_CVTDQ2PS,
15655
15656 IX86_BUILTIN_CVTPD2DQ,
15657 IX86_BUILTIN_CVTPD2PI,
15658 IX86_BUILTIN_CVTPD2PS,
15659 IX86_BUILTIN_CVTTPD2DQ,
15660 IX86_BUILTIN_CVTTPD2PI,
15661
15662 IX86_BUILTIN_CVTPI2PD,
15663 IX86_BUILTIN_CVTSI2SD,
15664 IX86_BUILTIN_CVTSI642SD,
15665
15666 IX86_BUILTIN_CVTSD2SI,
15667 IX86_BUILTIN_CVTSD2SI64,
15668 IX86_BUILTIN_CVTSD2SS,
15669 IX86_BUILTIN_CVTSS2SD,
15670 IX86_BUILTIN_CVTTSD2SI,
15671 IX86_BUILTIN_CVTTSD2SI64,
15672
15673 IX86_BUILTIN_CVTPS2DQ,
15674 IX86_BUILTIN_CVTPS2PD,
15675 IX86_BUILTIN_CVTTPS2DQ,
15676
15677 IX86_BUILTIN_MOVNTI,
15678 IX86_BUILTIN_MOVNTPD,
15679 IX86_BUILTIN_MOVNTDQ,
15680
15681 /* SSE2 MMX */
15682 IX86_BUILTIN_MASKMOVDQU,
15683 IX86_BUILTIN_MOVMSKPD,
15684 IX86_BUILTIN_PMOVMSKB128,
15685
15686 IX86_BUILTIN_PACKSSWB128,
15687 IX86_BUILTIN_PACKSSDW128,
15688 IX86_BUILTIN_PACKUSWB128,
15689
15690 IX86_BUILTIN_PADDB128,
15691 IX86_BUILTIN_PADDW128,
15692 IX86_BUILTIN_PADDD128,
15693 IX86_BUILTIN_PADDQ128,
15694 IX86_BUILTIN_PADDSB128,
15695 IX86_BUILTIN_PADDSW128,
15696 IX86_BUILTIN_PADDUSB128,
15697 IX86_BUILTIN_PADDUSW128,
15698 IX86_BUILTIN_PSUBB128,
15699 IX86_BUILTIN_PSUBW128,
15700 IX86_BUILTIN_PSUBD128,
15701 IX86_BUILTIN_PSUBQ128,
15702 IX86_BUILTIN_PSUBSB128,
15703 IX86_BUILTIN_PSUBSW128,
15704 IX86_BUILTIN_PSUBUSB128,
15705 IX86_BUILTIN_PSUBUSW128,
15706
15707 IX86_BUILTIN_PAND128,
15708 IX86_BUILTIN_PANDN128,
15709 IX86_BUILTIN_POR128,
15710 IX86_BUILTIN_PXOR128,
15711
15712 IX86_BUILTIN_PAVGB128,
15713 IX86_BUILTIN_PAVGW128,
15714
15715 IX86_BUILTIN_PCMPEQB128,
15716 IX86_BUILTIN_PCMPEQW128,
15717 IX86_BUILTIN_PCMPEQD128,
15718 IX86_BUILTIN_PCMPGTB128,
15719 IX86_BUILTIN_PCMPGTW128,
15720 IX86_BUILTIN_PCMPGTD128,
15721
15722 IX86_BUILTIN_PMADDWD128,
15723
15724 IX86_BUILTIN_PMAXSW128,
15725 IX86_BUILTIN_PMAXUB128,
15726 IX86_BUILTIN_PMINSW128,
15727 IX86_BUILTIN_PMINUB128,
15728
15729 IX86_BUILTIN_PMULUDQ,
15730 IX86_BUILTIN_PMULUDQ128,
15731 IX86_BUILTIN_PMULHUW128,
15732 IX86_BUILTIN_PMULHW128,
15733 IX86_BUILTIN_PMULLW128,
15734
15735 IX86_BUILTIN_PSADBW128,
15736 IX86_BUILTIN_PSHUFHW,
15737 IX86_BUILTIN_PSHUFLW,
15738 IX86_BUILTIN_PSHUFD,
15739
15740 IX86_BUILTIN_PSLLW128,
15741 IX86_BUILTIN_PSLLD128,
15742 IX86_BUILTIN_PSLLQ128,
15743 IX86_BUILTIN_PSRAW128,
15744 IX86_BUILTIN_PSRAD128,
15745 IX86_BUILTIN_PSRLW128,
15746 IX86_BUILTIN_PSRLD128,
15747 IX86_BUILTIN_PSRLQ128,
15748 IX86_BUILTIN_PSLLDQI128,
15749 IX86_BUILTIN_PSLLWI128,
15750 IX86_BUILTIN_PSLLDI128,
15751 IX86_BUILTIN_PSLLQI128,
15752 IX86_BUILTIN_PSRAWI128,
15753 IX86_BUILTIN_PSRADI128,
15754 IX86_BUILTIN_PSRLDQI128,
15755 IX86_BUILTIN_PSRLWI128,
15756 IX86_BUILTIN_PSRLDI128,
15757 IX86_BUILTIN_PSRLQI128,
15758
15759 IX86_BUILTIN_PUNPCKHBW128,
15760 IX86_BUILTIN_PUNPCKHWD128,
15761 IX86_BUILTIN_PUNPCKHDQ128,
15762 IX86_BUILTIN_PUNPCKHQDQ128,
15763 IX86_BUILTIN_PUNPCKLBW128,
15764 IX86_BUILTIN_PUNPCKLWD128,
15765 IX86_BUILTIN_PUNPCKLDQ128,
15766 IX86_BUILTIN_PUNPCKLQDQ128,
15767
15768 IX86_BUILTIN_CLFLUSH,
15769 IX86_BUILTIN_MFENCE,
15770 IX86_BUILTIN_LFENCE,
15771
15772 /* Prescott New Instructions. */
15773 IX86_BUILTIN_ADDSUBPS,
15774 IX86_BUILTIN_HADDPS,
15775 IX86_BUILTIN_HSUBPS,
15776 IX86_BUILTIN_MOVSHDUP,
15777 IX86_BUILTIN_MOVSLDUP,
15778 IX86_BUILTIN_ADDSUBPD,
15779 IX86_BUILTIN_HADDPD,
15780 IX86_BUILTIN_HSUBPD,
15781 IX86_BUILTIN_LDDQU,
15782
15783 IX86_BUILTIN_MONITOR,
15784 IX86_BUILTIN_MWAIT,
15785
15786 /* SSSE3. */
15787 IX86_BUILTIN_PHADDW,
15788 IX86_BUILTIN_PHADDD,
15789 IX86_BUILTIN_PHADDSW,
15790 IX86_BUILTIN_PHSUBW,
15791 IX86_BUILTIN_PHSUBD,
15792 IX86_BUILTIN_PHSUBSW,
15793 IX86_BUILTIN_PMADDUBSW,
15794 IX86_BUILTIN_PMULHRSW,
15795 IX86_BUILTIN_PSHUFB,
15796 IX86_BUILTIN_PSIGNB,
15797 IX86_BUILTIN_PSIGNW,
15798 IX86_BUILTIN_PSIGND,
15799 IX86_BUILTIN_PALIGNR,
15800 IX86_BUILTIN_PABSB,
15801 IX86_BUILTIN_PABSW,
15802 IX86_BUILTIN_PABSD,
15803
15804 IX86_BUILTIN_PHADDW128,
15805 IX86_BUILTIN_PHADDD128,
15806 IX86_BUILTIN_PHADDSW128,
15807 IX86_BUILTIN_PHSUBW128,
15808 IX86_BUILTIN_PHSUBD128,
15809 IX86_BUILTIN_PHSUBSW128,
15810 IX86_BUILTIN_PMADDUBSW128,
15811 IX86_BUILTIN_PMULHRSW128,
15812 IX86_BUILTIN_PSHUFB128,
15813 IX86_BUILTIN_PSIGNB128,
15814 IX86_BUILTIN_PSIGNW128,
15815 IX86_BUILTIN_PSIGND128,
15816 IX86_BUILTIN_PALIGNR128,
15817 IX86_BUILTIN_PABSB128,
15818 IX86_BUILTIN_PABSW128,
15819 IX86_BUILTIN_PABSD128,
15820
15821 /* AMDFAM10 - SSE4A New Instructions. */
15822 IX86_BUILTIN_MOVNTSD,
15823 IX86_BUILTIN_MOVNTSS,
15824 IX86_BUILTIN_EXTRQI,
15825 IX86_BUILTIN_EXTRQ,
15826 IX86_BUILTIN_INSERTQI,
15827 IX86_BUILTIN_INSERTQ,
15828
15829 IX86_BUILTIN_VEC_INIT_V2SI,
15830 IX86_BUILTIN_VEC_INIT_V4HI,
15831 IX86_BUILTIN_VEC_INIT_V8QI,
15832 IX86_BUILTIN_VEC_EXT_V2DF,
15833 IX86_BUILTIN_VEC_EXT_V2DI,
15834 IX86_BUILTIN_VEC_EXT_V4SF,
15835 IX86_BUILTIN_VEC_EXT_V4SI,
15836 IX86_BUILTIN_VEC_EXT_V8HI,
15837 IX86_BUILTIN_VEC_EXT_V2SI,
15838 IX86_BUILTIN_VEC_EXT_V4HI,
15839 IX86_BUILTIN_VEC_SET_V8HI,
15840 IX86_BUILTIN_VEC_SET_V4HI,
15841
15842 IX86_BUILTIN_MAX
15843 };
15844
15845 /* Table for the ix86 builtin decls. */
15846 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
15847
15848 /* Add a ix86 target builtin function with CODE, NAME and TYPE. Do so,
15849 * if the target_flags include one of MASK. Stores the function decl
15850 * in the ix86_builtins array.
15851 * Returns the function decl or NULL_TREE, if the builtin was not added. */
15852
15853 static inline tree
15854 def_builtin (int mask, const char *name, tree type, enum ix86_builtins code)
15855 {
15856 tree decl = NULL_TREE;
15857
15858 if (mask & target_flags
15859 && (!(mask & MASK_64BIT) || TARGET_64BIT))
15860 {
15861 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
15862 NULL, NULL_TREE);
15863 ix86_builtins[(int) code] = decl;
15864 }
15865
15866 return decl;
15867 }
15868
15869 /* Like def_builtin, but also marks the function decl "const". */
15870
15871 static inline tree
15872 def_builtin_const (int mask, const char *name, tree type,
15873 enum ix86_builtins code)
15874 {
15875 tree decl = def_builtin (mask, name, type, code);
15876 if (decl)
15877 TREE_READONLY (decl) = 1;
15878 return decl;
15879 }
15880
15881 /* Bits for builtin_description.flag. */
15882
15883 /* Set when we don't support the comparison natively, and should
15884 swap_comparison in order to support it. */
15885 #define BUILTIN_DESC_SWAP_OPERANDS 1
15886
15887 struct builtin_description
15888 {
15889 const unsigned int mask;
15890 const enum insn_code icode;
15891 const char *const name;
15892 const enum ix86_builtins code;
15893 const enum rtx_code comparison;
15894 const unsigned int flag;
15895 };
15896
15897 static const struct builtin_description bdesc_comi[] =
15898 {
15899 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
15900 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
15901 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
15902 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
15903 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
15904 { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
15905 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
15906 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
15907 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
15908 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
15909 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
15910 { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
15911 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
15912 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
15913 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
15914 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
15915 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
15916 { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
15917 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
15918 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
15919 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
15920 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
15921 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
15922 { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
15923 };
15924
15925 static const struct builtin_description bdesc_2arg[] =
15926 {
15927 /* SSE */
15928 { MASK_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, 0, 0 },
15929 { MASK_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, 0, 0 },
15930 { MASK_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, 0, 0 },
15931 { MASK_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, 0, 0 },
15932 { MASK_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, 0, 0 },
15933 { MASK_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, 0, 0 },
15934 { MASK_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, 0, 0 },
15935 { MASK_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, 0, 0 },
15936
15937 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, 0 },
15938 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, 0 },
15939 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, 0 },
15940 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT,
15941 BUILTIN_DESC_SWAP_OPERANDS },
15942 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE,
15943 BUILTIN_DESC_SWAP_OPERANDS },
15944 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, 0 },
15945 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, 0 },
15946 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, 0 },
15947 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, 0 },
15948 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE,
15949 BUILTIN_DESC_SWAP_OPERANDS },
15950 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT,
15951 BUILTIN_DESC_SWAP_OPERANDS },
15952 { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, 0 },
15953 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, 0 },
15954 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, 0 },
15955 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, 0 },
15956 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, 0 },
15957 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, 0 },
15958 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, 0 },
15959 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, 0 },
15960 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE,
15961 BUILTIN_DESC_SWAP_OPERANDS },
15962 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT,
15963 BUILTIN_DESC_SWAP_OPERANDS },
15964 { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, UNORDERED, 0 },
15965
15966 { MASK_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, 0, 0 },
15967 { MASK_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, 0, 0 },
15968 { MASK_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 },
15969 { MASK_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 },
15970
15971 { MASK_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 },
15972 { MASK_SSE, CODE_FOR_sse_nandv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 },
15973 { MASK_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 },
15974 { MASK_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 },
15975
15976 { MASK_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 },
15977 { MASK_SSE, CODE_FOR_sse_movhlps, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 },
15978 { MASK_SSE, CODE_FOR_sse_movlhps, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 },
15979 { MASK_SSE, CODE_FOR_sse_unpckhps, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, 0, 0 },
15980 { MASK_SSE, CODE_FOR_sse_unpcklps, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, 0, 0 },
15981
15982 /* MMX */
15983 { MASK_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, 0, 0 },
15984 { MASK_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, 0, 0 },
15985 { MASK_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, 0, 0 },
15986 { MASK_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, 0, 0 },
15987 { MASK_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, 0, 0 },
15988 { MASK_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, 0, 0 },
15989 { MASK_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, 0, 0 },
15990 { MASK_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, 0, 0 },
15991
15992 { MASK_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, 0, 0 },
15993 { MASK_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, 0, 0 },
15994 { MASK_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, 0, 0 },
15995 { MASK_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, 0, 0 },
15996 { MASK_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, 0, 0 },
15997 { MASK_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, 0, 0 },
15998 { MASK_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, 0, 0 },
15999 { MASK_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, 0, 0 },
16000
16001 { MASK_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, 0, 0 },
16002 { MASK_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, 0, 0 },
16003 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, 0, 0 },
16004
16005 { MASK_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, 0, 0 },
16006 { MASK_MMX, CODE_FOR_mmx_nandv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, 0, 0 },
16007 { MASK_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, 0, 0 },
16008 { MASK_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, 0, 0 },
16009
16010 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, 0, 0 },
16011 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, 0, 0 },
16012
16013 { MASK_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, 0, 0 },
16014 { MASK_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, 0, 0 },
16015 { MASK_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, 0, 0 },
16016 { MASK_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, 0, 0 },
16017 { MASK_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, 0, 0 },
16018 { MASK_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, 0, 0 },
16019
16020 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, 0, 0 },
16021 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, 0, 0 },
16022 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, 0, 0 },
16023 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, 0, 0 },
16024
16025 { MASK_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, 0, 0 },
16026 { MASK_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, 0, 0 },
16027 { MASK_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, 0, 0 },
16028 { MASK_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, 0, 0 },
16029 { MASK_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, 0, 0 },
16030 { MASK_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, 0, 0 },
16031
16032 /* Special. */
16033 { MASK_MMX, CODE_FOR_mmx_packsswb, 0, IX86_BUILTIN_PACKSSWB, 0, 0 },
16034 { MASK_MMX, CODE_FOR_mmx_packssdw, 0, IX86_BUILTIN_PACKSSDW, 0, 0 },
16035 { MASK_MMX, CODE_FOR_mmx_packuswb, 0, IX86_BUILTIN_PACKUSWB, 0, 0 },
16036
16037 { MASK_SSE, CODE_FOR_sse_cvtpi2ps, 0, IX86_BUILTIN_CVTPI2PS, 0, 0 },
16038 { MASK_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, 0, 0 },
16039 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, 0, 0 },
16040
16041 { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, 0, 0 },
16042 { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, 0, 0 },
16043 { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, 0, 0 },
16044 { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, 0, 0 },
16045 { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, 0, 0 },
16046 { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, 0, 0 },
16047
16048 { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, 0, 0 },
16049 { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, 0, 0 },
16050 { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, 0, 0 },
16051 { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, 0, 0 },
16052 { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, 0, 0 },
16053 { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, 0, 0 },
16054
16055 { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, 0, 0 },
16056 { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, 0, 0 },
16057 { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, 0, 0 },
16058 { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, 0, 0 },
16059
16060 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, 0, 0 },
16061 { MASK_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, 0, 0 },
16062
16063 /* SSE2 */
16064 { MASK_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, 0, 0 },
16065 { MASK_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, 0, 0 },
16066 { MASK_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, 0, 0 },
16067 { MASK_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, 0, 0 },
16068 { MASK_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, 0, 0 },
16069 { MASK_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, 0, 0 },
16070 { MASK_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, 0, 0 },
16071 { MASK_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, 0, 0 },
16072
16073 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, 0 },
16074 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, 0 },
16075 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, 0 },
16076 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT,
16077 BUILTIN_DESC_SWAP_OPERANDS },
16078 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE,
16079 BUILTIN_DESC_SWAP_OPERANDS },
16080 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, 0 },
16081 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, 0 },
16082 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, 0 },
16083 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, 0 },
16084 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE,
16085 BUILTIN_DESC_SWAP_OPERANDS },
16086 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT,
16087 BUILTIN_DESC_SWAP_OPERANDS },
16088 { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, 0 },
16089 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, 0 },
16090 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, 0 },
16091 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, 0 },
16092 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, 0 },
16093 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, 0 },
16094 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, 0 },
16095 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, 0 },
16096 { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, 0 },
16097
16098 { MASK_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, 0, 0 },
16099 { MASK_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, 0, 0 },
16100 { MASK_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 },
16101 { MASK_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 },
16102
16103 { MASK_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
16104 { MASK_SSE2, CODE_FOR_sse2_nandv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
16105 { MASK_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
16106 { MASK_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
16107
16108 { MASK_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 },
16109 { MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 },
16110 { MASK_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, 0, 0 },
16111
16112 /* SSE2 MMX */
16113 { MASK_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, 0, 0 },
16114 { MASK_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, 0, 0 },
16115 { MASK_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, 0, 0 },
16116 { MASK_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, 0, 0 },
16117 { MASK_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, 0, 0 },
16118 { MASK_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, 0, 0 },
16119 { MASK_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, 0, 0 },
16120 { MASK_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, 0, 0 },
16121
16122 { MASK_MMX, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, 0, 0 },
16123 { MASK_MMX, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, 0, 0 },
16124 { MASK_MMX, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, 0, 0 },
16125 { MASK_MMX, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, 0, 0 },
16126 { MASK_MMX, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, 0, 0 },
16127 { MASK_MMX, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, 0, 0 },
16128 { MASK_MMX, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, 0, 0 },
16129 { MASK_MMX, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, 0, 0 },
16130
16131 { MASK_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, 0, 0 },
16132 { MASK_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 },
16133
16134 { MASK_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 },
16135 { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 },
16136 { MASK_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, 0, 0 },
16137 { MASK_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, 0, 0 },
16138
16139 { MASK_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, 0, 0 },
16140 { MASK_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, 0, 0 },
16141
16142 { MASK_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, 0, 0 },
16143 { MASK_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, 0, 0 },
16144 { MASK_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, 0, 0 },
16145 { MASK_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, 0, 0 },
16146 { MASK_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, 0, 0 },
16147 { MASK_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, 0, 0 },
16148
16149 { MASK_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, 0, 0 },
16150 { MASK_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, 0, 0 },
16151 { MASK_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, 0, 0 },
16152 { MASK_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, 0, 0 },
16153
16154 { MASK_SSE2, CODE_FOR_sse2_punpckhbw, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, 0, 0 },
16155 { MASK_SSE2, CODE_FOR_sse2_punpckhwd, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, 0, 0 },
16156 { MASK_SSE2, CODE_FOR_sse2_punpckhdq, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, 0, 0 },
16157 { MASK_SSE2, CODE_FOR_sse2_punpckhqdq, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, 0, 0 },
16158 { MASK_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, 0, 0 },
16159 { MASK_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, 0, 0 },
16160 { MASK_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, 0, 0 },
16161 { MASK_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, 0, 0 },
16162
16163 { MASK_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, 0, 0 },
16164 { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 },
16165 { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 },
16166
16167 { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 },
16168 { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 },
16169
16170 { MASK_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, 0, 0 },
16171 { MASK_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, 0, 0 },
16172
16173 { MASK_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, 0, 0 },
16174 { MASK_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, 0, 0 },
16175 { MASK_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, 0, 0 },
16176
16177 { MASK_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, 0, 0 },
16178 { MASK_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, 0, 0 },
16179 { MASK_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, 0, 0 },
16180
16181 { MASK_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, 0, 0 },
16182 { MASK_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, 0, 0 },
16183
16184 { MASK_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, 0, 0 },
16185
16186 { MASK_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, 0, 0 },
16187 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsi2sdq, 0, IX86_BUILTIN_CVTSI642SD, 0, 0 },
16188 { MASK_SSE2, CODE_FOR_sse2_cvtsd2ss, 0, IX86_BUILTIN_CVTSD2SS, 0, 0 },
16189 { MASK_SSE2, CODE_FOR_sse2_cvtss2sd, 0, IX86_BUILTIN_CVTSS2SD, 0, 0 },
16190
16191 /* SSE3 MMX */
16192 { MASK_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, 0, 0 },
16193 { MASK_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, 0, 0 },
16194 { MASK_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, 0, 0 },
16195 { MASK_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, 0, 0 },
16196 { MASK_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, 0, 0 },
16197 { MASK_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, 0, 0 },
16198
16199 /* SSSE3 */
16200 { MASK_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, 0, 0 },
16201 { MASK_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 0, 0 },
16202 { MASK_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, 0, 0 },
16203 { MASK_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, 0, 0 },
16204 { MASK_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, 0, 0 },
16205 { MASK_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, 0, 0 },
16206 { MASK_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, 0, 0 },
16207 { MASK_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, 0, 0 },
16208 { MASK_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, 0, 0 },
16209 { MASK_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, 0, 0 },
16210 { MASK_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, 0, 0 },
16211 { MASK_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, 0, 0 },
16212 { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv8hi3, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, 0, 0 },
16213 { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv4hi3, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, 0, 0 },
16214 { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, 0, 0 },
16215 { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, 0, 0 },
16216 { MASK_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, 0, 0 },
16217 { MASK_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, 0, 0 },
16218 { MASK_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, 0, 0 },
16219 { MASK_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, 0, 0 },
16220 { MASK_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, 0, 0 },
16221 { MASK_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, 0, 0 },
16222 { MASK_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, 0, 0 },
16223 { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 }
16224 };
16225
16226 static const struct builtin_description bdesc_1arg[] =
16227 {
16228 { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, 0, 0 },
16229 { MASK_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, 0, 0 },
16230
16231 { MASK_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, 0, 0 },
16232 { MASK_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, 0, 0 },
16233 { MASK_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, 0, 0 },
16234
16235 { MASK_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, 0, 0 },
16236 { MASK_SSE, CODE_FOR_sse_cvtss2si, 0, IX86_BUILTIN_CVTSS2SI, 0, 0 },
16237 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtss2siq, 0, IX86_BUILTIN_CVTSS2SI64, 0, 0 },
16238 { MASK_SSE, CODE_FOR_sse_cvttps2pi, 0, IX86_BUILTIN_CVTTPS2PI, 0, 0 },
16239 { MASK_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, 0, 0 },
16240 { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, 0, 0 },
16241
16242 { MASK_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, 0, 0 },
16243 { MASK_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, 0, 0 },
16244
16245 { MASK_SSE2, CODE_FOR_sqrtv2df2, 0, IX86_BUILTIN_SQRTPD, 0, 0 },
16246
16247 { MASK_SSE2, CODE_FOR_sse2_cvtdq2pd, 0, IX86_BUILTIN_CVTDQ2PD, 0, 0 },
16248 { MASK_SSE2, CODE_FOR_sse2_cvtdq2ps, 0, IX86_BUILTIN_CVTDQ2PS, 0, 0 },
16249
16250 { MASK_SSE2, CODE_FOR_sse2_cvtpd2dq, 0, IX86_BUILTIN_CVTPD2DQ, 0, 0 },
16251 { MASK_SSE2, CODE_FOR_sse2_cvtpd2pi, 0, IX86_BUILTIN_CVTPD2PI, 0, 0 },
16252 { MASK_SSE2, CODE_FOR_sse2_cvtpd2ps, 0, IX86_BUILTIN_CVTPD2PS, 0, 0 },
16253 { MASK_SSE2, CODE_FOR_sse2_cvttpd2dq, 0, IX86_BUILTIN_CVTTPD2DQ, 0, 0 },
16254 { MASK_SSE2, CODE_FOR_sse2_cvttpd2pi, 0, IX86_BUILTIN_CVTTPD2PI, 0, 0 },
16255
16256 { MASK_SSE2, CODE_FOR_sse2_cvtpi2pd, 0, IX86_BUILTIN_CVTPI2PD, 0, 0 },
16257
16258 { MASK_SSE2, CODE_FOR_sse2_cvtsd2si, 0, IX86_BUILTIN_CVTSD2SI, 0, 0 },
16259 { MASK_SSE2, CODE_FOR_sse2_cvttsd2si, 0, IX86_BUILTIN_CVTTSD2SI, 0, 0 },
16260 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsd2siq, 0, IX86_BUILTIN_CVTSD2SI64, 0, 0 },
16261 { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvttsd2siq, 0, IX86_BUILTIN_CVTTSD2SI64, 0, 0 },
16262
16263 { MASK_SSE2, CODE_FOR_sse2_cvtps2dq, 0, IX86_BUILTIN_CVTPS2DQ, 0, 0 },
16264 { MASK_SSE2, CODE_FOR_sse2_cvtps2pd, 0, IX86_BUILTIN_CVTPS2PD, 0, 0 },
16265 { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 },
16266
16267 /* SSE3 */
16268 { MASK_SSE3, CODE_FOR_sse3_movshdup, 0, IX86_BUILTIN_MOVSHDUP, 0, 0 },
16269 { MASK_SSE3, CODE_FOR_sse3_movsldup, 0, IX86_BUILTIN_MOVSLDUP, 0, 0 },
16270
16271 /* SSSE3 */
16272 { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 },
16273 { MASK_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, 0, 0 },
16274 { MASK_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, 0, 0 },
16275 { MASK_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, 0, 0 },
16276 { MASK_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, 0, 0 },
16277 { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 },
16278 };
16279
16280 static void
16281 ix86_init_builtins (void)
16282 {
16283 if (TARGET_MMX)
16284 ix86_init_mmx_sse_builtins ();
16285 }
16286
16287 /* Set up all the MMX/SSE builtins. This is not called if TARGET_MMX
16288 is zero. Otherwise, if TARGET_SSE is not set, only expand the MMX
16289 builtins. */
16290 static void
16291 ix86_init_mmx_sse_builtins (void)
16292 {
16293 const struct builtin_description * d;
16294 size_t i;
16295
16296 tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode);
16297 tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode);
16298 tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode);
16299 tree V2DI_type_node
16300 = build_vector_type_for_mode (long_long_integer_type_node, V2DImode);
16301 tree V2DF_type_node = build_vector_type_for_mode (double_type_node, V2DFmode);
16302 tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode);
16303 tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode);
16304 tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode);
16305 tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode);
16306 tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode);
16307
16308 tree pchar_type_node = build_pointer_type (char_type_node);
16309 tree pcchar_type_node = build_pointer_type (
16310 build_type_variant (char_type_node, 1, 0));
16311 tree pfloat_type_node = build_pointer_type (float_type_node);
16312 tree pcfloat_type_node = build_pointer_type (
16313 build_type_variant (float_type_node, 1, 0));
16314 tree pv2si_type_node = build_pointer_type (V2SI_type_node);
16315 tree pv2di_type_node = build_pointer_type (V2DI_type_node);
16316 tree pdi_type_node = build_pointer_type (long_long_unsigned_type_node);
16317
16318 /* Comparisons. */
16319 tree int_ftype_v4sf_v4sf
16320 = build_function_type_list (integer_type_node,
16321 V4SF_type_node, V4SF_type_node, NULL_TREE);
16322 tree v4si_ftype_v4sf_v4sf
16323 = build_function_type_list (V4SI_type_node,
16324 V4SF_type_node, V4SF_type_node, NULL_TREE);
16325 /* MMX/SSE/integer conversions. */
16326 tree int_ftype_v4sf
16327 = build_function_type_list (integer_type_node,
16328 V4SF_type_node, NULL_TREE);
16329 tree int64_ftype_v4sf
16330 = build_function_type_list (long_long_integer_type_node,
16331 V4SF_type_node, NULL_TREE);
16332 tree int_ftype_v8qi
16333 = build_function_type_list (integer_type_node, V8QI_type_node, NULL_TREE);
16334 tree v4sf_ftype_v4sf_int
16335 = build_function_type_list (V4SF_type_node,
16336 V4SF_type_node, integer_type_node, NULL_TREE);
16337 tree v4sf_ftype_v4sf_int64
16338 = build_function_type_list (V4SF_type_node,
16339 V4SF_type_node, long_long_integer_type_node,
16340 NULL_TREE);
16341 tree v4sf_ftype_v4sf_v2si
16342 = build_function_type_list (V4SF_type_node,
16343 V4SF_type_node, V2SI_type_node, NULL_TREE);
16344
16345 /* Miscellaneous. */
16346 tree v8qi_ftype_v4hi_v4hi
16347 = build_function_type_list (V8QI_type_node,
16348 V4HI_type_node, V4HI_type_node, NULL_TREE);
16349 tree v4hi_ftype_v2si_v2si
16350 = build_function_type_list (V4HI_type_node,
16351 V2SI_type_node, V2SI_type_node, NULL_TREE);
16352 tree v4sf_ftype_v4sf_v4sf_int
16353 = build_function_type_list (V4SF_type_node,
16354 V4SF_type_node, V4SF_type_node,
16355 integer_type_node, NULL_TREE);
16356 tree v2si_ftype_v4hi_v4hi
16357 = build_function_type_list (V2SI_type_node,
16358 V4HI_type_node, V4HI_type_node, NULL_TREE);
16359 tree v4hi_ftype_v4hi_int
16360 = build_function_type_list (V4HI_type_node,
16361 V4HI_type_node, integer_type_node, NULL_TREE);
16362 tree v4hi_ftype_v4hi_di
16363 = build_function_type_list (V4HI_type_node,
16364 V4HI_type_node, long_long_unsigned_type_node,
16365 NULL_TREE);
16366 tree v2si_ftype_v2si_di
16367 = build_function_type_list (V2SI_type_node,
16368 V2SI_type_node, long_long_unsigned_type_node,
16369 NULL_TREE);
16370 tree void_ftype_void
16371 = build_function_type (void_type_node, void_list_node);
16372 tree void_ftype_unsigned
16373 = build_function_type_list (void_type_node, unsigned_type_node, NULL_TREE);
16374 tree void_ftype_unsigned_unsigned
16375 = build_function_type_list (void_type_node, unsigned_type_node,
16376 unsigned_type_node, NULL_TREE);
16377 tree void_ftype_pcvoid_unsigned_unsigned
16378 = build_function_type_list (void_type_node, const_ptr_type_node,
16379 unsigned_type_node, unsigned_type_node,
16380 NULL_TREE);
16381 tree unsigned_ftype_void
16382 = build_function_type (unsigned_type_node, void_list_node);
16383 tree v2si_ftype_v4sf
16384 = build_function_type_list (V2SI_type_node, V4SF_type_node, NULL_TREE);
16385 /* Loads/stores. */
16386 tree void_ftype_v8qi_v8qi_pchar
16387 = build_function_type_list (void_type_node,
16388 V8QI_type_node, V8QI_type_node,
16389 pchar_type_node, NULL_TREE);
16390 tree v4sf_ftype_pcfloat
16391 = build_function_type_list (V4SF_type_node, pcfloat_type_node, NULL_TREE);
16392 /* @@@ the type is bogus */
16393 tree v4sf_ftype_v4sf_pv2si
16394 = build_function_type_list (V4SF_type_node,
16395 V4SF_type_node, pv2si_type_node, NULL_TREE);
16396 tree void_ftype_pv2si_v4sf
16397 = build_function_type_list (void_type_node,
16398 pv2si_type_node, V4SF_type_node, NULL_TREE);
16399 tree void_ftype_pfloat_v4sf
16400 = build_function_type_list (void_type_node,
16401 pfloat_type_node, V4SF_type_node, NULL_TREE);
16402 tree void_ftype_pdi_di
16403 = build_function_type_list (void_type_node,
16404 pdi_type_node, long_long_unsigned_type_node,
16405 NULL_TREE);
16406 tree void_ftype_pv2di_v2di
16407 = build_function_type_list (void_type_node,
16408 pv2di_type_node, V2DI_type_node, NULL_TREE);
16409 /* Normal vector unops. */
16410 tree v4sf_ftype_v4sf
16411 = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
16412 tree v16qi_ftype_v16qi
16413 = build_function_type_list (V16QI_type_node, V16QI_type_node, NULL_TREE);
16414 tree v8hi_ftype_v8hi
16415 = build_function_type_list (V8HI_type_node, V8HI_type_node, NULL_TREE);
16416 tree v4si_ftype_v4si
16417 = build_function_type_list (V4SI_type_node, V4SI_type_node, NULL_TREE);
16418 tree v8qi_ftype_v8qi
16419 = build_function_type_list (V8QI_type_node, V8QI_type_node, NULL_TREE);
16420 tree v4hi_ftype_v4hi
16421 = build_function_type_list (V4HI_type_node, V4HI_type_node, NULL_TREE);
16422
16423 /* Normal vector binops. */
16424 tree v4sf_ftype_v4sf_v4sf
16425 = build_function_type_list (V4SF_type_node,
16426 V4SF_type_node, V4SF_type_node, NULL_TREE);
16427 tree v8qi_ftype_v8qi_v8qi
16428 = build_function_type_list (V8QI_type_node,
16429 V8QI_type_node, V8QI_type_node, NULL_TREE);
16430 tree v4hi_ftype_v4hi_v4hi
16431 = build_function_type_list (V4HI_type_node,
16432 V4HI_type_node, V4HI_type_node, NULL_TREE);
16433 tree v2si_ftype_v2si_v2si
16434 = build_function_type_list (V2SI_type_node,
16435 V2SI_type_node, V2SI_type_node, NULL_TREE);
16436 tree di_ftype_di_di
16437 = build_function_type_list (long_long_unsigned_type_node,
16438 long_long_unsigned_type_node,
16439 long_long_unsigned_type_node, NULL_TREE);
16440
16441 tree di_ftype_di_di_int
16442 = build_function_type_list (long_long_unsigned_type_node,
16443 long_long_unsigned_type_node,
16444 long_long_unsigned_type_node,
16445 integer_type_node, NULL_TREE);
16446
16447 tree v2si_ftype_v2sf
16448 = build_function_type_list (V2SI_type_node, V2SF_type_node, NULL_TREE);
16449 tree v2sf_ftype_v2si
16450 = build_function_type_list (V2SF_type_node, V2SI_type_node, NULL_TREE);
16451 tree v2si_ftype_v2si
16452 = build_function_type_list (V2SI_type_node, V2SI_type_node, NULL_TREE);
16453 tree v2sf_ftype_v2sf
16454 = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
16455 tree v2sf_ftype_v2sf_v2sf
16456 = build_function_type_list (V2SF_type_node,
16457 V2SF_type_node, V2SF_type_node, NULL_TREE);
16458 tree v2si_ftype_v2sf_v2sf
16459 = build_function_type_list (V2SI_type_node,
16460 V2SF_type_node, V2SF_type_node, NULL_TREE);
16461 tree pint_type_node = build_pointer_type (integer_type_node);
16462 tree pdouble_type_node = build_pointer_type (double_type_node);
16463 tree pcdouble_type_node = build_pointer_type (
16464 build_type_variant (double_type_node, 1, 0));
16465 tree int_ftype_v2df_v2df
16466 = build_function_type_list (integer_type_node,
16467 V2DF_type_node, V2DF_type_node, NULL_TREE);
16468
16469 tree void_ftype_pcvoid
16470 = build_function_type_list (void_type_node, const_ptr_type_node, NULL_TREE);
16471 tree v4sf_ftype_v4si
16472 = build_function_type_list (V4SF_type_node, V4SI_type_node, NULL_TREE);
16473 tree v4si_ftype_v4sf
16474 = build_function_type_list (V4SI_type_node, V4SF_type_node, NULL_TREE);
16475 tree v2df_ftype_v4si
16476 = build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
16477 tree v4si_ftype_v2df
16478 = build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
16479 tree v2si_ftype_v2df
16480 = build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
16481 tree v4sf_ftype_v2df
16482 = build_function_type_list (V4SF_type_node, V2DF_type_node, NULL_TREE);
16483 tree v2df_ftype_v2si
16484 = build_function_type_list (V2DF_type_node, V2SI_type_node, NULL_TREE);
16485 tree v2df_ftype_v4sf
16486 = build_function_type_list (V2DF_type_node, V4SF_type_node, NULL_TREE);
16487 tree int_ftype_v2df
16488 = build_function_type_list (integer_type_node, V2DF_type_node, NULL_TREE);
16489 tree int64_ftype_v2df
16490 = build_function_type_list (long_long_integer_type_node,
16491 V2DF_type_node, NULL_TREE);
16492 tree v2df_ftype_v2df_int
16493 = build_function_type_list (V2DF_type_node,
16494 V2DF_type_node, integer_type_node, NULL_TREE);
16495 tree v2df_ftype_v2df_int64
16496 = build_function_type_list (V2DF_type_node,
16497 V2DF_type_node, long_long_integer_type_node,
16498 NULL_TREE);
16499 tree v4sf_ftype_v4sf_v2df
16500 = build_function_type_list (V4SF_type_node,
16501 V4SF_type_node, V2DF_type_node, NULL_TREE);
16502 tree v2df_ftype_v2df_v4sf
16503 = build_function_type_list (V2DF_type_node,
16504 V2DF_type_node, V4SF_type_node, NULL_TREE);
16505 tree v2df_ftype_v2df_v2df_int
16506 = build_function_type_list (V2DF_type_node,
16507 V2DF_type_node, V2DF_type_node,
16508 integer_type_node,
16509 NULL_TREE);
16510 tree v2df_ftype_v2df_pcdouble
16511 = build_function_type_list (V2DF_type_node,
16512 V2DF_type_node, pcdouble_type_node, NULL_TREE);
16513 tree void_ftype_pdouble_v2df
16514 = build_function_type_list (void_type_node,
16515 pdouble_type_node, V2DF_type_node, NULL_TREE);
16516 tree void_ftype_pint_int
16517 = build_function_type_list (void_type_node,
16518 pint_type_node, integer_type_node, NULL_TREE);
16519 tree void_ftype_v16qi_v16qi_pchar
16520 = build_function_type_list (void_type_node,
16521 V16QI_type_node, V16QI_type_node,
16522 pchar_type_node, NULL_TREE);
16523 tree v2df_ftype_pcdouble
16524 = build_function_type_list (V2DF_type_node, pcdouble_type_node, NULL_TREE);
16525 tree v2df_ftype_v2df_v2df
16526 = build_function_type_list (V2DF_type_node,
16527 V2DF_type_node, V2DF_type_node, NULL_TREE);
16528 tree v16qi_ftype_v16qi_v16qi
16529 = build_function_type_list (V16QI_type_node,
16530 V16QI_type_node, V16QI_type_node, NULL_TREE);
16531 tree v8hi_ftype_v8hi_v8hi
16532 = build_function_type_list (V8HI_type_node,
16533 V8HI_type_node, V8HI_type_node, NULL_TREE);
16534 tree v4si_ftype_v4si_v4si
16535 = build_function_type_list (V4SI_type_node,
16536 V4SI_type_node, V4SI_type_node, NULL_TREE);
16537 tree v2di_ftype_v2di_v2di
16538 = build_function_type_list (V2DI_type_node,
16539 V2DI_type_node, V2DI_type_node, NULL_TREE);
16540 tree v2di_ftype_v2df_v2df
16541 = build_function_type_list (V2DI_type_node,
16542 V2DF_type_node, V2DF_type_node, NULL_TREE);
16543 tree v2df_ftype_v2df
16544 = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
16545 tree v2di_ftype_v2di_int
16546 = build_function_type_list (V2DI_type_node,
16547 V2DI_type_node, integer_type_node, NULL_TREE);
16548 tree v2di_ftype_v2di_v2di_int
16549 = build_function_type_list (V2DI_type_node, V2DI_type_node,
16550 V2DI_type_node, integer_type_node, NULL_TREE);
16551 tree v4si_ftype_v4si_int
16552 = build_function_type_list (V4SI_type_node,
16553 V4SI_type_node, integer_type_node, NULL_TREE);
16554 tree v8hi_ftype_v8hi_int
16555 = build_function_type_list (V8HI_type_node,
16556 V8HI_type_node, integer_type_node, NULL_TREE);
16557 tree v8hi_ftype_v8hi_v2di
16558 = build_function_type_list (V8HI_type_node,
16559 V8HI_type_node, V2DI_type_node, NULL_TREE);
16560 tree v4si_ftype_v4si_v2di
16561 = build_function_type_list (V4SI_type_node,
16562 V4SI_type_node, V2DI_type_node, NULL_TREE);
16563 tree v4si_ftype_v8hi_v8hi
16564 = build_function_type_list (V4SI_type_node,
16565 V8HI_type_node, V8HI_type_node, NULL_TREE);
16566 tree di_ftype_v8qi_v8qi
16567 = build_function_type_list (long_long_unsigned_type_node,
16568 V8QI_type_node, V8QI_type_node, NULL_TREE);
16569 tree di_ftype_v2si_v2si
16570 = build_function_type_list (long_long_unsigned_type_node,
16571 V2SI_type_node, V2SI_type_node, NULL_TREE);
16572 tree v2di_ftype_v16qi_v16qi
16573 = build_function_type_list (V2DI_type_node,
16574 V16QI_type_node, V16QI_type_node, NULL_TREE);
16575 tree v2di_ftype_v4si_v4si
16576 = build_function_type_list (V2DI_type_node,
16577 V4SI_type_node, V4SI_type_node, NULL_TREE);
16578 tree int_ftype_v16qi
16579 = build_function_type_list (integer_type_node, V16QI_type_node, NULL_TREE);
16580 tree v16qi_ftype_pcchar
16581 = build_function_type_list (V16QI_type_node, pcchar_type_node, NULL_TREE);
16582 tree void_ftype_pchar_v16qi
16583 = build_function_type_list (void_type_node,
16584 pchar_type_node, V16QI_type_node, NULL_TREE);
16585
16586 tree v2di_ftype_v2di_unsigned_unsigned
16587 = build_function_type_list (V2DI_type_node, V2DI_type_node,
16588 unsigned_type_node, unsigned_type_node,
16589 NULL_TREE);
16590 tree v2di_ftype_v2di_v2di_unsigned_unsigned
16591 = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node,
16592 unsigned_type_node, unsigned_type_node,
16593 NULL_TREE);
16594 tree v2di_ftype_v2di_v16qi
16595 = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
16596 NULL_TREE);
16597
16598 tree float80_type;
16599 tree float128_type;
16600 tree ftype;
16601
16602 /* The __float80 type. */
16603 if (TYPE_MODE (long_double_type_node) == XFmode)
16604 (*lang_hooks.types.register_builtin_type) (long_double_type_node,
16605 "__float80");
16606 else
16607 {
16608 /* The __float80 type. */
16609 float80_type = make_node (REAL_TYPE);
16610 TYPE_PRECISION (float80_type) = 80;
16611 layout_type (float80_type);
16612 (*lang_hooks.types.register_builtin_type) (float80_type, "__float80");
16613 }
16614
16615 if (TARGET_64BIT)
16616 {
16617 float128_type = make_node (REAL_TYPE);
16618 TYPE_PRECISION (float128_type) = 128;
16619 layout_type (float128_type);
16620 (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
16621 }
16622
16623 /* Add all builtins that are more or less simple operations on two
16624 operands. */
16625 for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
16626 {
16627 /* Use one of the operands; the target can have a different mode for
16628 mask-generating compares. */
16629 enum machine_mode mode;
16630 tree type;
16631
16632 if (d->name == 0)
16633 continue;
16634 mode = insn_data[d->icode].operand[1].mode;
16635
16636 switch (mode)
16637 {
16638 case V16QImode:
16639 type = v16qi_ftype_v16qi_v16qi;
16640 break;
16641 case V8HImode:
16642 type = v8hi_ftype_v8hi_v8hi;
16643 break;
16644 case V4SImode:
16645 type = v4si_ftype_v4si_v4si;
16646 break;
16647 case V2DImode:
16648 type = v2di_ftype_v2di_v2di;
16649 break;
16650 case V2DFmode:
16651 type = v2df_ftype_v2df_v2df;
16652 break;
16653 case V4SFmode:
16654 type = v4sf_ftype_v4sf_v4sf;
16655 break;
16656 case V8QImode:
16657 type = v8qi_ftype_v8qi_v8qi;
16658 break;
16659 case V4HImode:
16660 type = v4hi_ftype_v4hi_v4hi;
16661 break;
16662 case V2SImode:
16663 type = v2si_ftype_v2si_v2si;
16664 break;
16665 case DImode:
16666 type = di_ftype_di_di;
16667 break;
16668
16669 default:
16670 gcc_unreachable ();
16671 }
16672
16673 /* Override for comparisons. */
16674 if (d->icode == CODE_FOR_sse_maskcmpv4sf3
16675 || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3)
16676 type = v4si_ftype_v4sf_v4sf;
16677
16678 if (d->icode == CODE_FOR_sse2_maskcmpv2df3
16679 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
16680 type = v2di_ftype_v2df_v2df;
16681
16682 def_builtin (d->mask, d->name, type, d->code);
16683 }
16684
16685 /* Add all builtins that are more or less simple operations on 1 operand. */
16686 for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
16687 {
16688 enum machine_mode mode;
16689 tree type;
16690
16691 if (d->name == 0)
16692 continue;
16693 mode = insn_data[d->icode].operand[1].mode;
16694
16695 switch (mode)
16696 {
16697 case V16QImode:
16698 type = v16qi_ftype_v16qi;
16699 break;
16700 case V8HImode:
16701 type = v8hi_ftype_v8hi;
16702 break;
16703 case V4SImode:
16704 type = v4si_ftype_v4si;
16705 break;
16706 case V2DFmode:
16707 type = v2df_ftype_v2df;
16708 break;
16709 case V4SFmode:
16710 type = v4sf_ftype_v4sf;
16711 break;
16712 case V8QImode:
16713 type = v8qi_ftype_v8qi;
16714 break;
16715 case V4HImode:
16716 type = v4hi_ftype_v4hi;
16717 break;
16718 case V2SImode:
16719 type = v2si_ftype_v2si;
16720 break;
16721
16722 default:
16723 abort ();
16724 }
16725
16726 def_builtin (d->mask, d->name, type, d->code);
16727 }
16728
16729 /* Add the remaining MMX insns with somewhat more complicated types. */
16730 def_builtin (MASK_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
16731 def_builtin (MASK_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
16732 def_builtin (MASK_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
16733 def_builtin (MASK_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
16734
16735 def_builtin (MASK_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW);
16736 def_builtin (MASK_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD);
16737 def_builtin (MASK_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ);
16738
16739 def_builtin (MASK_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW);
16740 def_builtin (MASK_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD);
16741
16742 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
16743 def_builtin (MASK_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
16744
16745 /* comi/ucomi insns. */
16746 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
16747 if (d->mask == MASK_SSE2)
16748 def_builtin (d->mask, d->name, int_ftype_v2df_v2df, d->code);
16749 else
16750 def_builtin (d->mask, d->name, int_ftype_v4sf_v4sf, d->code);
16751
16752 def_builtin (MASK_MMX, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB);
16753 def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
16754 def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);
16755
16756 def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR);
16757 def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR);
16758 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS);
16759 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI);
16760 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS);
16761 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS);
16762 def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI);
16763 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64);
16764 def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
16765 def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);
16766 def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64);
16767
16768 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ);
16769
16770 def_builtin (MASK_SSE, "__builtin_ia32_loadups", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADUPS);
16771 def_builtin (MASK_SSE, "__builtin_ia32_storeups", void_ftype_pfloat_v4sf, IX86_BUILTIN_STOREUPS);
16772
16773 def_builtin (MASK_SSE, "__builtin_ia32_loadhps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADHPS);
16774 def_builtin (MASK_SSE, "__builtin_ia32_loadlps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADLPS);
16775 def_builtin (MASK_SSE, "__builtin_ia32_storehps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STOREHPS);
16776 def_builtin (MASK_SSE, "__builtin_ia32_storelps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STORELPS);
16777
16778 def_builtin (MASK_SSE, "__builtin_ia32_movmskps", int_ftype_v4sf, IX86_BUILTIN_MOVMSKPS);
16779 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pmovmskb", int_ftype_v8qi, IX86_BUILTIN_PMOVMSKB);
16780 def_builtin (MASK_SSE, "__builtin_ia32_movntps", void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTPS);
16781 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_movntq", void_ftype_pdi_di, IX86_BUILTIN_MOVNTQ);
16782
16783 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE);
16784
16785 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW);
16786
16787 def_builtin (MASK_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS);
16788 def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
16789 def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
16790 def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
16791 def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
16792 def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
16793
16794 def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS);
16795
16796 /* Original 3DNow! */
16797 def_builtin (MASK_3DNOW, "__builtin_ia32_femms", void_ftype_void, IX86_BUILTIN_FEMMS);
16798 def_builtin (MASK_3DNOW, "__builtin_ia32_pavgusb", v8qi_ftype_v8qi_v8qi, IX86_BUILTIN_PAVGUSB);
16799 def_builtin (MASK_3DNOW, "__builtin_ia32_pf2id", v2si_ftype_v2sf, IX86_BUILTIN_PF2ID);
16800 def_builtin (MASK_3DNOW, "__builtin_ia32_pfacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFACC);
16801 def_builtin (MASK_3DNOW, "__builtin_ia32_pfadd", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFADD);
16802 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpeq", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPEQ);
16803 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpge", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGE);
16804 def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpgt", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGT);
16805 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmax", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMAX);
16806 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmin", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMIN);
16807 def_builtin (MASK_3DNOW, "__builtin_ia32_pfmul", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMUL);
16808 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcp", v2sf_ftype_v2sf, IX86_BUILTIN_PFRCP);
16809 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT1);
16810 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit2", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT2);
16811 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqrt", v2sf_ftype_v2sf, IX86_BUILTIN_PFRSQRT);
16812 def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRSQIT1);
16813 def_builtin (MASK_3DNOW, "__builtin_ia32_pfsub", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUB);
16814 def_builtin (MASK_3DNOW, "__builtin_ia32_pfsubr", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUBR);
16815 def_builtin (MASK_3DNOW, "__builtin_ia32_pi2fd", v2sf_ftype_v2si, IX86_BUILTIN_PI2FD);
16816 def_builtin (MASK_3DNOW, "__builtin_ia32_pmulhrw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PMULHRW);
16817
16818 /* 3DNow! extension as used in the Athlon CPU. */
16819 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pf2iw", v2si_ftype_v2sf, IX86_BUILTIN_PF2IW);
16820 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFNACC);
16821 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfpnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFPNACC);
16822 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pi2fw", v2sf_ftype_v2si, IX86_BUILTIN_PI2FW);
16823 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsf", v2sf_ftype_v2sf, IX86_BUILTIN_PSWAPDSF);
16824 def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsi", v2si_ftype_v2si, IX86_BUILTIN_PSWAPDSI);
16825
16826 /* SSE2 */
16827 def_builtin (MASK_SSE2, "__builtin_ia32_maskmovdqu", void_ftype_v16qi_v16qi_pchar, IX86_BUILTIN_MASKMOVDQU);
16828
16829 def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADUPD);
16830 def_builtin (MASK_SSE2, "__builtin_ia32_storeupd", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREUPD);
16831
16832 def_builtin (MASK_SSE2, "__builtin_ia32_loadhpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADHPD);
16833 def_builtin (MASK_SSE2, "__builtin_ia32_loadlpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADLPD);
16834
16835 def_builtin (MASK_SSE2, "__builtin_ia32_movmskpd", int_ftype_v2df, IX86_BUILTIN_MOVMSKPD);
16836 def_builtin (MASK_SSE2, "__builtin_ia32_pmovmskb128", int_ftype_v16qi, IX86_BUILTIN_PMOVMSKB128);
16837 def_builtin (MASK_SSE2, "__builtin_ia32_movnti", void_ftype_pint_int, IX86_BUILTIN_MOVNTI);
16838 def_builtin (MASK_SSE2, "__builtin_ia32_movntpd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTPD);
16839 def_builtin (MASK_SSE2, "__builtin_ia32_movntdq", void_ftype_pv2di_v2di, IX86_BUILTIN_MOVNTDQ);
16840
16841 def_builtin (MASK_SSE2, "__builtin_ia32_pshufd", v4si_ftype_v4si_int, IX86_BUILTIN_PSHUFD);
16842 def_builtin (MASK_SSE2, "__builtin_ia32_pshuflw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFLW);
16843 def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW);
16844 def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128);
16845
16846 def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD);
16847 def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD);
16848
16849 def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD);
16850
16851 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD);
16852 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS);
16853
16854 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ);
16855 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI);
16856 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS);
16857 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ);
16858 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI);
16859
16860 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD);
16861
16862 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI);
16863 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI);
16864 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64);
16865 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64);
16866
16867 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ);
16868 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD);
16869 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ);
16870
16871 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD);
16872 def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD);
16873 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS);
16874 def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD);
16875
16876 def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH);
16877 def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE);
16878 def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE);
16879
16880 def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU);
16881 def_builtin (MASK_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU);
16882
16883 def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ);
16884 def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128);
16885
16886 def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSLLW128);
16887 def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSLLD128);
16888 def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128);
16889
16890 def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRLW128);
16891 def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRLD128);
16892 def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128);
16893
16894 def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRAW128);
16895 def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRAD128);
16896
16897 def_builtin (MASK_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128);
16898 def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128);
16899 def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128);
16900 def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128);
16901
16902 def_builtin (MASK_SSE2, "__builtin_ia32_psrldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLDQI128);
16903 def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128);
16904 def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128);
16905 def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128);
16906
16907 def_builtin (MASK_SSE2, "__builtin_ia32_psrawi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRAWI128);
16908 def_builtin (MASK_SSE2, "__builtin_ia32_psradi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRADI128);
16909
16910 def_builtin (MASK_SSE2, "__builtin_ia32_pmaddwd128", v4si_ftype_v8hi_v8hi, IX86_BUILTIN_PMADDWD128);
16911
16912 /* Prescott New Instructions. */
16913 def_builtin (MASK_SSE3, "__builtin_ia32_monitor",
16914 void_ftype_pcvoid_unsigned_unsigned,
16915 IX86_BUILTIN_MONITOR);
16916 def_builtin (MASK_SSE3, "__builtin_ia32_mwait",
16917 void_ftype_unsigned_unsigned,
16918 IX86_BUILTIN_MWAIT);
16919 def_builtin (MASK_SSE3, "__builtin_ia32_movshdup",
16920 v4sf_ftype_v4sf,
16921 IX86_BUILTIN_MOVSHDUP);
16922 def_builtin (MASK_SSE3, "__builtin_ia32_movsldup",
16923 v4sf_ftype_v4sf,
16924 IX86_BUILTIN_MOVSLDUP);
16925 def_builtin (MASK_SSE3, "__builtin_ia32_lddqu",
16926 v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU);
16927
16928 /* SSSE3. */
16929 def_builtin (MASK_SSSE3, "__builtin_ia32_palignr128",
16930 v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PALIGNR128);
16931 def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int,
16932 IX86_BUILTIN_PALIGNR);
16933
16934 /* AMDFAM10 SSE4A New built-ins */
16935 def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd",
16936 void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD);
16937 def_builtin (MASK_SSE4A, "__builtin_ia32_movntss",
16938 void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS);
16939 def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi",
16940 v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI);
16941 def_builtin (MASK_SSE4A, "__builtin_ia32_extrq",
16942 v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ);
16943 def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi",
16944 v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
16945 def_builtin (MASK_SSE4A, "__builtin_ia32_insertq",
16946 v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
16947
16948 /* Access to the vec_init patterns. */
16949 ftype = build_function_type_list (V2SI_type_node, integer_type_node,
16950 integer_type_node, NULL_TREE);
16951 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v2si",
16952 ftype, IX86_BUILTIN_VEC_INIT_V2SI);
16953
16954 ftype = build_function_type_list (V4HI_type_node, short_integer_type_node,
16955 short_integer_type_node,
16956 short_integer_type_node,
16957 short_integer_type_node, NULL_TREE);
16958 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v4hi",
16959 ftype, IX86_BUILTIN_VEC_INIT_V4HI);
16960
16961 ftype = build_function_type_list (V8QI_type_node, char_type_node,
16962 char_type_node, char_type_node,
16963 char_type_node, char_type_node,
16964 char_type_node, char_type_node,
16965 char_type_node, NULL_TREE);
16966 def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v8qi",
16967 ftype, IX86_BUILTIN_VEC_INIT_V8QI);
16968
16969 /* Access to the vec_extract patterns. */
16970 ftype = build_function_type_list (double_type_node, V2DF_type_node,
16971 integer_type_node, NULL_TREE);
16972 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2df",
16973 ftype, IX86_BUILTIN_VEC_EXT_V2DF);
16974
16975 ftype = build_function_type_list (long_long_integer_type_node,
16976 V2DI_type_node, integer_type_node,
16977 NULL_TREE);
16978 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2di",
16979 ftype, IX86_BUILTIN_VEC_EXT_V2DI);
16980
16981 ftype = build_function_type_list (float_type_node, V4SF_type_node,
16982 integer_type_node, NULL_TREE);
16983 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4sf",
16984 ftype, IX86_BUILTIN_VEC_EXT_V4SF);
16985
16986 ftype = build_function_type_list (intSI_type_node, V4SI_type_node,
16987 integer_type_node, NULL_TREE);
16988 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4si",
16989 ftype, IX86_BUILTIN_VEC_EXT_V4SI);
16990
16991 ftype = build_function_type_list (intHI_type_node, V8HI_type_node,
16992 integer_type_node, NULL_TREE);
16993 def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v8hi",
16994 ftype, IX86_BUILTIN_VEC_EXT_V8HI);
16995
16996 ftype = build_function_type_list (intHI_type_node, V4HI_type_node,
16997 integer_type_node, NULL_TREE);
16998 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_ext_v4hi",
16999 ftype, IX86_BUILTIN_VEC_EXT_V4HI);
17000
17001 ftype = build_function_type_list (intSI_type_node, V2SI_type_node,
17002 integer_type_node, NULL_TREE);
17003 def_builtin (MASK_MMX, "__builtin_ia32_vec_ext_v2si",
17004 ftype, IX86_BUILTIN_VEC_EXT_V2SI);
17005
17006 /* Access to the vec_set patterns. */
17007 ftype = build_function_type_list (V8HI_type_node, V8HI_type_node,
17008 intHI_type_node,
17009 integer_type_node, NULL_TREE);
17010 def_builtin (MASK_SSE, "__builtin_ia32_vec_set_v8hi",
17011 ftype, IX86_BUILTIN_VEC_SET_V8HI);
17012
17013 ftype = build_function_type_list (V4HI_type_node, V4HI_type_node,
17014 intHI_type_node,
17015 integer_type_node, NULL_TREE);
17016 def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi",
17017 ftype, IX86_BUILTIN_VEC_SET_V4HI);
17018 }
17019
17020 /* Errors in the source file can cause expand_expr to return const0_rtx
17021 where we expect a vector. To avoid crashing, use one of the vector
17022 clear instructions. */
17023 static rtx
17024 safe_vector_operand (rtx x, enum machine_mode mode)
17025 {
17026 if (x == const0_rtx)
17027 x = CONST0_RTX (mode);
17028 return x;
17029 }
17030
17031 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
17032
17033 static rtx
17034 ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target)
17035 {
17036 rtx pat, xops[3];
17037 tree arg0 = TREE_VALUE (arglist);
17038 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17039 rtx op0 = expand_normal (arg0);
17040 rtx op1 = expand_normal (arg1);
17041 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17042 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17043 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
17044
17045 if (VECTOR_MODE_P (mode0))
17046 op0 = safe_vector_operand (op0, mode0);
17047 if (VECTOR_MODE_P (mode1))
17048 op1 = safe_vector_operand (op1, mode1);
17049
17050 if (optimize || !target
17051 || GET_MODE (target) != tmode
17052 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17053 target = gen_reg_rtx (tmode);
17054
17055 if (GET_MODE (op1) == SImode && mode1 == TImode)
17056 {
17057 rtx x = gen_reg_rtx (V4SImode);
17058 emit_insn (gen_sse2_loadd (x, op1));
17059 op1 = gen_lowpart (TImode, x);
17060 }
17061
17062 /* The insn must want input operands in the same modes as the
17063 result. */
17064 gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
17065 && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
17066
17067 if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
17068 op0 = copy_to_mode_reg (mode0, op0);
17069 if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
17070 op1 = copy_to_mode_reg (mode1, op1);
17071
17072 /* ??? Using ix86_fixup_binary_operands is problematic when
17073 we've got mismatched modes. Fake it. */
17074
17075 xops[0] = target;
17076 xops[1] = op0;
17077 xops[2] = op1;
17078
17079 if (tmode == mode0 && tmode == mode1)
17080 {
17081 target = ix86_fixup_binary_operands (UNKNOWN, tmode, xops);
17082 op0 = xops[1];
17083 op1 = xops[2];
17084 }
17085 else if (optimize || !ix86_binary_operator_ok (UNKNOWN, tmode, xops))
17086 {
17087 op0 = force_reg (mode0, op0);
17088 op1 = force_reg (mode1, op1);
17089 target = gen_reg_rtx (tmode);
17090 }
17091
17092 pat = GEN_FCN (icode) (target, op0, op1);
17093 if (! pat)
17094 return 0;
17095 emit_insn (pat);
17096 return target;
17097 }
17098
17099 /* Subroutine of ix86_expand_builtin to take care of stores. */
17100
17101 static rtx
17102 ix86_expand_store_builtin (enum insn_code icode, tree arglist)
17103 {
17104 rtx pat;
17105 tree arg0 = TREE_VALUE (arglist);
17106 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17107 rtx op0 = expand_normal (arg0);
17108 rtx op1 = expand_normal (arg1);
17109 enum machine_mode mode0 = insn_data[icode].operand[0].mode;
17110 enum machine_mode mode1 = insn_data[icode].operand[1].mode;
17111
17112 if (VECTOR_MODE_P (mode1))
17113 op1 = safe_vector_operand (op1, mode1);
17114
17115 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17116 op1 = copy_to_mode_reg (mode1, op1);
17117
17118 pat = GEN_FCN (icode) (op0, op1);
17119 if (pat)
17120 emit_insn (pat);
17121 return 0;
17122 }
17123
17124 /* Subroutine of ix86_expand_builtin to take care of unop insns. */
17125
17126 static rtx
17127 ix86_expand_unop_builtin (enum insn_code icode, tree arglist,
17128 rtx target, int do_load)
17129 {
17130 rtx pat;
17131 tree arg0 = TREE_VALUE (arglist);
17132 rtx op0 = expand_normal (arg0);
17133 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17134 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17135
17136 if (optimize || !target
17137 || GET_MODE (target) != tmode
17138 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17139 target = gen_reg_rtx (tmode);
17140 if (do_load)
17141 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17142 else
17143 {
17144 if (VECTOR_MODE_P (mode0))
17145 op0 = safe_vector_operand (op0, mode0);
17146
17147 if ((optimize && !register_operand (op0, mode0))
17148 || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17149 op0 = copy_to_mode_reg (mode0, op0);
17150 }
17151
17152 pat = GEN_FCN (icode) (target, op0);
17153 if (! pat)
17154 return 0;
17155 emit_insn (pat);
17156 return target;
17157 }
17158
17159 /* Subroutine of ix86_expand_builtin to take care of three special unop insns:
17160 sqrtss, rsqrtss, rcpss. */
17161
17162 static rtx
17163 ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target)
17164 {
17165 rtx pat;
17166 tree arg0 = TREE_VALUE (arglist);
17167 rtx op1, op0 = expand_normal (arg0);
17168 enum machine_mode tmode = insn_data[icode].operand[0].mode;
17169 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17170
17171 if (optimize || !target
17172 || GET_MODE (target) != tmode
17173 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17174 target = gen_reg_rtx (tmode);
17175
17176 if (VECTOR_MODE_P (mode0))
17177 op0 = safe_vector_operand (op0, mode0);
17178
17179 if ((optimize && !register_operand (op0, mode0))
17180 || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17181 op0 = copy_to_mode_reg (mode0, op0);
17182
17183 op1 = op0;
17184 if (! (*insn_data[icode].operand[2].predicate) (op1, mode0))
17185 op1 = copy_to_mode_reg (mode0, op1);
17186
17187 pat = GEN_FCN (icode) (target, op0, op1);
17188 if (! pat)
17189 return 0;
17190 emit_insn (pat);
17191 return target;
17192 }
17193
17194 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
17195
17196 static rtx
17197 ix86_expand_sse_compare (const struct builtin_description *d, tree arglist,
17198 rtx target)
17199 {
17200 rtx pat;
17201 tree arg0 = TREE_VALUE (arglist);
17202 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17203 rtx op0 = expand_normal (arg0);
17204 rtx op1 = expand_normal (arg1);
17205 rtx op2;
17206 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
17207 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
17208 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
17209 enum rtx_code comparison = d->comparison;
17210
17211 if (VECTOR_MODE_P (mode0))
17212 op0 = safe_vector_operand (op0, mode0);
17213 if (VECTOR_MODE_P (mode1))
17214 op1 = safe_vector_operand (op1, mode1);
17215
17216 /* Swap operands if we have a comparison that isn't available in
17217 hardware. */
17218 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17219 {
17220 rtx tmp = gen_reg_rtx (mode1);
17221 emit_move_insn (tmp, op1);
17222 op1 = op0;
17223 op0 = tmp;
17224 }
17225
17226 if (optimize || !target
17227 || GET_MODE (target) != tmode
17228 || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode))
17229 target = gen_reg_rtx (tmode);
17230
17231 if ((optimize && !register_operand (op0, mode0))
17232 || ! (*insn_data[d->icode].operand[1].predicate) (op0, mode0))
17233 op0 = copy_to_mode_reg (mode0, op0);
17234 if ((optimize && !register_operand (op1, mode1))
17235 || ! (*insn_data[d->icode].operand[2].predicate) (op1, mode1))
17236 op1 = copy_to_mode_reg (mode1, op1);
17237
17238 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17239 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
17240 if (! pat)
17241 return 0;
17242 emit_insn (pat);
17243 return target;
17244 }
17245
17246 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
17247
17248 static rtx
17249 ix86_expand_sse_comi (const struct builtin_description *d, tree arglist,
17250 rtx target)
17251 {
17252 rtx pat;
17253 tree arg0 = TREE_VALUE (arglist);
17254 tree arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17255 rtx op0 = expand_normal (arg0);
17256 rtx op1 = expand_normal (arg1);
17257 rtx op2;
17258 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
17259 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
17260 enum rtx_code comparison = d->comparison;
17261
17262 if (VECTOR_MODE_P (mode0))
17263 op0 = safe_vector_operand (op0, mode0);
17264 if (VECTOR_MODE_P (mode1))
17265 op1 = safe_vector_operand (op1, mode1);
17266
17267 /* Swap operands if we have a comparison that isn't available in
17268 hardware. */
17269 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17270 {
17271 rtx tmp = op1;
17272 op1 = op0;
17273 op0 = tmp;
17274 }
17275
17276 target = gen_reg_rtx (SImode);
17277 emit_move_insn (target, const0_rtx);
17278 target = gen_rtx_SUBREG (QImode, target, 0);
17279
17280 if ((optimize && !register_operand (op0, mode0))
17281 || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0))
17282 op0 = copy_to_mode_reg (mode0, op0);
17283 if ((optimize && !register_operand (op1, mode1))
17284 || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1))
17285 op1 = copy_to_mode_reg (mode1, op1);
17286
17287 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17288 pat = GEN_FCN (d->icode) (op0, op1);
17289 if (! pat)
17290 return 0;
17291 emit_insn (pat);
17292 emit_insn (gen_rtx_SET (VOIDmode,
17293 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
17294 gen_rtx_fmt_ee (comparison, QImode,
17295 SET_DEST (pat),
17296 const0_rtx)));
17297
17298 return SUBREG_REG (target);
17299 }
17300
17301 /* Return the integer constant in ARG. Constrain it to be in the range
17302 of the subparts of VEC_TYPE; issue an error if not. */
17303
17304 static int
17305 get_element_number (tree vec_type, tree arg)
17306 {
17307 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
17308
17309 if (!host_integerp (arg, 1)
17310 || (elt = tree_low_cst (arg, 1), elt > max))
17311 {
17312 error ("selector must be an integer constant in the range 0..%wi", max);
17313 return 0;
17314 }
17315
17316 return elt;
17317 }
17318
17319 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17320 ix86_expand_vector_init. We DO have language-level syntax for this, in
17321 the form of (type){ init-list }. Except that since we can't place emms
17322 instructions from inside the compiler, we can't allow the use of MMX
17323 registers unless the user explicitly asks for it. So we do *not* define
17324 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
17325 we have builtins invoked by mmintrin.h that gives us license to emit
17326 these sorts of instructions. */
17327
17328 static rtx
17329 ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target)
17330 {
17331 enum machine_mode tmode = TYPE_MODE (type);
17332 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
17333 int i, n_elt = GET_MODE_NUNITS (tmode);
17334 rtvec v = rtvec_alloc (n_elt);
17335
17336 gcc_assert (VECTOR_MODE_P (tmode));
17337
17338 for (i = 0; i < n_elt; ++i, arglist = TREE_CHAIN (arglist))
17339 {
17340 rtx x = expand_normal (TREE_VALUE (arglist));
17341 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
17342 }
17343
17344 gcc_assert (arglist == NULL);
17345
17346 if (!target || !register_operand (target, tmode))
17347 target = gen_reg_rtx (tmode);
17348
17349 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
17350 return target;
17351 }
17352
17353 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17354 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
17355 had a language-level syntax for referencing vector elements. */
17356
17357 static rtx
17358 ix86_expand_vec_ext_builtin (tree arglist, rtx target)
17359 {
17360 enum machine_mode tmode, mode0;
17361 tree arg0, arg1;
17362 int elt;
17363 rtx op0;
17364
17365 arg0 = TREE_VALUE (arglist);
17366 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17367
17368 op0 = expand_normal (arg0);
17369 elt = get_element_number (TREE_TYPE (arg0), arg1);
17370
17371 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17372 mode0 = TYPE_MODE (TREE_TYPE (arg0));
17373 gcc_assert (VECTOR_MODE_P (mode0));
17374
17375 op0 = force_reg (mode0, op0);
17376
17377 if (optimize || !target || !register_operand (target, tmode))
17378 target = gen_reg_rtx (tmode);
17379
17380 ix86_expand_vector_extract (true, target, op0, elt);
17381
17382 return target;
17383 }
17384
17385 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
17386 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
17387 a language-level syntax for referencing vector elements. */
17388
17389 static rtx
17390 ix86_expand_vec_set_builtin (tree arglist)
17391 {
17392 enum machine_mode tmode, mode1;
17393 tree arg0, arg1, arg2;
17394 int elt;
17395 rtx op0, op1;
17396
17397 arg0 = TREE_VALUE (arglist);
17398 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17399 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17400
17401 tmode = TYPE_MODE (TREE_TYPE (arg0));
17402 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17403 gcc_assert (VECTOR_MODE_P (tmode));
17404
17405 op0 = expand_expr (arg0, NULL_RTX, tmode, 0);
17406 op1 = expand_expr (arg1, NULL_RTX, mode1, 0);
17407 elt = get_element_number (TREE_TYPE (arg0), arg2);
17408
17409 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
17410 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
17411
17412 op0 = force_reg (tmode, op0);
17413 op1 = force_reg (mode1, op1);
17414
17415 ix86_expand_vector_set (true, op0, op1, elt);
17416
17417 return op0;
17418 }
17419
17420 /* Expand an expression EXP that calls a built-in function,
17421 with result going to TARGET if that's convenient
17422 (and in mode MODE if that's convenient).
17423 SUBTARGET may be used as the target for computing one of EXP's operands.
17424 IGNORE is nonzero if the value is to be ignored. */
17425
17426 static rtx
17427 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
17428 enum machine_mode mode ATTRIBUTE_UNUSED,
17429 int ignore ATTRIBUTE_UNUSED)
17430 {
17431 const struct builtin_description *d;
17432 size_t i;
17433 enum insn_code icode;
17434 tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0);
17435 tree arglist = TREE_OPERAND (exp, 1);
17436 tree arg0, arg1, arg2, arg3;
17437 rtx op0, op1, op2, op3, pat;
17438 enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4;
17439 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
17440
17441 switch (fcode)
17442 {
17443 case IX86_BUILTIN_EMMS:
17444 emit_insn (gen_mmx_emms ());
17445 return 0;
17446
17447 case IX86_BUILTIN_SFENCE:
17448 emit_insn (gen_sse_sfence ());
17449 return 0;
17450
17451 case IX86_BUILTIN_MASKMOVQ:
17452 case IX86_BUILTIN_MASKMOVDQU:
17453 icode = (fcode == IX86_BUILTIN_MASKMOVQ
17454 ? CODE_FOR_mmx_maskmovq
17455 : CODE_FOR_sse2_maskmovdqu);
17456 /* Note the arg order is different from the operand order. */
17457 arg1 = TREE_VALUE (arglist);
17458 arg2 = TREE_VALUE (TREE_CHAIN (arglist));
17459 arg0 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17460 op0 = expand_normal (arg0);
17461 op1 = expand_normal (arg1);
17462 op2 = expand_normal (arg2);
17463 mode0 = insn_data[icode].operand[0].mode;
17464 mode1 = insn_data[icode].operand[1].mode;
17465 mode2 = insn_data[icode].operand[2].mode;
17466
17467 op0 = force_reg (Pmode, op0);
17468 op0 = gen_rtx_MEM (mode1, op0);
17469
17470 if (! (*insn_data[icode].operand[0].predicate) (op0, mode0))
17471 op0 = copy_to_mode_reg (mode0, op0);
17472 if (! (*insn_data[icode].operand[1].predicate) (op1, mode1))
17473 op1 = copy_to_mode_reg (mode1, op1);
17474 if (! (*insn_data[icode].operand[2].predicate) (op2, mode2))
17475 op2 = copy_to_mode_reg (mode2, op2);
17476 pat = GEN_FCN (icode) (op0, op1, op2);
17477 if (! pat)
17478 return 0;
17479 emit_insn (pat);
17480 return 0;
17481
17482 case IX86_BUILTIN_SQRTSS:
17483 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, arglist, target);
17484 case IX86_BUILTIN_RSQRTSS:
17485 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, arglist, target);
17486 case IX86_BUILTIN_RCPSS:
17487 return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, arglist, target);
17488
17489 case IX86_BUILTIN_LOADUPS:
17490 return ix86_expand_unop_builtin (CODE_FOR_sse_movups, arglist, target, 1);
17491
17492 case IX86_BUILTIN_STOREUPS:
17493 return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist);
17494
17495 case IX86_BUILTIN_LOADHPS:
17496 case IX86_BUILTIN_LOADLPS:
17497 case IX86_BUILTIN_LOADHPD:
17498 case IX86_BUILTIN_LOADLPD:
17499 icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_loadhps
17500 : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps
17501 : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd
17502 : CODE_FOR_sse2_loadlpd);
17503 arg0 = TREE_VALUE (arglist);
17504 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17505 op0 = expand_normal (arg0);
17506 op1 = expand_normal (arg1);
17507 tmode = insn_data[icode].operand[0].mode;
17508 mode0 = insn_data[icode].operand[1].mode;
17509 mode1 = insn_data[icode].operand[2].mode;
17510
17511 op0 = force_reg (mode0, op0);
17512 op1 = gen_rtx_MEM (mode1, copy_to_mode_reg (Pmode, op1));
17513 if (optimize || target == 0
17514 || GET_MODE (target) != tmode
17515 || !register_operand (target, tmode))
17516 target = gen_reg_rtx (tmode);
17517 pat = GEN_FCN (icode) (target, op0, op1);
17518 if (! pat)
17519 return 0;
17520 emit_insn (pat);
17521 return target;
17522
17523 case IX86_BUILTIN_STOREHPS:
17524 case IX86_BUILTIN_STORELPS:
17525 icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps
17526 : CODE_FOR_sse_storelps);
17527 arg0 = TREE_VALUE (arglist);
17528 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17529 op0 = expand_normal (arg0);
17530 op1 = expand_normal (arg1);
17531 mode0 = insn_data[icode].operand[0].mode;
17532 mode1 = insn_data[icode].operand[1].mode;
17533
17534 op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17535 op1 = force_reg (mode1, op1);
17536
17537 pat = GEN_FCN (icode) (op0, op1);
17538 if (! pat)
17539 return 0;
17540 emit_insn (pat);
17541 return const0_rtx;
17542
17543 case IX86_BUILTIN_MOVNTPS:
17544 return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, arglist);
17545 case IX86_BUILTIN_MOVNTQ:
17546 return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, arglist);
17547
17548 case IX86_BUILTIN_LDMXCSR:
17549 op0 = expand_normal (TREE_VALUE (arglist));
17550 target = assign_386_stack_local (SImode, SLOT_TEMP);
17551 emit_move_insn (target, op0);
17552 emit_insn (gen_sse_ldmxcsr (target));
17553 return 0;
17554
17555 case IX86_BUILTIN_STMXCSR:
17556 target = assign_386_stack_local (SImode, SLOT_TEMP);
17557 emit_insn (gen_sse_stmxcsr (target));
17558 return copy_to_mode_reg (SImode, target);
17559
17560 case IX86_BUILTIN_SHUFPS:
17561 case IX86_BUILTIN_SHUFPD:
17562 icode = (fcode == IX86_BUILTIN_SHUFPS
17563 ? CODE_FOR_sse_shufps
17564 : CODE_FOR_sse2_shufpd);
17565 arg0 = TREE_VALUE (arglist);
17566 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17567 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17568 op0 = expand_normal (arg0);
17569 op1 = expand_normal (arg1);
17570 op2 = expand_normal (arg2);
17571 tmode = insn_data[icode].operand[0].mode;
17572 mode0 = insn_data[icode].operand[1].mode;
17573 mode1 = insn_data[icode].operand[2].mode;
17574 mode2 = insn_data[icode].operand[3].mode;
17575
17576 if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17577 op0 = copy_to_mode_reg (mode0, op0);
17578 if ((optimize && !register_operand (op1, mode1))
17579 || !(*insn_data[icode].operand[2].predicate) (op1, mode1))
17580 op1 = copy_to_mode_reg (mode1, op1);
17581 if (! (*insn_data[icode].operand[3].predicate) (op2, mode2))
17582 {
17583 /* @@@ better error message */
17584 error ("mask must be an immediate");
17585 return gen_reg_rtx (tmode);
17586 }
17587 if (optimize || target == 0
17588 || GET_MODE (target) != tmode
17589 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17590 target = gen_reg_rtx (tmode);
17591 pat = GEN_FCN (icode) (target, op0, op1, op2);
17592 if (! pat)
17593 return 0;
17594 emit_insn (pat);
17595 return target;
17596
17597 case IX86_BUILTIN_PSHUFW:
17598 case IX86_BUILTIN_PSHUFD:
17599 case IX86_BUILTIN_PSHUFHW:
17600 case IX86_BUILTIN_PSHUFLW:
17601 icode = ( fcode == IX86_BUILTIN_PSHUFHW ? CODE_FOR_sse2_pshufhw
17602 : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw
17603 : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd
17604 : CODE_FOR_mmx_pshufw);
17605 arg0 = TREE_VALUE (arglist);
17606 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17607 op0 = expand_normal (arg0);
17608 op1 = expand_normal (arg1);
17609 tmode = insn_data[icode].operand[0].mode;
17610 mode1 = insn_data[icode].operand[1].mode;
17611 mode2 = insn_data[icode].operand[2].mode;
17612
17613 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17614 op0 = copy_to_mode_reg (mode1, op0);
17615 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17616 {
17617 /* @@@ better error message */
17618 error ("mask must be an immediate");
17619 return const0_rtx;
17620 }
17621 if (target == 0
17622 || GET_MODE (target) != tmode
17623 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17624 target = gen_reg_rtx (tmode);
17625 pat = GEN_FCN (icode) (target, op0, op1);
17626 if (! pat)
17627 return 0;
17628 emit_insn (pat);
17629 return target;
17630
17631 case IX86_BUILTIN_PSLLDQI128:
17632 case IX86_BUILTIN_PSRLDQI128:
17633 icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3
17634 : CODE_FOR_sse2_lshrti3);
17635 arg0 = TREE_VALUE (arglist);
17636 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17637 op0 = expand_normal (arg0);
17638 op1 = expand_normal (arg1);
17639 tmode = insn_data[icode].operand[0].mode;
17640 mode1 = insn_data[icode].operand[1].mode;
17641 mode2 = insn_data[icode].operand[2].mode;
17642
17643 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17644 {
17645 op0 = copy_to_reg (op0);
17646 op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
17647 }
17648 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17649 {
17650 error ("shift must be an immediate");
17651 return const0_rtx;
17652 }
17653 target = gen_reg_rtx (V2DImode);
17654 pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), op0, op1);
17655 if (! pat)
17656 return 0;
17657 emit_insn (pat);
17658 return target;
17659
17660 case IX86_BUILTIN_FEMMS:
17661 emit_insn (gen_mmx_femms ());
17662 return NULL_RTX;
17663
17664 case IX86_BUILTIN_PAVGUSB:
17665 return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, arglist, target);
17666
17667 case IX86_BUILTIN_PF2ID:
17668 return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, arglist, target, 0);
17669
17670 case IX86_BUILTIN_PFACC:
17671 return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, arglist, target);
17672
17673 case IX86_BUILTIN_PFADD:
17674 return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, arglist, target);
17675
17676 case IX86_BUILTIN_PFCMPEQ:
17677 return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, arglist, target);
17678
17679 case IX86_BUILTIN_PFCMPGE:
17680 return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, arglist, target);
17681
17682 case IX86_BUILTIN_PFCMPGT:
17683 return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, arglist, target);
17684
17685 case IX86_BUILTIN_PFMAX:
17686 return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, arglist, target);
17687
17688 case IX86_BUILTIN_PFMIN:
17689 return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, arglist, target);
17690
17691 case IX86_BUILTIN_PFMUL:
17692 return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, arglist, target);
17693
17694 case IX86_BUILTIN_PFRCP:
17695 return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, arglist, target, 0);
17696
17697 case IX86_BUILTIN_PFRCPIT1:
17698 return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, arglist, target);
17699
17700 case IX86_BUILTIN_PFRCPIT2:
17701 return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, arglist, target);
17702
17703 case IX86_BUILTIN_PFRSQIT1:
17704 return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, arglist, target);
17705
17706 case IX86_BUILTIN_PFRSQRT:
17707 return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, arglist, target, 0);
17708
17709 case IX86_BUILTIN_PFSUB:
17710 return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, arglist, target);
17711
17712 case IX86_BUILTIN_PFSUBR:
17713 return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, arglist, target);
17714
17715 case IX86_BUILTIN_PI2FD:
17716 return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, arglist, target, 0);
17717
17718 case IX86_BUILTIN_PMULHRW:
17719 return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, arglist, target);
17720
17721 case IX86_BUILTIN_PF2IW:
17722 return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, arglist, target, 0);
17723
17724 case IX86_BUILTIN_PFNACC:
17725 return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, arglist, target);
17726
17727 case IX86_BUILTIN_PFPNACC:
17728 return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, arglist, target);
17729
17730 case IX86_BUILTIN_PI2FW:
17731 return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, arglist, target, 0);
17732
17733 case IX86_BUILTIN_PSWAPDSI:
17734 return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, arglist, target, 0);
17735
17736 case IX86_BUILTIN_PSWAPDSF:
17737 return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, arglist, target, 0);
17738
17739 case IX86_BUILTIN_SQRTSD:
17740 return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, arglist, target);
17741 case IX86_BUILTIN_LOADUPD:
17742 return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, arglist, target, 1);
17743 case IX86_BUILTIN_STOREUPD:
17744 return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, arglist);
17745
17746 case IX86_BUILTIN_MFENCE:
17747 emit_insn (gen_sse2_mfence ());
17748 return 0;
17749 case IX86_BUILTIN_LFENCE:
17750 emit_insn (gen_sse2_lfence ());
17751 return 0;
17752
17753 case IX86_BUILTIN_CLFLUSH:
17754 arg0 = TREE_VALUE (arglist);
17755 op0 = expand_normal (arg0);
17756 icode = CODE_FOR_sse2_clflush;
17757 if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
17758 op0 = copy_to_mode_reg (Pmode, op0);
17759
17760 emit_insn (gen_sse2_clflush (op0));
17761 return 0;
17762
17763 case IX86_BUILTIN_MOVNTPD:
17764 return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist);
17765 case IX86_BUILTIN_MOVNTDQ:
17766 return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist);
17767 case IX86_BUILTIN_MOVNTI:
17768 return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist);
17769
17770 case IX86_BUILTIN_LOADDQU:
17771 return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1);
17772 case IX86_BUILTIN_STOREDQU:
17773 return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist);
17774
17775 case IX86_BUILTIN_MONITOR:
17776 arg0 = TREE_VALUE (arglist);
17777 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17778 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17779 op0 = expand_normal (arg0);
17780 op1 = expand_normal (arg1);
17781 op2 = expand_normal (arg2);
17782 if (!REG_P (op0))
17783 op0 = copy_to_mode_reg (Pmode, op0);
17784 if (!REG_P (op1))
17785 op1 = copy_to_mode_reg (SImode, op1);
17786 if (!REG_P (op2))
17787 op2 = copy_to_mode_reg (SImode, op2);
17788 if (!TARGET_64BIT)
17789 emit_insn (gen_sse3_monitor (op0, op1, op2));
17790 else
17791 emit_insn (gen_sse3_monitor64 (op0, op1, op2));
17792 return 0;
17793
17794 case IX86_BUILTIN_MWAIT:
17795 arg0 = TREE_VALUE (arglist);
17796 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17797 op0 = expand_normal (arg0);
17798 op1 = expand_normal (arg1);
17799 if (!REG_P (op0))
17800 op0 = copy_to_mode_reg (SImode, op0);
17801 if (!REG_P (op1))
17802 op1 = copy_to_mode_reg (SImode, op1);
17803 emit_insn (gen_sse3_mwait (op0, op1));
17804 return 0;
17805
17806 case IX86_BUILTIN_LDDQU:
17807 return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, arglist,
17808 target, 1);
17809
17810 case IX86_BUILTIN_PALIGNR:
17811 case IX86_BUILTIN_PALIGNR128:
17812 if (fcode == IX86_BUILTIN_PALIGNR)
17813 {
17814 icode = CODE_FOR_ssse3_palignrdi;
17815 mode = DImode;
17816 }
17817 else
17818 {
17819 icode = CODE_FOR_ssse3_palignrti;
17820 mode = V2DImode;
17821 }
17822 arg0 = TREE_VALUE (arglist);
17823 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17824 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17825 op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
17826 op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
17827 op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0);
17828 tmode = insn_data[icode].operand[0].mode;
17829 mode1 = insn_data[icode].operand[1].mode;
17830 mode2 = insn_data[icode].operand[2].mode;
17831 mode3 = insn_data[icode].operand[3].mode;
17832
17833 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17834 {
17835 op0 = copy_to_reg (op0);
17836 op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
17837 }
17838 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17839 {
17840 op1 = copy_to_reg (op1);
17841 op1 = simplify_gen_subreg (mode2, op1, GET_MODE (op1), 0);
17842 }
17843 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17844 {
17845 error ("shift must be an immediate");
17846 return const0_rtx;
17847 }
17848 target = gen_reg_rtx (mode);
17849 pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, mode, 0),
17850 op0, op1, op2);
17851 if (! pat)
17852 return 0;
17853 emit_insn (pat);
17854 return target;
17855
17856 case IX86_BUILTIN_MOVNTSD:
17857 return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, arglist);
17858
17859 case IX86_BUILTIN_MOVNTSS:
17860 return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, arglist);
17861
17862 case IX86_BUILTIN_INSERTQ:
17863 case IX86_BUILTIN_EXTRQ:
17864 icode = (fcode == IX86_BUILTIN_EXTRQ
17865 ? CODE_FOR_sse4a_extrq
17866 : CODE_FOR_sse4a_insertq);
17867 arg0 = TREE_VALUE (arglist);
17868 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17869 op0 = expand_normal (arg0);
17870 op1 = expand_normal (arg1);
17871 tmode = insn_data[icode].operand[0].mode;
17872 mode1 = insn_data[icode].operand[1].mode;
17873 mode2 = insn_data[icode].operand[2].mode;
17874 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17875 op0 = copy_to_mode_reg (mode1, op0);
17876 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17877 op1 = copy_to_mode_reg (mode2, op1);
17878 if (optimize || target == 0
17879 || GET_MODE (target) != tmode
17880 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17881 target = gen_reg_rtx (tmode);
17882 pat = GEN_FCN (icode) (target, op0, op1);
17883 if (! pat)
17884 return NULL_RTX;
17885 emit_insn (pat);
17886 return target;
17887
17888 case IX86_BUILTIN_EXTRQI:
17889 icode = CODE_FOR_sse4a_extrqi;
17890 arg0 = TREE_VALUE (arglist);
17891 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17892 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17893 op0 = expand_normal (arg0);
17894 op1 = expand_normal (arg1);
17895 op2 = expand_normal (arg2);
17896 tmode = insn_data[icode].operand[0].mode;
17897 mode1 = insn_data[icode].operand[1].mode;
17898 mode2 = insn_data[icode].operand[2].mode;
17899 mode3 = insn_data[icode].operand[3].mode;
17900 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17901 op0 = copy_to_mode_reg (mode1, op0);
17902 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17903 {
17904 error ("index mask must be an immediate");
17905 return gen_reg_rtx (tmode);
17906 }
17907 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17908 {
17909 error ("length mask must be an immediate");
17910 return gen_reg_rtx (tmode);
17911 }
17912 if (optimize || target == 0
17913 || GET_MODE (target) != tmode
17914 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17915 target = gen_reg_rtx (tmode);
17916 pat = GEN_FCN (icode) (target, op0, op1, op2);
17917 if (! pat)
17918 return NULL_RTX;
17919 emit_insn (pat);
17920 return target;
17921
17922 case IX86_BUILTIN_INSERTQI:
17923 icode = CODE_FOR_sse4a_insertqi;
17924 arg0 = TREE_VALUE (arglist);
17925 arg1 = TREE_VALUE (TREE_CHAIN (arglist));
17926 arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist)));
17927 arg3 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
17928 op0 = expand_normal (arg0);
17929 op1 = expand_normal (arg1);
17930 op2 = expand_normal (arg2);
17931 op3 = expand_normal (arg3);
17932 tmode = insn_data[icode].operand[0].mode;
17933 mode1 = insn_data[icode].operand[1].mode;
17934 mode2 = insn_data[icode].operand[2].mode;
17935 mode3 = insn_data[icode].operand[3].mode;
17936 mode4 = insn_data[icode].operand[4].mode;
17937
17938 if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
17939 op0 = copy_to_mode_reg (mode1, op0);
17940
17941 if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
17942 op1 = copy_to_mode_reg (mode2, op1);
17943
17944 if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
17945 {
17946 error ("index mask must be an immediate");
17947 return gen_reg_rtx (tmode);
17948 }
17949 if (! (*insn_data[icode].operand[4].predicate) (op3, mode4))
17950 {
17951 error ("length mask must be an immediate");
17952 return gen_reg_rtx (tmode);
17953 }
17954 if (optimize || target == 0
17955 || GET_MODE (target) != tmode
17956 || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17957 target = gen_reg_rtx (tmode);
17958 pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
17959 if (! pat)
17960 return NULL_RTX;
17961 emit_insn (pat);
17962 return target;
17963
17964 case IX86_BUILTIN_VEC_INIT_V2SI:
17965 case IX86_BUILTIN_VEC_INIT_V4HI:
17966 case IX86_BUILTIN_VEC_INIT_V8QI:
17967 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), arglist, target);
17968
17969 case IX86_BUILTIN_VEC_EXT_V2DF:
17970 case IX86_BUILTIN_VEC_EXT_V2DI:
17971 case IX86_BUILTIN_VEC_EXT_V4SF:
17972 case IX86_BUILTIN_VEC_EXT_V4SI:
17973 case IX86_BUILTIN_VEC_EXT_V8HI:
17974 case IX86_BUILTIN_VEC_EXT_V2SI:
17975 case IX86_BUILTIN_VEC_EXT_V4HI:
17976 return ix86_expand_vec_ext_builtin (arglist, target);
17977
17978 case IX86_BUILTIN_VEC_SET_V8HI:
17979 case IX86_BUILTIN_VEC_SET_V4HI:
17980 return ix86_expand_vec_set_builtin (arglist);
17981
17982 default:
17983 break;
17984 }
17985
17986 for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
17987 if (d->code == fcode)
17988 {
17989 /* Compares are treated specially. */
17990 if (d->icode == CODE_FOR_sse_maskcmpv4sf3
17991 || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3
17992 || d->icode == CODE_FOR_sse2_maskcmpv2df3
17993 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
17994 return ix86_expand_sse_compare (d, arglist, target);
17995
17996 return ix86_expand_binop_builtin (d->icode, arglist, target);
17997 }
17998
17999 for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
18000 if (d->code == fcode)
18001 return ix86_expand_unop_builtin (d->icode, arglist, target, 0);
18002
18003 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
18004 if (d->code == fcode)
18005 return ix86_expand_sse_comi (d, arglist, target);
18006
18007 gcc_unreachable ();
18008 }
18009
18010 /* Returns a function decl for a vectorized version of the builtin function
18011 with builtin function code FN and the result vector type TYPE, or NULL_TREE
18012 if it is not available. */
18013
18014 static tree
18015 ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out,
18016 tree type_in)
18017 {
18018 enum machine_mode in_mode, out_mode;
18019 int in_n, out_n;
18020
18021 if (TREE_CODE (type_out) != VECTOR_TYPE
18022 || TREE_CODE (type_in) != VECTOR_TYPE)
18023 return NULL_TREE;
18024
18025 out_mode = TYPE_MODE (TREE_TYPE (type_out));
18026 out_n = TYPE_VECTOR_SUBPARTS (type_out);
18027 in_mode = TYPE_MODE (TREE_TYPE (type_in));
18028 in_n = TYPE_VECTOR_SUBPARTS (type_in);
18029
18030 switch (fn)
18031 {
18032 case BUILT_IN_SQRT:
18033 if (out_mode == DFmode && out_n == 2
18034 && in_mode == DFmode && in_n == 2)
18035 return ix86_builtins[IX86_BUILTIN_SQRTPD];
18036 return NULL_TREE;
18037
18038 case BUILT_IN_SQRTF:
18039 if (out_mode == SFmode && out_n == 4
18040 && in_mode == SFmode && in_n == 4)
18041 return ix86_builtins[IX86_BUILTIN_SQRTPS];
18042 return NULL_TREE;
18043
18044 case BUILT_IN_LRINTF:
18045 if (out_mode == SImode && out_n == 4
18046 && in_mode == SFmode && in_n == 4)
18047 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
18048 return NULL_TREE;
18049
18050 default:
18051 ;
18052 }
18053
18054 return NULL_TREE;
18055 }
18056
18057 /* Store OPERAND to the memory after reload is completed. This means
18058 that we can't easily use assign_stack_local. */
18059 rtx
18060 ix86_force_to_memory (enum machine_mode mode, rtx operand)
18061 {
18062 rtx result;
18063
18064 gcc_assert (reload_completed);
18065 if (TARGET_RED_ZONE)
18066 {
18067 result = gen_rtx_MEM (mode,
18068 gen_rtx_PLUS (Pmode,
18069 stack_pointer_rtx,
18070 GEN_INT (-RED_ZONE_SIZE)));
18071 emit_move_insn (result, operand);
18072 }
18073 else if (!TARGET_RED_ZONE && TARGET_64BIT)
18074 {
18075 switch (mode)
18076 {
18077 case HImode:
18078 case SImode:
18079 operand = gen_lowpart (DImode, operand);
18080 /* FALLTHRU */
18081 case DImode:
18082 emit_insn (
18083 gen_rtx_SET (VOIDmode,
18084 gen_rtx_MEM (DImode,
18085 gen_rtx_PRE_DEC (DImode,
18086 stack_pointer_rtx)),
18087 operand));
18088 break;
18089 default:
18090 gcc_unreachable ();
18091 }
18092 result = gen_rtx_MEM (mode, stack_pointer_rtx);
18093 }
18094 else
18095 {
18096 switch (mode)
18097 {
18098 case DImode:
18099 {
18100 rtx operands[2];
18101 split_di (&operand, 1, operands, operands + 1);
18102 emit_insn (
18103 gen_rtx_SET (VOIDmode,
18104 gen_rtx_MEM (SImode,
18105 gen_rtx_PRE_DEC (Pmode,
18106 stack_pointer_rtx)),
18107 operands[1]));
18108 emit_insn (
18109 gen_rtx_SET (VOIDmode,
18110 gen_rtx_MEM (SImode,
18111 gen_rtx_PRE_DEC (Pmode,
18112 stack_pointer_rtx)),
18113 operands[0]));
18114 }
18115 break;
18116 case HImode:
18117 /* Store HImodes as SImodes. */
18118 operand = gen_lowpart (SImode, operand);
18119 /* FALLTHRU */
18120 case SImode:
18121 emit_insn (
18122 gen_rtx_SET (VOIDmode,
18123 gen_rtx_MEM (GET_MODE (operand),
18124 gen_rtx_PRE_DEC (SImode,
18125 stack_pointer_rtx)),
18126 operand));
18127 break;
18128 default:
18129 gcc_unreachable ();
18130 }
18131 result = gen_rtx_MEM (mode, stack_pointer_rtx);
18132 }
18133 return result;
18134 }
18135
18136 /* Free operand from the memory. */
18137 void
18138 ix86_free_from_memory (enum machine_mode mode)
18139 {
18140 if (!TARGET_RED_ZONE)
18141 {
18142 int size;
18143
18144 if (mode == DImode || TARGET_64BIT)
18145 size = 8;
18146 else
18147 size = 4;
18148 /* Use LEA to deallocate stack space. In peephole2 it will be converted
18149 to pop or add instruction if registers are available. */
18150 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
18151 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
18152 GEN_INT (size))));
18153 }
18154 }
18155
18156 /* Put float CONST_DOUBLE in the constant pool instead of fp regs.
18157 QImode must go into class Q_REGS.
18158 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
18159 movdf to do mem-to-mem moves through integer regs. */
18160 enum reg_class
18161 ix86_preferred_reload_class (rtx x, enum reg_class class)
18162 {
18163 enum machine_mode mode = GET_MODE (x);
18164
18165 /* We're only allowed to return a subclass of CLASS. Many of the
18166 following checks fail for NO_REGS, so eliminate that early. */
18167 if (class == NO_REGS)
18168 return NO_REGS;
18169
18170 /* All classes can load zeros. */
18171 if (x == CONST0_RTX (mode))
18172 return class;
18173
18174 /* Force constants into memory if we are loading a (nonzero) constant into
18175 an MMX or SSE register. This is because there are no MMX/SSE instructions
18176 to load from a constant. */
18177 if (CONSTANT_P (x)
18178 && (MAYBE_MMX_CLASS_P (class) || MAYBE_SSE_CLASS_P (class)))
18179 return NO_REGS;
18180
18181 /* Prefer SSE regs only, if we can use them for math. */
18182 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
18183 return SSE_CLASS_P (class) ? class : NO_REGS;
18184
18185 /* Floating-point constants need more complex checks. */
18186 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
18187 {
18188 /* General regs can load everything. */
18189 if (reg_class_subset_p (class, GENERAL_REGS))
18190 return class;
18191
18192 /* Floats can load 0 and 1 plus some others. Note that we eliminated
18193 zero above. We only want to wind up preferring 80387 registers if
18194 we plan on doing computation with them. */
18195 if (TARGET_80387
18196 && standard_80387_constant_p (x))
18197 {
18198 /* Limit class to non-sse. */
18199 if (class == FLOAT_SSE_REGS)
18200 return FLOAT_REGS;
18201 if (class == FP_TOP_SSE_REGS)
18202 return FP_TOP_REG;
18203 if (class == FP_SECOND_SSE_REGS)
18204 return FP_SECOND_REG;
18205 if (class == FLOAT_INT_REGS || class == FLOAT_REGS)
18206 return class;
18207 }
18208
18209 return NO_REGS;
18210 }
18211
18212 /* Generally when we see PLUS here, it's the function invariant
18213 (plus soft-fp const_int). Which can only be computed into general
18214 regs. */
18215 if (GET_CODE (x) == PLUS)
18216 return reg_class_subset_p (class, GENERAL_REGS) ? class : NO_REGS;
18217
18218 /* QImode constants are easy to load, but non-constant QImode data
18219 must go into Q_REGS. */
18220 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
18221 {
18222 if (reg_class_subset_p (class, Q_REGS))
18223 return class;
18224 if (reg_class_subset_p (Q_REGS, class))
18225 return Q_REGS;
18226 return NO_REGS;
18227 }
18228
18229 return class;
18230 }
18231
18232 /* Discourage putting floating-point values in SSE registers unless
18233 SSE math is being used, and likewise for the 387 registers. */
18234 enum reg_class
18235 ix86_preferred_output_reload_class (rtx x, enum reg_class class)
18236 {
18237 enum machine_mode mode = GET_MODE (x);
18238
18239 /* Restrict the output reload class to the register bank that we are doing
18240 math on. If we would like not to return a subset of CLASS, reject this
18241 alternative: if reload cannot do this, it will still use its choice. */
18242 mode = GET_MODE (x);
18243 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
18244 return MAYBE_SSE_CLASS_P (class) ? SSE_REGS : NO_REGS;
18245
18246 if (TARGET_80387 && SCALAR_FLOAT_MODE_P (mode))
18247 {
18248 if (class == FP_TOP_SSE_REGS)
18249 return FP_TOP_REG;
18250 else if (class == FP_SECOND_SSE_REGS)
18251 return FP_SECOND_REG;
18252 else
18253 return FLOAT_CLASS_P (class) ? class : NO_REGS;
18254 }
18255
18256 return class;
18257 }
18258
18259 /* If we are copying between general and FP registers, we need a memory
18260 location. The same is true for SSE and MMX registers.
18261
18262 The macro can't work reliably when one of the CLASSES is class containing
18263 registers from multiple units (SSE, MMX, integer). We avoid this by never
18264 combining those units in single alternative in the machine description.
18265 Ensure that this constraint holds to avoid unexpected surprises.
18266
18267 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
18268 enforce these sanity checks. */
18269
18270 int
18271 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
18272 enum machine_mode mode, int strict)
18273 {
18274 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
18275 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
18276 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
18277 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
18278 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
18279 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
18280 {
18281 gcc_assert (!strict);
18282 return true;
18283 }
18284
18285 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
18286 return true;
18287
18288 /* ??? This is a lie. We do have moves between mmx/general, and for
18289 mmx/sse2. But by saying we need secondary memory we discourage the
18290 register allocator from using the mmx registers unless needed. */
18291 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
18292 return true;
18293
18294 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
18295 {
18296 /* SSE1 doesn't have any direct moves from other classes. */
18297 if (!TARGET_SSE2)
18298 return true;
18299
18300 /* If the target says that inter-unit moves are more expensive
18301 than moving through memory, then don't generate them. */
18302 if (!TARGET_INTER_UNIT_MOVES)
18303 return true;
18304
18305 /* Between SSE and general, we have moves no larger than word size. */
18306 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
18307 return true;
18308 }
18309
18310 return false;
18311 }
18312
18313 /* Return true if the registers in CLASS cannot represent the change from
18314 modes FROM to TO. */
18315
18316 bool
18317 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
18318 enum reg_class class)
18319 {
18320 if (from == to)
18321 return false;
18322
18323 /* x87 registers can't do subreg at all, as all values are reformatted
18324 to extended precision. */
18325 if (MAYBE_FLOAT_CLASS_P (class))
18326 return true;
18327
18328 if (MAYBE_SSE_CLASS_P (class) || MAYBE_MMX_CLASS_P (class))
18329 {
18330 /* Vector registers do not support QI or HImode loads. If we don't
18331 disallow a change to these modes, reload will assume it's ok to
18332 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
18333 the vec_dupv4hi pattern. */
18334 if (GET_MODE_SIZE (from) < 4)
18335 return true;
18336
18337 /* Vector registers do not support subreg with nonzero offsets, which
18338 are otherwise valid for integer registers. Since we can't see
18339 whether we have a nonzero offset from here, prohibit all
18340 nonparadoxical subregs changing size. */
18341 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
18342 return true;
18343 }
18344
18345 return false;
18346 }
18347
18348 /* Return the cost of moving data from a register in class CLASS1 to
18349 one in class CLASS2.
18350
18351 It is not required that the cost always equal 2 when FROM is the same as TO;
18352 on some machines it is expensive to move between registers if they are not
18353 general registers. */
18354
18355 int
18356 ix86_register_move_cost (enum machine_mode mode, enum reg_class class1,
18357 enum reg_class class2)
18358 {
18359 /* In case we require secondary memory, compute cost of the store followed
18360 by load. In order to avoid bad register allocation choices, we need
18361 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
18362
18363 if (ix86_secondary_memory_needed (class1, class2, mode, 0))
18364 {
18365 int cost = 1;
18366
18367 cost += MAX (MEMORY_MOVE_COST (mode, class1, 0),
18368 MEMORY_MOVE_COST (mode, class1, 1));
18369 cost += MAX (MEMORY_MOVE_COST (mode, class2, 0),
18370 MEMORY_MOVE_COST (mode, class2, 1));
18371
18372 /* In case of copying from general_purpose_register we may emit multiple
18373 stores followed by single load causing memory size mismatch stall.
18374 Count this as arbitrarily high cost of 20. */
18375 if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
18376 cost += 20;
18377
18378 /* In the case of FP/MMX moves, the registers actually overlap, and we
18379 have to switch modes in order to treat them differently. */
18380 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
18381 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
18382 cost += 20;
18383
18384 return cost;
18385 }
18386
18387 /* Moves between SSE/MMX and integer unit are expensive. */
18388 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
18389 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
18390 return ix86_cost->mmxsse_to_integer;
18391 if (MAYBE_FLOAT_CLASS_P (class1))
18392 return ix86_cost->fp_move;
18393 if (MAYBE_SSE_CLASS_P (class1))
18394 return ix86_cost->sse_move;
18395 if (MAYBE_MMX_CLASS_P (class1))
18396 return ix86_cost->mmx_move;
18397 return 2;
18398 }
18399
18400 /* Return 1 if hard register REGNO can hold a value of machine-mode MODE. */
18401
18402 bool
18403 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
18404 {
18405 /* Flags and only flags can only hold CCmode values. */
18406 if (CC_REGNO_P (regno))
18407 return GET_MODE_CLASS (mode) == MODE_CC;
18408 if (GET_MODE_CLASS (mode) == MODE_CC
18409 || GET_MODE_CLASS (mode) == MODE_RANDOM
18410 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
18411 return 0;
18412 if (FP_REGNO_P (regno))
18413 return VALID_FP_MODE_P (mode);
18414 if (SSE_REGNO_P (regno))
18415 {
18416 /* We implement the move patterns for all vector modes into and
18417 out of SSE registers, even when no operation instructions
18418 are available. */
18419 return (VALID_SSE_REG_MODE (mode)
18420 || VALID_SSE2_REG_MODE (mode)
18421 || VALID_MMX_REG_MODE (mode)
18422 || VALID_MMX_REG_MODE_3DNOW (mode));
18423 }
18424 if (MMX_REGNO_P (regno))
18425 {
18426 /* We implement the move patterns for 3DNOW modes even in MMX mode,
18427 so if the register is available at all, then we can move data of
18428 the given mode into or out of it. */
18429 return (VALID_MMX_REG_MODE (mode)
18430 || VALID_MMX_REG_MODE_3DNOW (mode));
18431 }
18432
18433 if (mode == QImode)
18434 {
18435 /* Take care for QImode values - they can be in non-QI regs,
18436 but then they do cause partial register stalls. */
18437 if (regno < 4 || TARGET_64BIT)
18438 return 1;
18439 if (!TARGET_PARTIAL_REG_STALL)
18440 return 1;
18441 return reload_in_progress || reload_completed;
18442 }
18443 /* We handle both integer and floats in the general purpose registers. */
18444 else if (VALID_INT_MODE_P (mode))
18445 return 1;
18446 else if (VALID_FP_MODE_P (mode))
18447 return 1;
18448 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
18449 on to use that value in smaller contexts, this can easily force a
18450 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
18451 supporting DImode, allow it. */
18452 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
18453 return 1;
18454
18455 return 0;
18456 }
18457
18458 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
18459 tieable integer mode. */
18460
18461 static bool
18462 ix86_tieable_integer_mode_p (enum machine_mode mode)
18463 {
18464 switch (mode)
18465 {
18466 case HImode:
18467 case SImode:
18468 return true;
18469
18470 case QImode:
18471 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
18472
18473 case DImode:
18474 return TARGET_64BIT;
18475
18476 default:
18477 return false;
18478 }
18479 }
18480
18481 /* Return true if MODE1 is accessible in a register that can hold MODE2
18482 without copying. That is, all register classes that can hold MODE2
18483 can also hold MODE1. */
18484
18485 bool
18486 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
18487 {
18488 if (mode1 == mode2)
18489 return true;
18490
18491 if (ix86_tieable_integer_mode_p (mode1)
18492 && ix86_tieable_integer_mode_p (mode2))
18493 return true;
18494
18495 /* MODE2 being XFmode implies fp stack or general regs, which means we
18496 can tie any smaller floating point modes to it. Note that we do not
18497 tie this with TFmode. */
18498 if (mode2 == XFmode)
18499 return mode1 == SFmode || mode1 == DFmode;
18500
18501 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
18502 that we can tie it with SFmode. */
18503 if (mode2 == DFmode)
18504 return mode1 == SFmode;
18505
18506 /* If MODE2 is only appropriate for an SSE register, then tie with
18507 any other mode acceptable to SSE registers. */
18508 if (GET_MODE_SIZE (mode2) >= 8
18509 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
18510 return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1);
18511
18512 /* If MODE2 is appropriate for an MMX (or SSE) register, then tie
18513 with any other mode acceptable to MMX registers. */
18514 if (GET_MODE_SIZE (mode2) == 8
18515 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
18516 return ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1);
18517
18518 return false;
18519 }
18520
18521 /* Return the cost of moving data of mode M between a
18522 register and memory. A value of 2 is the default; this cost is
18523 relative to those in `REGISTER_MOVE_COST'.
18524
18525 If moving between registers and memory is more expensive than
18526 between two registers, you should define this macro to express the
18527 relative cost.
18528
18529 Model also increased moving costs of QImode registers in non
18530 Q_REGS classes.
18531 */
18532 int
18533 ix86_memory_move_cost (enum machine_mode mode, enum reg_class class, int in)
18534 {
18535 if (FLOAT_CLASS_P (class))
18536 {
18537 int index;
18538 switch (mode)
18539 {
18540 case SFmode:
18541 index = 0;
18542 break;
18543 case DFmode:
18544 index = 1;
18545 break;
18546 case XFmode:
18547 index = 2;
18548 break;
18549 default:
18550 return 100;
18551 }
18552 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
18553 }
18554 if (SSE_CLASS_P (class))
18555 {
18556 int index;
18557 switch (GET_MODE_SIZE (mode))
18558 {
18559 case 4:
18560 index = 0;
18561 break;
18562 case 8:
18563 index = 1;
18564 break;
18565 case 16:
18566 index = 2;
18567 break;
18568 default:
18569 return 100;
18570 }
18571 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
18572 }
18573 if (MMX_CLASS_P (class))
18574 {
18575 int index;
18576 switch (GET_MODE_SIZE (mode))
18577 {
18578 case 4:
18579 index = 0;
18580 break;
18581 case 8:
18582 index = 1;
18583 break;
18584 default:
18585 return 100;
18586 }
18587 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
18588 }
18589 switch (GET_MODE_SIZE (mode))
18590 {
18591 case 1:
18592 if (in)
18593 return (Q_CLASS_P (class) ? ix86_cost->int_load[0]
18594 : ix86_cost->movzbl_load);
18595 else
18596 return (Q_CLASS_P (class) ? ix86_cost->int_store[0]
18597 : ix86_cost->int_store[0] + 4);
18598 break;
18599 case 2:
18600 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
18601 default:
18602 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
18603 if (mode == TFmode)
18604 mode = XFmode;
18605 return ((in ? ix86_cost->int_load[2] : ix86_cost->int_store[2])
18606 * (((int) GET_MODE_SIZE (mode)
18607 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
18608 }
18609 }
18610
18611 /* Compute a (partial) cost for rtx X. Return true if the complete
18612 cost has been computed, and false if subexpressions should be
18613 scanned. In either case, *TOTAL contains the cost result. */
18614
18615 static bool
18616 ix86_rtx_costs (rtx x, int code, int outer_code, int *total)
18617 {
18618 enum machine_mode mode = GET_MODE (x);
18619
18620 switch (code)
18621 {
18622 case CONST_INT:
18623 case CONST:
18624 case LABEL_REF:
18625 case SYMBOL_REF:
18626 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
18627 *total = 3;
18628 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
18629 *total = 2;
18630 else if (flag_pic && SYMBOLIC_CONST (x)
18631 && (!TARGET_64BIT
18632 || (!GET_CODE (x) != LABEL_REF
18633 && (GET_CODE (x) != SYMBOL_REF
18634 || !SYMBOL_REF_LOCAL_P (x)))))
18635 *total = 1;
18636 else
18637 *total = 0;
18638 return true;
18639
18640 case CONST_DOUBLE:
18641 if (mode == VOIDmode)
18642 *total = 0;
18643 else
18644 switch (standard_80387_constant_p (x))
18645 {
18646 case 1: /* 0.0 */
18647 *total = 1;
18648 break;
18649 default: /* Other constants */
18650 *total = 2;
18651 break;
18652 case 0:
18653 case -1:
18654 /* Start with (MEM (SYMBOL_REF)), since that's where
18655 it'll probably end up. Add a penalty for size. */
18656 *total = (COSTS_N_INSNS (1)
18657 + (flag_pic != 0 && !TARGET_64BIT)
18658 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
18659 break;
18660 }
18661 return true;
18662
18663 case ZERO_EXTEND:
18664 /* The zero extensions is often completely free on x86_64, so make
18665 it as cheap as possible. */
18666 if (TARGET_64BIT && mode == DImode
18667 && GET_MODE (XEXP (x, 0)) == SImode)
18668 *total = 1;
18669 else if (TARGET_ZERO_EXTEND_WITH_AND)
18670 *total = ix86_cost->add;
18671 else
18672 *total = ix86_cost->movzx;
18673 return false;
18674
18675 case SIGN_EXTEND:
18676 *total = ix86_cost->movsx;
18677 return false;
18678
18679 case ASHIFT:
18680 if (CONST_INT_P (XEXP (x, 1))
18681 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
18682 {
18683 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
18684 if (value == 1)
18685 {
18686 *total = ix86_cost->add;
18687 return false;
18688 }
18689 if ((value == 2 || value == 3)
18690 && ix86_cost->lea <= ix86_cost->shift_const)
18691 {
18692 *total = ix86_cost->lea;
18693 return false;
18694 }
18695 }
18696 /* FALLTHRU */
18697
18698 case ROTATE:
18699 case ASHIFTRT:
18700 case LSHIFTRT:
18701 case ROTATERT:
18702 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
18703 {
18704 if (CONST_INT_P (XEXP (x, 1)))
18705 {
18706 if (INTVAL (XEXP (x, 1)) > 32)
18707 *total = ix86_cost->shift_const + COSTS_N_INSNS (2);
18708 else
18709 *total = ix86_cost->shift_const * 2;
18710 }
18711 else
18712 {
18713 if (GET_CODE (XEXP (x, 1)) == AND)
18714 *total = ix86_cost->shift_var * 2;
18715 else
18716 *total = ix86_cost->shift_var * 6 + COSTS_N_INSNS (2);
18717 }
18718 }
18719 else
18720 {
18721 if (CONST_INT_P (XEXP (x, 1)))
18722 *total = ix86_cost->shift_const;
18723 else
18724 *total = ix86_cost->shift_var;
18725 }
18726 return false;
18727
18728 case MULT:
18729 if (FLOAT_MODE_P (mode))
18730 {
18731 *total = ix86_cost->fmul;
18732 return false;
18733 }
18734 else
18735 {
18736 rtx op0 = XEXP (x, 0);
18737 rtx op1 = XEXP (x, 1);
18738 int nbits;
18739 if (CONST_INT_P (XEXP (x, 1)))
18740 {
18741 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
18742 for (nbits = 0; value != 0; value &= value - 1)
18743 nbits++;
18744 }
18745 else
18746 /* This is arbitrary. */
18747 nbits = 7;
18748
18749 /* Compute costs correctly for widening multiplication. */
18750 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND)
18751 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
18752 == GET_MODE_SIZE (mode))
18753 {
18754 int is_mulwiden = 0;
18755 enum machine_mode inner_mode = GET_MODE (op0);
18756
18757 if (GET_CODE (op0) == GET_CODE (op1))
18758 is_mulwiden = 1, op1 = XEXP (op1, 0);
18759 else if (CONST_INT_P (op1))
18760 {
18761 if (GET_CODE (op0) == SIGN_EXTEND)
18762 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
18763 == INTVAL (op1);
18764 else
18765 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
18766 }
18767
18768 if (is_mulwiden)
18769 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
18770 }
18771
18772 *total = (ix86_cost->mult_init[MODE_INDEX (mode)]
18773 + nbits * ix86_cost->mult_bit
18774 + rtx_cost (op0, outer_code) + rtx_cost (op1, outer_code));
18775
18776 return true;
18777 }
18778
18779 case DIV:
18780 case UDIV:
18781 case MOD:
18782 case UMOD:
18783 if (FLOAT_MODE_P (mode))
18784 *total = ix86_cost->fdiv;
18785 else
18786 *total = ix86_cost->divide[MODE_INDEX (mode)];
18787 return false;
18788
18789 case PLUS:
18790 if (FLOAT_MODE_P (mode))
18791 *total = ix86_cost->fadd;
18792 else if (GET_MODE_CLASS (mode) == MODE_INT
18793 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
18794 {
18795 if (GET_CODE (XEXP (x, 0)) == PLUS
18796 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
18797 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
18798 && CONSTANT_P (XEXP (x, 1)))
18799 {
18800 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
18801 if (val == 2 || val == 4 || val == 8)
18802 {
18803 *total = ix86_cost->lea;
18804 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
18805 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
18806 outer_code);
18807 *total += rtx_cost (XEXP (x, 1), outer_code);
18808 return true;
18809 }
18810 }
18811 else if (GET_CODE (XEXP (x, 0)) == MULT
18812 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
18813 {
18814 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
18815 if (val == 2 || val == 4 || val == 8)
18816 {
18817 *total = ix86_cost->lea;
18818 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
18819 *total += rtx_cost (XEXP (x, 1), outer_code);
18820 return true;
18821 }
18822 }
18823 else if (GET_CODE (XEXP (x, 0)) == PLUS)
18824 {
18825 *total = ix86_cost->lea;
18826 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
18827 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
18828 *total += rtx_cost (XEXP (x, 1), outer_code);
18829 return true;
18830 }
18831 }
18832 /* FALLTHRU */
18833
18834 case MINUS:
18835 if (FLOAT_MODE_P (mode))
18836 {
18837 *total = ix86_cost->fadd;
18838 return false;
18839 }
18840 /* FALLTHRU */
18841
18842 case AND:
18843 case IOR:
18844 case XOR:
18845 if (!TARGET_64BIT && mode == DImode)
18846 {
18847 *total = (ix86_cost->add * 2
18848 + (rtx_cost (XEXP (x, 0), outer_code)
18849 << (GET_MODE (XEXP (x, 0)) != DImode))
18850 + (rtx_cost (XEXP (x, 1), outer_code)
18851 << (GET_MODE (XEXP (x, 1)) != DImode)));
18852 return true;
18853 }
18854 /* FALLTHRU */
18855
18856 case NEG:
18857 if (FLOAT_MODE_P (mode))
18858 {
18859 *total = ix86_cost->fchs;
18860 return false;
18861 }
18862 /* FALLTHRU */
18863
18864 case NOT:
18865 if (!TARGET_64BIT && mode == DImode)
18866 *total = ix86_cost->add * 2;
18867 else
18868 *total = ix86_cost->add;
18869 return false;
18870
18871 case COMPARE:
18872 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
18873 && XEXP (XEXP (x, 0), 1) == const1_rtx
18874 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
18875 && XEXP (x, 1) == const0_rtx)
18876 {
18877 /* This kind of construct is implemented using test[bwl].
18878 Treat it as if we had an AND. */
18879 *total = (ix86_cost->add
18880 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code)
18881 + rtx_cost (const1_rtx, outer_code));
18882 return true;
18883 }
18884 return false;
18885
18886 case FLOAT_EXTEND:
18887 if (!TARGET_SSE_MATH
18888 || mode == XFmode
18889 || (mode == DFmode && !TARGET_SSE2))
18890 *total = 0;
18891 return false;
18892
18893 case ABS:
18894 if (FLOAT_MODE_P (mode))
18895 *total = ix86_cost->fabs;
18896 return false;
18897
18898 case SQRT:
18899 if (FLOAT_MODE_P (mode))
18900 *total = ix86_cost->fsqrt;
18901 return false;
18902
18903 case UNSPEC:
18904 if (XINT (x, 1) == UNSPEC_TP)
18905 *total = 0;
18906 return false;
18907
18908 default:
18909 return false;
18910 }
18911 }
18912
18913 #if TARGET_MACHO
18914
18915 static int current_machopic_label_num;
18916
18917 /* Given a symbol name and its associated stub, write out the
18918 definition of the stub. */
18919
18920 void
18921 machopic_output_stub (FILE *file, const char *symb, const char *stub)
18922 {
18923 unsigned int length;
18924 char *binder_name, *symbol_name, lazy_ptr_name[32];
18925 int label = ++current_machopic_label_num;
18926
18927 /* For 64-bit we shouldn't get here. */
18928 gcc_assert (!TARGET_64BIT);
18929
18930 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
18931 symb = (*targetm.strip_name_encoding) (symb);
18932
18933 length = strlen (stub);
18934 binder_name = alloca (length + 32);
18935 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
18936
18937 length = strlen (symb);
18938 symbol_name = alloca (length + 32);
18939 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
18940
18941 sprintf (lazy_ptr_name, "L%d$lz", label);
18942
18943 if (MACHOPIC_PURE)
18944 switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
18945 else
18946 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
18947
18948 fprintf (file, "%s:\n", stub);
18949 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
18950
18951 if (MACHOPIC_PURE)
18952 {
18953 fprintf (file, "\tcall\tLPC$%d\nLPC$%d:\tpopl\t%%eax\n", label, label);
18954 fprintf (file, "\tmovl\t%s-LPC$%d(%%eax),%%edx\n", lazy_ptr_name, label);
18955 fprintf (file, "\tjmp\t*%%edx\n");
18956 }
18957 else
18958 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
18959
18960 fprintf (file, "%s:\n", binder_name);
18961
18962 if (MACHOPIC_PURE)
18963 {
18964 fprintf (file, "\tlea\t%s-LPC$%d(%%eax),%%eax\n", lazy_ptr_name, label);
18965 fprintf (file, "\tpushl\t%%eax\n");
18966 }
18967 else
18968 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
18969
18970 fprintf (file, "\tjmp\tdyld_stub_binding_helper\n");
18971
18972 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
18973 fprintf (file, "%s:\n", lazy_ptr_name);
18974 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
18975 fprintf (file, "\t.long %s\n", binder_name);
18976 }
18977
18978 void
18979 darwin_x86_file_end (void)
18980 {
18981 darwin_file_end ();
18982 ix86_file_end ();
18983 }
18984 #endif /* TARGET_MACHO */
18985
18986 /* Order the registers for register allocator. */
18987
18988 void
18989 x86_order_regs_for_local_alloc (void)
18990 {
18991 int pos = 0;
18992 int i;
18993
18994 /* First allocate the local general purpose registers. */
18995 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
18996 if (GENERAL_REGNO_P (i) && call_used_regs[i])
18997 reg_alloc_order [pos++] = i;
18998
18999 /* Global general purpose registers. */
19000 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
19001 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
19002 reg_alloc_order [pos++] = i;
19003
19004 /* x87 registers come first in case we are doing FP math
19005 using them. */
19006 if (!TARGET_SSE_MATH)
19007 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19008 reg_alloc_order [pos++] = i;
19009
19010 /* SSE registers. */
19011 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19012 reg_alloc_order [pos++] = i;
19013 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19014 reg_alloc_order [pos++] = i;
19015
19016 /* x87 registers. */
19017 if (TARGET_SSE_MATH)
19018 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19019 reg_alloc_order [pos++] = i;
19020
19021 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
19022 reg_alloc_order [pos++] = i;
19023
19024 /* Initialize the rest of array as we do not allocate some registers
19025 at all. */
19026 while (pos < FIRST_PSEUDO_REGISTER)
19027 reg_alloc_order [pos++] = 0;
19028 }
19029
19030 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
19031 struct attribute_spec.handler. */
19032 static tree
19033 ix86_handle_struct_attribute (tree *node, tree name,
19034 tree args ATTRIBUTE_UNUSED,
19035 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
19036 {
19037 tree *type = NULL;
19038 if (DECL_P (*node))
19039 {
19040 if (TREE_CODE (*node) == TYPE_DECL)
19041 type = &TREE_TYPE (*node);
19042 }
19043 else
19044 type = node;
19045
19046 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
19047 || TREE_CODE (*type) == UNION_TYPE)))
19048 {
19049 warning (OPT_Wattributes, "%qs attribute ignored",
19050 IDENTIFIER_POINTER (name));
19051 *no_add_attrs = true;
19052 }
19053
19054 else if ((is_attribute_p ("ms_struct", name)
19055 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
19056 || ((is_attribute_p ("gcc_struct", name)
19057 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
19058 {
19059 warning (OPT_Wattributes, "%qs incompatible attribute ignored",
19060 IDENTIFIER_POINTER (name));
19061 *no_add_attrs = true;
19062 }
19063
19064 return NULL_TREE;
19065 }
19066
19067 static bool
19068 ix86_ms_bitfield_layout_p (tree record_type)
19069 {
19070 return (TARGET_MS_BITFIELD_LAYOUT &&
19071 !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
19072 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type));
19073 }
19074
19075 /* Returns an expression indicating where the this parameter is
19076 located on entry to the FUNCTION. */
19077
19078 static rtx
19079 x86_this_parameter (tree function)
19080 {
19081 tree type = TREE_TYPE (function);
19082
19083 if (TARGET_64BIT)
19084 {
19085 int n = aggregate_value_p (TREE_TYPE (type), type) != 0;
19086 return gen_rtx_REG (DImode, x86_64_int_parameter_registers[n]);
19087 }
19088
19089 if (ix86_function_regparm (type, function) > 0)
19090 {
19091 tree parm;
19092
19093 parm = TYPE_ARG_TYPES (type);
19094 /* Figure out whether or not the function has a variable number of
19095 arguments. */
19096 for (; parm; parm = TREE_CHAIN (parm))
19097 if (TREE_VALUE (parm) == void_type_node)
19098 break;
19099 /* If not, the this parameter is in the first argument. */
19100 if (parm)
19101 {
19102 int regno = 0;
19103 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
19104 regno = 2;
19105 return gen_rtx_REG (SImode, regno);
19106 }
19107 }
19108
19109 if (aggregate_value_p (TREE_TYPE (type), type))
19110 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 8));
19111 else
19112 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 4));
19113 }
19114
19115 /* Determine whether x86_output_mi_thunk can succeed. */
19116
19117 static bool
19118 x86_can_output_mi_thunk (tree thunk ATTRIBUTE_UNUSED,
19119 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
19120 HOST_WIDE_INT vcall_offset, tree function)
19121 {
19122 /* 64-bit can handle anything. */
19123 if (TARGET_64BIT)
19124 return true;
19125
19126 /* For 32-bit, everything's fine if we have one free register. */
19127 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
19128 return true;
19129
19130 /* Need a free register for vcall_offset. */
19131 if (vcall_offset)
19132 return false;
19133
19134 /* Need a free register for GOT references. */
19135 if (flag_pic && !(*targetm.binds_local_p) (function))
19136 return false;
19137
19138 /* Otherwise ok. */
19139 return true;
19140 }
19141
19142 /* Output the assembler code for a thunk function. THUNK_DECL is the
19143 declaration for the thunk function itself, FUNCTION is the decl for
19144 the target function. DELTA is an immediate constant offset to be
19145 added to THIS. If VCALL_OFFSET is nonzero, the word at
19146 *(*this + vcall_offset) should be added to THIS. */
19147
19148 static void
19149 x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
19150 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
19151 HOST_WIDE_INT vcall_offset, tree function)
19152 {
19153 rtx xops[3];
19154 rtx this = x86_this_parameter (function);
19155 rtx this_reg, tmp;
19156
19157 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
19158 pull it in now and let DELTA benefit. */
19159 if (REG_P (this))
19160 this_reg = this;
19161 else if (vcall_offset)
19162 {
19163 /* Put the this parameter into %eax. */
19164 xops[0] = this;
19165 xops[1] = this_reg = gen_rtx_REG (Pmode, 0);
19166 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19167 }
19168 else
19169 this_reg = NULL_RTX;
19170
19171 /* Adjust the this parameter by a fixed constant. */
19172 if (delta)
19173 {
19174 xops[0] = GEN_INT (delta);
19175 xops[1] = this_reg ? this_reg : this;
19176 if (TARGET_64BIT)
19177 {
19178 if (!x86_64_general_operand (xops[0], DImode))
19179 {
19180 tmp = gen_rtx_REG (DImode, R10_REG);
19181 xops[1] = tmp;
19182 output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops);
19183 xops[0] = tmp;
19184 xops[1] = this;
19185 }
19186 output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19187 }
19188 else
19189 output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19190 }
19191
19192 /* Adjust the this parameter by a value stored in the vtable. */
19193 if (vcall_offset)
19194 {
19195 if (TARGET_64BIT)
19196 tmp = gen_rtx_REG (DImode, R10_REG);
19197 else
19198 {
19199 int tmp_regno = 2 /* ECX */;
19200 if (lookup_attribute ("fastcall",
19201 TYPE_ATTRIBUTES (TREE_TYPE (function))))
19202 tmp_regno = 0 /* EAX */;
19203 tmp = gen_rtx_REG (SImode, tmp_regno);
19204 }
19205
19206 xops[0] = gen_rtx_MEM (Pmode, this_reg);
19207 xops[1] = tmp;
19208 if (TARGET_64BIT)
19209 output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19210 else
19211 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19212
19213 /* Adjust the this parameter. */
19214 xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset));
19215 if (TARGET_64BIT && !memory_operand (xops[0], Pmode))
19216 {
19217 rtx tmp2 = gen_rtx_REG (DImode, R11_REG);
19218 xops[0] = GEN_INT (vcall_offset);
19219 xops[1] = tmp2;
19220 output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19221 xops[0] = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, tmp, tmp2));
19222 }
19223 xops[1] = this_reg;
19224 if (TARGET_64BIT)
19225 output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19226 else
19227 output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19228 }
19229
19230 /* If necessary, drop THIS back to its stack slot. */
19231 if (this_reg && this_reg != this)
19232 {
19233 xops[0] = this_reg;
19234 xops[1] = this;
19235 output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19236 }
19237
19238 xops[0] = XEXP (DECL_RTL (function), 0);
19239 if (TARGET_64BIT)
19240 {
19241 if (!flag_pic || (*targetm.binds_local_p) (function))
19242 output_asm_insn ("jmp\t%P0", xops);
19243 else
19244 {
19245 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL);
19246 tmp = gen_rtx_CONST (Pmode, tmp);
19247 tmp = gen_rtx_MEM (QImode, tmp);
19248 xops[0] = tmp;
19249 output_asm_insn ("jmp\t%A0", xops);
19250 }
19251 }
19252 else
19253 {
19254 if (!flag_pic || (*targetm.binds_local_p) (function))
19255 output_asm_insn ("jmp\t%P0", xops);
19256 else
19257 #if TARGET_MACHO
19258 if (TARGET_MACHO)
19259 {
19260 rtx sym_ref = XEXP (DECL_RTL (function), 0);
19261 tmp = (gen_rtx_SYMBOL_REF
19262 (Pmode,
19263 machopic_indirection_name (sym_ref, /*stub_p=*/true)));
19264 tmp = gen_rtx_MEM (QImode, tmp);
19265 xops[0] = tmp;
19266 output_asm_insn ("jmp\t%0", xops);
19267 }
19268 else
19269 #endif /* TARGET_MACHO */
19270 {
19271 tmp = gen_rtx_REG (SImode, 2 /* ECX */);
19272 output_set_got (tmp, NULL_RTX);
19273
19274 xops[1] = tmp;
19275 output_asm_insn ("mov{l}\t{%0@GOT(%1), %1|%1, %0@GOT[%1]}", xops);
19276 output_asm_insn ("jmp\t{*}%1", xops);
19277 }
19278 }
19279 }
19280
19281 static void
19282 x86_file_start (void)
19283 {
19284 default_file_start ();
19285 #if TARGET_MACHO
19286 darwin_file_start ();
19287 #endif
19288 if (X86_FILE_START_VERSION_DIRECTIVE)
19289 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
19290 if (X86_FILE_START_FLTUSED)
19291 fputs ("\t.global\t__fltused\n", asm_out_file);
19292 if (ix86_asm_dialect == ASM_INTEL)
19293 fputs ("\t.intel_syntax\n", asm_out_file);
19294 }
19295
19296 int
19297 x86_field_alignment (tree field, int computed)
19298 {
19299 enum machine_mode mode;
19300 tree type = TREE_TYPE (field);
19301
19302 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
19303 return computed;
19304 mode = TYPE_MODE (TREE_CODE (type) == ARRAY_TYPE
19305 ? get_inner_array_type (type) : type);
19306 if (mode == DFmode || mode == DCmode
19307 || GET_MODE_CLASS (mode) == MODE_INT
19308 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
19309 return MIN (32, computed);
19310 return computed;
19311 }
19312
19313 /* Output assembler code to FILE to increment profiler label # LABELNO
19314 for profiling a function entry. */
19315 void
19316 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
19317 {
19318 if (TARGET_64BIT)
19319 if (flag_pic)
19320 {
19321 #ifndef NO_PROFILE_COUNTERS
19322 fprintf (file, "\tleaq\t%sP%d@(%%rip),%%r11\n", LPREFIX, labelno);
19323 #endif
19324 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", MCOUNT_NAME);
19325 }
19326 else
19327 {
19328 #ifndef NO_PROFILE_COUNTERS
19329 fprintf (file, "\tmovq\t$%sP%d,%%r11\n", LPREFIX, labelno);
19330 #endif
19331 fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
19332 }
19333 else if (flag_pic)
19334 {
19335 #ifndef NO_PROFILE_COUNTERS
19336 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%%s\n",
19337 LPREFIX, labelno, PROFILE_COUNT_REGISTER);
19338 #endif
19339 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", MCOUNT_NAME);
19340 }
19341 else
19342 {
19343 #ifndef NO_PROFILE_COUNTERS
19344 fprintf (file, "\tmovl\t$%sP%d,%%%s\n", LPREFIX, labelno,
19345 PROFILE_COUNT_REGISTER);
19346 #endif
19347 fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
19348 }
19349 }
19350
19351 /* We don't have exact information about the insn sizes, but we may assume
19352 quite safely that we are informed about all 1 byte insns and memory
19353 address sizes. This is enough to eliminate unnecessary padding in
19354 99% of cases. */
19355
19356 static int
19357 min_insn_size (rtx insn)
19358 {
19359 int l = 0;
19360
19361 if (!INSN_P (insn) || !active_insn_p (insn))
19362 return 0;
19363
19364 /* Discard alignments we've emit and jump instructions. */
19365 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19366 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
19367 return 0;
19368 if (JUMP_P (insn)
19369 && (GET_CODE (PATTERN (insn)) == ADDR_VEC
19370 || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC))
19371 return 0;
19372
19373 /* Important case - calls are always 5 bytes.
19374 It is common to have many calls in the row. */
19375 if (CALL_P (insn)
19376 && symbolic_reference_mentioned_p (PATTERN (insn))
19377 && !SIBLING_CALL_P (insn))
19378 return 5;
19379 if (get_attr_length (insn) <= 1)
19380 return 1;
19381
19382 /* For normal instructions we may rely on the sizes of addresses
19383 and the presence of symbol to require 4 bytes of encoding.
19384 This is not the case for jumps where references are PC relative. */
19385 if (!JUMP_P (insn))
19386 {
19387 l = get_attr_length_address (insn);
19388 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
19389 l = 4;
19390 }
19391 if (l)
19392 return 1+l;
19393 else
19394 return 2;
19395 }
19396
19397 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
19398 window. */
19399
19400 static void
19401 ix86_avoid_jump_misspredicts (void)
19402 {
19403 rtx insn, start = get_insns ();
19404 int nbytes = 0, njumps = 0;
19405 int isjump = 0;
19406
19407 /* Look for all minimal intervals of instructions containing 4 jumps.
19408 The intervals are bounded by START and INSN. NBYTES is the total
19409 size of instructions in the interval including INSN and not including
19410 START. When the NBYTES is smaller than 16 bytes, it is possible
19411 that the end of START and INSN ends up in the same 16byte page.
19412
19413 The smallest offset in the page INSN can start is the case where START
19414 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
19415 We add p2align to 16byte window with maxskip 17 - NBYTES + sizeof (INSN).
19416 */
19417 for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
19418 {
19419
19420 nbytes += min_insn_size (insn);
19421 if (dump_file)
19422 fprintf(dump_file, "Insn %i estimated to %i bytes\n",
19423 INSN_UID (insn), min_insn_size (insn));
19424 if ((JUMP_P (insn)
19425 && GET_CODE (PATTERN (insn)) != ADDR_VEC
19426 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
19427 || CALL_P (insn))
19428 njumps++;
19429 else
19430 continue;
19431
19432 while (njumps > 3)
19433 {
19434 start = NEXT_INSN (start);
19435 if ((JUMP_P (start)
19436 && GET_CODE (PATTERN (start)) != ADDR_VEC
19437 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
19438 || CALL_P (start))
19439 njumps--, isjump = 1;
19440 else
19441 isjump = 0;
19442 nbytes -= min_insn_size (start);
19443 }
19444 gcc_assert (njumps >= 0);
19445 if (dump_file)
19446 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
19447 INSN_UID (start), INSN_UID (insn), nbytes);
19448
19449 if (njumps == 3 && isjump && nbytes < 16)
19450 {
19451 int padsize = 15 - nbytes + min_insn_size (insn);
19452
19453 if (dump_file)
19454 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
19455 INSN_UID (insn), padsize);
19456 emit_insn_before (gen_align (GEN_INT (padsize)), insn);
19457 }
19458 }
19459 }
19460
19461 /* AMD Athlon works faster
19462 when RET is not destination of conditional jump or directly preceded
19463 by other jump instruction. We avoid the penalty by inserting NOP just
19464 before the RET instructions in such cases. */
19465 static void
19466 ix86_pad_returns (void)
19467 {
19468 edge e;
19469 edge_iterator ei;
19470
19471 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
19472 {
19473 basic_block bb = e->src;
19474 rtx ret = BB_END (bb);
19475 rtx prev;
19476 bool replace = false;
19477
19478 if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN
19479 || !maybe_hot_bb_p (bb))
19480 continue;
19481 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
19482 if (active_insn_p (prev) || LABEL_P (prev))
19483 break;
19484 if (prev && LABEL_P (prev))
19485 {
19486 edge e;
19487 edge_iterator ei;
19488
19489 FOR_EACH_EDGE (e, ei, bb->preds)
19490 if (EDGE_FREQUENCY (e) && e->src->index >= 0
19491 && !(e->flags & EDGE_FALLTHRU))
19492 replace = true;
19493 }
19494 if (!replace)
19495 {
19496 prev = prev_active_insn (ret);
19497 if (prev
19498 && ((JUMP_P (prev) && any_condjump_p (prev))
19499 || CALL_P (prev)))
19500 replace = true;
19501 /* Empty functions get branch mispredict even when the jump destination
19502 is not visible to us. */
19503 if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
19504 replace = true;
19505 }
19506 if (replace)
19507 {
19508 emit_insn_before (gen_return_internal_long (), ret);
19509 delete_insn (ret);
19510 }
19511 }
19512 }
19513
19514 /* Implement machine specific optimizations. We implement padding of returns
19515 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
19516 static void
19517 ix86_reorg (void)
19518 {
19519 if (TARGET_PAD_RETURNS && optimize && !optimize_size)
19520 ix86_pad_returns ();
19521 if (TARGET_FOUR_JUMP_LIMIT && optimize && !optimize_size)
19522 ix86_avoid_jump_misspredicts ();
19523 }
19524
19525 /* Return nonzero when QImode register that must be represented via REX prefix
19526 is used. */
19527 bool
19528 x86_extended_QIreg_mentioned_p (rtx insn)
19529 {
19530 int i;
19531 extract_insn_cached (insn);
19532 for (i = 0; i < recog_data.n_operands; i++)
19533 if (REG_P (recog_data.operand[i])
19534 && REGNO (recog_data.operand[i]) >= 4)
19535 return true;
19536 return false;
19537 }
19538
19539 /* Return nonzero when P points to register encoded via REX prefix.
19540 Called via for_each_rtx. */
19541 static int
19542 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
19543 {
19544 unsigned int regno;
19545 if (!REG_P (*p))
19546 return 0;
19547 regno = REGNO (*p);
19548 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
19549 }
19550
19551 /* Return true when INSN mentions register that must be encoded using REX
19552 prefix. */
19553 bool
19554 x86_extended_reg_mentioned_p (rtx insn)
19555 {
19556 return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL);
19557 }
19558
19559 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
19560 optabs would emit if we didn't have TFmode patterns. */
19561
19562 void
19563 x86_emit_floatuns (rtx operands[2])
19564 {
19565 rtx neglab, donelab, i0, i1, f0, in, out;
19566 enum machine_mode mode, inmode;
19567
19568 inmode = GET_MODE (operands[1]);
19569 gcc_assert (inmode == SImode || inmode == DImode);
19570
19571 out = operands[0];
19572 in = force_reg (inmode, operands[1]);
19573 mode = GET_MODE (out);
19574 neglab = gen_label_rtx ();
19575 donelab = gen_label_rtx ();
19576 i1 = gen_reg_rtx (Pmode);
19577 f0 = gen_reg_rtx (mode);
19578
19579 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, Pmode, 0, neglab);
19580
19581 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_FLOAT (mode, in)));
19582 emit_jump_insn (gen_jump (donelab));
19583 emit_barrier ();
19584
19585 emit_label (neglab);
19586
19587 i0 = expand_simple_binop (Pmode, LSHIFTRT, in, const1_rtx, NULL, 1, OPTAB_DIRECT);
19588 i1 = expand_simple_binop (Pmode, AND, in, const1_rtx, NULL, 1, OPTAB_DIRECT);
19589 i0 = expand_simple_binop (Pmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
19590 expand_float (f0, i0, 0);
19591 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
19592
19593 emit_label (donelab);
19594 }
19595 \f
19596 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19597 with all elements equal to VAR. Return true if successful. */
19598
19599 static bool
19600 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
19601 rtx target, rtx val)
19602 {
19603 enum machine_mode smode, wsmode, wvmode;
19604 rtx x;
19605
19606 switch (mode)
19607 {
19608 case V2SImode:
19609 case V2SFmode:
19610 if (!mmx_ok)
19611 return false;
19612 /* FALLTHRU */
19613
19614 case V2DFmode:
19615 case V2DImode:
19616 case V4SFmode:
19617 case V4SImode:
19618 val = force_reg (GET_MODE_INNER (mode), val);
19619 x = gen_rtx_VEC_DUPLICATE (mode, val);
19620 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19621 return true;
19622
19623 case V4HImode:
19624 if (!mmx_ok)
19625 return false;
19626 if (TARGET_SSE || TARGET_3DNOW_A)
19627 {
19628 val = gen_lowpart (SImode, val);
19629 x = gen_rtx_TRUNCATE (HImode, val);
19630 x = gen_rtx_VEC_DUPLICATE (mode, x);
19631 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19632 return true;
19633 }
19634 else
19635 {
19636 smode = HImode;
19637 wsmode = SImode;
19638 wvmode = V2SImode;
19639 goto widen;
19640 }
19641
19642 case V8QImode:
19643 if (!mmx_ok)
19644 return false;
19645 smode = QImode;
19646 wsmode = HImode;
19647 wvmode = V4HImode;
19648 goto widen;
19649 case V8HImode:
19650 if (TARGET_SSE2)
19651 {
19652 rtx tmp1, tmp2;
19653 /* Extend HImode to SImode using a paradoxical SUBREG. */
19654 tmp1 = gen_reg_rtx (SImode);
19655 emit_move_insn (tmp1, gen_lowpart (SImode, val));
19656 /* Insert the SImode value as low element of V4SImode vector. */
19657 tmp2 = gen_reg_rtx (V4SImode);
19658 tmp1 = gen_rtx_VEC_MERGE (V4SImode,
19659 gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
19660 CONST0_RTX (V4SImode),
19661 const1_rtx);
19662 emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
19663 /* Cast the V4SImode vector back to a V8HImode vector. */
19664 tmp1 = gen_reg_rtx (V8HImode);
19665 emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
19666 /* Duplicate the low short through the whole low SImode word. */
19667 emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
19668 /* Cast the V8HImode vector back to a V4SImode vector. */
19669 tmp2 = gen_reg_rtx (V4SImode);
19670 emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
19671 /* Replicate the low element of the V4SImode vector. */
19672 emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
19673 /* Cast the V2SImode back to V8HImode, and store in target. */
19674 emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
19675 return true;
19676 }
19677 smode = HImode;
19678 wsmode = SImode;
19679 wvmode = V4SImode;
19680 goto widen;
19681 case V16QImode:
19682 if (TARGET_SSE2)
19683 {
19684 rtx tmp1, tmp2;
19685 /* Extend QImode to SImode using a paradoxical SUBREG. */
19686 tmp1 = gen_reg_rtx (SImode);
19687 emit_move_insn (tmp1, gen_lowpart (SImode, val));
19688 /* Insert the SImode value as low element of V4SImode vector. */
19689 tmp2 = gen_reg_rtx (V4SImode);
19690 tmp1 = gen_rtx_VEC_MERGE (V4SImode,
19691 gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
19692 CONST0_RTX (V4SImode),
19693 const1_rtx);
19694 emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
19695 /* Cast the V4SImode vector back to a V16QImode vector. */
19696 tmp1 = gen_reg_rtx (V16QImode);
19697 emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
19698 /* Duplicate the low byte through the whole low SImode word. */
19699 emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
19700 emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
19701 /* Cast the V16QImode vector back to a V4SImode vector. */
19702 tmp2 = gen_reg_rtx (V4SImode);
19703 emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
19704 /* Replicate the low element of the V4SImode vector. */
19705 emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
19706 /* Cast the V2SImode back to V16QImode, and store in target. */
19707 emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
19708 return true;
19709 }
19710 smode = QImode;
19711 wsmode = HImode;
19712 wvmode = V8HImode;
19713 goto widen;
19714 widen:
19715 /* Replicate the value once into the next wider mode and recurse. */
19716 val = convert_modes (wsmode, smode, val, true);
19717 x = expand_simple_binop (wsmode, ASHIFT, val,
19718 GEN_INT (GET_MODE_BITSIZE (smode)),
19719 NULL_RTX, 1, OPTAB_LIB_WIDEN);
19720 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
19721
19722 x = gen_reg_rtx (wvmode);
19723 if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
19724 gcc_unreachable ();
19725 emit_move_insn (target, gen_lowpart (mode, x));
19726 return true;
19727
19728 default:
19729 return false;
19730 }
19731 }
19732
19733 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19734 whose ONE_VAR element is VAR, and other elements are zero. Return true
19735 if successful. */
19736
19737 static bool
19738 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
19739 rtx target, rtx var, int one_var)
19740 {
19741 enum machine_mode vsimode;
19742 rtx new_target;
19743 rtx x, tmp;
19744
19745 switch (mode)
19746 {
19747 case V2SFmode:
19748 case V2SImode:
19749 if (!mmx_ok)
19750 return false;
19751 /* FALLTHRU */
19752
19753 case V2DFmode:
19754 case V2DImode:
19755 if (one_var != 0)
19756 return false;
19757 var = force_reg (GET_MODE_INNER (mode), var);
19758 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
19759 emit_insn (gen_rtx_SET (VOIDmode, target, x));
19760 return true;
19761
19762 case V4SFmode:
19763 case V4SImode:
19764 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
19765 new_target = gen_reg_rtx (mode);
19766 else
19767 new_target = target;
19768 var = force_reg (GET_MODE_INNER (mode), var);
19769 x = gen_rtx_VEC_DUPLICATE (mode, var);
19770 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
19771 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
19772 if (one_var != 0)
19773 {
19774 /* We need to shuffle the value to the correct position, so
19775 create a new pseudo to store the intermediate result. */
19776
19777 /* With SSE2, we can use the integer shuffle insns. */
19778 if (mode != V4SFmode && TARGET_SSE2)
19779 {
19780 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
19781 GEN_INT (1),
19782 GEN_INT (one_var == 1 ? 0 : 1),
19783 GEN_INT (one_var == 2 ? 0 : 1),
19784 GEN_INT (one_var == 3 ? 0 : 1)));
19785 if (target != new_target)
19786 emit_move_insn (target, new_target);
19787 return true;
19788 }
19789
19790 /* Otherwise convert the intermediate result to V4SFmode and
19791 use the SSE1 shuffle instructions. */
19792 if (mode != V4SFmode)
19793 {
19794 tmp = gen_reg_rtx (V4SFmode);
19795 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
19796 }
19797 else
19798 tmp = new_target;
19799
19800 emit_insn (gen_sse_shufps_1 (tmp, tmp, tmp,
19801 GEN_INT (1),
19802 GEN_INT (one_var == 1 ? 0 : 1),
19803 GEN_INT (one_var == 2 ? 0+4 : 1+4),
19804 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
19805
19806 if (mode != V4SFmode)
19807 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
19808 else if (tmp != target)
19809 emit_move_insn (target, tmp);
19810 }
19811 else if (target != new_target)
19812 emit_move_insn (target, new_target);
19813 return true;
19814
19815 case V8HImode:
19816 case V16QImode:
19817 vsimode = V4SImode;
19818 goto widen;
19819 case V4HImode:
19820 case V8QImode:
19821 if (!mmx_ok)
19822 return false;
19823 vsimode = V2SImode;
19824 goto widen;
19825 widen:
19826 if (one_var != 0)
19827 return false;
19828
19829 /* Zero extend the variable element to SImode and recurse. */
19830 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
19831
19832 x = gen_reg_rtx (vsimode);
19833 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
19834 var, one_var))
19835 gcc_unreachable ();
19836
19837 emit_move_insn (target, gen_lowpart (mode, x));
19838 return true;
19839
19840 default:
19841 return false;
19842 }
19843 }
19844
19845 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
19846 consisting of the values in VALS. It is known that all elements
19847 except ONE_VAR are constants. Return true if successful. */
19848
19849 static bool
19850 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
19851 rtx target, rtx vals, int one_var)
19852 {
19853 rtx var = XVECEXP (vals, 0, one_var);
19854 enum machine_mode wmode;
19855 rtx const_vec, x;
19856
19857 const_vec = copy_rtx (vals);
19858 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
19859 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
19860
19861 switch (mode)
19862 {
19863 case V2DFmode:
19864 case V2DImode:
19865 case V2SFmode:
19866 case V2SImode:
19867 /* For the two element vectors, it's just as easy to use
19868 the general case. */
19869 return false;
19870
19871 case V4SFmode:
19872 case V4SImode:
19873 case V8HImode:
19874 case V4HImode:
19875 break;
19876
19877 case V16QImode:
19878 wmode = V8HImode;
19879 goto widen;
19880 case V8QImode:
19881 wmode = V4HImode;
19882 goto widen;
19883 widen:
19884 /* There's no way to set one QImode entry easily. Combine
19885 the variable value with its adjacent constant value, and
19886 promote to an HImode set. */
19887 x = XVECEXP (vals, 0, one_var ^ 1);
19888 if (one_var & 1)
19889 {
19890 var = convert_modes (HImode, QImode, var, true);
19891 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
19892 NULL_RTX, 1, OPTAB_LIB_WIDEN);
19893 x = GEN_INT (INTVAL (x) & 0xff);
19894 }
19895 else
19896 {
19897 var = convert_modes (HImode, QImode, var, true);
19898 x = gen_int_mode (INTVAL (x) << 8, HImode);
19899 }
19900 if (x != const0_rtx)
19901 var = expand_simple_binop (HImode, IOR, var, x, var,
19902 1, OPTAB_LIB_WIDEN);
19903
19904 x = gen_reg_rtx (wmode);
19905 emit_move_insn (x, gen_lowpart (wmode, const_vec));
19906 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
19907
19908 emit_move_insn (target, gen_lowpart (mode, x));
19909 return true;
19910
19911 default:
19912 return false;
19913 }
19914
19915 emit_move_insn (target, const_vec);
19916 ix86_expand_vector_set (mmx_ok, target, var, one_var);
19917 return true;
19918 }
19919
19920 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
19921 all values variable, and none identical. */
19922
19923 static void
19924 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
19925 rtx target, rtx vals)
19926 {
19927 enum machine_mode half_mode = GET_MODE_INNER (mode);
19928 rtx op0 = NULL, op1 = NULL;
19929 bool use_vec_concat = false;
19930
19931 switch (mode)
19932 {
19933 case V2SFmode:
19934 case V2SImode:
19935 if (!mmx_ok && !TARGET_SSE)
19936 break;
19937 /* FALLTHRU */
19938
19939 case V2DFmode:
19940 case V2DImode:
19941 /* For the two element vectors, we always implement VEC_CONCAT. */
19942 op0 = XVECEXP (vals, 0, 0);
19943 op1 = XVECEXP (vals, 0, 1);
19944 use_vec_concat = true;
19945 break;
19946
19947 case V4SFmode:
19948 half_mode = V2SFmode;
19949 goto half;
19950 case V4SImode:
19951 half_mode = V2SImode;
19952 goto half;
19953 half:
19954 {
19955 rtvec v;
19956
19957 /* For V4SF and V4SI, we implement a concat of two V2 vectors.
19958 Recurse to load the two halves. */
19959
19960 op0 = gen_reg_rtx (half_mode);
19961 v = gen_rtvec (2, XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1));
19962 ix86_expand_vector_init (false, op0, gen_rtx_PARALLEL (half_mode, v));
19963
19964 op1 = gen_reg_rtx (half_mode);
19965 v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3));
19966 ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v));
19967
19968 use_vec_concat = true;
19969 }
19970 break;
19971
19972 case V8HImode:
19973 case V16QImode:
19974 case V4HImode:
19975 case V8QImode:
19976 break;
19977
19978 default:
19979 gcc_unreachable ();
19980 }
19981
19982 if (use_vec_concat)
19983 {
19984 if (!register_operand (op0, half_mode))
19985 op0 = force_reg (half_mode, op0);
19986 if (!register_operand (op1, half_mode))
19987 op1 = force_reg (half_mode, op1);
19988
19989 emit_insn (gen_rtx_SET (VOIDmode, target,
19990 gen_rtx_VEC_CONCAT (mode, op0, op1)));
19991 }
19992 else
19993 {
19994 int i, j, n_elts, n_words, n_elt_per_word;
19995 enum machine_mode inner_mode;
19996 rtx words[4], shift;
19997
19998 inner_mode = GET_MODE_INNER (mode);
19999 n_elts = GET_MODE_NUNITS (mode);
20000 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
20001 n_elt_per_word = n_elts / n_words;
20002 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
20003
20004 for (i = 0; i < n_words; ++i)
20005 {
20006 rtx word = NULL_RTX;
20007
20008 for (j = 0; j < n_elt_per_word; ++j)
20009 {
20010 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
20011 elt = convert_modes (word_mode, inner_mode, elt, true);
20012
20013 if (j == 0)
20014 word = elt;
20015 else
20016 {
20017 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
20018 word, 1, OPTAB_LIB_WIDEN);
20019 word = expand_simple_binop (word_mode, IOR, word, elt,
20020 word, 1, OPTAB_LIB_WIDEN);
20021 }
20022 }
20023
20024 words[i] = word;
20025 }
20026
20027 if (n_words == 1)
20028 emit_move_insn (target, gen_lowpart (mode, words[0]));
20029 else if (n_words == 2)
20030 {
20031 rtx tmp = gen_reg_rtx (mode);
20032 emit_insn (gen_rtx_CLOBBER (VOIDmode, tmp));
20033 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
20034 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
20035 emit_move_insn (target, tmp);
20036 }
20037 else if (n_words == 4)
20038 {
20039 rtx tmp = gen_reg_rtx (V4SImode);
20040 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
20041 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
20042 emit_move_insn (target, gen_lowpart (mode, tmp));
20043 }
20044 else
20045 gcc_unreachable ();
20046 }
20047 }
20048
20049 /* Initialize vector TARGET via VALS. Suppress the use of MMX
20050 instructions unless MMX_OK is true. */
20051
20052 void
20053 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
20054 {
20055 enum machine_mode mode = GET_MODE (target);
20056 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20057 int n_elts = GET_MODE_NUNITS (mode);
20058 int n_var = 0, one_var = -1;
20059 bool all_same = true, all_const_zero = true;
20060 int i;
20061 rtx x;
20062
20063 for (i = 0; i < n_elts; ++i)
20064 {
20065 x = XVECEXP (vals, 0, i);
20066 if (!CONSTANT_P (x))
20067 n_var++, one_var = i;
20068 else if (x != CONST0_RTX (inner_mode))
20069 all_const_zero = false;
20070 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
20071 all_same = false;
20072 }
20073
20074 /* Constants are best loaded from the constant pool. */
20075 if (n_var == 0)
20076 {
20077 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
20078 return;
20079 }
20080
20081 /* If all values are identical, broadcast the value. */
20082 if (all_same
20083 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
20084 XVECEXP (vals, 0, 0)))
20085 return;
20086
20087 /* Values where only one field is non-constant are best loaded from
20088 the pool and overwritten via move later. */
20089 if (n_var == 1)
20090 {
20091 if (all_const_zero
20092 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
20093 XVECEXP (vals, 0, one_var),
20094 one_var))
20095 return;
20096
20097 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
20098 return;
20099 }
20100
20101 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
20102 }
20103
20104 void
20105 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
20106 {
20107 enum machine_mode mode = GET_MODE (target);
20108 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20109 bool use_vec_merge = false;
20110 rtx tmp;
20111
20112 switch (mode)
20113 {
20114 case V2SFmode:
20115 case V2SImode:
20116 if (mmx_ok)
20117 {
20118 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
20119 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
20120 if (elt == 0)
20121 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
20122 else
20123 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
20124 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20125 return;
20126 }
20127 break;
20128
20129 case V2DFmode:
20130 case V2DImode:
20131 {
20132 rtx op0, op1;
20133
20134 /* For the two element vectors, we implement a VEC_CONCAT with
20135 the extraction of the other element. */
20136
20137 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
20138 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
20139
20140 if (elt == 0)
20141 op0 = val, op1 = tmp;
20142 else
20143 op0 = tmp, op1 = val;
20144
20145 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
20146 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20147 }
20148 return;
20149
20150 case V4SFmode:
20151 switch (elt)
20152 {
20153 case 0:
20154 use_vec_merge = true;
20155 break;
20156
20157 case 1:
20158 /* tmp = target = A B C D */
20159 tmp = copy_to_reg (target);
20160 /* target = A A B B */
20161 emit_insn (gen_sse_unpcklps (target, target, target));
20162 /* target = X A B B */
20163 ix86_expand_vector_set (false, target, val, 0);
20164 /* target = A X C D */
20165 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20166 GEN_INT (1), GEN_INT (0),
20167 GEN_INT (2+4), GEN_INT (3+4)));
20168 return;
20169
20170 case 2:
20171 /* tmp = target = A B C D */
20172 tmp = copy_to_reg (target);
20173 /* tmp = X B C D */
20174 ix86_expand_vector_set (false, tmp, val, 0);
20175 /* target = A B X D */
20176 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20177 GEN_INT (0), GEN_INT (1),
20178 GEN_INT (0+4), GEN_INT (3+4)));
20179 return;
20180
20181 case 3:
20182 /* tmp = target = A B C D */
20183 tmp = copy_to_reg (target);
20184 /* tmp = X B C D */
20185 ix86_expand_vector_set (false, tmp, val, 0);
20186 /* target = A B X D */
20187 emit_insn (gen_sse_shufps_1 (target, target, tmp,
20188 GEN_INT (0), GEN_INT (1),
20189 GEN_INT (2+4), GEN_INT (0+4)));
20190 return;
20191
20192 default:
20193 gcc_unreachable ();
20194 }
20195 break;
20196
20197 case V4SImode:
20198 /* Element 0 handled by vec_merge below. */
20199 if (elt == 0)
20200 {
20201 use_vec_merge = true;
20202 break;
20203 }
20204
20205 if (TARGET_SSE2)
20206 {
20207 /* With SSE2, use integer shuffles to swap element 0 and ELT,
20208 store into element 0, then shuffle them back. */
20209
20210 rtx order[4];
20211
20212 order[0] = GEN_INT (elt);
20213 order[1] = const1_rtx;
20214 order[2] = const2_rtx;
20215 order[3] = GEN_INT (3);
20216 order[elt] = const0_rtx;
20217
20218 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20219 order[1], order[2], order[3]));
20220
20221 ix86_expand_vector_set (false, target, val, 0);
20222
20223 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20224 order[1], order[2], order[3]));
20225 }
20226 else
20227 {
20228 /* For SSE1, we have to reuse the V4SF code. */
20229 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
20230 gen_lowpart (SFmode, val), elt);
20231 }
20232 return;
20233
20234 case V8HImode:
20235 use_vec_merge = TARGET_SSE2;
20236 break;
20237 case V4HImode:
20238 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
20239 break;
20240
20241 case V16QImode:
20242 case V8QImode:
20243 default:
20244 break;
20245 }
20246
20247 if (use_vec_merge)
20248 {
20249 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
20250 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
20251 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20252 }
20253 else
20254 {
20255 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
20256
20257 emit_move_insn (mem, target);
20258
20259 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
20260 emit_move_insn (tmp, val);
20261
20262 emit_move_insn (target, mem);
20263 }
20264 }
20265
20266 void
20267 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
20268 {
20269 enum machine_mode mode = GET_MODE (vec);
20270 enum machine_mode inner_mode = GET_MODE_INNER (mode);
20271 bool use_vec_extr = false;
20272 rtx tmp;
20273
20274 switch (mode)
20275 {
20276 case V2SImode:
20277 case V2SFmode:
20278 if (!mmx_ok)
20279 break;
20280 /* FALLTHRU */
20281
20282 case V2DFmode:
20283 case V2DImode:
20284 use_vec_extr = true;
20285 break;
20286
20287 case V4SFmode:
20288 switch (elt)
20289 {
20290 case 0:
20291 tmp = vec;
20292 break;
20293
20294 case 1:
20295 case 3:
20296 tmp = gen_reg_rtx (mode);
20297 emit_insn (gen_sse_shufps_1 (tmp, vec, vec,
20298 GEN_INT (elt), GEN_INT (elt),
20299 GEN_INT (elt+4), GEN_INT (elt+4)));
20300 break;
20301
20302 case 2:
20303 tmp = gen_reg_rtx (mode);
20304 emit_insn (gen_sse_unpckhps (tmp, vec, vec));
20305 break;
20306
20307 default:
20308 gcc_unreachable ();
20309 }
20310 vec = tmp;
20311 use_vec_extr = true;
20312 elt = 0;
20313 break;
20314
20315 case V4SImode:
20316 if (TARGET_SSE2)
20317 {
20318 switch (elt)
20319 {
20320 case 0:
20321 tmp = vec;
20322 break;
20323
20324 case 1:
20325 case 3:
20326 tmp = gen_reg_rtx (mode);
20327 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
20328 GEN_INT (elt), GEN_INT (elt),
20329 GEN_INT (elt), GEN_INT (elt)));
20330 break;
20331
20332 case 2:
20333 tmp = gen_reg_rtx (mode);
20334 emit_insn (gen_sse2_punpckhdq (tmp, vec, vec));
20335 break;
20336
20337 default:
20338 gcc_unreachable ();
20339 }
20340 vec = tmp;
20341 use_vec_extr = true;
20342 elt = 0;
20343 }
20344 else
20345 {
20346 /* For SSE1, we have to reuse the V4SF code. */
20347 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
20348 gen_lowpart (V4SFmode, vec), elt);
20349 return;
20350 }
20351 break;
20352
20353 case V8HImode:
20354 use_vec_extr = TARGET_SSE2;
20355 break;
20356 case V4HImode:
20357 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
20358 break;
20359
20360 case V16QImode:
20361 case V8QImode:
20362 /* ??? Could extract the appropriate HImode element and shift. */
20363 default:
20364 break;
20365 }
20366
20367 if (use_vec_extr)
20368 {
20369 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
20370 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
20371
20372 /* Let the rtl optimizers know about the zero extension performed. */
20373 if (inner_mode == HImode)
20374 {
20375 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
20376 target = gen_lowpart (SImode, target);
20377 }
20378
20379 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20380 }
20381 else
20382 {
20383 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
20384
20385 emit_move_insn (mem, vec);
20386
20387 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
20388 emit_move_insn (target, tmp);
20389 }
20390 }
20391
20392 /* Expand a vector reduction on V4SFmode for SSE1. FN is the binary
20393 pattern to reduce; DEST is the destination; IN is the input vector. */
20394
20395 void
20396 ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
20397 {
20398 rtx tmp1, tmp2, tmp3;
20399
20400 tmp1 = gen_reg_rtx (V4SFmode);
20401 tmp2 = gen_reg_rtx (V4SFmode);
20402 tmp3 = gen_reg_rtx (V4SFmode);
20403
20404 emit_insn (gen_sse_movhlps (tmp1, in, in));
20405 emit_insn (fn (tmp2, tmp1, in));
20406
20407 emit_insn (gen_sse_shufps_1 (tmp3, tmp2, tmp2,
20408 GEN_INT (1), GEN_INT (1),
20409 GEN_INT (1+4), GEN_INT (1+4)));
20410 emit_insn (fn (dest, tmp2, tmp3));
20411 }
20412 \f
20413 /* Target hook for scalar_mode_supported_p. */
20414 static bool
20415 ix86_scalar_mode_supported_p (enum machine_mode mode)
20416 {
20417 if (DECIMAL_FLOAT_MODE_P (mode))
20418 return true;
20419 else
20420 return default_scalar_mode_supported_p (mode);
20421 }
20422
20423 /* Implements target hook vector_mode_supported_p. */
20424 static bool
20425 ix86_vector_mode_supported_p (enum machine_mode mode)
20426 {
20427 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
20428 return true;
20429 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
20430 return true;
20431 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
20432 return true;
20433 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
20434 return true;
20435 return false;
20436 }
20437
20438 /* Worker function for TARGET_MD_ASM_CLOBBERS.
20439
20440 We do this in the new i386 backend to maintain source compatibility
20441 with the old cc0-based compiler. */
20442
20443 static tree
20444 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
20445 tree inputs ATTRIBUTE_UNUSED,
20446 tree clobbers)
20447 {
20448 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
20449 clobbers);
20450 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
20451 clobbers);
20452 return clobbers;
20453 }
20454
20455 /* Return true if this goes in small data/bss. */
20456
20457 static bool
20458 ix86_in_large_data_p (tree exp)
20459 {
20460 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
20461 return false;
20462
20463 /* Functions are never large data. */
20464 if (TREE_CODE (exp) == FUNCTION_DECL)
20465 return false;
20466
20467 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
20468 {
20469 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
20470 if (strcmp (section, ".ldata") == 0
20471 || strcmp (section, ".lbss") == 0)
20472 return true;
20473 return false;
20474 }
20475 else
20476 {
20477 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
20478
20479 /* If this is an incomplete type with size 0, then we can't put it
20480 in data because it might be too big when completed. */
20481 if (!size || size > ix86_section_threshold)
20482 return true;
20483 }
20484
20485 return false;
20486 }
20487 static void
20488 ix86_encode_section_info (tree decl, rtx rtl, int first)
20489 {
20490 default_encode_section_info (decl, rtl, first);
20491
20492 if (TREE_CODE (decl) == VAR_DECL
20493 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
20494 && ix86_in_large_data_p (decl))
20495 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
20496 }
20497
20498 /* Worker function for REVERSE_CONDITION. */
20499
20500 enum rtx_code
20501 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
20502 {
20503 return (mode != CCFPmode && mode != CCFPUmode
20504 ? reverse_condition (code)
20505 : reverse_condition_maybe_unordered (code));
20506 }
20507
20508 /* Output code to perform an x87 FP register move, from OPERANDS[1]
20509 to OPERANDS[0]. */
20510
20511 const char *
20512 output_387_reg_move (rtx insn, rtx *operands)
20513 {
20514 if (REG_P (operands[1])
20515 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
20516 {
20517 if (REGNO (operands[0]) == FIRST_STACK_REG)
20518 return output_387_ffreep (operands, 0);
20519 return "fstp\t%y0";
20520 }
20521 if (STACK_TOP_P (operands[0]))
20522 return "fld%z1\t%y1";
20523 return "fst\t%y0";
20524 }
20525
20526 /* Output code to perform a conditional jump to LABEL, if C2 flag in
20527 FP status register is set. */
20528
20529 void
20530 ix86_emit_fp_unordered_jump (rtx label)
20531 {
20532 rtx reg = gen_reg_rtx (HImode);
20533 rtx temp;
20534
20535 emit_insn (gen_x86_fnstsw_1 (reg));
20536
20537 if (TARGET_USE_SAHF)
20538 {
20539 emit_insn (gen_x86_sahf_1 (reg));
20540
20541 temp = gen_rtx_REG (CCmode, FLAGS_REG);
20542 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
20543 }
20544 else
20545 {
20546 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
20547
20548 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
20549 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
20550 }
20551
20552 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
20553 gen_rtx_LABEL_REF (VOIDmode, label),
20554 pc_rtx);
20555 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
20556 emit_jump_insn (temp);
20557 }
20558
20559 /* Output code to perform a log1p XFmode calculation. */
20560
20561 void ix86_emit_i387_log1p (rtx op0, rtx op1)
20562 {
20563 rtx label1 = gen_label_rtx ();
20564 rtx label2 = gen_label_rtx ();
20565
20566 rtx tmp = gen_reg_rtx (XFmode);
20567 rtx tmp2 = gen_reg_rtx (XFmode);
20568
20569 emit_insn (gen_absxf2 (tmp, op1));
20570 emit_insn (gen_cmpxf (tmp,
20571 CONST_DOUBLE_FROM_REAL_VALUE (
20572 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
20573 XFmode)));
20574 emit_jump_insn (gen_bge (label1));
20575
20576 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
20577 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
20578 emit_jump (label2);
20579
20580 emit_label (label1);
20581 emit_move_insn (tmp, CONST1_RTX (XFmode));
20582 emit_insn (gen_addxf3 (tmp, op1, tmp));
20583 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
20584 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
20585
20586 emit_label (label2);
20587 }
20588
20589 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
20590
20591 static void
20592 i386_solaris_elf_named_section (const char *name, unsigned int flags,
20593 tree decl)
20594 {
20595 /* With Binutils 2.15, the "@unwind" marker must be specified on
20596 every occurrence of the ".eh_frame" section, not just the first
20597 one. */
20598 if (TARGET_64BIT
20599 && strcmp (name, ".eh_frame") == 0)
20600 {
20601 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
20602 flags & SECTION_WRITE ? "aw" : "a");
20603 return;
20604 }
20605 default_elf_asm_named_section (name, flags, decl);
20606 }
20607
20608 /* Return the mangling of TYPE if it is an extended fundamental type. */
20609
20610 static const char *
20611 ix86_mangle_fundamental_type (tree type)
20612 {
20613 switch (TYPE_MODE (type))
20614 {
20615 case TFmode:
20616 /* __float128 is "g". */
20617 return "g";
20618 case XFmode:
20619 /* "long double" or __float80 is "e". */
20620 return "e";
20621 default:
20622 return NULL;
20623 }
20624 }
20625
20626 /* For 32-bit code we can save PIC register setup by using
20627 __stack_chk_fail_local hidden function instead of calling
20628 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
20629 register, so it is better to call __stack_chk_fail directly. */
20630
20631 static tree
20632 ix86_stack_protect_fail (void)
20633 {
20634 return TARGET_64BIT
20635 ? default_external_stack_protect_fail ()
20636 : default_hidden_stack_protect_fail ();
20637 }
20638
20639 /* Select a format to encode pointers in exception handling data. CODE
20640 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
20641 true if the symbol may be affected by dynamic relocations.
20642
20643 ??? All x86 object file formats are capable of representing this.
20644 After all, the relocation needed is the same as for the call insn.
20645 Whether or not a particular assembler allows us to enter such, I
20646 guess we'll have to see. */
20647 int
20648 asm_preferred_eh_data_format (int code, int global)
20649 {
20650 if (flag_pic)
20651 {
20652 int type = DW_EH_PE_sdata8;
20653 if (!TARGET_64BIT
20654 || ix86_cmodel == CM_SMALL_PIC
20655 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
20656 type = DW_EH_PE_sdata4;
20657 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
20658 }
20659 if (ix86_cmodel == CM_SMALL
20660 || (ix86_cmodel == CM_MEDIUM && code))
20661 return DW_EH_PE_udata4;
20662 return DW_EH_PE_absptr;
20663 }
20664 \f
20665 /* Expand copysign from SIGN to the positive value ABS_VALUE
20666 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
20667 the sign-bit. */
20668 static void
20669 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
20670 {
20671 enum machine_mode mode = GET_MODE (sign);
20672 rtx sgn = gen_reg_rtx (mode);
20673 if (mask == NULL_RTX)
20674 {
20675 mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false);
20676 if (!VECTOR_MODE_P (mode))
20677 {
20678 /* We need to generate a scalar mode mask in this case. */
20679 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20680 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20681 mask = gen_reg_rtx (mode);
20682 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
20683 }
20684 }
20685 else
20686 mask = gen_rtx_NOT (mode, mask);
20687 emit_insn (gen_rtx_SET (VOIDmode, sgn,
20688 gen_rtx_AND (mode, mask, sign)));
20689 emit_insn (gen_rtx_SET (VOIDmode, result,
20690 gen_rtx_IOR (mode, abs_value, sgn)));
20691 }
20692
20693 /* Expand fabs (OP0) and return a new rtx that holds the result. The
20694 mask for masking out the sign-bit is stored in *SMASK, if that is
20695 non-null. */
20696 static rtx
20697 ix86_expand_sse_fabs (rtx op0, rtx *smask)
20698 {
20699 enum machine_mode mode = GET_MODE (op0);
20700 rtx xa, mask;
20701
20702 xa = gen_reg_rtx (mode);
20703 mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true);
20704 if (!VECTOR_MODE_P (mode))
20705 {
20706 /* We need to generate a scalar mode mask in this case. */
20707 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20708 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20709 mask = gen_reg_rtx (mode);
20710 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
20711 }
20712 emit_insn (gen_rtx_SET (VOIDmode, xa,
20713 gen_rtx_AND (mode, op0, mask)));
20714
20715 if (smask)
20716 *smask = mask;
20717
20718 return xa;
20719 }
20720
20721 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
20722 swapping the operands if SWAP_OPERANDS is true. The expanded
20723 code is a forward jump to a newly created label in case the
20724 comparison is true. The generated label rtx is returned. */
20725 static rtx
20726 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
20727 bool swap_operands)
20728 {
20729 rtx label, tmp;
20730
20731 if (swap_operands)
20732 {
20733 tmp = op0;
20734 op0 = op1;
20735 op1 = tmp;
20736 }
20737
20738 label = gen_label_rtx ();
20739 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
20740 emit_insn (gen_rtx_SET (VOIDmode, tmp,
20741 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
20742 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
20743 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
20744 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
20745 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
20746 JUMP_LABEL (tmp) = label;
20747
20748 return label;
20749 }
20750
20751 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
20752 using comparison code CODE. Operands are swapped for the comparison if
20753 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
20754 static rtx
20755 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
20756 bool swap_operands)
20757 {
20758 enum machine_mode mode = GET_MODE (op0);
20759 rtx mask = gen_reg_rtx (mode);
20760
20761 if (swap_operands)
20762 {
20763 rtx tmp = op0;
20764 op0 = op1;
20765 op1 = tmp;
20766 }
20767
20768 if (mode == DFmode)
20769 emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
20770 gen_rtx_fmt_ee (code, mode, op0, op1)));
20771 else
20772 emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
20773 gen_rtx_fmt_ee (code, mode, op0, op1)));
20774
20775 return mask;
20776 }
20777
20778 /* Generate and return a rtx of mode MODE for 2**n where n is the number
20779 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
20780 static rtx
20781 ix86_gen_TWO52 (enum machine_mode mode)
20782 {
20783 REAL_VALUE_TYPE TWO52r;
20784 rtx TWO52;
20785
20786 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
20787 TWO52 = const_double_from_real_value (TWO52r, mode);
20788 TWO52 = force_reg (mode, TWO52);
20789
20790 return TWO52;
20791 }
20792
20793 /* Expand SSE sequence for computing lround from OP1 storing
20794 into OP0. */
20795 void
20796 ix86_expand_lround (rtx op0, rtx op1)
20797 {
20798 /* C code for the stuff we're doing below:
20799 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
20800 return (long)tmp;
20801 */
20802 enum machine_mode mode = GET_MODE (op1);
20803 const struct real_format *fmt;
20804 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20805 rtx adj;
20806
20807 /* load nextafter (0.5, 0.0) */
20808 fmt = REAL_MODE_FORMAT (mode);
20809 real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
20810 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
20811
20812 /* adj = copysign (0.5, op1) */
20813 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
20814 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
20815
20816 /* adj = op1 + adj */
20817 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
20818
20819 /* op0 = (imode)adj */
20820 expand_fix (op0, adj, 0);
20821 }
20822
20823 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
20824 into OPERAND0. */
20825 void
20826 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
20827 {
20828 /* C code for the stuff we're doing below (for do_floor):
20829 xi = (long)op1;
20830 xi -= (double)xi > op1 ? 1 : 0;
20831 return xi;
20832 */
20833 enum machine_mode fmode = GET_MODE (op1);
20834 enum machine_mode imode = GET_MODE (op0);
20835 rtx ireg, freg, label, tmp;
20836
20837 /* reg = (long)op1 */
20838 ireg = gen_reg_rtx (imode);
20839 expand_fix (ireg, op1, 0);
20840
20841 /* freg = (double)reg */
20842 freg = gen_reg_rtx (fmode);
20843 expand_float (freg, ireg, 0);
20844
20845 /* ireg = (freg > op1) ? ireg - 1 : ireg */
20846 label = ix86_expand_sse_compare_and_jump (UNLE,
20847 freg, op1, !do_floor);
20848 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
20849 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
20850 emit_move_insn (ireg, tmp);
20851
20852 emit_label (label);
20853 LABEL_NUSES (label) = 1;
20854
20855 emit_move_insn (op0, ireg);
20856 }
20857
20858 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
20859 result in OPERAND0. */
20860 void
20861 ix86_expand_rint (rtx operand0, rtx operand1)
20862 {
20863 /* C code for the stuff we're doing below:
20864 xa = fabs (operand1);
20865 if (!isless (xa, 2**52))
20866 return operand1;
20867 xa = xa + 2**52 - 2**52;
20868 return copysign (xa, operand1);
20869 */
20870 enum machine_mode mode = GET_MODE (operand0);
20871 rtx res, xa, label, TWO52, mask;
20872
20873 res = gen_reg_rtx (mode);
20874 emit_move_insn (res, operand1);
20875
20876 /* xa = abs (operand1) */
20877 xa = ix86_expand_sse_fabs (res, &mask);
20878
20879 /* if (!isless (xa, TWO52)) goto label; */
20880 TWO52 = ix86_gen_TWO52 (mode);
20881 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20882
20883 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20884 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20885
20886 ix86_sse_copysign_to_positive (res, xa, res, mask);
20887
20888 emit_label (label);
20889 LABEL_NUSES (label) = 1;
20890
20891 emit_move_insn (operand0, res);
20892 }
20893
20894 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20895 into OPERAND0. */
20896 void
20897 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
20898 {
20899 /* C code for the stuff we expand below.
20900 double xa = fabs (x), x2;
20901 if (!isless (xa, TWO52))
20902 return x;
20903 xa = xa + TWO52 - TWO52;
20904 x2 = copysign (xa, x);
20905 Compensate. Floor:
20906 if (x2 > x)
20907 x2 -= 1;
20908 Compensate. Ceil:
20909 if (x2 < x)
20910 x2 -= -1;
20911 return x2;
20912 */
20913 enum machine_mode mode = GET_MODE (operand0);
20914 rtx xa, TWO52, tmp, label, one, res, mask;
20915
20916 TWO52 = ix86_gen_TWO52 (mode);
20917
20918 /* Temporary for holding the result, initialized to the input
20919 operand to ease control flow. */
20920 res = gen_reg_rtx (mode);
20921 emit_move_insn (res, operand1);
20922
20923 /* xa = abs (operand1) */
20924 xa = ix86_expand_sse_fabs (res, &mask);
20925
20926 /* if (!isless (xa, TWO52)) goto label; */
20927 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20928
20929 /* xa = xa + TWO52 - TWO52; */
20930 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20931 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20932
20933 /* xa = copysign (xa, operand1) */
20934 ix86_sse_copysign_to_positive (xa, xa, res, mask);
20935
20936 /* generate 1.0 or -1.0 */
20937 one = force_reg (mode,
20938 const_double_from_real_value (do_floor
20939 ? dconst1 : dconstm1, mode));
20940
20941 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20942 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20943 emit_insn (gen_rtx_SET (VOIDmode, tmp,
20944 gen_rtx_AND (mode, one, tmp)));
20945 /* We always need to subtract here to preserve signed zero. */
20946 tmp = expand_simple_binop (mode, MINUS,
20947 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20948 emit_move_insn (res, tmp);
20949
20950 emit_label (label);
20951 LABEL_NUSES (label) = 1;
20952
20953 emit_move_insn (operand0, res);
20954 }
20955
20956 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20957 into OPERAND0. */
20958 void
20959 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
20960 {
20961 /* C code for the stuff we expand below.
20962 double xa = fabs (x), x2;
20963 if (!isless (xa, TWO52))
20964 return x;
20965 x2 = (double)(long)x;
20966 Compensate. Floor:
20967 if (x2 > x)
20968 x2 -= 1;
20969 Compensate. Ceil:
20970 if (x2 < x)
20971 x2 += 1;
20972 if (HONOR_SIGNED_ZEROS (mode))
20973 return copysign (x2, x);
20974 return x2;
20975 */
20976 enum machine_mode mode = GET_MODE (operand0);
20977 rtx xa, xi, TWO52, tmp, label, one, res, mask;
20978
20979 TWO52 = ix86_gen_TWO52 (mode);
20980
20981 /* Temporary for holding the result, initialized to the input
20982 operand to ease control flow. */
20983 res = gen_reg_rtx (mode);
20984 emit_move_insn (res, operand1);
20985
20986 /* xa = abs (operand1) */
20987 xa = ix86_expand_sse_fabs (res, &mask);
20988
20989 /* if (!isless (xa, TWO52)) goto label; */
20990 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20991
20992 /* xa = (double)(long)x */
20993 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
20994 expand_fix (xi, res, 0);
20995 expand_float (xa, xi, 0);
20996
20997 /* generate 1.0 */
20998 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20999
21000 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
21001 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
21002 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21003 gen_rtx_AND (mode, one, tmp)));
21004 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
21005 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21006 emit_move_insn (res, tmp);
21007
21008 if (HONOR_SIGNED_ZEROS (mode))
21009 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21010
21011 emit_label (label);
21012 LABEL_NUSES (label) = 1;
21013
21014 emit_move_insn (operand0, res);
21015 }
21016
21017 /* Expand SSE sequence for computing round from OPERAND1 storing
21018 into OPERAND0. Sequence that works without relying on DImode truncation
21019 via cvttsd2siq that is only available on 64bit targets. */
21020 void
21021 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21022 {
21023 /* C code for the stuff we expand below.
21024 double xa = fabs (x), xa2, x2;
21025 if (!isless (xa, TWO52))
21026 return x;
21027 Using the absolute value and copying back sign makes
21028 -0.0 -> -0.0 correct.
21029 xa2 = xa + TWO52 - TWO52;
21030 Compensate.
21031 dxa = xa2 - xa;
21032 if (dxa <= -0.5)
21033 xa2 += 1;
21034 else if (dxa > 0.5)
21035 xa2 -= 1;
21036 x2 = copysign (xa2, x);
21037 return x2;
21038 */
21039 enum machine_mode mode = GET_MODE (operand0);
21040 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
21041
21042 TWO52 = ix86_gen_TWO52 (mode);
21043
21044 /* Temporary for holding the result, initialized to the input
21045 operand to ease control flow. */
21046 res = gen_reg_rtx (mode);
21047 emit_move_insn (res, operand1);
21048
21049 /* xa = abs (operand1) */
21050 xa = ix86_expand_sse_fabs (res, &mask);
21051
21052 /* if (!isless (xa, TWO52)) goto label; */
21053 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21054
21055 /* xa2 = xa + TWO52 - TWO52; */
21056 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21057 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21058
21059 /* dxa = xa2 - xa; */
21060 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21061
21062 /* generate 0.5, 1.0 and -0.5 */
21063 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21064 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21065 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21066 0, OPTAB_DIRECT);
21067
21068 /* Compensate. */
21069 tmp = gen_reg_rtx (mode);
21070 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21071 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21072 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21073 gen_rtx_AND (mode, one, tmp)));
21074 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21075 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21076 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21077 emit_insn (gen_rtx_SET (VOIDmode, tmp,
21078 gen_rtx_AND (mode, one, tmp)));
21079 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21080
21081 /* res = copysign (xa2, operand1) */
21082 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
21083
21084 emit_label (label);
21085 LABEL_NUSES (label) = 1;
21086
21087 emit_move_insn (operand0, res);
21088 }
21089
21090 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21091 into OPERAND0. */
21092 void
21093 ix86_expand_trunc (rtx operand0, rtx operand1)
21094 {
21095 /* C code for SSE variant we expand below.
21096 double xa = fabs (x), x2;
21097 if (!isless (xa, TWO52))
21098 return x;
21099 x2 = (double)(long)x;
21100 if (HONOR_SIGNED_ZEROS (mode))
21101 return copysign (x2, x);
21102 return x2;
21103 */
21104 enum machine_mode mode = GET_MODE (operand0);
21105 rtx xa, xi, TWO52, label, res, mask;
21106
21107 TWO52 = ix86_gen_TWO52 (mode);
21108
21109 /* Temporary for holding the result, initialized to the input
21110 operand to ease control flow. */
21111 res = gen_reg_rtx (mode);
21112 emit_move_insn (res, operand1);
21113
21114 /* xa = abs (operand1) */
21115 xa = ix86_expand_sse_fabs (res, &mask);
21116
21117 /* if (!isless (xa, TWO52)) goto label; */
21118 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21119
21120 /* x = (double)(long)x */
21121 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21122 expand_fix (xi, res, 0);
21123 expand_float (res, xi, 0);
21124
21125 if (HONOR_SIGNED_ZEROS (mode))
21126 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21127
21128 emit_label (label);
21129 LABEL_NUSES (label) = 1;
21130
21131 emit_move_insn (operand0, res);
21132 }
21133
21134 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21135 into OPERAND0. */
21136 void
21137 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
21138 {
21139 enum machine_mode mode = GET_MODE (operand0);
21140 rtx xa, mask, TWO52, label, one, res, smask, tmp;
21141
21142 /* C code for SSE variant we expand below.
21143 double xa = fabs (x), x2;
21144 if (!isless (xa, TWO52))
21145 return x;
21146 xa2 = xa + TWO52 - TWO52;
21147 Compensate:
21148 if (xa2 > xa)
21149 xa2 -= 1.0;
21150 x2 = copysign (xa2, x);
21151 return x2;
21152 */
21153
21154 TWO52 = ix86_gen_TWO52 (mode);
21155
21156 /* Temporary for holding the result, initialized to the input
21157 operand to ease control flow. */
21158 res = gen_reg_rtx (mode);
21159 emit_move_insn (res, operand1);
21160
21161 /* xa = abs (operand1) */
21162 xa = ix86_expand_sse_fabs (res, &smask);
21163
21164 /* if (!isless (xa, TWO52)) goto label; */
21165 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21166
21167 /* res = xa + TWO52 - TWO52; */
21168 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21169 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
21170 emit_move_insn (res, tmp);
21171
21172 /* generate 1.0 */
21173 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21174
21175 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
21176 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
21177 emit_insn (gen_rtx_SET (VOIDmode, mask,
21178 gen_rtx_AND (mode, mask, one)));
21179 tmp = expand_simple_binop (mode, MINUS,
21180 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
21181 emit_move_insn (res, tmp);
21182
21183 /* res = copysign (res, operand1) */
21184 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
21185
21186 emit_label (label);
21187 LABEL_NUSES (label) = 1;
21188
21189 emit_move_insn (operand0, res);
21190 }
21191
21192 /* Expand SSE sequence for computing round from OPERAND1 storing
21193 into OPERAND0. */
21194 void
21195 ix86_expand_round (rtx operand0, rtx operand1)
21196 {
21197 /* C code for the stuff we're doing below:
21198 double xa = fabs (x);
21199 if (!isless (xa, TWO52))
21200 return x;
21201 xa = (double)(long)(xa + nextafter (0.5, 0.0));
21202 return copysign (xa, x);
21203 */
21204 enum machine_mode mode = GET_MODE (operand0);
21205 rtx res, TWO52, xa, label, xi, half, mask;
21206 const struct real_format *fmt;
21207 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21208
21209 /* Temporary for holding the result, initialized to the input
21210 operand to ease control flow. */
21211 res = gen_reg_rtx (mode);
21212 emit_move_insn (res, operand1);
21213
21214 TWO52 = ix86_gen_TWO52 (mode);
21215 xa = ix86_expand_sse_fabs (res, &mask);
21216 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21217
21218 /* load nextafter (0.5, 0.0) */
21219 fmt = REAL_MODE_FORMAT (mode);
21220 real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
21221 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
21222
21223 /* xa = xa + 0.5 */
21224 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21225 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21226
21227 /* xa = (double)(int64_t)xa */
21228 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21229 expand_fix (xi, xa, 0);
21230 expand_float (xa, xi, 0);
21231
21232 /* res = copysign (xa, operand1) */
21233 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
21234
21235 emit_label (label);
21236 LABEL_NUSES (label) = 1;
21237
21238 emit_move_insn (operand0, res);
21239 }
21240
21241 #include "gt-i386.h"