[multiple changes]
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
4 Free Software Foundation, Inc.
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
11 any later version.
12
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "tm.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "tm_p.h"
29 #include "regs.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
33 #include "output.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
36 #include "flags.h"
37 #include "except.h"
38 #include "function.h"
39 #include "recog.h"
40 #include "expr.h"
41 #include "optabs.h"
42 #include "diagnostic-core.h"
43 #include "toplev.h"
44 #include "basic-block.h"
45 #include "ggc.h"
46 #include "target.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
50 #include "reload.h"
51 #include "cgraph.h"
52 #include "gimple.h"
53 #include "dwarf2.h"
54 #include "df.h"
55 #include "tm-constrs.h"
56 #include "params.h"
57 #include "cselib.h"
58 #include "debug.h"
59 #include "sched-int.h"
60 #include "sbitmap.h"
61 #include "fibheap.h"
62 #include "opts.h"
63 #include "diagnostic.h"
64 #include "dumpfile.h"
65 #include "tree-pass.h"
66 #include "tree-flow.h"
67
68 static rtx legitimize_dllimport_symbol (rtx, bool);
69
70 #ifndef CHECK_STACK_LIMIT
71 #define CHECK_STACK_LIMIT (-1)
72 #endif
73
74 /* Return index of given mode in mult and division cost tables. */
75 #define MODE_INDEX(mode) \
76 ((mode) == QImode ? 0 \
77 : (mode) == HImode ? 1 \
78 : (mode) == SImode ? 2 \
79 : (mode) == DImode ? 3 \
80 : 4)
81
82 /* Processor costs (relative to an add) */
83 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
84 #define COSTS_N_BYTES(N) ((N) * 2)
85
86 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
87
88 const
89 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
90 COSTS_N_BYTES (2), /* cost of an add instruction */
91 COSTS_N_BYTES (3), /* cost of a lea instruction */
92 COSTS_N_BYTES (2), /* variable shift costs */
93 COSTS_N_BYTES (3), /* constant shift costs */
94 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
95 COSTS_N_BYTES (3), /* HI */
96 COSTS_N_BYTES (3), /* SI */
97 COSTS_N_BYTES (3), /* DI */
98 COSTS_N_BYTES (5)}, /* other */
99 0, /* cost of multiply per each bit set */
100 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
101 COSTS_N_BYTES (3), /* HI */
102 COSTS_N_BYTES (3), /* SI */
103 COSTS_N_BYTES (3), /* DI */
104 COSTS_N_BYTES (5)}, /* other */
105 COSTS_N_BYTES (3), /* cost of movsx */
106 COSTS_N_BYTES (3), /* cost of movzx */
107 0, /* "large" insn */
108 2, /* MOVE_RATIO */
109 2, /* cost for loading QImode using movzbl */
110 {2, 2, 2}, /* cost of loading integer registers
111 in QImode, HImode and SImode.
112 Relative to reg-reg move (2). */
113 {2, 2, 2}, /* cost of storing integer registers */
114 2, /* cost of reg,reg fld/fst */
115 {2, 2, 2}, /* cost of loading fp registers
116 in SFmode, DFmode and XFmode */
117 {2, 2, 2}, /* cost of storing fp registers
118 in SFmode, DFmode and XFmode */
119 3, /* cost of moving MMX register */
120 {3, 3}, /* cost of loading MMX registers
121 in SImode and DImode */
122 {3, 3}, /* cost of storing MMX registers
123 in SImode and DImode */
124 3, /* cost of moving SSE register */
125 {3, 3, 3}, /* cost of loading SSE registers
126 in SImode, DImode and TImode */
127 {3, 3, 3}, /* cost of storing SSE registers
128 in SImode, DImode and TImode */
129 3, /* MMX or SSE register to integer */
130 0, /* size of l1 cache */
131 0, /* size of l2 cache */
132 0, /* size of prefetch block */
133 0, /* number of parallel prefetches */
134 2, /* Branch cost */
135 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
136 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
137 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
138 COSTS_N_BYTES (2), /* cost of FABS instruction. */
139 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
140 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
143 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
145 1, /* scalar_stmt_cost. */
146 1, /* scalar load_cost. */
147 1, /* scalar_store_cost. */
148 1, /* vec_stmt_cost. */
149 1, /* vec_to_scalar_cost. */
150 1, /* scalar_to_vec_cost. */
151 1, /* vec_align_load_cost. */
152 1, /* vec_unalign_load_cost. */
153 1, /* vec_store_cost. */
154 1, /* cond_taken_branch_cost. */
155 1, /* cond_not_taken_branch_cost. */
156 };
157
158 /* Processor costs (relative to an add) */
159 static const
160 struct processor_costs i386_cost = { /* 386 specific costs */
161 COSTS_N_INSNS (1), /* cost of an add instruction */
162 COSTS_N_INSNS (1), /* cost of a lea instruction */
163 COSTS_N_INSNS (3), /* variable shift costs */
164 COSTS_N_INSNS (2), /* constant shift costs */
165 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
166 COSTS_N_INSNS (6), /* HI */
167 COSTS_N_INSNS (6), /* SI */
168 COSTS_N_INSNS (6), /* DI */
169 COSTS_N_INSNS (6)}, /* other */
170 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
171 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
172 COSTS_N_INSNS (23), /* HI */
173 COSTS_N_INSNS (23), /* SI */
174 COSTS_N_INSNS (23), /* DI */
175 COSTS_N_INSNS (23)}, /* other */
176 COSTS_N_INSNS (3), /* cost of movsx */
177 COSTS_N_INSNS (2), /* cost of movzx */
178 15, /* "large" insn */
179 3, /* MOVE_RATIO */
180 4, /* cost for loading QImode using movzbl */
181 {2, 4, 2}, /* cost of loading integer registers
182 in QImode, HImode and SImode.
183 Relative to reg-reg move (2). */
184 {2, 4, 2}, /* cost of storing integer registers */
185 2, /* cost of reg,reg fld/fst */
186 {8, 8, 8}, /* cost of loading fp registers
187 in SFmode, DFmode and XFmode */
188 {8, 8, 8}, /* cost of storing fp registers
189 in SFmode, DFmode and XFmode */
190 2, /* cost of moving MMX register */
191 {4, 8}, /* cost of loading MMX registers
192 in SImode and DImode */
193 {4, 8}, /* cost of storing MMX registers
194 in SImode and DImode */
195 2, /* cost of moving SSE register */
196 {4, 8, 16}, /* cost of loading SSE registers
197 in SImode, DImode and TImode */
198 {4, 8, 16}, /* cost of storing SSE registers
199 in SImode, DImode and TImode */
200 3, /* MMX or SSE register to integer */
201 0, /* size of l1 cache */
202 0, /* size of l2 cache */
203 0, /* size of prefetch block */
204 0, /* number of parallel prefetches */
205 1, /* Branch cost */
206 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
207 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
208 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
209 COSTS_N_INSNS (22), /* cost of FABS instruction. */
210 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
211 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
213 DUMMY_STRINGOP_ALGS},
214 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
215 DUMMY_STRINGOP_ALGS},
216 1, /* scalar_stmt_cost. */
217 1, /* scalar load_cost. */
218 1, /* scalar_store_cost. */
219 1, /* vec_stmt_cost. */
220 1, /* vec_to_scalar_cost. */
221 1, /* scalar_to_vec_cost. */
222 1, /* vec_align_load_cost. */
223 2, /* vec_unalign_load_cost. */
224 1, /* vec_store_cost. */
225 3, /* cond_taken_branch_cost. */
226 1, /* cond_not_taken_branch_cost. */
227 };
228
229 static const
230 struct processor_costs i486_cost = { /* 486 specific costs */
231 COSTS_N_INSNS (1), /* cost of an add instruction */
232 COSTS_N_INSNS (1), /* cost of a lea instruction */
233 COSTS_N_INSNS (3), /* variable shift costs */
234 COSTS_N_INSNS (2), /* constant shift costs */
235 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
236 COSTS_N_INSNS (12), /* HI */
237 COSTS_N_INSNS (12), /* SI */
238 COSTS_N_INSNS (12), /* DI */
239 COSTS_N_INSNS (12)}, /* other */
240 1, /* cost of multiply per each bit set */
241 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
242 COSTS_N_INSNS (40), /* HI */
243 COSTS_N_INSNS (40), /* SI */
244 COSTS_N_INSNS (40), /* DI */
245 COSTS_N_INSNS (40)}, /* other */
246 COSTS_N_INSNS (3), /* cost of movsx */
247 COSTS_N_INSNS (2), /* cost of movzx */
248 15, /* "large" insn */
249 3, /* MOVE_RATIO */
250 4, /* cost for loading QImode using movzbl */
251 {2, 4, 2}, /* cost of loading integer registers
252 in QImode, HImode and SImode.
253 Relative to reg-reg move (2). */
254 {2, 4, 2}, /* cost of storing integer registers */
255 2, /* cost of reg,reg fld/fst */
256 {8, 8, 8}, /* cost of loading fp registers
257 in SFmode, DFmode and XFmode */
258 {8, 8, 8}, /* cost of storing fp registers
259 in SFmode, DFmode and XFmode */
260 2, /* cost of moving MMX register */
261 {4, 8}, /* cost of loading MMX registers
262 in SImode and DImode */
263 {4, 8}, /* cost of storing MMX registers
264 in SImode and DImode */
265 2, /* cost of moving SSE register */
266 {4, 8, 16}, /* cost of loading SSE registers
267 in SImode, DImode and TImode */
268 {4, 8, 16}, /* cost of storing SSE registers
269 in SImode, DImode and TImode */
270 3, /* MMX or SSE register to integer */
271 4, /* size of l1 cache. 486 has 8kB cache
272 shared for code and data, so 4kB is
273 not really precise. */
274 4, /* size of l2 cache */
275 0, /* size of prefetch block */
276 0, /* number of parallel prefetches */
277 1, /* Branch cost */
278 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
279 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
280 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
281 COSTS_N_INSNS (3), /* cost of FABS instruction. */
282 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
283 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
285 DUMMY_STRINGOP_ALGS},
286 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
287 DUMMY_STRINGOP_ALGS},
288 1, /* scalar_stmt_cost. */
289 1, /* scalar load_cost. */
290 1, /* scalar_store_cost. */
291 1, /* vec_stmt_cost. */
292 1, /* vec_to_scalar_cost. */
293 1, /* scalar_to_vec_cost. */
294 1, /* vec_align_load_cost. */
295 2, /* vec_unalign_load_cost. */
296 1, /* vec_store_cost. */
297 3, /* cond_taken_branch_cost. */
298 1, /* cond_not_taken_branch_cost. */
299 };
300
301 static const
302 struct processor_costs pentium_cost = {
303 COSTS_N_INSNS (1), /* cost of an add instruction */
304 COSTS_N_INSNS (1), /* cost of a lea instruction */
305 COSTS_N_INSNS (4), /* variable shift costs */
306 COSTS_N_INSNS (1), /* constant shift costs */
307 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
308 COSTS_N_INSNS (11), /* HI */
309 COSTS_N_INSNS (11), /* SI */
310 COSTS_N_INSNS (11), /* DI */
311 COSTS_N_INSNS (11)}, /* other */
312 0, /* cost of multiply per each bit set */
313 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
314 COSTS_N_INSNS (25), /* HI */
315 COSTS_N_INSNS (25), /* SI */
316 COSTS_N_INSNS (25), /* DI */
317 COSTS_N_INSNS (25)}, /* other */
318 COSTS_N_INSNS (3), /* cost of movsx */
319 COSTS_N_INSNS (2), /* cost of movzx */
320 8, /* "large" insn */
321 6, /* MOVE_RATIO */
322 6, /* cost for loading QImode using movzbl */
323 {2, 4, 2}, /* cost of loading integer registers
324 in QImode, HImode and SImode.
325 Relative to reg-reg move (2). */
326 {2, 4, 2}, /* cost of storing integer registers */
327 2, /* cost of reg,reg fld/fst */
328 {2, 2, 6}, /* cost of loading fp registers
329 in SFmode, DFmode and XFmode */
330 {4, 4, 6}, /* cost of storing fp registers
331 in SFmode, DFmode and XFmode */
332 8, /* cost of moving MMX register */
333 {8, 8}, /* cost of loading MMX registers
334 in SImode and DImode */
335 {8, 8}, /* cost of storing MMX registers
336 in SImode and DImode */
337 2, /* cost of moving SSE register */
338 {4, 8, 16}, /* cost of loading SSE registers
339 in SImode, DImode and TImode */
340 {4, 8, 16}, /* cost of storing SSE registers
341 in SImode, DImode and TImode */
342 3, /* MMX or SSE register to integer */
343 8, /* size of l1 cache. */
344 8, /* size of l2 cache */
345 0, /* size of prefetch block */
346 0, /* number of parallel prefetches */
347 2, /* Branch cost */
348 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
349 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
350 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
351 COSTS_N_INSNS (1), /* cost of FABS instruction. */
352 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
353 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
354 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
355 DUMMY_STRINGOP_ALGS},
356 {{libcall, {{-1, rep_prefix_4_byte}}},
357 DUMMY_STRINGOP_ALGS},
358 1, /* scalar_stmt_cost. */
359 1, /* scalar load_cost. */
360 1, /* scalar_store_cost. */
361 1, /* vec_stmt_cost. */
362 1, /* vec_to_scalar_cost. */
363 1, /* scalar_to_vec_cost. */
364 1, /* vec_align_load_cost. */
365 2, /* vec_unalign_load_cost. */
366 1, /* vec_store_cost. */
367 3, /* cond_taken_branch_cost. */
368 1, /* cond_not_taken_branch_cost. */
369 };
370
371 static const
372 struct processor_costs pentiumpro_cost = {
373 COSTS_N_INSNS (1), /* cost of an add instruction */
374 COSTS_N_INSNS (1), /* cost of a lea instruction */
375 COSTS_N_INSNS (1), /* variable shift costs */
376 COSTS_N_INSNS (1), /* constant shift costs */
377 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
378 COSTS_N_INSNS (4), /* HI */
379 COSTS_N_INSNS (4), /* SI */
380 COSTS_N_INSNS (4), /* DI */
381 COSTS_N_INSNS (4)}, /* other */
382 0, /* cost of multiply per each bit set */
383 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
384 COSTS_N_INSNS (17), /* HI */
385 COSTS_N_INSNS (17), /* SI */
386 COSTS_N_INSNS (17), /* DI */
387 COSTS_N_INSNS (17)}, /* other */
388 COSTS_N_INSNS (1), /* cost of movsx */
389 COSTS_N_INSNS (1), /* cost of movzx */
390 8, /* "large" insn */
391 6, /* MOVE_RATIO */
392 2, /* cost for loading QImode using movzbl */
393 {4, 4, 4}, /* cost of loading integer registers
394 in QImode, HImode and SImode.
395 Relative to reg-reg move (2). */
396 {2, 2, 2}, /* cost of storing integer registers */
397 2, /* cost of reg,reg fld/fst */
398 {2, 2, 6}, /* cost of loading fp registers
399 in SFmode, DFmode and XFmode */
400 {4, 4, 6}, /* cost of storing fp registers
401 in SFmode, DFmode and XFmode */
402 2, /* cost of moving MMX register */
403 {2, 2}, /* cost of loading MMX registers
404 in SImode and DImode */
405 {2, 2}, /* cost of storing MMX registers
406 in SImode and DImode */
407 2, /* cost of moving SSE register */
408 {2, 2, 8}, /* cost of loading SSE registers
409 in SImode, DImode and TImode */
410 {2, 2, 8}, /* cost of storing SSE registers
411 in SImode, DImode and TImode */
412 3, /* MMX or SSE register to integer */
413 8, /* size of l1 cache. */
414 256, /* size of l2 cache */
415 32, /* size of prefetch block */
416 6, /* number of parallel prefetches */
417 2, /* Branch cost */
418 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
419 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
420 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
421 COSTS_N_INSNS (2), /* cost of FABS instruction. */
422 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
423 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
424 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
425 (we ensure the alignment). For small blocks inline loop is still a
426 noticeable win, for bigger blocks either rep movsl or rep movsb is
427 way to go. Rep movsb has apparently more expensive startup time in CPU,
428 but after 4K the difference is down in the noise. */
429 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
430 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
431 DUMMY_STRINGOP_ALGS},
432 {{rep_prefix_4_byte, {{1024, unrolled_loop},
433 {8192, rep_prefix_4_byte}, {-1, libcall}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
446 };
447
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
479
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
517 };
518
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
590 };
591
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
663 };
664
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
727 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
728 {{libcall, {{8, loop}, {24, unrolled_loop},
729 {2048, rep_prefix_4_byte}, {-1, libcall}}},
730 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
731 4, /* scalar_stmt_cost. */
732 2, /* scalar load_cost. */
733 2, /* scalar_store_cost. */
734 5, /* vec_stmt_cost. */
735 0, /* vec_to_scalar_cost. */
736 2, /* scalar_to_vec_cost. */
737 2, /* vec_align_load_cost. */
738 3, /* vec_unalign_load_cost. */
739 3, /* vec_store_cost. */
740 3, /* cond_taken_branch_cost. */
741 2, /* cond_not_taken_branch_cost. */
742 };
743
744 struct processor_costs amdfam10_cost = {
745 COSTS_N_INSNS (1), /* cost of an add instruction */
746 COSTS_N_INSNS (2), /* cost of a lea instruction */
747 COSTS_N_INSNS (1), /* variable shift costs */
748 COSTS_N_INSNS (1), /* constant shift costs */
749 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
750 COSTS_N_INSNS (4), /* HI */
751 COSTS_N_INSNS (3), /* SI */
752 COSTS_N_INSNS (4), /* DI */
753 COSTS_N_INSNS (5)}, /* other */
754 0, /* cost of multiply per each bit set */
755 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
756 COSTS_N_INSNS (35), /* HI */
757 COSTS_N_INSNS (51), /* SI */
758 COSTS_N_INSNS (83), /* DI */
759 COSTS_N_INSNS (83)}, /* other */
760 COSTS_N_INSNS (1), /* cost of movsx */
761 COSTS_N_INSNS (1), /* cost of movzx */
762 8, /* "large" insn */
763 9, /* MOVE_RATIO */
764 4, /* cost for loading QImode using movzbl */
765 {3, 4, 3}, /* cost of loading integer registers
766 in QImode, HImode and SImode.
767 Relative to reg-reg move (2). */
768 {3, 4, 3}, /* cost of storing integer registers */
769 4, /* cost of reg,reg fld/fst */
770 {4, 4, 12}, /* cost of loading fp registers
771 in SFmode, DFmode and XFmode */
772 {6, 6, 8}, /* cost of storing fp registers
773 in SFmode, DFmode and XFmode */
774 2, /* cost of moving MMX register */
775 {3, 3}, /* cost of loading MMX registers
776 in SImode and DImode */
777 {4, 4}, /* cost of storing MMX registers
778 in SImode and DImode */
779 2, /* cost of moving SSE register */
780 {4, 4, 3}, /* cost of loading SSE registers
781 in SImode, DImode and TImode */
782 {4, 4, 5}, /* cost of storing SSE registers
783 in SImode, DImode and TImode */
784 3, /* MMX or SSE register to integer */
785 /* On K8:
786 MOVD reg64, xmmreg Double FSTORE 4
787 MOVD reg32, xmmreg Double FSTORE 4
788 On AMDFAM10:
789 MOVD reg64, xmmreg Double FADD 3
790 1/1 1/1
791 MOVD reg32, xmmreg Double FADD 3
792 1/1 1/1 */
793 64, /* size of l1 cache. */
794 512, /* size of l2 cache. */
795 64, /* size of prefetch block */
796 /* New AMD processors never drop prefetches; if they cannot be performed
797 immediately, they are queued. We set number of simultaneous prefetches
798 to a large constant to reflect this (it probably is not a good idea not
799 to limit number of prefetches at all, as their execution also takes some
800 time). */
801 100, /* number of parallel prefetches */
802 2, /* Branch cost */
803 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
804 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
805 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
806 COSTS_N_INSNS (2), /* cost of FABS instruction. */
807 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
808 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
809
810 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
811 very small blocks it is better to use loop. For large blocks, libcall can
812 do nontemporary accesses and beat inline considerably. */
813 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
814 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
815 {{libcall, {{8, loop}, {24, unrolled_loop},
816 {2048, rep_prefix_4_byte}, {-1, libcall}}},
817 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
818 4, /* scalar_stmt_cost. */
819 2, /* scalar load_cost. */
820 2, /* scalar_store_cost. */
821 6, /* vec_stmt_cost. */
822 0, /* vec_to_scalar_cost. */
823 2, /* scalar_to_vec_cost. */
824 2, /* vec_align_load_cost. */
825 2, /* vec_unalign_load_cost. */
826 2, /* vec_store_cost. */
827 2, /* cond_taken_branch_cost. */
828 1, /* cond_not_taken_branch_cost. */
829 };
830
831 struct processor_costs bdver1_cost = {
832 COSTS_N_INSNS (1), /* cost of an add instruction */
833 COSTS_N_INSNS (1), /* cost of a lea instruction */
834 COSTS_N_INSNS (1), /* variable shift costs */
835 COSTS_N_INSNS (1), /* constant shift costs */
836 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
837 COSTS_N_INSNS (4), /* HI */
838 COSTS_N_INSNS (4), /* SI */
839 COSTS_N_INSNS (6), /* DI */
840 COSTS_N_INSNS (6)}, /* other */
841 0, /* cost of multiply per each bit set */
842 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
843 COSTS_N_INSNS (35), /* HI */
844 COSTS_N_INSNS (51), /* SI */
845 COSTS_N_INSNS (83), /* DI */
846 COSTS_N_INSNS (83)}, /* other */
847 COSTS_N_INSNS (1), /* cost of movsx */
848 COSTS_N_INSNS (1), /* cost of movzx */
849 8, /* "large" insn */
850 9, /* MOVE_RATIO */
851 4, /* cost for loading QImode using movzbl */
852 {5, 5, 4}, /* cost of loading integer registers
853 in QImode, HImode and SImode.
854 Relative to reg-reg move (2). */
855 {4, 4, 4}, /* cost of storing integer registers */
856 2, /* cost of reg,reg fld/fst */
857 {5, 5, 12}, /* cost of loading fp registers
858 in SFmode, DFmode and XFmode */
859 {4, 4, 8}, /* cost of storing fp registers
860 in SFmode, DFmode and XFmode */
861 2, /* cost of moving MMX register */
862 {4, 4}, /* cost of loading MMX registers
863 in SImode and DImode */
864 {4, 4}, /* cost of storing MMX registers
865 in SImode and DImode */
866 2, /* cost of moving SSE register */
867 {4, 4, 4}, /* cost of loading SSE registers
868 in SImode, DImode and TImode */
869 {4, 4, 4}, /* cost of storing SSE registers
870 in SImode, DImode and TImode */
871 2, /* MMX or SSE register to integer */
872 /* On K8:
873 MOVD reg64, xmmreg Double FSTORE 4
874 MOVD reg32, xmmreg Double FSTORE 4
875 On AMDFAM10:
876 MOVD reg64, xmmreg Double FADD 3
877 1/1 1/1
878 MOVD reg32, xmmreg Double FADD 3
879 1/1 1/1 */
880 16, /* size of l1 cache. */
881 2048, /* size of l2 cache. */
882 64, /* size of prefetch block */
883 /* New AMD processors never drop prefetches; if they cannot be performed
884 immediately, they are queued. We set number of simultaneous prefetches
885 to a large constant to reflect this (it probably is not a good idea not
886 to limit number of prefetches at all, as their execution also takes some
887 time). */
888 100, /* number of parallel prefetches */
889 2, /* Branch cost */
890 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
891 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
892 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
893 COSTS_N_INSNS (2), /* cost of FABS instruction. */
894 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
895 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
896
897 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
898 very small blocks it is better to use loop. For large blocks, libcall
899 can do nontemporary accesses and beat inline considerably. */
900 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
901 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
902 {{libcall, {{8, loop}, {24, unrolled_loop},
903 {2048, rep_prefix_4_byte}, {-1, libcall}}},
904 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
905 6, /* scalar_stmt_cost. */
906 4, /* scalar load_cost. */
907 4, /* scalar_store_cost. */
908 6, /* vec_stmt_cost. */
909 0, /* vec_to_scalar_cost. */
910 2, /* scalar_to_vec_cost. */
911 4, /* vec_align_load_cost. */
912 4, /* vec_unalign_load_cost. */
913 4, /* vec_store_cost. */
914 2, /* cond_taken_branch_cost. */
915 1, /* cond_not_taken_branch_cost. */
916 };
917
918 struct processor_costs bdver2_cost = {
919 COSTS_N_INSNS (1), /* cost of an add instruction */
920 COSTS_N_INSNS (1), /* cost of a lea instruction */
921 COSTS_N_INSNS (1), /* variable shift costs */
922 COSTS_N_INSNS (1), /* constant shift costs */
923 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
924 COSTS_N_INSNS (4), /* HI */
925 COSTS_N_INSNS (4), /* SI */
926 COSTS_N_INSNS (6), /* DI */
927 COSTS_N_INSNS (6)}, /* other */
928 0, /* cost of multiply per each bit set */
929 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
930 COSTS_N_INSNS (35), /* HI */
931 COSTS_N_INSNS (51), /* SI */
932 COSTS_N_INSNS (83), /* DI */
933 COSTS_N_INSNS (83)}, /* other */
934 COSTS_N_INSNS (1), /* cost of movsx */
935 COSTS_N_INSNS (1), /* cost of movzx */
936 8, /* "large" insn */
937 9, /* MOVE_RATIO */
938 4, /* cost for loading QImode using movzbl */
939 {5, 5, 4}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {4, 4, 4}, /* cost of storing integer registers */
943 2, /* cost of reg,reg fld/fst */
944 {5, 5, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {4, 4, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {4, 4}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, /* cost of moving SSE register */
954 {4, 4, 4}, /* cost of loading SSE registers
955 in SImode, DImode and TImode */
956 {4, 4, 4}, /* cost of storing SSE registers
957 in SImode, DImode and TImode */
958 2, /* MMX or SSE register to integer */
959 /* On K8:
960 MOVD reg64, xmmreg Double FSTORE 4
961 MOVD reg32, xmmreg Double FSTORE 4
962 On AMDFAM10:
963 MOVD reg64, xmmreg Double FADD 3
964 1/1 1/1
965 MOVD reg32, xmmreg Double FADD 3
966 1/1 1/1 */
967 16, /* size of l1 cache. */
968 2048, /* size of l2 cache. */
969 64, /* size of prefetch block */
970 /* New AMD processors never drop prefetches; if they cannot be performed
971 immediately, they are queued. We set number of simultaneous prefetches
972 to a large constant to reflect this (it probably is not a good idea not
973 to limit number of prefetches at all, as their execution also takes some
974 time). */
975 100, /* number of parallel prefetches */
976 2, /* Branch cost */
977 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
978 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
979 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
980 COSTS_N_INSNS (2), /* cost of FABS instruction. */
981 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
982 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
983
984 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
985 very small blocks it is better to use loop. For large blocks, libcall
986 can do nontemporary accesses and beat inline considerably. */
987 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
988 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
989 {{libcall, {{8, loop}, {24, unrolled_loop},
990 {2048, rep_prefix_4_byte}, {-1, libcall}}},
991 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
992 6, /* scalar_stmt_cost. */
993 4, /* scalar load_cost. */
994 4, /* scalar_store_cost. */
995 6, /* vec_stmt_cost. */
996 0, /* vec_to_scalar_cost. */
997 2, /* scalar_to_vec_cost. */
998 4, /* vec_align_load_cost. */
999 4, /* vec_unalign_load_cost. */
1000 4, /* vec_store_cost. */
1001 2, /* cond_taken_branch_cost. */
1002 1, /* cond_not_taken_branch_cost. */
1003 };
1004
1005 struct processor_costs bdver3_cost = {
1006 COSTS_N_INSNS (1), /* cost of an add instruction */
1007 COSTS_N_INSNS (1), /* cost of a lea instruction */
1008 COSTS_N_INSNS (1), /* variable shift costs */
1009 COSTS_N_INSNS (1), /* constant shift costs */
1010 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1011 COSTS_N_INSNS (4), /* HI */
1012 COSTS_N_INSNS (4), /* SI */
1013 COSTS_N_INSNS (6), /* DI */
1014 COSTS_N_INSNS (6)}, /* other */
1015 0, /* cost of multiply per each bit set */
1016 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1017 COSTS_N_INSNS (35), /* HI */
1018 COSTS_N_INSNS (51), /* SI */
1019 COSTS_N_INSNS (83), /* DI */
1020 COSTS_N_INSNS (83)}, /* other */
1021 COSTS_N_INSNS (1), /* cost of movsx */
1022 COSTS_N_INSNS (1), /* cost of movzx */
1023 8, /* "large" insn */
1024 9, /* MOVE_RATIO */
1025 4, /* cost for loading QImode using movzbl */
1026 {5, 5, 4}, /* cost of loading integer registers
1027 in QImode, HImode and SImode.
1028 Relative to reg-reg move (2). */
1029 {4, 4, 4}, /* cost of storing integer registers */
1030 2, /* cost of reg,reg fld/fst */
1031 {5, 5, 12}, /* cost of loading fp registers
1032 in SFmode, DFmode and XFmode */
1033 {4, 4, 8}, /* cost of storing fp registers
1034 in SFmode, DFmode and XFmode */
1035 2, /* cost of moving MMX register */
1036 {4, 4}, /* cost of loading MMX registers
1037 in SImode and DImode */
1038 {4, 4}, /* cost of storing MMX registers
1039 in SImode and DImode */
1040 2, /* cost of moving SSE register */
1041 {4, 4, 4}, /* cost of loading SSE registers
1042 in SImode, DImode and TImode */
1043 {4, 4, 4}, /* cost of storing SSE registers
1044 in SImode, DImode and TImode */
1045 2, /* MMX or SSE register to integer */
1046 16, /* size of l1 cache. */
1047 2048, /* size of l2 cache. */
1048 64, /* size of prefetch block */
1049 /* New AMD processors never drop prefetches; if they cannot be performed
1050 immediately, they are queued. We set number of simultaneous prefetches
1051 to a large constant to reflect this (it probably is not a good idea not
1052 to limit number of prefetches at all, as their execution also takes some
1053 time). */
1054 100, /* number of parallel prefetches */
1055 2, /* Branch cost */
1056 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1057 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1058 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1059 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1060 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1061 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1062
1063 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1064 very small blocks it is better to use loop. For large blocks, libcall
1065 can do nontemporary accesses and beat inline considerably. */
1066 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1067 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1068 {{libcall, {{8, loop}, {24, unrolled_loop},
1069 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1070 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1071 6, /* scalar_stmt_cost. */
1072 4, /* scalar load_cost. */
1073 4, /* scalar_store_cost. */
1074 6, /* vec_stmt_cost. */
1075 0, /* vec_to_scalar_cost. */
1076 2, /* scalar_to_vec_cost. */
1077 4, /* vec_align_load_cost. */
1078 4, /* vec_unalign_load_cost. */
1079 4, /* vec_store_cost. */
1080 2, /* cond_taken_branch_cost. */
1081 1, /* cond_not_taken_branch_cost. */
1082 };
1083
1084 struct processor_costs btver1_cost = {
1085 COSTS_N_INSNS (1), /* cost of an add instruction */
1086 COSTS_N_INSNS (2), /* cost of a lea instruction */
1087 COSTS_N_INSNS (1), /* variable shift costs */
1088 COSTS_N_INSNS (1), /* constant shift costs */
1089 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1090 COSTS_N_INSNS (4), /* HI */
1091 COSTS_N_INSNS (3), /* SI */
1092 COSTS_N_INSNS (4), /* DI */
1093 COSTS_N_INSNS (5)}, /* other */
1094 0, /* cost of multiply per each bit set */
1095 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1096 COSTS_N_INSNS (35), /* HI */
1097 COSTS_N_INSNS (51), /* SI */
1098 COSTS_N_INSNS (83), /* DI */
1099 COSTS_N_INSNS (83)}, /* other */
1100 COSTS_N_INSNS (1), /* cost of movsx */
1101 COSTS_N_INSNS (1), /* cost of movzx */
1102 8, /* "large" insn */
1103 9, /* MOVE_RATIO */
1104 4, /* cost for loading QImode using movzbl */
1105 {3, 4, 3}, /* cost of loading integer registers
1106 in QImode, HImode and SImode.
1107 Relative to reg-reg move (2). */
1108 {3, 4, 3}, /* cost of storing integer registers */
1109 4, /* cost of reg,reg fld/fst */
1110 {4, 4, 12}, /* cost of loading fp registers
1111 in SFmode, DFmode and XFmode */
1112 {6, 6, 8}, /* cost of storing fp registers
1113 in SFmode, DFmode and XFmode */
1114 2, /* cost of moving MMX register */
1115 {3, 3}, /* cost of loading MMX registers
1116 in SImode and DImode */
1117 {4, 4}, /* cost of storing MMX registers
1118 in SImode and DImode */
1119 2, /* cost of moving SSE register */
1120 {4, 4, 3}, /* cost of loading SSE registers
1121 in SImode, DImode and TImode */
1122 {4, 4, 5}, /* cost of storing SSE registers
1123 in SImode, DImode and TImode */
1124 3, /* MMX or SSE register to integer */
1125 /* On K8:
1126 MOVD reg64, xmmreg Double FSTORE 4
1127 MOVD reg32, xmmreg Double FSTORE 4
1128 On AMDFAM10:
1129 MOVD reg64, xmmreg Double FADD 3
1130 1/1 1/1
1131 MOVD reg32, xmmreg Double FADD 3
1132 1/1 1/1 */
1133 32, /* size of l1 cache. */
1134 512, /* size of l2 cache. */
1135 64, /* size of prefetch block */
1136 100, /* number of parallel prefetches */
1137 2, /* Branch cost */
1138 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1139 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1140 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1141 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1142 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1143 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1144
1145 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1146 very small blocks it is better to use loop. For large blocks, libcall can
1147 do nontemporary accesses and beat inline considerably. */
1148 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1149 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1150 {{libcall, {{8, loop}, {24, unrolled_loop},
1151 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1152 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1153 4, /* scalar_stmt_cost. */
1154 2, /* scalar load_cost. */
1155 2, /* scalar_store_cost. */
1156 6, /* vec_stmt_cost. */
1157 0, /* vec_to_scalar_cost. */
1158 2, /* scalar_to_vec_cost. */
1159 2, /* vec_align_load_cost. */
1160 2, /* vec_unalign_load_cost. */
1161 2, /* vec_store_cost. */
1162 2, /* cond_taken_branch_cost. */
1163 1, /* cond_not_taken_branch_cost. */
1164 };
1165
1166 struct processor_costs btver2_cost = {
1167 COSTS_N_INSNS (1), /* cost of an add instruction */
1168 COSTS_N_INSNS (2), /* cost of a lea instruction */
1169 COSTS_N_INSNS (1), /* variable shift costs */
1170 COSTS_N_INSNS (1), /* constant shift costs */
1171 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1172 COSTS_N_INSNS (4), /* HI */
1173 COSTS_N_INSNS (3), /* SI */
1174 COSTS_N_INSNS (4), /* DI */
1175 COSTS_N_INSNS (5)}, /* other */
1176 0, /* cost of multiply per each bit set */
1177 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1178 COSTS_N_INSNS (35), /* HI */
1179 COSTS_N_INSNS (51), /* SI */
1180 COSTS_N_INSNS (83), /* DI */
1181 COSTS_N_INSNS (83)}, /* other */
1182 COSTS_N_INSNS (1), /* cost of movsx */
1183 COSTS_N_INSNS (1), /* cost of movzx */
1184 8, /* "large" insn */
1185 9, /* MOVE_RATIO */
1186 4, /* cost for loading QImode using movzbl */
1187 {3, 4, 3}, /* cost of loading integer registers
1188 in QImode, HImode and SImode.
1189 Relative to reg-reg move (2). */
1190 {3, 4, 3}, /* cost of storing integer registers */
1191 4, /* cost of reg,reg fld/fst */
1192 {4, 4, 12}, /* cost of loading fp registers
1193 in SFmode, DFmode and XFmode */
1194 {6, 6, 8}, /* cost of storing fp registers
1195 in SFmode, DFmode and XFmode */
1196 2, /* cost of moving MMX register */
1197 {3, 3}, /* cost of loading MMX registers
1198 in SImode and DImode */
1199 {4, 4}, /* cost of storing MMX registers
1200 in SImode and DImode */
1201 2, /* cost of moving SSE register */
1202 {4, 4, 3}, /* cost of loading SSE registers
1203 in SImode, DImode and TImode */
1204 {4, 4, 5}, /* cost of storing SSE registers
1205 in SImode, DImode and TImode */
1206 3, /* MMX or SSE register to integer */
1207 /* On K8:
1208 MOVD reg64, xmmreg Double FSTORE 4
1209 MOVD reg32, xmmreg Double FSTORE 4
1210 On AMDFAM10:
1211 MOVD reg64, xmmreg Double FADD 3
1212 1/1 1/1
1213 MOVD reg32, xmmreg Double FADD 3
1214 1/1 1/1 */
1215 32, /* size of l1 cache. */
1216 2048, /* size of l2 cache. */
1217 64, /* size of prefetch block */
1218 100, /* number of parallel prefetches */
1219 2, /* Branch cost */
1220 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1221 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1222 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1223 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1224 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1225 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1226
1227 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1228 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1229 {{libcall, {{8, loop}, {24, unrolled_loop},
1230 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1231 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1232 4, /* scalar_stmt_cost. */
1233 2, /* scalar load_cost. */
1234 2, /* scalar_store_cost. */
1235 6, /* vec_stmt_cost. */
1236 0, /* vec_to_scalar_cost. */
1237 2, /* scalar_to_vec_cost. */
1238 2, /* vec_align_load_cost. */
1239 2, /* vec_unalign_load_cost. */
1240 2, /* vec_store_cost. */
1241 2, /* cond_taken_branch_cost. */
1242 1, /* cond_not_taken_branch_cost. */
1243 };
1244
1245 static const
1246 struct processor_costs pentium4_cost = {
1247 COSTS_N_INSNS (1), /* cost of an add instruction */
1248 COSTS_N_INSNS (3), /* cost of a lea instruction */
1249 COSTS_N_INSNS (4), /* variable shift costs */
1250 COSTS_N_INSNS (4), /* constant shift costs */
1251 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1252 COSTS_N_INSNS (15), /* HI */
1253 COSTS_N_INSNS (15), /* SI */
1254 COSTS_N_INSNS (15), /* DI */
1255 COSTS_N_INSNS (15)}, /* other */
1256 0, /* cost of multiply per each bit set */
1257 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1258 COSTS_N_INSNS (56), /* HI */
1259 COSTS_N_INSNS (56), /* SI */
1260 COSTS_N_INSNS (56), /* DI */
1261 COSTS_N_INSNS (56)}, /* other */
1262 COSTS_N_INSNS (1), /* cost of movsx */
1263 COSTS_N_INSNS (1), /* cost of movzx */
1264 16, /* "large" insn */
1265 6, /* MOVE_RATIO */
1266 2, /* cost for loading QImode using movzbl */
1267 {4, 5, 4}, /* cost of loading integer registers
1268 in QImode, HImode and SImode.
1269 Relative to reg-reg move (2). */
1270 {2, 3, 2}, /* cost of storing integer registers */
1271 2, /* cost of reg,reg fld/fst */
1272 {2, 2, 6}, /* cost of loading fp registers
1273 in SFmode, DFmode and XFmode */
1274 {4, 4, 6}, /* cost of storing fp registers
1275 in SFmode, DFmode and XFmode */
1276 2, /* cost of moving MMX register */
1277 {2, 2}, /* cost of loading MMX registers
1278 in SImode and DImode */
1279 {2, 2}, /* cost of storing MMX registers
1280 in SImode and DImode */
1281 12, /* cost of moving SSE register */
1282 {12, 12, 12}, /* cost of loading SSE registers
1283 in SImode, DImode and TImode */
1284 {2, 2, 8}, /* cost of storing SSE registers
1285 in SImode, DImode and TImode */
1286 10, /* MMX or SSE register to integer */
1287 8, /* size of l1 cache. */
1288 256, /* size of l2 cache. */
1289 64, /* size of prefetch block */
1290 6, /* number of parallel prefetches */
1291 2, /* Branch cost */
1292 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1293 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1294 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1295 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1296 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1297 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1298 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1299 DUMMY_STRINGOP_ALGS},
1300 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1301 {-1, libcall}}},
1302 DUMMY_STRINGOP_ALGS},
1303 1, /* scalar_stmt_cost. */
1304 1, /* scalar load_cost. */
1305 1, /* scalar_store_cost. */
1306 1, /* vec_stmt_cost. */
1307 1, /* vec_to_scalar_cost. */
1308 1, /* scalar_to_vec_cost. */
1309 1, /* vec_align_load_cost. */
1310 2, /* vec_unalign_load_cost. */
1311 1, /* vec_store_cost. */
1312 3, /* cond_taken_branch_cost. */
1313 1, /* cond_not_taken_branch_cost. */
1314 };
1315
1316 static const
1317 struct processor_costs nocona_cost = {
1318 COSTS_N_INSNS (1), /* cost of an add instruction */
1319 COSTS_N_INSNS (1), /* cost of a lea instruction */
1320 COSTS_N_INSNS (1), /* variable shift costs */
1321 COSTS_N_INSNS (1), /* constant shift costs */
1322 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1323 COSTS_N_INSNS (10), /* HI */
1324 COSTS_N_INSNS (10), /* SI */
1325 COSTS_N_INSNS (10), /* DI */
1326 COSTS_N_INSNS (10)}, /* other */
1327 0, /* cost of multiply per each bit set */
1328 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1329 COSTS_N_INSNS (66), /* HI */
1330 COSTS_N_INSNS (66), /* SI */
1331 COSTS_N_INSNS (66), /* DI */
1332 COSTS_N_INSNS (66)}, /* other */
1333 COSTS_N_INSNS (1), /* cost of movsx */
1334 COSTS_N_INSNS (1), /* cost of movzx */
1335 16, /* "large" insn */
1336 17, /* MOVE_RATIO */
1337 4, /* cost for loading QImode using movzbl */
1338 {4, 4, 4}, /* cost of loading integer registers
1339 in QImode, HImode and SImode.
1340 Relative to reg-reg move (2). */
1341 {4, 4, 4}, /* cost of storing integer registers */
1342 3, /* cost of reg,reg fld/fst */
1343 {12, 12, 12}, /* cost of loading fp registers
1344 in SFmode, DFmode and XFmode */
1345 {4, 4, 4}, /* cost of storing fp registers
1346 in SFmode, DFmode and XFmode */
1347 6, /* cost of moving MMX register */
1348 {12, 12}, /* cost of loading MMX registers
1349 in SImode and DImode */
1350 {12, 12}, /* cost of storing MMX registers
1351 in SImode and DImode */
1352 6, /* cost of moving SSE register */
1353 {12, 12, 12}, /* cost of loading SSE registers
1354 in SImode, DImode and TImode */
1355 {12, 12, 12}, /* cost of storing SSE registers
1356 in SImode, DImode and TImode */
1357 8, /* MMX or SSE register to integer */
1358 8, /* size of l1 cache. */
1359 1024, /* size of l2 cache. */
1360 128, /* size of prefetch block */
1361 8, /* number of parallel prefetches */
1362 1, /* Branch cost */
1363 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1364 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1365 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1366 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1367 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1368 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1369 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1370 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1371 {100000, unrolled_loop}, {-1, libcall}}}},
1372 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1373 {-1, libcall}}},
1374 {libcall, {{24, loop}, {64, unrolled_loop},
1375 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1376 1, /* scalar_stmt_cost. */
1377 1, /* scalar load_cost. */
1378 1, /* scalar_store_cost. */
1379 1, /* vec_stmt_cost. */
1380 1, /* vec_to_scalar_cost. */
1381 1, /* scalar_to_vec_cost. */
1382 1, /* vec_align_load_cost. */
1383 2, /* vec_unalign_load_cost. */
1384 1, /* vec_store_cost. */
1385 3, /* cond_taken_branch_cost. */
1386 1, /* cond_not_taken_branch_cost. */
1387 };
1388
1389 static const
1390 struct processor_costs atom_cost = {
1391 COSTS_N_INSNS (1), /* cost of an add instruction */
1392 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1393 COSTS_N_INSNS (1), /* variable shift costs */
1394 COSTS_N_INSNS (1), /* constant shift costs */
1395 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1396 COSTS_N_INSNS (4), /* HI */
1397 COSTS_N_INSNS (3), /* SI */
1398 COSTS_N_INSNS (4), /* DI */
1399 COSTS_N_INSNS (2)}, /* other */
1400 0, /* cost of multiply per each bit set */
1401 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1402 COSTS_N_INSNS (26), /* HI */
1403 COSTS_N_INSNS (42), /* SI */
1404 COSTS_N_INSNS (74), /* DI */
1405 COSTS_N_INSNS (74)}, /* other */
1406 COSTS_N_INSNS (1), /* cost of movsx */
1407 COSTS_N_INSNS (1), /* cost of movzx */
1408 8, /* "large" insn */
1409 17, /* MOVE_RATIO */
1410 4, /* cost for loading QImode using movzbl */
1411 {4, 4, 4}, /* cost of loading integer registers
1412 in QImode, HImode and SImode.
1413 Relative to reg-reg move (2). */
1414 {4, 4, 4}, /* cost of storing integer registers */
1415 4, /* cost of reg,reg fld/fst */
1416 {12, 12, 12}, /* cost of loading fp registers
1417 in SFmode, DFmode and XFmode */
1418 {6, 6, 8}, /* cost of storing fp registers
1419 in SFmode, DFmode and XFmode */
1420 2, /* cost of moving MMX register */
1421 {8, 8}, /* cost of loading MMX registers
1422 in SImode and DImode */
1423 {8, 8}, /* cost of storing MMX registers
1424 in SImode and DImode */
1425 2, /* cost of moving SSE register */
1426 {8, 8, 8}, /* cost of loading SSE registers
1427 in SImode, DImode and TImode */
1428 {8, 8, 8}, /* cost of storing SSE registers
1429 in SImode, DImode and TImode */
1430 5, /* MMX or SSE register to integer */
1431 32, /* size of l1 cache. */
1432 256, /* size of l2 cache. */
1433 64, /* size of prefetch block */
1434 6, /* number of parallel prefetches */
1435 3, /* Branch cost */
1436 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1437 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1438 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1439 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1440 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1441 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1442 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1443 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1444 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1445 {{libcall, {{8, loop}, {15, unrolled_loop},
1446 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1447 {libcall, {{24, loop}, {32, unrolled_loop},
1448 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1449 1, /* scalar_stmt_cost. */
1450 1, /* scalar load_cost. */
1451 1, /* scalar_store_cost. */
1452 1, /* vec_stmt_cost. */
1453 1, /* vec_to_scalar_cost. */
1454 1, /* scalar_to_vec_cost. */
1455 1, /* vec_align_load_cost. */
1456 2, /* vec_unalign_load_cost. */
1457 1, /* vec_store_cost. */
1458 3, /* cond_taken_branch_cost. */
1459 1, /* cond_not_taken_branch_cost. */
1460 };
1461
1462 /* Generic64 should produce code tuned for Nocona and K8. */
1463 static const
1464 struct processor_costs generic64_cost = {
1465 COSTS_N_INSNS (1), /* cost of an add instruction */
1466 /* On all chips taken into consideration lea is 2 cycles and more. With
1467 this cost however our current implementation of synth_mult results in
1468 use of unnecessary temporary registers causing regression on several
1469 SPECfp benchmarks. */
1470 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1471 COSTS_N_INSNS (1), /* variable shift costs */
1472 COSTS_N_INSNS (1), /* constant shift costs */
1473 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1474 COSTS_N_INSNS (4), /* HI */
1475 COSTS_N_INSNS (3), /* SI */
1476 COSTS_N_INSNS (4), /* DI */
1477 COSTS_N_INSNS (2)}, /* other */
1478 0, /* cost of multiply per each bit set */
1479 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1480 COSTS_N_INSNS (26), /* HI */
1481 COSTS_N_INSNS (42), /* SI */
1482 COSTS_N_INSNS (74), /* DI */
1483 COSTS_N_INSNS (74)}, /* other */
1484 COSTS_N_INSNS (1), /* cost of movsx */
1485 COSTS_N_INSNS (1), /* cost of movzx */
1486 8, /* "large" insn */
1487 17, /* MOVE_RATIO */
1488 4, /* cost for loading QImode using movzbl */
1489 {4, 4, 4}, /* cost of loading integer registers
1490 in QImode, HImode and SImode.
1491 Relative to reg-reg move (2). */
1492 {4, 4, 4}, /* cost of storing integer registers */
1493 4, /* cost of reg,reg fld/fst */
1494 {12, 12, 12}, /* cost of loading fp registers
1495 in SFmode, DFmode and XFmode */
1496 {6, 6, 8}, /* cost of storing fp registers
1497 in SFmode, DFmode and XFmode */
1498 2, /* cost of moving MMX register */
1499 {8, 8}, /* cost of loading MMX registers
1500 in SImode and DImode */
1501 {8, 8}, /* cost of storing MMX registers
1502 in SImode and DImode */
1503 2, /* cost of moving SSE register */
1504 {8, 8, 8}, /* cost of loading SSE registers
1505 in SImode, DImode and TImode */
1506 {8, 8, 8}, /* cost of storing SSE registers
1507 in SImode, DImode and TImode */
1508 5, /* MMX or SSE register to integer */
1509 32, /* size of l1 cache. */
1510 512, /* size of l2 cache. */
1511 64, /* size of prefetch block */
1512 6, /* number of parallel prefetches */
1513 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1514 value is increased to perhaps more appropriate value of 5. */
1515 3, /* Branch cost */
1516 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1517 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1518 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1519 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1520 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1521 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1522 {DUMMY_STRINGOP_ALGS,
1523 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1524 {DUMMY_STRINGOP_ALGS,
1525 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1526 1, /* scalar_stmt_cost. */
1527 1, /* scalar load_cost. */
1528 1, /* scalar_store_cost. */
1529 1, /* vec_stmt_cost. */
1530 1, /* vec_to_scalar_cost. */
1531 1, /* scalar_to_vec_cost. */
1532 1, /* vec_align_load_cost. */
1533 2, /* vec_unalign_load_cost. */
1534 1, /* vec_store_cost. */
1535 3, /* cond_taken_branch_cost. */
1536 1, /* cond_not_taken_branch_cost. */
1537 };
1538
1539 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1540 Athlon and K8. */
1541 static const
1542 struct processor_costs generic32_cost = {
1543 COSTS_N_INSNS (1), /* cost of an add instruction */
1544 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1545 COSTS_N_INSNS (1), /* variable shift costs */
1546 COSTS_N_INSNS (1), /* constant shift costs */
1547 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1548 COSTS_N_INSNS (4), /* HI */
1549 COSTS_N_INSNS (3), /* SI */
1550 COSTS_N_INSNS (4), /* DI */
1551 COSTS_N_INSNS (2)}, /* other */
1552 0, /* cost of multiply per each bit set */
1553 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1554 COSTS_N_INSNS (26), /* HI */
1555 COSTS_N_INSNS (42), /* SI */
1556 COSTS_N_INSNS (74), /* DI */
1557 COSTS_N_INSNS (74)}, /* other */
1558 COSTS_N_INSNS (1), /* cost of movsx */
1559 COSTS_N_INSNS (1), /* cost of movzx */
1560 8, /* "large" insn */
1561 17, /* MOVE_RATIO */
1562 4, /* cost for loading QImode using movzbl */
1563 {4, 4, 4}, /* cost of loading integer registers
1564 in QImode, HImode and SImode.
1565 Relative to reg-reg move (2). */
1566 {4, 4, 4}, /* cost of storing integer registers */
1567 4, /* cost of reg,reg fld/fst */
1568 {12, 12, 12}, /* cost of loading fp registers
1569 in SFmode, DFmode and XFmode */
1570 {6, 6, 8}, /* cost of storing fp registers
1571 in SFmode, DFmode and XFmode */
1572 2, /* cost of moving MMX register */
1573 {8, 8}, /* cost of loading MMX registers
1574 in SImode and DImode */
1575 {8, 8}, /* cost of storing MMX registers
1576 in SImode and DImode */
1577 2, /* cost of moving SSE register */
1578 {8, 8, 8}, /* cost of loading SSE registers
1579 in SImode, DImode and TImode */
1580 {8, 8, 8}, /* cost of storing SSE registers
1581 in SImode, DImode and TImode */
1582 5, /* MMX or SSE register to integer */
1583 32, /* size of l1 cache. */
1584 256, /* size of l2 cache. */
1585 64, /* size of prefetch block */
1586 6, /* number of parallel prefetches */
1587 3, /* Branch cost */
1588 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1589 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1590 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1591 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1592 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1593 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1594 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1595 DUMMY_STRINGOP_ALGS},
1596 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1597 DUMMY_STRINGOP_ALGS},
1598 1, /* scalar_stmt_cost. */
1599 1, /* scalar load_cost. */
1600 1, /* scalar_store_cost. */
1601 1, /* vec_stmt_cost. */
1602 1, /* vec_to_scalar_cost. */
1603 1, /* scalar_to_vec_cost. */
1604 1, /* vec_align_load_cost. */
1605 2, /* vec_unalign_load_cost. */
1606 1, /* vec_store_cost. */
1607 3, /* cond_taken_branch_cost. */
1608 1, /* cond_not_taken_branch_cost. */
1609 };
1610
1611 /* Set by -mtune. */
1612 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1613
1614 /* Set by -mtune or -Os. */
1615 const struct processor_costs *ix86_cost = &pentium_cost;
1616
1617 /* Processor feature/optimization bitmasks. */
1618 #define m_386 (1<<PROCESSOR_I386)
1619 #define m_486 (1<<PROCESSOR_I486)
1620 #define m_PENT (1<<PROCESSOR_PENTIUM)
1621 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1622 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1623 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1624 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1625 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1626 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1627 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1628 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1629 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1630 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1631 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1632 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1633 #define m_ATOM (1<<PROCESSOR_ATOM)
1634
1635 #define m_GEODE (1<<PROCESSOR_GEODE)
1636 #define m_K6 (1<<PROCESSOR_K6)
1637 #define m_K6_GEODE (m_K6 | m_GEODE)
1638 #define m_K8 (1<<PROCESSOR_K8)
1639 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1640 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1641 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1642 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1643 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1644 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1645 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1646 #define m_BTVER (m_BTVER1 | m_BTVER2)
1647 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1648 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1649 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1650
1651 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1652 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1653
1654 /* Generic instruction choice should be common subset of supported CPUs
1655 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1656 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1657
1658 /* Feature tests against the various tunings. */
1659 unsigned char ix86_tune_features[X86_TUNE_LAST];
1660
1661 /* Feature tests against the various tunings used to create ix86_tune_features
1662 based on the processor mask. */
1663 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1664 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1665 negatively, so enabling for Generic64 seems like good code size
1666 tradeoff. We can't enable it for 32bit generic because it does not
1667 work well with PPro base chips. */
1668 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1669
1670 /* X86_TUNE_PUSH_MEMORY */
1671 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1672
1673 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1674 m_486 | m_PENT,
1675
1676 /* X86_TUNE_UNROLL_STRLEN */
1677 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1678
1679 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1680 on simulation result. But after P4 was made, no performance benefit
1681 was observed with branch hints. It also increases the code size.
1682 As a result, icc never generates branch hints. */
1683 0,
1684
1685 /* X86_TUNE_DOUBLE_WITH_ADD */
1686 ~m_386,
1687
1688 /* X86_TUNE_USE_SAHF */
1689 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1690
1691 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1692 partial dependencies. */
1693 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1694
1695 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1696 register stalls on Generic32 compilation setting as well. However
1697 in current implementation the partial register stalls are not eliminated
1698 very well - they can be introduced via subregs synthesized by combine
1699 and can happen in caller/callee saving sequences. Because this option
1700 pays back little on PPro based chips and is in conflict with partial reg
1701 dependencies used by Athlon/P4 based chips, it is better to leave it off
1702 for generic32 for now. */
1703 m_PPRO,
1704
1705 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1706 m_CORE2I7 | m_GENERIC,
1707
1708 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1709 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1710 m_CORE2I7 | m_GENERIC,
1711
1712 /* X86_TUNE_USE_HIMODE_FIOP */
1713 m_386 | m_486 | m_K6_GEODE,
1714
1715 /* X86_TUNE_USE_SIMODE_FIOP */
1716 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1717
1718 /* X86_TUNE_USE_MOV0 */
1719 m_K6,
1720
1721 /* X86_TUNE_USE_CLTD */
1722 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1723
1724 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1725 m_PENT4,
1726
1727 /* X86_TUNE_SPLIT_LONG_MOVES */
1728 m_PPRO,
1729
1730 /* X86_TUNE_READ_MODIFY_WRITE */
1731 ~m_PENT,
1732
1733 /* X86_TUNE_READ_MODIFY */
1734 ~(m_PENT | m_PPRO),
1735
1736 /* X86_TUNE_PROMOTE_QIMODE */
1737 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1738
1739 /* X86_TUNE_FAST_PREFIX */
1740 ~(m_386 | m_486 | m_PENT),
1741
1742 /* X86_TUNE_SINGLE_STRINGOP */
1743 m_386 | m_P4_NOCONA,
1744
1745 /* X86_TUNE_QIMODE_MATH */
1746 ~0,
1747
1748 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1749 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1750 might be considered for Generic32 if our scheme for avoiding partial
1751 stalls was more effective. */
1752 ~m_PPRO,
1753
1754 /* X86_TUNE_PROMOTE_QI_REGS */
1755 0,
1756
1757 /* X86_TUNE_PROMOTE_HI_REGS */
1758 m_PPRO,
1759
1760 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1761 over esp addition. */
1762 m_386 | m_486 | m_PENT | m_PPRO,
1763
1764 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1765 over esp addition. */
1766 m_PENT,
1767
1768 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1769 over esp subtraction. */
1770 m_386 | m_486 | m_PENT | m_K6_GEODE,
1771
1772 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1773 over esp subtraction. */
1774 m_PENT | m_K6_GEODE,
1775
1776 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1777 for DFmode copies */
1778 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1779
1780 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1781 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1782
1783 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1784 conflict here in between PPro/Pentium4 based chips that thread 128bit
1785 SSE registers as single units versus K8 based chips that divide SSE
1786 registers to two 64bit halves. This knob promotes all store destinations
1787 to be 128bit to allow register renaming on 128bit SSE units, but usually
1788 results in one extra microop on 64bit SSE units. Experimental results
1789 shows that disabling this option on P4 brings over 20% SPECfp regression,
1790 while enabling it on K8 brings roughly 2.4% regression that can be partly
1791 masked by careful scheduling of moves. */
1792 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1793
1794 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1795 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER,
1796
1797 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1798 m_COREI7 | m_BDVER,
1799
1800 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1801 m_BDVER ,
1802
1803 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1804 are resolved on SSE register parts instead of whole registers, so we may
1805 maintain just lower part of scalar values in proper format leaving the
1806 upper part undefined. */
1807 m_ATHLON_K8,
1808
1809 /* X86_TUNE_SSE_TYPELESS_STORES */
1810 m_AMD_MULTIPLE,
1811
1812 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1813 m_PPRO | m_P4_NOCONA,
1814
1815 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1816 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1817
1818 /* X86_TUNE_PROLOGUE_USING_MOVE */
1819 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
1820
1821 /* X86_TUNE_EPILOGUE_USING_MOVE */
1822 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
1823
1824 /* X86_TUNE_SHIFT1 */
1825 ~m_486,
1826
1827 /* X86_TUNE_USE_FFREEP */
1828 m_AMD_MULTIPLE,
1829
1830 /* X86_TUNE_INTER_UNIT_MOVES */
1831 ~(m_AMD_MULTIPLE | m_GENERIC),
1832
1833 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1834 ~(m_AMDFAM10 | m_BDVER ),
1835
1836 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1837 than 4 branch instructions in the 16 byte window. */
1838 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1839
1840 /* X86_TUNE_SCHEDULE */
1841 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1842
1843 /* X86_TUNE_USE_BT */
1844 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1845
1846 /* X86_TUNE_USE_INCDEC */
1847 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
1848
1849 /* X86_TUNE_PAD_RETURNS */
1850 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
1851
1852 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1853 m_ATOM,
1854
1855 /* X86_TUNE_EXT_80387_CONSTANTS */
1856 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1857
1858 /* X86_TUNE_AVOID_VECTOR_DECODE */
1859 m_CORE2I7_64 | m_K8 | m_GENERIC64,
1860
1861 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1862 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1863 ~(m_386 | m_486),
1864
1865 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1866 vector path on AMD machines. */
1867 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1868
1869 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1870 machines. */
1871 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1872
1873 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1874 than a MOV. */
1875 m_PENT,
1876
1877 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1878 but one byte longer. */
1879 m_PENT,
1880
1881 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1882 operand that cannot be represented using a modRM byte. The XOR
1883 replacement is long decoded, so this split helps here as well. */
1884 m_K6,
1885
1886 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1887 from FP to FP. */
1888 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
1889
1890 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1891 from integer to FP. */
1892 m_AMDFAM10,
1893
1894 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1895 with a subsequent conditional jump instruction into a single
1896 compare-and-branch uop. */
1897 m_BDVER,
1898
1899 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
1900 will impact LEA instruction selection. */
1901 m_ATOM,
1902
1903 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
1904 instructions. */
1905 ~m_ATOM,
1906
1907 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
1908 at -O3. For the moment, the prefetching seems badly tuned for Intel
1909 chips. */
1910 m_K6_GEODE | m_AMD_MULTIPLE,
1911
1912 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
1913 the auto-vectorizer. */
1914 m_BDVER,
1915
1916 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
1917 during reassociation of integer computation. */
1918 m_ATOM,
1919
1920 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
1921 during reassociation of fp computation. */
1922 m_ATOM,
1923
1924 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
1925 regs instead of memory. */
1926 m_COREI7 | m_CORE2I7
1927 };
1928
1929 /* Feature tests against the various architecture variations. */
1930 unsigned char ix86_arch_features[X86_ARCH_LAST];
1931
1932 /* Feature tests against the various architecture variations, used to create
1933 ix86_arch_features based on the processor mask. */
1934 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1935 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1936 ~(m_386 | m_486 | m_PENT | m_K6),
1937
1938 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1939 ~m_386,
1940
1941 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1942 ~(m_386 | m_486),
1943
1944 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1945 ~m_386,
1946
1947 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1948 ~m_386,
1949 };
1950
1951 static const unsigned int x86_accumulate_outgoing_args
1952 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
1953
1954 static const unsigned int x86_arch_always_fancy_math_387
1955 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
1956
1957 static const unsigned int x86_avx256_split_unaligned_load
1958 = m_COREI7 | m_GENERIC;
1959
1960 static const unsigned int x86_avx256_split_unaligned_store
1961 = m_COREI7 | m_BDVER | m_GENERIC;
1962
1963 /* In case the average insn count for single function invocation is
1964 lower than this constant, emit fast (but longer) prologue and
1965 epilogue code. */
1966 #define FAST_PROLOGUE_INSN_COUNT 20
1967
1968 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1969 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1970 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1971 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1972
1973 /* Array of the smallest class containing reg number REGNO, indexed by
1974 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1975
1976 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1977 {
1978 /* ax, dx, cx, bx */
1979 AREG, DREG, CREG, BREG,
1980 /* si, di, bp, sp */
1981 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1982 /* FP registers */
1983 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1984 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1985 /* arg pointer */
1986 NON_Q_REGS,
1987 /* flags, fpsr, fpcr, frame */
1988 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1989 /* SSE registers */
1990 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1991 SSE_REGS, SSE_REGS,
1992 /* MMX registers */
1993 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1994 MMX_REGS, MMX_REGS,
1995 /* REX registers */
1996 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1997 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1998 /* SSE REX registers */
1999 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2000 SSE_REGS, SSE_REGS,
2001 };
2002
2003 /* The "default" register map used in 32bit mode. */
2004
2005 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2006 {
2007 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2008 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2009 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2010 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2011 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2012 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2013 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2014 };
2015
2016 /* The "default" register map used in 64bit mode. */
2017
2018 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2019 {
2020 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2021 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2022 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2023 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2024 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2025 8,9,10,11,12,13,14,15, /* extended integer registers */
2026 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2027 };
2028
2029 /* Define the register numbers to be used in Dwarf debugging information.
2030 The SVR4 reference port C compiler uses the following register numbers
2031 in its Dwarf output code:
2032 0 for %eax (gcc regno = 0)
2033 1 for %ecx (gcc regno = 2)
2034 2 for %edx (gcc regno = 1)
2035 3 for %ebx (gcc regno = 3)
2036 4 for %esp (gcc regno = 7)
2037 5 for %ebp (gcc regno = 6)
2038 6 for %esi (gcc regno = 4)
2039 7 for %edi (gcc regno = 5)
2040 The following three DWARF register numbers are never generated by
2041 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2042 believes these numbers have these meanings.
2043 8 for %eip (no gcc equivalent)
2044 9 for %eflags (gcc regno = 17)
2045 10 for %trapno (no gcc equivalent)
2046 It is not at all clear how we should number the FP stack registers
2047 for the x86 architecture. If the version of SDB on x86/svr4 were
2048 a bit less brain dead with respect to floating-point then we would
2049 have a precedent to follow with respect to DWARF register numbers
2050 for x86 FP registers, but the SDB on x86/svr4 is so completely
2051 broken with respect to FP registers that it is hardly worth thinking
2052 of it as something to strive for compatibility with.
2053 The version of x86/svr4 SDB I have at the moment does (partially)
2054 seem to believe that DWARF register number 11 is associated with
2055 the x86 register %st(0), but that's about all. Higher DWARF
2056 register numbers don't seem to be associated with anything in
2057 particular, and even for DWARF regno 11, SDB only seems to under-
2058 stand that it should say that a variable lives in %st(0) (when
2059 asked via an `=' command) if we said it was in DWARF regno 11,
2060 but SDB still prints garbage when asked for the value of the
2061 variable in question (via a `/' command).
2062 (Also note that the labels SDB prints for various FP stack regs
2063 when doing an `x' command are all wrong.)
2064 Note that these problems generally don't affect the native SVR4
2065 C compiler because it doesn't allow the use of -O with -g and
2066 because when it is *not* optimizing, it allocates a memory
2067 location for each floating-point variable, and the memory
2068 location is what gets described in the DWARF AT_location
2069 attribute for the variable in question.
2070 Regardless of the severe mental illness of the x86/svr4 SDB, we
2071 do something sensible here and we use the following DWARF
2072 register numbers. Note that these are all stack-top-relative
2073 numbers.
2074 11 for %st(0) (gcc regno = 8)
2075 12 for %st(1) (gcc regno = 9)
2076 13 for %st(2) (gcc regno = 10)
2077 14 for %st(3) (gcc regno = 11)
2078 15 for %st(4) (gcc regno = 12)
2079 16 for %st(5) (gcc regno = 13)
2080 17 for %st(6) (gcc regno = 14)
2081 18 for %st(7) (gcc regno = 15)
2082 */
2083 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2084 {
2085 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2086 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2087 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2088 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2089 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2090 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2091 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2092 };
2093
2094 /* Define parameter passing and return registers. */
2095
2096 static int const x86_64_int_parameter_registers[6] =
2097 {
2098 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2099 };
2100
2101 static int const x86_64_ms_abi_int_parameter_registers[4] =
2102 {
2103 CX_REG, DX_REG, R8_REG, R9_REG
2104 };
2105
2106 static int const x86_64_int_return_registers[4] =
2107 {
2108 AX_REG, DX_REG, DI_REG, SI_REG
2109 };
2110
2111 /* Define the structure for the machine field in struct function. */
2112
2113 struct GTY(()) stack_local_entry {
2114 unsigned short mode;
2115 unsigned short n;
2116 rtx rtl;
2117 struct stack_local_entry *next;
2118 };
2119
2120 /* Structure describing stack frame layout.
2121 Stack grows downward:
2122
2123 [arguments]
2124 <- ARG_POINTER
2125 saved pc
2126
2127 saved static chain if ix86_static_chain_on_stack
2128
2129 saved frame pointer if frame_pointer_needed
2130 <- HARD_FRAME_POINTER
2131 [saved regs]
2132 <- regs_save_offset
2133 [padding0]
2134
2135 [saved SSE regs]
2136 <- sse_regs_save_offset
2137 [padding1] |
2138 | <- FRAME_POINTER
2139 [va_arg registers] |
2140 |
2141 [frame] |
2142 |
2143 [padding2] | = to_allocate
2144 <- STACK_POINTER
2145 */
2146 struct ix86_frame
2147 {
2148 int nsseregs;
2149 int nregs;
2150 int va_arg_size;
2151 int red_zone_size;
2152 int outgoing_arguments_size;
2153
2154 /* The offsets relative to ARG_POINTER. */
2155 HOST_WIDE_INT frame_pointer_offset;
2156 HOST_WIDE_INT hard_frame_pointer_offset;
2157 HOST_WIDE_INT stack_pointer_offset;
2158 HOST_WIDE_INT hfp_save_offset;
2159 HOST_WIDE_INT reg_save_offset;
2160 HOST_WIDE_INT sse_reg_save_offset;
2161
2162 /* When save_regs_using_mov is set, emit prologue using
2163 move instead of push instructions. */
2164 bool save_regs_using_mov;
2165 };
2166
2167 /* Which cpu are we scheduling for. */
2168 enum attr_cpu ix86_schedule;
2169
2170 /* Which cpu are we optimizing for. */
2171 enum processor_type ix86_tune;
2172
2173 /* Which instruction set architecture to use. */
2174 enum processor_type ix86_arch;
2175
2176 /* True if processor has SSE prefetch instruction. */
2177 unsigned char x86_prefetch_sse;
2178
2179 /* -mstackrealign option */
2180 static const char ix86_force_align_arg_pointer_string[]
2181 = "force_align_arg_pointer";
2182
2183 static rtx (*ix86_gen_leave) (void);
2184 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2185 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2186 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2187 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2188 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2189 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2190 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2191 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2192 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2193 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2194 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2195
2196 /* Preferred alignment for stack boundary in bits. */
2197 unsigned int ix86_preferred_stack_boundary;
2198
2199 /* Alignment for incoming stack boundary in bits specified at
2200 command line. */
2201 static unsigned int ix86_user_incoming_stack_boundary;
2202
2203 /* Default alignment for incoming stack boundary in bits. */
2204 static unsigned int ix86_default_incoming_stack_boundary;
2205
2206 /* Alignment for incoming stack boundary in bits. */
2207 unsigned int ix86_incoming_stack_boundary;
2208
2209 /* Calling abi specific va_list type nodes. */
2210 static GTY(()) tree sysv_va_list_type_node;
2211 static GTY(()) tree ms_va_list_type_node;
2212
2213 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2214 char internal_label_prefix[16];
2215 int internal_label_prefix_len;
2216
2217 /* Fence to use after loop using movnt. */
2218 tree x86_mfence;
2219
2220 /* Register class used for passing given 64bit part of the argument.
2221 These represent classes as documented by the PS ABI, with the exception
2222 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2223 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2224
2225 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2226 whenever possible (upper half does contain padding). */
2227 enum x86_64_reg_class
2228 {
2229 X86_64_NO_CLASS,
2230 X86_64_INTEGER_CLASS,
2231 X86_64_INTEGERSI_CLASS,
2232 X86_64_SSE_CLASS,
2233 X86_64_SSESF_CLASS,
2234 X86_64_SSEDF_CLASS,
2235 X86_64_SSEUP_CLASS,
2236 X86_64_X87_CLASS,
2237 X86_64_X87UP_CLASS,
2238 X86_64_COMPLEX_X87_CLASS,
2239 X86_64_MEMORY_CLASS
2240 };
2241
2242 #define MAX_CLASSES 4
2243
2244 /* Table of constants used by fldpi, fldln2, etc.... */
2245 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2246 static bool ext_80387_constants_init = 0;
2247
2248 \f
2249 static struct machine_function * ix86_init_machine_status (void);
2250 static rtx ix86_function_value (const_tree, const_tree, bool);
2251 static bool ix86_function_value_regno_p (const unsigned int);
2252 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2253 const_tree);
2254 static rtx ix86_static_chain (const_tree, bool);
2255 static int ix86_function_regparm (const_tree, const_tree);
2256 static void ix86_compute_frame_layout (struct ix86_frame *);
2257 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2258 rtx, rtx, int);
2259 static void ix86_add_new_builtins (HOST_WIDE_INT);
2260 static tree ix86_canonical_va_list_type (tree);
2261 static void predict_jump (int);
2262 static unsigned int split_stack_prologue_scratch_regno (void);
2263 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2264
2265 enum ix86_function_specific_strings
2266 {
2267 IX86_FUNCTION_SPECIFIC_ARCH,
2268 IX86_FUNCTION_SPECIFIC_TUNE,
2269 IX86_FUNCTION_SPECIFIC_MAX
2270 };
2271
2272 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2273 const char *, enum fpmath_unit, bool);
2274 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2275 static void ix86_function_specific_save (struct cl_target_option *);
2276 static void ix86_function_specific_restore (struct cl_target_option *);
2277 static void ix86_function_specific_print (FILE *, int,
2278 struct cl_target_option *);
2279 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2280 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2281 struct gcc_options *);
2282 static bool ix86_can_inline_p (tree, tree);
2283 static void ix86_set_current_function (tree);
2284 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2285
2286 static enum calling_abi ix86_function_abi (const_tree);
2287
2288 \f
2289 #ifndef SUBTARGET32_DEFAULT_CPU
2290 #define SUBTARGET32_DEFAULT_CPU "i386"
2291 #endif
2292
2293 /* The svr4 ABI for the i386 says that records and unions are returned
2294 in memory. */
2295 #ifndef DEFAULT_PCC_STRUCT_RETURN
2296 #define DEFAULT_PCC_STRUCT_RETURN 1
2297 #endif
2298
2299 /* Whether -mtune= or -march= were specified */
2300 static int ix86_tune_defaulted;
2301 static int ix86_arch_specified;
2302
2303 /* Vectorization library interface and handlers. */
2304 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2305
2306 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2307 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2308
2309 /* Processor target table, indexed by processor number */
2310 struct ptt
2311 {
2312 const struct processor_costs *cost; /* Processor costs */
2313 const int align_loop; /* Default alignments. */
2314 const int align_loop_max_skip;
2315 const int align_jump;
2316 const int align_jump_max_skip;
2317 const int align_func;
2318 };
2319
2320 static const struct ptt processor_target_table[PROCESSOR_max] =
2321 {
2322 {&i386_cost, 4, 3, 4, 3, 4},
2323 {&i486_cost, 16, 15, 16, 15, 16},
2324 {&pentium_cost, 16, 7, 16, 7, 16},
2325 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2326 {&geode_cost, 0, 0, 0, 0, 0},
2327 {&k6_cost, 32, 7, 32, 7, 32},
2328 {&athlon_cost, 16, 7, 16, 7, 16},
2329 {&pentium4_cost, 0, 0, 0, 0, 0},
2330 {&k8_cost, 16, 7, 16, 7, 16},
2331 {&nocona_cost, 0, 0, 0, 0, 0},
2332 /* Core 2 32-bit. */
2333 {&generic32_cost, 16, 10, 16, 10, 16},
2334 /* Core 2 64-bit. */
2335 {&generic64_cost, 16, 10, 16, 10, 16},
2336 /* Core i7 32-bit. */
2337 {&generic32_cost, 16, 10, 16, 10, 16},
2338 /* Core i7 64-bit. */
2339 {&generic64_cost, 16, 10, 16, 10, 16},
2340 {&generic32_cost, 16, 7, 16, 7, 16},
2341 {&generic64_cost, 16, 10, 16, 10, 16},
2342 {&amdfam10_cost, 32, 24, 32, 7, 32},
2343 {&bdver1_cost, 32, 24, 32, 7, 32},
2344 {&bdver2_cost, 32, 24, 32, 7, 32},
2345 {&bdver3_cost, 32, 24, 32, 7, 32},
2346 {&btver1_cost, 32, 24, 32, 7, 32},
2347 {&btver2_cost, 32, 24, 32, 7, 32},
2348 {&atom_cost, 16, 15, 16, 7, 16}
2349 };
2350
2351 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2352 {
2353 "generic",
2354 "i386",
2355 "i486",
2356 "pentium",
2357 "pentium-mmx",
2358 "pentiumpro",
2359 "pentium2",
2360 "pentium3",
2361 "pentium4",
2362 "pentium-m",
2363 "prescott",
2364 "nocona",
2365 "core2",
2366 "corei7",
2367 "atom",
2368 "geode",
2369 "k6",
2370 "k6-2",
2371 "k6-3",
2372 "athlon",
2373 "athlon-4",
2374 "k8",
2375 "amdfam10",
2376 "bdver1",
2377 "bdver2",
2378 "bdver3",
2379 "btver1",
2380 "btver2"
2381 };
2382 \f
2383 static bool
2384 gate_insert_vzeroupper (void)
2385 {
2386 return TARGET_VZEROUPPER;
2387 }
2388
2389 static unsigned int
2390 rest_of_handle_insert_vzeroupper (void)
2391 {
2392 int i;
2393
2394 /* vzeroupper instructions are inserted immediately after reload to
2395 account for possible spills from 256bit registers. The pass
2396 reuses mode switching infrastructure by re-running mode insertion
2397 pass, so disable entities that have already been processed. */
2398 for (i = 0; i < MAX_386_ENTITIES; i++)
2399 ix86_optimize_mode_switching[i] = 0;
2400
2401 ix86_optimize_mode_switching[AVX_U128] = 1;
2402
2403 /* Call optimize_mode_switching. */
2404 pass_mode_switching.pass.execute ();
2405 return 0;
2406 }
2407
2408 struct rtl_opt_pass pass_insert_vzeroupper =
2409 {
2410 {
2411 RTL_PASS,
2412 "vzeroupper", /* name */
2413 OPTGROUP_NONE, /* optinfo_flags */
2414 gate_insert_vzeroupper, /* gate */
2415 rest_of_handle_insert_vzeroupper, /* execute */
2416 NULL, /* sub */
2417 NULL, /* next */
2418 0, /* static_pass_number */
2419 TV_NONE, /* tv_id */
2420 0, /* properties_required */
2421 0, /* properties_provided */
2422 0, /* properties_destroyed */
2423 0, /* todo_flags_start */
2424 TODO_df_finish | TODO_verify_rtl_sharing |
2425 0, /* todo_flags_finish */
2426 }
2427 };
2428
2429 /* Return true if a red-zone is in use. */
2430
2431 static inline bool
2432 ix86_using_red_zone (void)
2433 {
2434 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2435 }
2436 \f
2437 /* Return a string that documents the current -m options. The caller is
2438 responsible for freeing the string. */
2439
2440 static char *
2441 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2442 const char *tune, enum fpmath_unit fpmath,
2443 bool add_nl_p)
2444 {
2445 struct ix86_target_opts
2446 {
2447 const char *option; /* option string */
2448 HOST_WIDE_INT mask; /* isa mask options */
2449 };
2450
2451 /* This table is ordered so that options like -msse4.2 that imply
2452 preceding options while match those first. */
2453 static struct ix86_target_opts isa_opts[] =
2454 {
2455 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2456 { "-mfma", OPTION_MASK_ISA_FMA },
2457 { "-mxop", OPTION_MASK_ISA_XOP },
2458 { "-mlwp", OPTION_MASK_ISA_LWP },
2459 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2460 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2461 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2462 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2463 { "-msse3", OPTION_MASK_ISA_SSE3 },
2464 { "-msse2", OPTION_MASK_ISA_SSE2 },
2465 { "-msse", OPTION_MASK_ISA_SSE },
2466 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2467 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2468 { "-mmmx", OPTION_MASK_ISA_MMX },
2469 { "-mabm", OPTION_MASK_ISA_ABM },
2470 { "-mbmi", OPTION_MASK_ISA_BMI },
2471 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2472 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2473 { "-mhle", OPTION_MASK_ISA_HLE },
2474 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2475 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2476 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2477 { "-madx", OPTION_MASK_ISA_ADX },
2478 { "-mtbm", OPTION_MASK_ISA_TBM },
2479 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2480 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2481 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2482 { "-maes", OPTION_MASK_ISA_AES },
2483 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2484 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2485 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2486 { "-mf16c", OPTION_MASK_ISA_F16C },
2487 { "-mrtm", OPTION_MASK_ISA_RTM },
2488 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2489 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2490 };
2491
2492 /* Flag options. */
2493 static struct ix86_target_opts flag_opts[] =
2494 {
2495 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2496 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2497 { "-m80387", MASK_80387 },
2498 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2499 { "-malign-double", MASK_ALIGN_DOUBLE },
2500 { "-mcld", MASK_CLD },
2501 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2502 { "-mieee-fp", MASK_IEEE_FP },
2503 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2504 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2505 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2506 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2507 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2508 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2509 { "-mno-red-zone", MASK_NO_RED_ZONE },
2510 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2511 { "-mrecip", MASK_RECIP },
2512 { "-mrtd", MASK_RTD },
2513 { "-msseregparm", MASK_SSEREGPARM },
2514 { "-mstack-arg-probe", MASK_STACK_PROBE },
2515 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2516 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2517 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2518 { "-mvzeroupper", MASK_VZEROUPPER },
2519 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2520 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2521 { "-mprefer-avx128", MASK_PREFER_AVX128},
2522 };
2523
2524 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2525
2526 char isa_other[40];
2527 char target_other[40];
2528 unsigned num = 0;
2529 unsigned i, j;
2530 char *ret;
2531 char *ptr;
2532 size_t len;
2533 size_t line_len;
2534 size_t sep_len;
2535 const char *abi;
2536
2537 memset (opts, '\0', sizeof (opts));
2538
2539 /* Add -march= option. */
2540 if (arch)
2541 {
2542 opts[num][0] = "-march=";
2543 opts[num++][1] = arch;
2544 }
2545
2546 /* Add -mtune= option. */
2547 if (tune)
2548 {
2549 opts[num][0] = "-mtune=";
2550 opts[num++][1] = tune;
2551 }
2552
2553 /* Add -m32/-m64/-mx32. */
2554 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2555 {
2556 if ((isa & OPTION_MASK_ABI_64) != 0)
2557 abi = "-m64";
2558 else
2559 abi = "-mx32";
2560 isa &= ~ (OPTION_MASK_ISA_64BIT
2561 | OPTION_MASK_ABI_64
2562 | OPTION_MASK_ABI_X32);
2563 }
2564 else
2565 abi = "-m32";
2566 opts[num++][0] = abi;
2567
2568 /* Pick out the options in isa options. */
2569 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2570 {
2571 if ((isa & isa_opts[i].mask) != 0)
2572 {
2573 opts[num++][0] = isa_opts[i].option;
2574 isa &= ~ isa_opts[i].mask;
2575 }
2576 }
2577
2578 if (isa && add_nl_p)
2579 {
2580 opts[num++][0] = isa_other;
2581 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2582 isa);
2583 }
2584
2585 /* Add flag options. */
2586 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2587 {
2588 if ((flags & flag_opts[i].mask) != 0)
2589 {
2590 opts[num++][0] = flag_opts[i].option;
2591 flags &= ~ flag_opts[i].mask;
2592 }
2593 }
2594
2595 if (flags && add_nl_p)
2596 {
2597 opts[num++][0] = target_other;
2598 sprintf (target_other, "(other flags: %#x)", flags);
2599 }
2600
2601 /* Add -fpmath= option. */
2602 if (fpmath)
2603 {
2604 opts[num][0] = "-mfpmath=";
2605 switch ((int) fpmath)
2606 {
2607 case FPMATH_387:
2608 opts[num++][1] = "387";
2609 break;
2610
2611 case FPMATH_SSE:
2612 opts[num++][1] = "sse";
2613 break;
2614
2615 case FPMATH_387 | FPMATH_SSE:
2616 opts[num++][1] = "sse+387";
2617 break;
2618
2619 default:
2620 gcc_unreachable ();
2621 }
2622 }
2623
2624 /* Any options? */
2625 if (num == 0)
2626 return NULL;
2627
2628 gcc_assert (num < ARRAY_SIZE (opts));
2629
2630 /* Size the string. */
2631 len = 0;
2632 sep_len = (add_nl_p) ? 3 : 1;
2633 for (i = 0; i < num; i++)
2634 {
2635 len += sep_len;
2636 for (j = 0; j < 2; j++)
2637 if (opts[i][j])
2638 len += strlen (opts[i][j]);
2639 }
2640
2641 /* Build the string. */
2642 ret = ptr = (char *) xmalloc (len);
2643 line_len = 0;
2644
2645 for (i = 0; i < num; i++)
2646 {
2647 size_t len2[2];
2648
2649 for (j = 0; j < 2; j++)
2650 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2651
2652 if (i != 0)
2653 {
2654 *ptr++ = ' ';
2655 line_len++;
2656
2657 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2658 {
2659 *ptr++ = '\\';
2660 *ptr++ = '\n';
2661 line_len = 0;
2662 }
2663 }
2664
2665 for (j = 0; j < 2; j++)
2666 if (opts[i][j])
2667 {
2668 memcpy (ptr, opts[i][j], len2[j]);
2669 ptr += len2[j];
2670 line_len += len2[j];
2671 }
2672 }
2673
2674 *ptr = '\0';
2675 gcc_assert (ret + len >= ptr);
2676
2677 return ret;
2678 }
2679
2680 /* Return true, if profiling code should be emitted before
2681 prologue. Otherwise it returns false.
2682 Note: For x86 with "hotfix" it is sorried. */
2683 static bool
2684 ix86_profile_before_prologue (void)
2685 {
2686 return flag_fentry != 0;
2687 }
2688
2689 /* Function that is callable from the debugger to print the current
2690 options. */
2691 void
2692 ix86_debug_options (void)
2693 {
2694 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2695 ix86_arch_string, ix86_tune_string,
2696 ix86_fpmath, true);
2697
2698 if (opts)
2699 {
2700 fprintf (stderr, "%s\n\n", opts);
2701 free (opts);
2702 }
2703 else
2704 fputs ("<no options>\n\n", stderr);
2705
2706 return;
2707 }
2708 \f
2709 /* Override various settings based on options. If MAIN_ARGS_P, the
2710 options are from the command line, otherwise they are from
2711 attributes. */
2712
2713 static void
2714 ix86_option_override_internal (bool main_args_p)
2715 {
2716 int i;
2717 unsigned int ix86_arch_mask, ix86_tune_mask;
2718 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2719 const char *prefix;
2720 const char *suffix;
2721 const char *sw;
2722
2723 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2724 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2725 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2726 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2727 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2728 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2729 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2730 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2731 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2732 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2733 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2734 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2735 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2736 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2737 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2738 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2739 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2740 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2741 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2742 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2743 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2744 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2745 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2746 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2747 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2748 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2749 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2750 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2751 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2752 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2753 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2754 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2755 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2756 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2757 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2758 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2759 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2760 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2761 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2762 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2763
2764 /* if this reaches 64, need to widen struct pta flags below */
2765
2766 static struct pta
2767 {
2768 const char *const name; /* processor name or nickname. */
2769 const enum processor_type processor;
2770 const enum attr_cpu schedule;
2771 const unsigned HOST_WIDE_INT flags;
2772 }
2773 const processor_alias_table[] =
2774 {
2775 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2776 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2777 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2778 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2779 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2780 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2781 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2782 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2783 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2784 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2785 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2786 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2787 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2788 PTA_MMX | PTA_SSE | PTA_FXSR},
2789 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2790 PTA_MMX | PTA_SSE | PTA_FXSR},
2791 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2792 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2793 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2794 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2795 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2796 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2797 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2798 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2799 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2800 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2801 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2802 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2803 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2804 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2805 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2806 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2807 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_FXSR},
2808 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2809 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2810 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2811 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2812 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2813 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2814 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2815 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2816 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2817 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2818 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2819 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2820 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2821 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2822 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2823 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
2824 | PTA_XSAVEOPT},
2825 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2826 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2827 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2828 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2829 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2830 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2831 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2832 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2833 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2834 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2835 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2836 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2837 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2838 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2839 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2840 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2841 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2842 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2843 {"x86-64", PROCESSOR_K8, CPU_K8,
2844 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2845 {"k8", PROCESSOR_K8, CPU_K8,
2846 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2847 | PTA_SSE2 | PTA_NO_SAHF},
2848 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2849 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2850 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2851 {"opteron", PROCESSOR_K8, CPU_K8,
2852 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2853 | PTA_SSE2 | PTA_NO_SAHF},
2854 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2855 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2856 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2857 {"athlon64", PROCESSOR_K8, CPU_K8,
2858 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2859 | PTA_SSE2 | PTA_NO_SAHF},
2860 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2861 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2862 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2863 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2864 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2865 | PTA_SSE2 | PTA_NO_SAHF},
2866 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2867 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2868 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2869 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2870 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2871 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
2872 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2873 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2874 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2875 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2876 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2877 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2878 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2879 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2880 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2881 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2882 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2883 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2884 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2885 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2886 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2887 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2888 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2889 | PTA_XSAVEOPT},
2890 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2891 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2892 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2893 | PTA_FXSR | PTA_XSAVE},
2894 {"btver2", PROCESSOR_BTVER2, CPU_GENERIC64,
2895 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2896 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2897 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2898 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2899 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2900
2901 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2902 PTA_HLE /* flags are only used for -march switch. */ },
2903 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2904 PTA_64BIT
2905 | PTA_HLE /* flags are only used for -march switch. */ },
2906 };
2907
2908 /* -mrecip options. */
2909 static struct
2910 {
2911 const char *string; /* option name */
2912 unsigned int mask; /* mask bits to set */
2913 }
2914 const recip_options[] =
2915 {
2916 { "all", RECIP_MASK_ALL },
2917 { "none", RECIP_MASK_NONE },
2918 { "div", RECIP_MASK_DIV },
2919 { "sqrt", RECIP_MASK_SQRT },
2920 { "vec-div", RECIP_MASK_VEC_DIV },
2921 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
2922 };
2923
2924 int const pta_size = ARRAY_SIZE (processor_alias_table);
2925
2926 /* Set up prefix/suffix so the error messages refer to either the command
2927 line argument, or the attribute(target). */
2928 if (main_args_p)
2929 {
2930 prefix = "-m";
2931 suffix = "";
2932 sw = "switch";
2933 }
2934 else
2935 {
2936 prefix = "option(\"";
2937 suffix = "\")";
2938 sw = "attribute";
2939 }
2940
2941 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
2942 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
2943 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
2944 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
2945 #ifdef TARGET_BI_ARCH
2946 else
2947 {
2948 #if TARGET_BI_ARCH == 1
2949 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
2950 is on and OPTION_MASK_ABI_X32 is off. We turn off
2951 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
2952 -mx32. */
2953 if (TARGET_X32)
2954 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
2955 #else
2956 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
2957 on and OPTION_MASK_ABI_64 is off. We turn off
2958 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
2959 -m64. */
2960 if (TARGET_LP64)
2961 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
2962 #endif
2963 }
2964 #endif
2965
2966 if (TARGET_X32)
2967 {
2968 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
2969 OPTION_MASK_ABI_64 for TARGET_X32. */
2970 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
2971 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
2972 }
2973 else if (TARGET_LP64)
2974 {
2975 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
2976 OPTION_MASK_ABI_X32 for TARGET_LP64. */
2977 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
2978 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
2979 }
2980
2981 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2982 SUBTARGET_OVERRIDE_OPTIONS;
2983 #endif
2984
2985 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2986 SUBSUBTARGET_OVERRIDE_OPTIONS;
2987 #endif
2988
2989 /* -fPIC is the default for x86_64. */
2990 if (TARGET_MACHO && TARGET_64BIT)
2991 flag_pic = 2;
2992
2993 /* Need to check -mtune=generic first. */
2994 if (ix86_tune_string)
2995 {
2996 if (!strcmp (ix86_tune_string, "generic")
2997 || !strcmp (ix86_tune_string, "i686")
2998 /* As special support for cross compilers we read -mtune=native
2999 as -mtune=generic. With native compilers we won't see the
3000 -mtune=native, as it was changed by the driver. */
3001 || !strcmp (ix86_tune_string, "native"))
3002 {
3003 if (TARGET_64BIT)
3004 ix86_tune_string = "generic64";
3005 else
3006 ix86_tune_string = "generic32";
3007 }
3008 /* If this call is for setting the option attribute, allow the
3009 generic32/generic64 that was previously set. */
3010 else if (!main_args_p
3011 && (!strcmp (ix86_tune_string, "generic32")
3012 || !strcmp (ix86_tune_string, "generic64")))
3013 ;
3014 else if (!strncmp (ix86_tune_string, "generic", 7))
3015 error ("bad value (%s) for %stune=%s %s",
3016 ix86_tune_string, prefix, suffix, sw);
3017 else if (!strcmp (ix86_tune_string, "x86-64"))
3018 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3019 "%stune=k8%s or %stune=generic%s instead as appropriate",
3020 prefix, suffix, prefix, suffix, prefix, suffix);
3021 }
3022 else
3023 {
3024 if (ix86_arch_string)
3025 ix86_tune_string = ix86_arch_string;
3026 if (!ix86_tune_string)
3027 {
3028 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3029 ix86_tune_defaulted = 1;
3030 }
3031
3032 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3033 need to use a sensible tune option. */
3034 if (!strcmp (ix86_tune_string, "generic")
3035 || !strcmp (ix86_tune_string, "x86-64")
3036 || !strcmp (ix86_tune_string, "i686"))
3037 {
3038 if (TARGET_64BIT)
3039 ix86_tune_string = "generic64";
3040 else
3041 ix86_tune_string = "generic32";
3042 }
3043 }
3044
3045 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3046 {
3047 /* rep; movq isn't available in 32-bit code. */
3048 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3049 ix86_stringop_alg = no_stringop;
3050 }
3051
3052 if (!ix86_arch_string)
3053 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3054 else
3055 ix86_arch_specified = 1;
3056
3057 if (global_options_set.x_ix86_pmode)
3058 {
3059 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3060 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3061 error ("address mode %qs not supported in the %s bit mode",
3062 TARGET_64BIT ? "short" : "long",
3063 TARGET_64BIT ? "64" : "32");
3064 }
3065 else
3066 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3067
3068 if (!global_options_set.x_ix86_abi)
3069 ix86_abi = DEFAULT_ABI;
3070
3071 if (global_options_set.x_ix86_cmodel)
3072 {
3073 switch (ix86_cmodel)
3074 {
3075 case CM_SMALL:
3076 case CM_SMALL_PIC:
3077 if (flag_pic)
3078 ix86_cmodel = CM_SMALL_PIC;
3079 if (!TARGET_64BIT)
3080 error ("code model %qs not supported in the %s bit mode",
3081 "small", "32");
3082 break;
3083
3084 case CM_MEDIUM:
3085 case CM_MEDIUM_PIC:
3086 if (flag_pic)
3087 ix86_cmodel = CM_MEDIUM_PIC;
3088 if (!TARGET_64BIT)
3089 error ("code model %qs not supported in the %s bit mode",
3090 "medium", "32");
3091 else if (TARGET_X32)
3092 error ("code model %qs not supported in x32 mode",
3093 "medium");
3094 break;
3095
3096 case CM_LARGE:
3097 case CM_LARGE_PIC:
3098 if (flag_pic)
3099 ix86_cmodel = CM_LARGE_PIC;
3100 if (!TARGET_64BIT)
3101 error ("code model %qs not supported in the %s bit mode",
3102 "large", "32");
3103 else if (TARGET_X32)
3104 error ("code model %qs not supported in x32 mode",
3105 "large");
3106 break;
3107
3108 case CM_32:
3109 if (flag_pic)
3110 error ("code model %s does not support PIC mode", "32");
3111 if (TARGET_64BIT)
3112 error ("code model %qs not supported in the %s bit mode",
3113 "32", "64");
3114 break;
3115
3116 case CM_KERNEL:
3117 if (flag_pic)
3118 {
3119 error ("code model %s does not support PIC mode", "kernel");
3120 ix86_cmodel = CM_32;
3121 }
3122 if (!TARGET_64BIT)
3123 error ("code model %qs not supported in the %s bit mode",
3124 "kernel", "32");
3125 break;
3126
3127 default:
3128 gcc_unreachable ();
3129 }
3130 }
3131 else
3132 {
3133 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3134 use of rip-relative addressing. This eliminates fixups that
3135 would otherwise be needed if this object is to be placed in a
3136 DLL, and is essentially just as efficient as direct addressing. */
3137 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3138 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3139 else if (TARGET_64BIT)
3140 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3141 else
3142 ix86_cmodel = CM_32;
3143 }
3144 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3145 {
3146 error ("-masm=intel not supported in this configuration");
3147 ix86_asm_dialect = ASM_ATT;
3148 }
3149 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3150 sorry ("%i-bit mode not compiled in",
3151 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3152
3153 for (i = 0; i < pta_size; i++)
3154 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3155 {
3156 ix86_schedule = processor_alias_table[i].schedule;
3157 ix86_arch = processor_alias_table[i].processor;
3158 /* Default cpu tuning to the architecture. */
3159 ix86_tune = ix86_arch;
3160
3161 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3162 error ("CPU you selected does not support x86-64 "
3163 "instruction set");
3164
3165 if (processor_alias_table[i].flags & PTA_MMX
3166 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3167 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3168 if (processor_alias_table[i].flags & PTA_3DNOW
3169 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3170 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3171 if (processor_alias_table[i].flags & PTA_3DNOW_A
3172 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3173 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3174 if (processor_alias_table[i].flags & PTA_SSE
3175 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3176 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3177 if (processor_alias_table[i].flags & PTA_SSE2
3178 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3179 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3180 if (processor_alias_table[i].flags & PTA_SSE3
3181 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3182 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3183 if (processor_alias_table[i].flags & PTA_SSSE3
3184 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3185 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3186 if (processor_alias_table[i].flags & PTA_SSE4_1
3187 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3188 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3189 if (processor_alias_table[i].flags & PTA_SSE4_2
3190 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3191 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3192 if (processor_alias_table[i].flags & PTA_AVX
3193 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3194 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3195 if (processor_alias_table[i].flags & PTA_AVX2
3196 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3197 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3198 if (processor_alias_table[i].flags & PTA_FMA
3199 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3200 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3201 if (processor_alias_table[i].flags & PTA_SSE4A
3202 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3203 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3204 if (processor_alias_table[i].flags & PTA_FMA4
3205 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3206 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3207 if (processor_alias_table[i].flags & PTA_XOP
3208 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3209 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3210 if (processor_alias_table[i].flags & PTA_LWP
3211 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3212 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3213 if (processor_alias_table[i].flags & PTA_ABM
3214 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3215 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3216 if (processor_alias_table[i].flags & PTA_BMI
3217 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3218 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3219 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3220 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3221 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3222 if (processor_alias_table[i].flags & PTA_TBM
3223 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3224 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3225 if (processor_alias_table[i].flags & PTA_BMI2
3226 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3227 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3228 if (processor_alias_table[i].flags & PTA_CX16
3229 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3230 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3231 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3232 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3233 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3234 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3235 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3236 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3237 if (processor_alias_table[i].flags & PTA_MOVBE
3238 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3239 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3240 if (processor_alias_table[i].flags & PTA_AES
3241 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3242 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3243 if (processor_alias_table[i].flags & PTA_PCLMUL
3244 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3245 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3246 if (processor_alias_table[i].flags & PTA_FSGSBASE
3247 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3248 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3249 if (processor_alias_table[i].flags & PTA_RDRND
3250 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3251 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3252 if (processor_alias_table[i].flags & PTA_F16C
3253 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3254 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3255 if (processor_alias_table[i].flags & PTA_RTM
3256 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3257 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3258 if (processor_alias_table[i].flags & PTA_HLE
3259 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3260 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3261 if (processor_alias_table[i].flags & PTA_PRFCHW
3262 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3263 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3264 if (processor_alias_table[i].flags & PTA_RDSEED
3265 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3266 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3267 if (processor_alias_table[i].flags & PTA_ADX
3268 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3269 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3270 if (processor_alias_table[i].flags & PTA_FXSR
3271 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3272 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3273 if (processor_alias_table[i].flags & PTA_XSAVE
3274 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3275 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3276 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3277 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3278 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3279 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3280 x86_prefetch_sse = true;
3281
3282 break;
3283 }
3284
3285 if (!strcmp (ix86_arch_string, "generic"))
3286 error ("generic CPU can be used only for %stune=%s %s",
3287 prefix, suffix, sw);
3288 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3289 error ("bad value (%s) for %sarch=%s %s",
3290 ix86_arch_string, prefix, suffix, sw);
3291
3292 ix86_arch_mask = 1u << ix86_arch;
3293 for (i = 0; i < X86_ARCH_LAST; ++i)
3294 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3295
3296 for (i = 0; i < pta_size; i++)
3297 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3298 {
3299 ix86_schedule = processor_alias_table[i].schedule;
3300 ix86_tune = processor_alias_table[i].processor;
3301 if (TARGET_64BIT)
3302 {
3303 if (!(processor_alias_table[i].flags & PTA_64BIT))
3304 {
3305 if (ix86_tune_defaulted)
3306 {
3307 ix86_tune_string = "x86-64";
3308 for (i = 0; i < pta_size; i++)
3309 if (! strcmp (ix86_tune_string,
3310 processor_alias_table[i].name))
3311 break;
3312 ix86_schedule = processor_alias_table[i].schedule;
3313 ix86_tune = processor_alias_table[i].processor;
3314 }
3315 else
3316 error ("CPU you selected does not support x86-64 "
3317 "instruction set");
3318 }
3319 }
3320 else
3321 {
3322 /* Adjust tuning when compiling for 32-bit ABI. */
3323 switch (ix86_tune)
3324 {
3325 case PROCESSOR_GENERIC64:
3326 ix86_tune = PROCESSOR_GENERIC32;
3327 ix86_schedule = CPU_PENTIUMPRO;
3328 break;
3329
3330 case PROCESSOR_CORE2_64:
3331 ix86_tune = PROCESSOR_CORE2_32;
3332 break;
3333
3334 case PROCESSOR_COREI7_64:
3335 ix86_tune = PROCESSOR_COREI7_32;
3336 break;
3337
3338 default:
3339 break;
3340 }
3341 }
3342 /* Intel CPUs have always interpreted SSE prefetch instructions as
3343 NOPs; so, we can enable SSE prefetch instructions even when
3344 -mtune (rather than -march) points us to a processor that has them.
3345 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3346 higher processors. */
3347 if (TARGET_CMOV
3348 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3349 x86_prefetch_sse = true;
3350 break;
3351 }
3352
3353 if (ix86_tune_specified && i == pta_size)
3354 error ("bad value (%s) for %stune=%s %s",
3355 ix86_tune_string, prefix, suffix, sw);
3356
3357 ix86_tune_mask = 1u << ix86_tune;
3358 for (i = 0; i < X86_TUNE_LAST; ++i)
3359 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3360
3361 #ifndef USE_IX86_FRAME_POINTER
3362 #define USE_IX86_FRAME_POINTER 0
3363 #endif
3364
3365 #ifndef USE_X86_64_FRAME_POINTER
3366 #define USE_X86_64_FRAME_POINTER 0
3367 #endif
3368
3369 /* Set the default values for switches whose default depends on TARGET_64BIT
3370 in case they weren't overwritten by command line options. */
3371 if (TARGET_64BIT)
3372 {
3373 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3374 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3375 if (flag_asynchronous_unwind_tables == 2)
3376 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3377 if (flag_pcc_struct_return == 2)
3378 flag_pcc_struct_return = 0;
3379 }
3380 else
3381 {
3382 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3383 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3384 if (flag_asynchronous_unwind_tables == 2)
3385 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3386 if (flag_pcc_struct_return == 2)
3387 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3388 }
3389
3390 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3391 if (optimize_size)
3392 ix86_cost = &ix86_size_cost;
3393 else
3394 ix86_cost = ix86_tune_cost;
3395
3396 /* Arrange to set up i386_stack_locals for all functions. */
3397 init_machine_status = ix86_init_machine_status;
3398
3399 /* Validate -mregparm= value. */
3400 if (global_options_set.x_ix86_regparm)
3401 {
3402 if (TARGET_64BIT)
3403 warning (0, "-mregparm is ignored in 64-bit mode");
3404 if (ix86_regparm > REGPARM_MAX)
3405 {
3406 error ("-mregparm=%d is not between 0 and %d",
3407 ix86_regparm, REGPARM_MAX);
3408 ix86_regparm = 0;
3409 }
3410 }
3411 if (TARGET_64BIT)
3412 ix86_regparm = REGPARM_MAX;
3413
3414 /* Default align_* from the processor table. */
3415 if (align_loops == 0)
3416 {
3417 align_loops = processor_target_table[ix86_tune].align_loop;
3418 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3419 }
3420 if (align_jumps == 0)
3421 {
3422 align_jumps = processor_target_table[ix86_tune].align_jump;
3423 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3424 }
3425 if (align_functions == 0)
3426 {
3427 align_functions = processor_target_table[ix86_tune].align_func;
3428 }
3429
3430 /* Provide default for -mbranch-cost= value. */
3431 if (!global_options_set.x_ix86_branch_cost)
3432 ix86_branch_cost = ix86_cost->branch_cost;
3433
3434 if (TARGET_64BIT)
3435 {
3436 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3437
3438 /* Enable by default the SSE and MMX builtins. Do allow the user to
3439 explicitly disable any of these. In particular, disabling SSE and
3440 MMX for kernel code is extremely useful. */
3441 if (!ix86_arch_specified)
3442 ix86_isa_flags
3443 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3444 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3445
3446 if (TARGET_RTD)
3447 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3448 }
3449 else
3450 {
3451 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3452
3453 if (!ix86_arch_specified)
3454 ix86_isa_flags
3455 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3456
3457 /* i386 ABI does not specify red zone. It still makes sense to use it
3458 when programmer takes care to stack from being destroyed. */
3459 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3460 target_flags |= MASK_NO_RED_ZONE;
3461 }
3462
3463 /* Keep nonleaf frame pointers. */
3464 if (flag_omit_frame_pointer)
3465 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3466 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3467 flag_omit_frame_pointer = 1;
3468
3469 /* If we're doing fast math, we don't care about comparison order
3470 wrt NaNs. This lets us use a shorter comparison sequence. */
3471 if (flag_finite_math_only)
3472 target_flags &= ~MASK_IEEE_FP;
3473
3474 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3475 since the insns won't need emulation. */
3476 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3477 target_flags &= ~MASK_NO_FANCY_MATH_387;
3478
3479 /* Likewise, if the target doesn't have a 387, or we've specified
3480 software floating point, don't use 387 inline intrinsics. */
3481 if (!TARGET_80387)
3482 target_flags |= MASK_NO_FANCY_MATH_387;
3483
3484 /* Turn on MMX builtins for -msse. */
3485 if (TARGET_SSE)
3486 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3487
3488 /* Enable SSE prefetch. */
3489 if (TARGET_SSE || TARGET_PRFCHW)
3490 x86_prefetch_sse = true;
3491
3492 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3493 if (TARGET_SSE4_2 || TARGET_ABM)
3494 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3495
3496 /* Turn on lzcnt instruction for -mabm. */
3497 if (TARGET_ABM)
3498 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3499
3500 /* Validate -mpreferred-stack-boundary= value or default it to
3501 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3502 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3503 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3504 {
3505 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3506 int max = (TARGET_SEH ? 4 : 12);
3507
3508 if (ix86_preferred_stack_boundary_arg < min
3509 || ix86_preferred_stack_boundary_arg > max)
3510 {
3511 if (min == max)
3512 error ("-mpreferred-stack-boundary is not supported "
3513 "for this target");
3514 else
3515 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3516 ix86_preferred_stack_boundary_arg, min, max);
3517 }
3518 else
3519 ix86_preferred_stack_boundary
3520 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3521 }
3522
3523 /* Set the default value for -mstackrealign. */
3524 if (ix86_force_align_arg_pointer == -1)
3525 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3526
3527 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3528
3529 /* Validate -mincoming-stack-boundary= value or default it to
3530 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3531 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3532 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3533 {
3534 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3535 || ix86_incoming_stack_boundary_arg > 12)
3536 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3537 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3538 else
3539 {
3540 ix86_user_incoming_stack_boundary
3541 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3542 ix86_incoming_stack_boundary
3543 = ix86_user_incoming_stack_boundary;
3544 }
3545 }
3546
3547 /* Accept -msseregparm only if at least SSE support is enabled. */
3548 if (TARGET_SSEREGPARM
3549 && ! TARGET_SSE)
3550 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3551
3552 if (global_options_set.x_ix86_fpmath)
3553 {
3554 if (ix86_fpmath & FPMATH_SSE)
3555 {
3556 if (!TARGET_SSE)
3557 {
3558 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3559 ix86_fpmath = FPMATH_387;
3560 }
3561 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3562 {
3563 warning (0, "387 instruction set disabled, using SSE arithmetics");
3564 ix86_fpmath = FPMATH_SSE;
3565 }
3566 }
3567 }
3568 else
3569 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3570
3571 /* If the i387 is disabled, then do not return values in it. */
3572 if (!TARGET_80387)
3573 target_flags &= ~MASK_FLOAT_RETURNS;
3574
3575 /* Use external vectorized library in vectorizing intrinsics. */
3576 if (global_options_set.x_ix86_veclibabi_type)
3577 switch (ix86_veclibabi_type)
3578 {
3579 case ix86_veclibabi_type_svml:
3580 ix86_veclib_handler = ix86_veclibabi_svml;
3581 break;
3582
3583 case ix86_veclibabi_type_acml:
3584 ix86_veclib_handler = ix86_veclibabi_acml;
3585 break;
3586
3587 default:
3588 gcc_unreachable ();
3589 }
3590
3591 if ((!USE_IX86_FRAME_POINTER
3592 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3593 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3594 && !optimize_size)
3595 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3596
3597 /* ??? Unwind info is not correct around the CFG unless either a frame
3598 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3599 unwind info generation to be aware of the CFG and propagating states
3600 around edges. */
3601 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3602 || flag_exceptions || flag_non_call_exceptions)
3603 && flag_omit_frame_pointer
3604 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3605 {
3606 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3607 warning (0, "unwind tables currently require either a frame pointer "
3608 "or %saccumulate-outgoing-args%s for correctness",
3609 prefix, suffix);
3610 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3611 }
3612
3613 /* If stack probes are required, the space used for large function
3614 arguments on the stack must also be probed, so enable
3615 -maccumulate-outgoing-args so this happens in the prologue. */
3616 if (TARGET_STACK_PROBE
3617 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3618 {
3619 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3620 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3621 "for correctness", prefix, suffix);
3622 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3623 }
3624
3625 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3626 {
3627 char *p;
3628 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3629 p = strchr (internal_label_prefix, 'X');
3630 internal_label_prefix_len = p - internal_label_prefix;
3631 *p = '\0';
3632 }
3633
3634 /* When scheduling description is not available, disable scheduler pass
3635 so it won't slow down the compilation and make x87 code slower. */
3636 if (!TARGET_SCHEDULE)
3637 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3638
3639 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3640 ix86_tune_cost->simultaneous_prefetches,
3641 global_options.x_param_values,
3642 global_options_set.x_param_values);
3643 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3644 ix86_tune_cost->prefetch_block,
3645 global_options.x_param_values,
3646 global_options_set.x_param_values);
3647 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3648 ix86_tune_cost->l1_cache_size,
3649 global_options.x_param_values,
3650 global_options_set.x_param_values);
3651 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3652 ix86_tune_cost->l2_cache_size,
3653 global_options.x_param_values,
3654 global_options_set.x_param_values);
3655
3656 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3657 if (flag_prefetch_loop_arrays < 0
3658 && HAVE_prefetch
3659 && (optimize >= 3 || flag_profile_use)
3660 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3661 flag_prefetch_loop_arrays = 1;
3662
3663 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3664 can be optimized to ap = __builtin_next_arg (0). */
3665 if (!TARGET_64BIT && !flag_split_stack)
3666 targetm.expand_builtin_va_start = NULL;
3667
3668 if (TARGET_64BIT)
3669 {
3670 ix86_gen_leave = gen_leave_rex64;
3671 if (Pmode == DImode)
3672 {
3673 ix86_gen_monitor = gen_sse3_monitor64_di;
3674 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3675 ix86_gen_tls_local_dynamic_base_64
3676 = gen_tls_local_dynamic_base_64_di;
3677 }
3678 else
3679 {
3680 ix86_gen_monitor = gen_sse3_monitor64_si;
3681 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3682 ix86_gen_tls_local_dynamic_base_64
3683 = gen_tls_local_dynamic_base_64_si;
3684 }
3685 }
3686 else
3687 {
3688 ix86_gen_leave = gen_leave;
3689 ix86_gen_monitor = gen_sse3_monitor;
3690 }
3691
3692 if (Pmode == DImode)
3693 {
3694 ix86_gen_add3 = gen_adddi3;
3695 ix86_gen_sub3 = gen_subdi3;
3696 ix86_gen_sub3_carry = gen_subdi3_carry;
3697 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3698 ix86_gen_andsp = gen_anddi3;
3699 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3700 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3701 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3702 }
3703 else
3704 {
3705 ix86_gen_add3 = gen_addsi3;
3706 ix86_gen_sub3 = gen_subsi3;
3707 ix86_gen_sub3_carry = gen_subsi3_carry;
3708 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3709 ix86_gen_andsp = gen_andsi3;
3710 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3711 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3712 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3713 }
3714
3715 #ifdef USE_IX86_CLD
3716 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3717 if (!TARGET_64BIT)
3718 target_flags |= MASK_CLD & ~target_flags_explicit;
3719 #endif
3720
3721 if (!TARGET_64BIT && flag_pic)
3722 {
3723 if (flag_fentry > 0)
3724 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3725 "with -fpic");
3726 flag_fentry = 0;
3727 }
3728 else if (TARGET_SEH)
3729 {
3730 if (flag_fentry == 0)
3731 sorry ("-mno-fentry isn%'t compatible with SEH");
3732 flag_fentry = 1;
3733 }
3734 else if (flag_fentry < 0)
3735 {
3736 #if defined(PROFILE_BEFORE_PROLOGUE)
3737 flag_fentry = 1;
3738 #else
3739 flag_fentry = 0;
3740 #endif
3741 }
3742
3743 if (TARGET_AVX)
3744 {
3745 /* When not optimize for size, enable vzeroupper optimization for
3746 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3747 AVX unaligned load/store. */
3748 if (!optimize_size)
3749 {
3750 if (flag_expensive_optimizations
3751 && !(target_flags_explicit & MASK_VZEROUPPER))
3752 target_flags |= MASK_VZEROUPPER;
3753 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3754 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3755 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3756 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3757 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3758 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3759 /* Enable 128-bit AVX instruction generation
3760 for the auto-vectorizer. */
3761 if (TARGET_AVX128_OPTIMAL
3762 && !(target_flags_explicit & MASK_PREFER_AVX128))
3763 target_flags |= MASK_PREFER_AVX128;
3764 }
3765 }
3766 else
3767 {
3768 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3769 target_flags &= ~MASK_VZEROUPPER;
3770 }
3771
3772 if (ix86_recip_name)
3773 {
3774 char *p = ASTRDUP (ix86_recip_name);
3775 char *q;
3776 unsigned int mask, i;
3777 bool invert;
3778
3779 while ((q = strtok (p, ",")) != NULL)
3780 {
3781 p = NULL;
3782 if (*q == '!')
3783 {
3784 invert = true;
3785 q++;
3786 }
3787 else
3788 invert = false;
3789
3790 if (!strcmp (q, "default"))
3791 mask = RECIP_MASK_ALL;
3792 else
3793 {
3794 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3795 if (!strcmp (q, recip_options[i].string))
3796 {
3797 mask = recip_options[i].mask;
3798 break;
3799 }
3800
3801 if (i == ARRAY_SIZE (recip_options))
3802 {
3803 error ("unknown option for -mrecip=%s", q);
3804 invert = false;
3805 mask = RECIP_MASK_NONE;
3806 }
3807 }
3808
3809 recip_mask_explicit |= mask;
3810 if (invert)
3811 recip_mask &= ~mask;
3812 else
3813 recip_mask |= mask;
3814 }
3815 }
3816
3817 if (TARGET_RECIP)
3818 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3819 else if (target_flags_explicit & MASK_RECIP)
3820 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3821
3822 /* Default long double to 64-bit for Bionic. */
3823 if (TARGET_HAS_BIONIC
3824 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3825 target_flags |= MASK_LONG_DOUBLE_64;
3826
3827 /* Save the initial options in case the user does function specific
3828 options. */
3829 if (main_args_p)
3830 target_option_default_node = target_option_current_node
3831 = build_target_option_node ();
3832 }
3833
3834 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3835
3836 static void
3837 ix86_option_override (void)
3838 {
3839 static struct register_pass_info insert_vzeroupper_info
3840 = { &pass_insert_vzeroupper.pass, "reload",
3841 1, PASS_POS_INSERT_AFTER
3842 };
3843
3844 ix86_option_override_internal (true);
3845
3846
3847 /* This needs to be done at start up. It's convenient to do it here. */
3848 register_pass (&insert_vzeroupper_info);
3849 }
3850
3851 /* Update register usage after having seen the compiler flags. */
3852
3853 static void
3854 ix86_conditional_register_usage (void)
3855 {
3856 int i, c_mask;
3857 unsigned int j;
3858
3859 /* The PIC register, if it exists, is fixed. */
3860 j = PIC_OFFSET_TABLE_REGNUM;
3861 if (j != INVALID_REGNUM)
3862 fixed_regs[j] = call_used_regs[j] = 1;
3863
3864 /* For 32-bit targets, squash the REX registers. */
3865 if (! TARGET_64BIT)
3866 {
3867 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3868 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3869 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3870 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3871 }
3872
3873 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3874 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3875 : TARGET_64BIT ? (1 << 2)
3876 : (1 << 1));
3877
3878 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3879
3880 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3881 {
3882 /* Set/reset conditionally defined registers from
3883 CALL_USED_REGISTERS initializer. */
3884 if (call_used_regs[i] > 1)
3885 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3886
3887 /* Calculate registers of CLOBBERED_REGS register set
3888 as call used registers from GENERAL_REGS register set. */
3889 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3890 && call_used_regs[i])
3891 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3892 }
3893
3894 /* If MMX is disabled, squash the registers. */
3895 if (! TARGET_MMX)
3896 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3897 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3898 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3899
3900 /* If SSE is disabled, squash the registers. */
3901 if (! TARGET_SSE)
3902 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3903 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3904 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3905
3906 /* If the FPU is disabled, squash the registers. */
3907 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3908 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3909 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3910 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3911 }
3912
3913 \f
3914 /* Save the current options */
3915
3916 static void
3917 ix86_function_specific_save (struct cl_target_option *ptr)
3918 {
3919 ptr->arch = ix86_arch;
3920 ptr->schedule = ix86_schedule;
3921 ptr->tune = ix86_tune;
3922 ptr->branch_cost = ix86_branch_cost;
3923 ptr->tune_defaulted = ix86_tune_defaulted;
3924 ptr->arch_specified = ix86_arch_specified;
3925 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3926 ptr->ix86_target_flags_explicit = target_flags_explicit;
3927 ptr->x_recip_mask_explicit = recip_mask_explicit;
3928
3929 /* The fields are char but the variables are not; make sure the
3930 values fit in the fields. */
3931 gcc_assert (ptr->arch == ix86_arch);
3932 gcc_assert (ptr->schedule == ix86_schedule);
3933 gcc_assert (ptr->tune == ix86_tune);
3934 gcc_assert (ptr->branch_cost == ix86_branch_cost);
3935 }
3936
3937 /* Restore the current options */
3938
3939 static void
3940 ix86_function_specific_restore (struct cl_target_option *ptr)
3941 {
3942 enum processor_type old_tune = ix86_tune;
3943 enum processor_type old_arch = ix86_arch;
3944 unsigned int ix86_arch_mask, ix86_tune_mask;
3945 int i;
3946
3947 ix86_arch = (enum processor_type) ptr->arch;
3948 ix86_schedule = (enum attr_cpu) ptr->schedule;
3949 ix86_tune = (enum processor_type) ptr->tune;
3950 ix86_branch_cost = ptr->branch_cost;
3951 ix86_tune_defaulted = ptr->tune_defaulted;
3952 ix86_arch_specified = ptr->arch_specified;
3953 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
3954 target_flags_explicit = ptr->ix86_target_flags_explicit;
3955 recip_mask_explicit = ptr->x_recip_mask_explicit;
3956
3957 /* Recreate the arch feature tests if the arch changed */
3958 if (old_arch != ix86_arch)
3959 {
3960 ix86_arch_mask = 1u << ix86_arch;
3961 for (i = 0; i < X86_ARCH_LAST; ++i)
3962 ix86_arch_features[i]
3963 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3964 }
3965
3966 /* Recreate the tune optimization tests */
3967 if (old_tune != ix86_tune)
3968 {
3969 ix86_tune_mask = 1u << ix86_tune;
3970 for (i = 0; i < X86_TUNE_LAST; ++i)
3971 ix86_tune_features[i]
3972 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3973 }
3974 }
3975
3976 /* Print the current options */
3977
3978 static void
3979 ix86_function_specific_print (FILE *file, int indent,
3980 struct cl_target_option *ptr)
3981 {
3982 char *target_string
3983 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
3984 NULL, NULL, ptr->x_ix86_fpmath, false);
3985
3986 fprintf (file, "%*sarch = %d (%s)\n",
3987 indent, "",
3988 ptr->arch,
3989 ((ptr->arch < TARGET_CPU_DEFAULT_max)
3990 ? cpu_names[ptr->arch]
3991 : "<unknown>"));
3992
3993 fprintf (file, "%*stune = %d (%s)\n",
3994 indent, "",
3995 ptr->tune,
3996 ((ptr->tune < TARGET_CPU_DEFAULT_max)
3997 ? cpu_names[ptr->tune]
3998 : "<unknown>"));
3999
4000 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4001
4002 if (target_string)
4003 {
4004 fprintf (file, "%*s%s\n", indent, "", target_string);
4005 free (target_string);
4006 }
4007 }
4008
4009 \f
4010 /* Inner function to process the attribute((target(...))), take an argument and
4011 set the current options from the argument. If we have a list, recursively go
4012 over the list. */
4013
4014 static bool
4015 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4016 struct gcc_options *enum_opts_set)
4017 {
4018 char *next_optstr;
4019 bool ret = true;
4020
4021 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4022 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4023 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4024 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4025 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4026
4027 enum ix86_opt_type
4028 {
4029 ix86_opt_unknown,
4030 ix86_opt_yes,
4031 ix86_opt_no,
4032 ix86_opt_str,
4033 ix86_opt_enum,
4034 ix86_opt_isa
4035 };
4036
4037 static const struct
4038 {
4039 const char *string;
4040 size_t len;
4041 enum ix86_opt_type type;
4042 int opt;
4043 int mask;
4044 } attrs[] = {
4045 /* isa options */
4046 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4047 IX86_ATTR_ISA ("abm", OPT_mabm),
4048 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4049 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4050 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4051 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4052 IX86_ATTR_ISA ("aes", OPT_maes),
4053 IX86_ATTR_ISA ("avx", OPT_mavx),
4054 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4055 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4056 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4057 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4058 IX86_ATTR_ISA ("sse", OPT_msse),
4059 IX86_ATTR_ISA ("sse2", OPT_msse2),
4060 IX86_ATTR_ISA ("sse3", OPT_msse3),
4061 IX86_ATTR_ISA ("sse4", OPT_msse4),
4062 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4063 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4064 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4065 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4066 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4067 IX86_ATTR_ISA ("fma", OPT_mfma),
4068 IX86_ATTR_ISA ("xop", OPT_mxop),
4069 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4070 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4071 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4072 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4073 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4074 IX86_ATTR_ISA ("hle", OPT_mhle),
4075 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4076 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4077 IX86_ATTR_ISA ("adx", OPT_madx),
4078 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4079 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4080 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4081
4082 /* enum options */
4083 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4084
4085 /* string options */
4086 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4087 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4088
4089 /* flag options */
4090 IX86_ATTR_YES ("cld",
4091 OPT_mcld,
4092 MASK_CLD),
4093
4094 IX86_ATTR_NO ("fancy-math-387",
4095 OPT_mfancy_math_387,
4096 MASK_NO_FANCY_MATH_387),
4097
4098 IX86_ATTR_YES ("ieee-fp",
4099 OPT_mieee_fp,
4100 MASK_IEEE_FP),
4101
4102 IX86_ATTR_YES ("inline-all-stringops",
4103 OPT_minline_all_stringops,
4104 MASK_INLINE_ALL_STRINGOPS),
4105
4106 IX86_ATTR_YES ("inline-stringops-dynamically",
4107 OPT_minline_stringops_dynamically,
4108 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4109
4110 IX86_ATTR_NO ("align-stringops",
4111 OPT_mno_align_stringops,
4112 MASK_NO_ALIGN_STRINGOPS),
4113
4114 IX86_ATTR_YES ("recip",
4115 OPT_mrecip,
4116 MASK_RECIP),
4117
4118 };
4119
4120 /* If this is a list, recurse to get the options. */
4121 if (TREE_CODE (args) == TREE_LIST)
4122 {
4123 bool ret = true;
4124
4125 for (; args; args = TREE_CHAIN (args))
4126 if (TREE_VALUE (args)
4127 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4128 p_strings, enum_opts_set))
4129 ret = false;
4130
4131 return ret;
4132 }
4133
4134 else if (TREE_CODE (args) != STRING_CST)
4135 gcc_unreachable ();
4136
4137 /* Handle multiple arguments separated by commas. */
4138 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4139
4140 while (next_optstr && *next_optstr != '\0')
4141 {
4142 char *p = next_optstr;
4143 char *orig_p = p;
4144 char *comma = strchr (next_optstr, ',');
4145 const char *opt_string;
4146 size_t len, opt_len;
4147 int opt;
4148 bool opt_set_p;
4149 char ch;
4150 unsigned i;
4151 enum ix86_opt_type type = ix86_opt_unknown;
4152 int mask = 0;
4153
4154 if (comma)
4155 {
4156 *comma = '\0';
4157 len = comma - next_optstr;
4158 next_optstr = comma + 1;
4159 }
4160 else
4161 {
4162 len = strlen (p);
4163 next_optstr = NULL;
4164 }
4165
4166 /* Recognize no-xxx. */
4167 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4168 {
4169 opt_set_p = false;
4170 p += 3;
4171 len -= 3;
4172 }
4173 else
4174 opt_set_p = true;
4175
4176 /* Find the option. */
4177 ch = *p;
4178 opt = N_OPTS;
4179 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4180 {
4181 type = attrs[i].type;
4182 opt_len = attrs[i].len;
4183 if (ch == attrs[i].string[0]
4184 && ((type != ix86_opt_str && type != ix86_opt_enum)
4185 ? len == opt_len
4186 : len > opt_len)
4187 && memcmp (p, attrs[i].string, opt_len) == 0)
4188 {
4189 opt = attrs[i].opt;
4190 mask = attrs[i].mask;
4191 opt_string = attrs[i].string;
4192 break;
4193 }
4194 }
4195
4196 /* Process the option. */
4197 if (opt == N_OPTS)
4198 {
4199 error ("attribute(target(\"%s\")) is unknown", orig_p);
4200 ret = false;
4201 }
4202
4203 else if (type == ix86_opt_isa)
4204 {
4205 struct cl_decoded_option decoded;
4206
4207 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4208 ix86_handle_option (&global_options, &global_options_set,
4209 &decoded, input_location);
4210 }
4211
4212 else if (type == ix86_opt_yes || type == ix86_opt_no)
4213 {
4214 if (type == ix86_opt_no)
4215 opt_set_p = !opt_set_p;
4216
4217 if (opt_set_p)
4218 target_flags |= mask;
4219 else
4220 target_flags &= ~mask;
4221 }
4222
4223 else if (type == ix86_opt_str)
4224 {
4225 if (p_strings[opt])
4226 {
4227 error ("option(\"%s\") was already specified", opt_string);
4228 ret = false;
4229 }
4230 else
4231 p_strings[opt] = xstrdup (p + opt_len);
4232 }
4233
4234 else if (type == ix86_opt_enum)
4235 {
4236 bool arg_ok;
4237 int value;
4238
4239 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4240 if (arg_ok)
4241 set_option (&global_options, enum_opts_set, opt, value,
4242 p + opt_len, DK_UNSPECIFIED, input_location,
4243 global_dc);
4244 else
4245 {
4246 error ("attribute(target(\"%s\")) is unknown", orig_p);
4247 ret = false;
4248 }
4249 }
4250
4251 else
4252 gcc_unreachable ();
4253 }
4254
4255 return ret;
4256 }
4257
4258 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4259
4260 tree
4261 ix86_valid_target_attribute_tree (tree args)
4262 {
4263 const char *orig_arch_string = ix86_arch_string;
4264 const char *orig_tune_string = ix86_tune_string;
4265 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4266 int orig_tune_defaulted = ix86_tune_defaulted;
4267 int orig_arch_specified = ix86_arch_specified;
4268 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4269 tree t = NULL_TREE;
4270 int i;
4271 struct cl_target_option *def
4272 = TREE_TARGET_OPTION (target_option_default_node);
4273 struct gcc_options enum_opts_set;
4274
4275 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4276
4277 /* Process each of the options on the chain. */
4278 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4279 &enum_opts_set))
4280 return NULL_TREE;
4281
4282 /* If the changed options are different from the default, rerun
4283 ix86_option_override_internal, and then save the options away.
4284 The string options are are attribute options, and will be undone
4285 when we copy the save structure. */
4286 if (ix86_isa_flags != def->x_ix86_isa_flags
4287 || target_flags != def->x_target_flags
4288 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4289 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4290 || enum_opts_set.x_ix86_fpmath)
4291 {
4292 /* If we are using the default tune= or arch=, undo the string assigned,
4293 and use the default. */
4294 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4295 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4296 else if (!orig_arch_specified)
4297 ix86_arch_string = NULL;
4298
4299 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4300 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4301 else if (orig_tune_defaulted)
4302 ix86_tune_string = NULL;
4303
4304 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4305 if (enum_opts_set.x_ix86_fpmath)
4306 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4307 else if (!TARGET_64BIT && TARGET_SSE)
4308 {
4309 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4310 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4311 }
4312
4313 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4314 ix86_option_override_internal (false);
4315
4316 /* Add any builtin functions with the new isa if any. */
4317 ix86_add_new_builtins (ix86_isa_flags);
4318
4319 /* Save the current options unless we are validating options for
4320 #pragma. */
4321 t = build_target_option_node ();
4322
4323 ix86_arch_string = orig_arch_string;
4324 ix86_tune_string = orig_tune_string;
4325 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4326
4327 /* Free up memory allocated to hold the strings */
4328 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4329 free (option_strings[i]);
4330 }
4331
4332 return t;
4333 }
4334
4335 /* Hook to validate attribute((target("string"))). */
4336
4337 static bool
4338 ix86_valid_target_attribute_p (tree fndecl,
4339 tree ARG_UNUSED (name),
4340 tree args,
4341 int ARG_UNUSED (flags))
4342 {
4343 struct cl_target_option cur_target;
4344 bool ret = true;
4345 tree old_optimize = build_optimization_node ();
4346 tree new_target, new_optimize;
4347 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4348
4349 /* If the function changed the optimization levels as well as setting target
4350 options, start with the optimizations specified. */
4351 if (func_optimize && func_optimize != old_optimize)
4352 cl_optimization_restore (&global_options,
4353 TREE_OPTIMIZATION (func_optimize));
4354
4355 /* The target attributes may also change some optimization flags, so update
4356 the optimization options if necessary. */
4357 cl_target_option_save (&cur_target, &global_options);
4358 new_target = ix86_valid_target_attribute_tree (args);
4359 new_optimize = build_optimization_node ();
4360
4361 if (!new_target)
4362 ret = false;
4363
4364 else if (fndecl)
4365 {
4366 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4367
4368 if (old_optimize != new_optimize)
4369 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4370 }
4371
4372 cl_target_option_restore (&global_options, &cur_target);
4373
4374 if (old_optimize != new_optimize)
4375 cl_optimization_restore (&global_options,
4376 TREE_OPTIMIZATION (old_optimize));
4377
4378 return ret;
4379 }
4380
4381 \f
4382 /* Hook to determine if one function can safely inline another. */
4383
4384 static bool
4385 ix86_can_inline_p (tree caller, tree callee)
4386 {
4387 bool ret = false;
4388 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4389 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4390
4391 /* If callee has no option attributes, then it is ok to inline. */
4392 if (!callee_tree)
4393 ret = true;
4394
4395 /* If caller has no option attributes, but callee does then it is not ok to
4396 inline. */
4397 else if (!caller_tree)
4398 ret = false;
4399
4400 else
4401 {
4402 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4403 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4404
4405 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4406 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4407 function. */
4408 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4409 != callee_opts->x_ix86_isa_flags)
4410 ret = false;
4411
4412 /* See if we have the same non-isa options. */
4413 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4414 ret = false;
4415
4416 /* See if arch, tune, etc. are the same. */
4417 else if (caller_opts->arch != callee_opts->arch)
4418 ret = false;
4419
4420 else if (caller_opts->tune != callee_opts->tune)
4421 ret = false;
4422
4423 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4424 ret = false;
4425
4426 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4427 ret = false;
4428
4429 else
4430 ret = true;
4431 }
4432
4433 return ret;
4434 }
4435
4436 \f
4437 /* Remember the last target of ix86_set_current_function. */
4438 static GTY(()) tree ix86_previous_fndecl;
4439
4440 /* Establish appropriate back-end context for processing the function
4441 FNDECL. The argument might be NULL to indicate processing at top
4442 level, outside of any function scope. */
4443 static void
4444 ix86_set_current_function (tree fndecl)
4445 {
4446 /* Only change the context if the function changes. This hook is called
4447 several times in the course of compiling a function, and we don't want to
4448 slow things down too much or call target_reinit when it isn't safe. */
4449 if (fndecl && fndecl != ix86_previous_fndecl)
4450 {
4451 tree old_tree = (ix86_previous_fndecl
4452 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4453 : NULL_TREE);
4454
4455 tree new_tree = (fndecl
4456 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4457 : NULL_TREE);
4458
4459 ix86_previous_fndecl = fndecl;
4460 if (old_tree == new_tree)
4461 ;
4462
4463 else if (new_tree)
4464 {
4465 cl_target_option_restore (&global_options,
4466 TREE_TARGET_OPTION (new_tree));
4467 target_reinit ();
4468 }
4469
4470 else if (old_tree)
4471 {
4472 struct cl_target_option *def
4473 = TREE_TARGET_OPTION (target_option_current_node);
4474
4475 cl_target_option_restore (&global_options, def);
4476 target_reinit ();
4477 }
4478 }
4479 }
4480
4481 \f
4482 /* Return true if this goes in large data/bss. */
4483
4484 static bool
4485 ix86_in_large_data_p (tree exp)
4486 {
4487 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4488 return false;
4489
4490 /* Functions are never large data. */
4491 if (TREE_CODE (exp) == FUNCTION_DECL)
4492 return false;
4493
4494 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4495 {
4496 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4497 if (strcmp (section, ".ldata") == 0
4498 || strcmp (section, ".lbss") == 0)
4499 return true;
4500 return false;
4501 }
4502 else
4503 {
4504 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4505
4506 /* If this is an incomplete type with size 0, then we can't put it
4507 in data because it might be too big when completed. */
4508 if (!size || size > ix86_section_threshold)
4509 return true;
4510 }
4511
4512 return false;
4513 }
4514
4515 /* Switch to the appropriate section for output of DECL.
4516 DECL is either a `VAR_DECL' node or a constant of some sort.
4517 RELOC indicates whether forming the initial value of DECL requires
4518 link-time relocations. */
4519
4520 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4521 ATTRIBUTE_UNUSED;
4522
4523 static section *
4524 x86_64_elf_select_section (tree decl, int reloc,
4525 unsigned HOST_WIDE_INT align)
4526 {
4527 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4528 && ix86_in_large_data_p (decl))
4529 {
4530 const char *sname = NULL;
4531 unsigned int flags = SECTION_WRITE;
4532 switch (categorize_decl_for_section (decl, reloc))
4533 {
4534 case SECCAT_DATA:
4535 sname = ".ldata";
4536 break;
4537 case SECCAT_DATA_REL:
4538 sname = ".ldata.rel";
4539 break;
4540 case SECCAT_DATA_REL_LOCAL:
4541 sname = ".ldata.rel.local";
4542 break;
4543 case SECCAT_DATA_REL_RO:
4544 sname = ".ldata.rel.ro";
4545 break;
4546 case SECCAT_DATA_REL_RO_LOCAL:
4547 sname = ".ldata.rel.ro.local";
4548 break;
4549 case SECCAT_BSS:
4550 sname = ".lbss";
4551 flags |= SECTION_BSS;
4552 break;
4553 case SECCAT_RODATA:
4554 case SECCAT_RODATA_MERGE_STR:
4555 case SECCAT_RODATA_MERGE_STR_INIT:
4556 case SECCAT_RODATA_MERGE_CONST:
4557 sname = ".lrodata";
4558 flags = 0;
4559 break;
4560 case SECCAT_SRODATA:
4561 case SECCAT_SDATA:
4562 case SECCAT_SBSS:
4563 gcc_unreachable ();
4564 case SECCAT_TEXT:
4565 case SECCAT_TDATA:
4566 case SECCAT_TBSS:
4567 /* We don't split these for medium model. Place them into
4568 default sections and hope for best. */
4569 break;
4570 }
4571 if (sname)
4572 {
4573 /* We might get called with string constants, but get_named_section
4574 doesn't like them as they are not DECLs. Also, we need to set
4575 flags in that case. */
4576 if (!DECL_P (decl))
4577 return get_section (sname, flags, NULL);
4578 return get_named_section (decl, sname, reloc);
4579 }
4580 }
4581 return default_elf_select_section (decl, reloc, align);
4582 }
4583
4584 /* Build up a unique section name, expressed as a
4585 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4586 RELOC indicates whether the initial value of EXP requires
4587 link-time relocations. */
4588
4589 static void ATTRIBUTE_UNUSED
4590 x86_64_elf_unique_section (tree decl, int reloc)
4591 {
4592 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4593 && ix86_in_large_data_p (decl))
4594 {
4595 const char *prefix = NULL;
4596 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4597 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4598
4599 switch (categorize_decl_for_section (decl, reloc))
4600 {
4601 case SECCAT_DATA:
4602 case SECCAT_DATA_REL:
4603 case SECCAT_DATA_REL_LOCAL:
4604 case SECCAT_DATA_REL_RO:
4605 case SECCAT_DATA_REL_RO_LOCAL:
4606 prefix = one_only ? ".ld" : ".ldata";
4607 break;
4608 case SECCAT_BSS:
4609 prefix = one_only ? ".lb" : ".lbss";
4610 break;
4611 case SECCAT_RODATA:
4612 case SECCAT_RODATA_MERGE_STR:
4613 case SECCAT_RODATA_MERGE_STR_INIT:
4614 case SECCAT_RODATA_MERGE_CONST:
4615 prefix = one_only ? ".lr" : ".lrodata";
4616 break;
4617 case SECCAT_SRODATA:
4618 case SECCAT_SDATA:
4619 case SECCAT_SBSS:
4620 gcc_unreachable ();
4621 case SECCAT_TEXT:
4622 case SECCAT_TDATA:
4623 case SECCAT_TBSS:
4624 /* We don't split these for medium model. Place them into
4625 default sections and hope for best. */
4626 break;
4627 }
4628 if (prefix)
4629 {
4630 const char *name, *linkonce;
4631 char *string;
4632
4633 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4634 name = targetm.strip_name_encoding (name);
4635
4636 /* If we're using one_only, then there needs to be a .gnu.linkonce
4637 prefix to the section name. */
4638 linkonce = one_only ? ".gnu.linkonce" : "";
4639
4640 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4641
4642 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4643 return;
4644 }
4645 }
4646 default_unique_section (decl, reloc);
4647 }
4648
4649 #ifdef COMMON_ASM_OP
4650 /* This says how to output assembler code to declare an
4651 uninitialized external linkage data object.
4652
4653 For medium model x86-64 we need to use .largecomm opcode for
4654 large objects. */
4655 void
4656 x86_elf_aligned_common (FILE *file,
4657 const char *name, unsigned HOST_WIDE_INT size,
4658 int align)
4659 {
4660 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4661 && size > (unsigned int)ix86_section_threshold)
4662 fputs (".largecomm\t", file);
4663 else
4664 fputs (COMMON_ASM_OP, file);
4665 assemble_name (file, name);
4666 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4667 size, align / BITS_PER_UNIT);
4668 }
4669 #endif
4670
4671 /* Utility function for targets to use in implementing
4672 ASM_OUTPUT_ALIGNED_BSS. */
4673
4674 void
4675 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4676 const char *name, unsigned HOST_WIDE_INT size,
4677 int align)
4678 {
4679 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4680 && size > (unsigned int)ix86_section_threshold)
4681 switch_to_section (get_named_section (decl, ".lbss", 0));
4682 else
4683 switch_to_section (bss_section);
4684 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4685 #ifdef ASM_DECLARE_OBJECT_NAME
4686 last_assemble_variable_decl = decl;
4687 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4688 #else
4689 /* Standard thing is just output label for the object. */
4690 ASM_OUTPUT_LABEL (file, name);
4691 #endif /* ASM_DECLARE_OBJECT_NAME */
4692 ASM_OUTPUT_SKIP (file, size ? size : 1);
4693 }
4694 \f
4695 /* Decide whether we must probe the stack before any space allocation
4696 on this target. It's essentially TARGET_STACK_PROBE except when
4697 -fstack-check causes the stack to be already probed differently. */
4698
4699 bool
4700 ix86_target_stack_probe (void)
4701 {
4702 /* Do not probe the stack twice if static stack checking is enabled. */
4703 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4704 return false;
4705
4706 return TARGET_STACK_PROBE;
4707 }
4708 \f
4709 /* Decide whether we can make a sibling call to a function. DECL is the
4710 declaration of the function being targeted by the call and EXP is the
4711 CALL_EXPR representing the call. */
4712
4713 static bool
4714 ix86_function_ok_for_sibcall (tree decl, tree exp)
4715 {
4716 tree type, decl_or_type;
4717 rtx a, b;
4718
4719 /* If we are generating position-independent code, we cannot sibcall
4720 optimize any indirect call, or a direct call to a global function,
4721 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4722 if (!TARGET_MACHO
4723 && !TARGET_64BIT
4724 && flag_pic
4725 && (!decl || !targetm.binds_local_p (decl)))
4726 return false;
4727
4728 /* If we need to align the outgoing stack, then sibcalling would
4729 unalign the stack, which may break the called function. */
4730 if (ix86_minimum_incoming_stack_boundary (true)
4731 < PREFERRED_STACK_BOUNDARY)
4732 return false;
4733
4734 if (decl)
4735 {
4736 decl_or_type = decl;
4737 type = TREE_TYPE (decl);
4738 }
4739 else
4740 {
4741 /* We're looking at the CALL_EXPR, we need the type of the function. */
4742 type = CALL_EXPR_FN (exp); /* pointer expression */
4743 type = TREE_TYPE (type); /* pointer type */
4744 type = TREE_TYPE (type); /* function type */
4745 decl_or_type = type;
4746 }
4747
4748 /* Check that the return value locations are the same. Like
4749 if we are returning floats on the 80387 register stack, we cannot
4750 make a sibcall from a function that doesn't return a float to a
4751 function that does or, conversely, from a function that does return
4752 a float to a function that doesn't; the necessary stack adjustment
4753 would not be executed. This is also the place we notice
4754 differences in the return value ABI. Note that it is ok for one
4755 of the functions to have void return type as long as the return
4756 value of the other is passed in a register. */
4757 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4758 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4759 cfun->decl, false);
4760 if (STACK_REG_P (a) || STACK_REG_P (b))
4761 {
4762 if (!rtx_equal_p (a, b))
4763 return false;
4764 }
4765 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4766 ;
4767 else if (!rtx_equal_p (a, b))
4768 return false;
4769
4770 if (TARGET_64BIT)
4771 {
4772 /* The SYSV ABI has more call-clobbered registers;
4773 disallow sibcalls from MS to SYSV. */
4774 if (cfun->machine->call_abi == MS_ABI
4775 && ix86_function_type_abi (type) == SYSV_ABI)
4776 return false;
4777 }
4778 else
4779 {
4780 /* If this call is indirect, we'll need to be able to use a
4781 call-clobbered register for the address of the target function.
4782 Make sure that all such registers are not used for passing
4783 parameters. Note that DLLIMPORT functions are indirect. */
4784 if (!decl
4785 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4786 {
4787 if (ix86_function_regparm (type, NULL) >= 3)
4788 {
4789 /* ??? Need to count the actual number of registers to be used,
4790 not the possible number of registers. Fix later. */
4791 return false;
4792 }
4793 }
4794 }
4795
4796 /* Otherwise okay. That also includes certain types of indirect calls. */
4797 return true;
4798 }
4799
4800 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4801 and "sseregparm" calling convention attributes;
4802 arguments as in struct attribute_spec.handler. */
4803
4804 static tree
4805 ix86_handle_cconv_attribute (tree *node, tree name,
4806 tree args,
4807 int flags ATTRIBUTE_UNUSED,
4808 bool *no_add_attrs)
4809 {
4810 if (TREE_CODE (*node) != FUNCTION_TYPE
4811 && TREE_CODE (*node) != METHOD_TYPE
4812 && TREE_CODE (*node) != FIELD_DECL
4813 && TREE_CODE (*node) != TYPE_DECL)
4814 {
4815 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4816 name);
4817 *no_add_attrs = true;
4818 return NULL_TREE;
4819 }
4820
4821 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4822 if (is_attribute_p ("regparm", name))
4823 {
4824 tree cst;
4825
4826 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4827 {
4828 error ("fastcall and regparm attributes are not compatible");
4829 }
4830
4831 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4832 {
4833 error ("regparam and thiscall attributes are not compatible");
4834 }
4835
4836 cst = TREE_VALUE (args);
4837 if (TREE_CODE (cst) != INTEGER_CST)
4838 {
4839 warning (OPT_Wattributes,
4840 "%qE attribute requires an integer constant argument",
4841 name);
4842 *no_add_attrs = true;
4843 }
4844 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4845 {
4846 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4847 name, REGPARM_MAX);
4848 *no_add_attrs = true;
4849 }
4850
4851 return NULL_TREE;
4852 }
4853
4854 if (TARGET_64BIT)
4855 {
4856 /* Do not warn when emulating the MS ABI. */
4857 if ((TREE_CODE (*node) != FUNCTION_TYPE
4858 && TREE_CODE (*node) != METHOD_TYPE)
4859 || ix86_function_type_abi (*node) != MS_ABI)
4860 warning (OPT_Wattributes, "%qE attribute ignored",
4861 name);
4862 *no_add_attrs = true;
4863 return NULL_TREE;
4864 }
4865
4866 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4867 if (is_attribute_p ("fastcall", name))
4868 {
4869 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4870 {
4871 error ("fastcall and cdecl attributes are not compatible");
4872 }
4873 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4874 {
4875 error ("fastcall and stdcall attributes are not compatible");
4876 }
4877 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4878 {
4879 error ("fastcall and regparm attributes are not compatible");
4880 }
4881 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4882 {
4883 error ("fastcall and thiscall attributes are not compatible");
4884 }
4885 }
4886
4887 /* Can combine stdcall with fastcall (redundant), regparm and
4888 sseregparm. */
4889 else if (is_attribute_p ("stdcall", name))
4890 {
4891 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4892 {
4893 error ("stdcall and cdecl attributes are not compatible");
4894 }
4895 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4896 {
4897 error ("stdcall and fastcall attributes are not compatible");
4898 }
4899 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4900 {
4901 error ("stdcall and thiscall attributes are not compatible");
4902 }
4903 }
4904
4905 /* Can combine cdecl with regparm and sseregparm. */
4906 else if (is_attribute_p ("cdecl", name))
4907 {
4908 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4909 {
4910 error ("stdcall and cdecl attributes are not compatible");
4911 }
4912 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4913 {
4914 error ("fastcall and cdecl attributes are not compatible");
4915 }
4916 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4917 {
4918 error ("cdecl and thiscall attributes are not compatible");
4919 }
4920 }
4921 else if (is_attribute_p ("thiscall", name))
4922 {
4923 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
4924 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
4925 name);
4926 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4927 {
4928 error ("stdcall and thiscall attributes are not compatible");
4929 }
4930 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4931 {
4932 error ("fastcall and thiscall attributes are not compatible");
4933 }
4934 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4935 {
4936 error ("cdecl and thiscall attributes are not compatible");
4937 }
4938 }
4939
4940 /* Can combine sseregparm with all attributes. */
4941
4942 return NULL_TREE;
4943 }
4944
4945 /* The transactional memory builtins are implicitly regparm or fastcall
4946 depending on the ABI. Override the generic do-nothing attribute that
4947 these builtins were declared with, and replace it with one of the two
4948 attributes that we expect elsewhere. */
4949
4950 static tree
4951 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
4952 tree args ATTRIBUTE_UNUSED,
4953 int flags ATTRIBUTE_UNUSED,
4954 bool *no_add_attrs)
4955 {
4956 tree alt;
4957
4958 /* In no case do we want to add the placeholder attribute. */
4959 *no_add_attrs = true;
4960
4961 /* The 64-bit ABI is unchanged for transactional memory. */
4962 if (TARGET_64BIT)
4963 return NULL_TREE;
4964
4965 /* ??? Is there a better way to validate 32-bit windows? We have
4966 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
4967 if (CHECK_STACK_LIMIT > 0)
4968 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
4969 else
4970 {
4971 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
4972 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
4973 }
4974 decl_attributes (node, alt, flags);
4975
4976 return NULL_TREE;
4977 }
4978
4979 /* This function determines from TYPE the calling-convention. */
4980
4981 unsigned int
4982 ix86_get_callcvt (const_tree type)
4983 {
4984 unsigned int ret = 0;
4985 bool is_stdarg;
4986 tree attrs;
4987
4988 if (TARGET_64BIT)
4989 return IX86_CALLCVT_CDECL;
4990
4991 attrs = TYPE_ATTRIBUTES (type);
4992 if (attrs != NULL_TREE)
4993 {
4994 if (lookup_attribute ("cdecl", attrs))
4995 ret |= IX86_CALLCVT_CDECL;
4996 else if (lookup_attribute ("stdcall", attrs))
4997 ret |= IX86_CALLCVT_STDCALL;
4998 else if (lookup_attribute ("fastcall", attrs))
4999 ret |= IX86_CALLCVT_FASTCALL;
5000 else if (lookup_attribute ("thiscall", attrs))
5001 ret |= IX86_CALLCVT_THISCALL;
5002
5003 /* Regparam isn't allowed for thiscall and fastcall. */
5004 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5005 {
5006 if (lookup_attribute ("regparm", attrs))
5007 ret |= IX86_CALLCVT_REGPARM;
5008 if (lookup_attribute ("sseregparm", attrs))
5009 ret |= IX86_CALLCVT_SSEREGPARM;
5010 }
5011
5012 if (IX86_BASE_CALLCVT(ret) != 0)
5013 return ret;
5014 }
5015
5016 is_stdarg = stdarg_p (type);
5017 if (TARGET_RTD && !is_stdarg)
5018 return IX86_CALLCVT_STDCALL | ret;
5019
5020 if (ret != 0
5021 || is_stdarg
5022 || TREE_CODE (type) != METHOD_TYPE
5023 || ix86_function_type_abi (type) != MS_ABI)
5024 return IX86_CALLCVT_CDECL | ret;
5025
5026 return IX86_CALLCVT_THISCALL;
5027 }
5028
5029 /* Return 0 if the attributes for two types are incompatible, 1 if they
5030 are compatible, and 2 if they are nearly compatible (which causes a
5031 warning to be generated). */
5032
5033 static int
5034 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5035 {
5036 unsigned int ccvt1, ccvt2;
5037
5038 if (TREE_CODE (type1) != FUNCTION_TYPE
5039 && TREE_CODE (type1) != METHOD_TYPE)
5040 return 1;
5041
5042 ccvt1 = ix86_get_callcvt (type1);
5043 ccvt2 = ix86_get_callcvt (type2);
5044 if (ccvt1 != ccvt2)
5045 return 0;
5046 if (ix86_function_regparm (type1, NULL)
5047 != ix86_function_regparm (type2, NULL))
5048 return 0;
5049
5050 return 1;
5051 }
5052 \f
5053 /* Return the regparm value for a function with the indicated TYPE and DECL.
5054 DECL may be NULL when calling function indirectly
5055 or considering a libcall. */
5056
5057 static int
5058 ix86_function_regparm (const_tree type, const_tree decl)
5059 {
5060 tree attr;
5061 int regparm;
5062 unsigned int ccvt;
5063
5064 if (TARGET_64BIT)
5065 return (ix86_function_type_abi (type) == SYSV_ABI
5066 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5067 ccvt = ix86_get_callcvt (type);
5068 regparm = ix86_regparm;
5069
5070 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5071 {
5072 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5073 if (attr)
5074 {
5075 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5076 return regparm;
5077 }
5078 }
5079 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5080 return 2;
5081 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5082 return 1;
5083
5084 /* Use register calling convention for local functions when possible. */
5085 if (decl
5086 && TREE_CODE (decl) == FUNCTION_DECL
5087 && optimize
5088 && !(profile_flag && !flag_fentry))
5089 {
5090 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5091 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5092 if (i && i->local && i->can_change_signature)
5093 {
5094 int local_regparm, globals = 0, regno;
5095
5096 /* Make sure no regparm register is taken by a
5097 fixed register variable. */
5098 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5099 if (fixed_regs[local_regparm])
5100 break;
5101
5102 /* We don't want to use regparm(3) for nested functions as
5103 these use a static chain pointer in the third argument. */
5104 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5105 local_regparm = 2;
5106
5107 /* In 32-bit mode save a register for the split stack. */
5108 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5109 local_regparm = 2;
5110
5111 /* Each fixed register usage increases register pressure,
5112 so less registers should be used for argument passing.
5113 This functionality can be overriden by an explicit
5114 regparm value. */
5115 for (regno = AX_REG; regno <= DI_REG; regno++)
5116 if (fixed_regs[regno])
5117 globals++;
5118
5119 local_regparm
5120 = globals < local_regparm ? local_regparm - globals : 0;
5121
5122 if (local_regparm > regparm)
5123 regparm = local_regparm;
5124 }
5125 }
5126
5127 return regparm;
5128 }
5129
5130 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5131 DFmode (2) arguments in SSE registers for a function with the
5132 indicated TYPE and DECL. DECL may be NULL when calling function
5133 indirectly or considering a libcall. Otherwise return 0. */
5134
5135 static int
5136 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5137 {
5138 gcc_assert (!TARGET_64BIT);
5139
5140 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5141 by the sseregparm attribute. */
5142 if (TARGET_SSEREGPARM
5143 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5144 {
5145 if (!TARGET_SSE)
5146 {
5147 if (warn)
5148 {
5149 if (decl)
5150 error ("calling %qD with attribute sseregparm without "
5151 "SSE/SSE2 enabled", decl);
5152 else
5153 error ("calling %qT with attribute sseregparm without "
5154 "SSE/SSE2 enabled", type);
5155 }
5156 return 0;
5157 }
5158
5159 return 2;
5160 }
5161
5162 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5163 (and DFmode for SSE2) arguments in SSE registers. */
5164 if (decl && TARGET_SSE_MATH && optimize
5165 && !(profile_flag && !flag_fentry))
5166 {
5167 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5168 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5169 if (i && i->local && i->can_change_signature)
5170 return TARGET_SSE2 ? 2 : 1;
5171 }
5172
5173 return 0;
5174 }
5175
5176 /* Return true if EAX is live at the start of the function. Used by
5177 ix86_expand_prologue to determine if we need special help before
5178 calling allocate_stack_worker. */
5179
5180 static bool
5181 ix86_eax_live_at_start_p (void)
5182 {
5183 /* Cheat. Don't bother working forward from ix86_function_regparm
5184 to the function type to whether an actual argument is located in
5185 eax. Instead just look at cfg info, which is still close enough
5186 to correct at this point. This gives false positives for broken
5187 functions that might use uninitialized data that happens to be
5188 allocated in eax, but who cares? */
5189 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5190 }
5191
5192 static bool
5193 ix86_keep_aggregate_return_pointer (tree fntype)
5194 {
5195 tree attr;
5196
5197 if (!TARGET_64BIT)
5198 {
5199 attr = lookup_attribute ("callee_pop_aggregate_return",
5200 TYPE_ATTRIBUTES (fntype));
5201 if (attr)
5202 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5203
5204 /* For 32-bit MS-ABI the default is to keep aggregate
5205 return pointer. */
5206 if (ix86_function_type_abi (fntype) == MS_ABI)
5207 return true;
5208 }
5209 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5210 }
5211
5212 /* Value is the number of bytes of arguments automatically
5213 popped when returning from a subroutine call.
5214 FUNDECL is the declaration node of the function (as a tree),
5215 FUNTYPE is the data type of the function (as a tree),
5216 or for a library call it is an identifier node for the subroutine name.
5217 SIZE is the number of bytes of arguments passed on the stack.
5218
5219 On the 80386, the RTD insn may be used to pop them if the number
5220 of args is fixed, but if the number is variable then the caller
5221 must pop them all. RTD can't be used for library calls now
5222 because the library is compiled with the Unix compiler.
5223 Use of RTD is a selectable option, since it is incompatible with
5224 standard Unix calling sequences. If the option is not selected,
5225 the caller must always pop the args.
5226
5227 The attribute stdcall is equivalent to RTD on a per module basis. */
5228
5229 static int
5230 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5231 {
5232 unsigned int ccvt;
5233
5234 /* None of the 64-bit ABIs pop arguments. */
5235 if (TARGET_64BIT)
5236 return 0;
5237
5238 ccvt = ix86_get_callcvt (funtype);
5239
5240 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5241 | IX86_CALLCVT_THISCALL)) != 0
5242 && ! stdarg_p (funtype))
5243 return size;
5244
5245 /* Lose any fake structure return argument if it is passed on the stack. */
5246 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5247 && !ix86_keep_aggregate_return_pointer (funtype))
5248 {
5249 int nregs = ix86_function_regparm (funtype, fundecl);
5250 if (nregs == 0)
5251 return GET_MODE_SIZE (Pmode);
5252 }
5253
5254 return 0;
5255 }
5256
5257 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5258
5259 static bool
5260 ix86_legitimate_combined_insn (rtx insn)
5261 {
5262 /* Check operand constraints in case hard registers were propagated
5263 into insn pattern. This check prevents combine pass from
5264 generating insn patterns with invalid hard register operands.
5265 These invalid insns can eventually confuse reload to error out
5266 with a spill failure. See also PRs 46829 and 46843. */
5267 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5268 {
5269 int i;
5270
5271 extract_insn (insn);
5272 preprocess_constraints ();
5273
5274 for (i = 0; i < recog_data.n_operands; i++)
5275 {
5276 rtx op = recog_data.operand[i];
5277 enum machine_mode mode = GET_MODE (op);
5278 struct operand_alternative *op_alt;
5279 int offset = 0;
5280 bool win;
5281 int j;
5282
5283 /* A unary operator may be accepted by the predicate, but it
5284 is irrelevant for matching constraints. */
5285 if (UNARY_P (op))
5286 op = XEXP (op, 0);
5287
5288 if (GET_CODE (op) == SUBREG)
5289 {
5290 if (REG_P (SUBREG_REG (op))
5291 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5292 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5293 GET_MODE (SUBREG_REG (op)),
5294 SUBREG_BYTE (op),
5295 GET_MODE (op));
5296 op = SUBREG_REG (op);
5297 }
5298
5299 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5300 continue;
5301
5302 op_alt = recog_op_alt[i];
5303
5304 /* Operand has no constraints, anything is OK. */
5305 win = !recog_data.n_alternatives;
5306
5307 for (j = 0; j < recog_data.n_alternatives; j++)
5308 {
5309 if (op_alt[j].anything_ok
5310 || (op_alt[j].matches != -1
5311 && operands_match_p
5312 (recog_data.operand[i],
5313 recog_data.operand[op_alt[j].matches]))
5314 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5315 {
5316 win = true;
5317 break;
5318 }
5319 }
5320
5321 if (!win)
5322 return false;
5323 }
5324 }
5325
5326 return true;
5327 }
5328 \f
5329 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5330
5331 static unsigned HOST_WIDE_INT
5332 ix86_asan_shadow_offset (void)
5333 {
5334 return (unsigned HOST_WIDE_INT) 1 << (TARGET_LP64 ? 44 : 29);
5335 }
5336 \f
5337 /* Argument support functions. */
5338
5339 /* Return true when register may be used to pass function parameters. */
5340 bool
5341 ix86_function_arg_regno_p (int regno)
5342 {
5343 int i;
5344 const int *parm_regs;
5345
5346 if (!TARGET_64BIT)
5347 {
5348 if (TARGET_MACHO)
5349 return (regno < REGPARM_MAX
5350 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5351 else
5352 return (regno < REGPARM_MAX
5353 || (TARGET_MMX && MMX_REGNO_P (regno)
5354 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5355 || (TARGET_SSE && SSE_REGNO_P (regno)
5356 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5357 }
5358
5359 if (TARGET_MACHO)
5360 {
5361 if (SSE_REGNO_P (regno) && TARGET_SSE)
5362 return true;
5363 }
5364 else
5365 {
5366 if (TARGET_SSE && SSE_REGNO_P (regno)
5367 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5368 return true;
5369 }
5370
5371 /* TODO: The function should depend on current function ABI but
5372 builtins.c would need updating then. Therefore we use the
5373 default ABI. */
5374
5375 /* RAX is used as hidden argument to va_arg functions. */
5376 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5377 return true;
5378
5379 if (ix86_abi == MS_ABI)
5380 parm_regs = x86_64_ms_abi_int_parameter_registers;
5381 else
5382 parm_regs = x86_64_int_parameter_registers;
5383 for (i = 0; i < (ix86_abi == MS_ABI
5384 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5385 if (regno == parm_regs[i])
5386 return true;
5387 return false;
5388 }
5389
5390 /* Return if we do not know how to pass TYPE solely in registers. */
5391
5392 static bool
5393 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5394 {
5395 if (must_pass_in_stack_var_size_or_pad (mode, type))
5396 return true;
5397
5398 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5399 The layout_type routine is crafty and tries to trick us into passing
5400 currently unsupported vector types on the stack by using TImode. */
5401 return (!TARGET_64BIT && mode == TImode
5402 && type && TREE_CODE (type) != VECTOR_TYPE);
5403 }
5404
5405 /* It returns the size, in bytes, of the area reserved for arguments passed
5406 in registers for the function represented by fndecl dependent to the used
5407 abi format. */
5408 int
5409 ix86_reg_parm_stack_space (const_tree fndecl)
5410 {
5411 enum calling_abi call_abi = SYSV_ABI;
5412 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5413 call_abi = ix86_function_abi (fndecl);
5414 else
5415 call_abi = ix86_function_type_abi (fndecl);
5416 if (TARGET_64BIT && call_abi == MS_ABI)
5417 return 32;
5418 return 0;
5419 }
5420
5421 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5422 call abi used. */
5423 enum calling_abi
5424 ix86_function_type_abi (const_tree fntype)
5425 {
5426 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5427 {
5428 enum calling_abi abi = ix86_abi;
5429 if (abi == SYSV_ABI)
5430 {
5431 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5432 abi = MS_ABI;
5433 }
5434 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5435 abi = SYSV_ABI;
5436 return abi;
5437 }
5438 return ix86_abi;
5439 }
5440
5441 static bool
5442 ix86_function_ms_hook_prologue (const_tree fn)
5443 {
5444 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5445 {
5446 if (decl_function_context (fn) != NULL_TREE)
5447 error_at (DECL_SOURCE_LOCATION (fn),
5448 "ms_hook_prologue is not compatible with nested function");
5449 else
5450 return true;
5451 }
5452 return false;
5453 }
5454
5455 static enum calling_abi
5456 ix86_function_abi (const_tree fndecl)
5457 {
5458 if (! fndecl)
5459 return ix86_abi;
5460 return ix86_function_type_abi (TREE_TYPE (fndecl));
5461 }
5462
5463 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5464 call abi used. */
5465 enum calling_abi
5466 ix86_cfun_abi (void)
5467 {
5468 if (! cfun)
5469 return ix86_abi;
5470 return cfun->machine->call_abi;
5471 }
5472
5473 /* Write the extra assembler code needed to declare a function properly. */
5474
5475 void
5476 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5477 tree decl)
5478 {
5479 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5480
5481 if (is_ms_hook)
5482 {
5483 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5484 unsigned int filler_cc = 0xcccccccc;
5485
5486 for (i = 0; i < filler_count; i += 4)
5487 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5488 }
5489
5490 #ifdef SUBTARGET_ASM_UNWIND_INIT
5491 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5492 #endif
5493
5494 ASM_OUTPUT_LABEL (asm_out_file, fname);
5495
5496 /* Output magic byte marker, if hot-patch attribute is set. */
5497 if (is_ms_hook)
5498 {
5499 if (TARGET_64BIT)
5500 {
5501 /* leaq [%rsp + 0], %rsp */
5502 asm_fprintf (asm_out_file, ASM_BYTE
5503 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5504 }
5505 else
5506 {
5507 /* movl.s %edi, %edi
5508 push %ebp
5509 movl.s %esp, %ebp */
5510 asm_fprintf (asm_out_file, ASM_BYTE
5511 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5512 }
5513 }
5514 }
5515
5516 /* regclass.c */
5517 extern void init_regs (void);
5518
5519 /* Implementation of call abi switching target hook. Specific to FNDECL
5520 the specific call register sets are set. See also
5521 ix86_conditional_register_usage for more details. */
5522 void
5523 ix86_call_abi_override (const_tree fndecl)
5524 {
5525 if (fndecl == NULL_TREE)
5526 cfun->machine->call_abi = ix86_abi;
5527 else
5528 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5529 }
5530
5531 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5532 expensive re-initialization of init_regs each time we switch function context
5533 since this is needed only during RTL expansion. */
5534 static void
5535 ix86_maybe_switch_abi (void)
5536 {
5537 if (TARGET_64BIT &&
5538 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5539 reinit_regs ();
5540 }
5541
5542 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5543 for a call to a function whose data type is FNTYPE.
5544 For a library call, FNTYPE is 0. */
5545
5546 void
5547 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5548 tree fntype, /* tree ptr for function decl */
5549 rtx libname, /* SYMBOL_REF of library name or 0 */
5550 tree fndecl,
5551 int caller)
5552 {
5553 struct cgraph_local_info *i;
5554
5555 memset (cum, 0, sizeof (*cum));
5556
5557 if (fndecl)
5558 {
5559 i = cgraph_local_info (fndecl);
5560 cum->call_abi = ix86_function_abi (fndecl);
5561 }
5562 else
5563 {
5564 i = NULL;
5565 cum->call_abi = ix86_function_type_abi (fntype);
5566 }
5567
5568 cum->caller = caller;
5569
5570 /* Set up the number of registers to use for passing arguments. */
5571
5572 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5573 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5574 "or subtarget optimization implying it");
5575 cum->nregs = ix86_regparm;
5576 if (TARGET_64BIT)
5577 {
5578 cum->nregs = (cum->call_abi == SYSV_ABI
5579 ? X86_64_REGPARM_MAX
5580 : X86_64_MS_REGPARM_MAX);
5581 }
5582 if (TARGET_SSE)
5583 {
5584 cum->sse_nregs = SSE_REGPARM_MAX;
5585 if (TARGET_64BIT)
5586 {
5587 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5588 ? X86_64_SSE_REGPARM_MAX
5589 : X86_64_MS_SSE_REGPARM_MAX);
5590 }
5591 }
5592 if (TARGET_MMX)
5593 cum->mmx_nregs = MMX_REGPARM_MAX;
5594 cum->warn_avx = true;
5595 cum->warn_sse = true;
5596 cum->warn_mmx = true;
5597
5598 /* Because type might mismatch in between caller and callee, we need to
5599 use actual type of function for local calls.
5600 FIXME: cgraph_analyze can be told to actually record if function uses
5601 va_start so for local functions maybe_vaarg can be made aggressive
5602 helping K&R code.
5603 FIXME: once typesytem is fixed, we won't need this code anymore. */
5604 if (i && i->local && i->can_change_signature)
5605 fntype = TREE_TYPE (fndecl);
5606 cum->maybe_vaarg = (fntype
5607 ? (!prototype_p (fntype) || stdarg_p (fntype))
5608 : !libname);
5609
5610 if (!TARGET_64BIT)
5611 {
5612 /* If there are variable arguments, then we won't pass anything
5613 in registers in 32-bit mode. */
5614 if (stdarg_p (fntype))
5615 {
5616 cum->nregs = 0;
5617 cum->sse_nregs = 0;
5618 cum->mmx_nregs = 0;
5619 cum->warn_avx = 0;
5620 cum->warn_sse = 0;
5621 cum->warn_mmx = 0;
5622 return;
5623 }
5624
5625 /* Use ecx and edx registers if function has fastcall attribute,
5626 else look for regparm information. */
5627 if (fntype)
5628 {
5629 unsigned int ccvt = ix86_get_callcvt (fntype);
5630 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5631 {
5632 cum->nregs = 1;
5633 cum->fastcall = 1; /* Same first register as in fastcall. */
5634 }
5635 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5636 {
5637 cum->nregs = 2;
5638 cum->fastcall = 1;
5639 }
5640 else
5641 cum->nregs = ix86_function_regparm (fntype, fndecl);
5642 }
5643
5644 /* Set up the number of SSE registers used for passing SFmode
5645 and DFmode arguments. Warn for mismatching ABI. */
5646 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5647 }
5648 }
5649
5650 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5651 But in the case of vector types, it is some vector mode.
5652
5653 When we have only some of our vector isa extensions enabled, then there
5654 are some modes for which vector_mode_supported_p is false. For these
5655 modes, the generic vector support in gcc will choose some non-vector mode
5656 in order to implement the type. By computing the natural mode, we'll
5657 select the proper ABI location for the operand and not depend on whatever
5658 the middle-end decides to do with these vector types.
5659
5660 The midde-end can't deal with the vector types > 16 bytes. In this
5661 case, we return the original mode and warn ABI change if CUM isn't
5662 NULL. */
5663
5664 static enum machine_mode
5665 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5666 {
5667 enum machine_mode mode = TYPE_MODE (type);
5668
5669 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5670 {
5671 HOST_WIDE_INT size = int_size_in_bytes (type);
5672 if ((size == 8 || size == 16 || size == 32)
5673 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5674 && TYPE_VECTOR_SUBPARTS (type) > 1)
5675 {
5676 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5677
5678 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5679 mode = MIN_MODE_VECTOR_FLOAT;
5680 else
5681 mode = MIN_MODE_VECTOR_INT;
5682
5683 /* Get the mode which has this inner mode and number of units. */
5684 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5685 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5686 && GET_MODE_INNER (mode) == innermode)
5687 {
5688 if (size == 32 && !TARGET_AVX)
5689 {
5690 static bool warnedavx;
5691
5692 if (cum
5693 && !warnedavx
5694 && cum->warn_avx)
5695 {
5696 warnedavx = true;
5697 warning (0, "AVX vector argument without AVX "
5698 "enabled changes the ABI");
5699 }
5700 return TYPE_MODE (type);
5701 }
5702 else if ((size == 8 || size == 16) && !TARGET_SSE)
5703 {
5704 static bool warnedsse;
5705
5706 if (cum
5707 && !warnedsse
5708 && cum->warn_sse)
5709 {
5710 warnedsse = true;
5711 warning (0, "SSE vector argument without SSE "
5712 "enabled changes the ABI");
5713 }
5714 return mode;
5715 }
5716 else
5717 return mode;
5718 }
5719
5720 gcc_unreachable ();
5721 }
5722 }
5723
5724 return mode;
5725 }
5726
5727 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5728 this may not agree with the mode that the type system has chosen for the
5729 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5730 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5731
5732 static rtx
5733 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5734 unsigned int regno)
5735 {
5736 rtx tmp;
5737
5738 if (orig_mode != BLKmode)
5739 tmp = gen_rtx_REG (orig_mode, regno);
5740 else
5741 {
5742 tmp = gen_rtx_REG (mode, regno);
5743 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5744 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5745 }
5746
5747 return tmp;
5748 }
5749
5750 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5751 of this code is to classify each 8bytes of incoming argument by the register
5752 class and assign registers accordingly. */
5753
5754 /* Return the union class of CLASS1 and CLASS2.
5755 See the x86-64 PS ABI for details. */
5756
5757 static enum x86_64_reg_class
5758 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5759 {
5760 /* Rule #1: If both classes are equal, this is the resulting class. */
5761 if (class1 == class2)
5762 return class1;
5763
5764 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5765 the other class. */
5766 if (class1 == X86_64_NO_CLASS)
5767 return class2;
5768 if (class2 == X86_64_NO_CLASS)
5769 return class1;
5770
5771 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5772 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5773 return X86_64_MEMORY_CLASS;
5774
5775 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5776 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5777 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5778 return X86_64_INTEGERSI_CLASS;
5779 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5780 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5781 return X86_64_INTEGER_CLASS;
5782
5783 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5784 MEMORY is used. */
5785 if (class1 == X86_64_X87_CLASS
5786 || class1 == X86_64_X87UP_CLASS
5787 || class1 == X86_64_COMPLEX_X87_CLASS
5788 || class2 == X86_64_X87_CLASS
5789 || class2 == X86_64_X87UP_CLASS
5790 || class2 == X86_64_COMPLEX_X87_CLASS)
5791 return X86_64_MEMORY_CLASS;
5792
5793 /* Rule #6: Otherwise class SSE is used. */
5794 return X86_64_SSE_CLASS;
5795 }
5796
5797 /* Classify the argument of type TYPE and mode MODE.
5798 CLASSES will be filled by the register class used to pass each word
5799 of the operand. The number of words is returned. In case the parameter
5800 should be passed in memory, 0 is returned. As a special case for zero
5801 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5802
5803 BIT_OFFSET is used internally for handling records and specifies offset
5804 of the offset in bits modulo 256 to avoid overflow cases.
5805
5806 See the x86-64 PS ABI for details.
5807 */
5808
5809 static int
5810 classify_argument (enum machine_mode mode, const_tree type,
5811 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5812 {
5813 HOST_WIDE_INT bytes =
5814 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5815 int words
5816 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5817
5818 /* Variable sized entities are always passed/returned in memory. */
5819 if (bytes < 0)
5820 return 0;
5821
5822 if (mode != VOIDmode
5823 && targetm.calls.must_pass_in_stack (mode, type))
5824 return 0;
5825
5826 if (type && AGGREGATE_TYPE_P (type))
5827 {
5828 int i;
5829 tree field;
5830 enum x86_64_reg_class subclasses[MAX_CLASSES];
5831
5832 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5833 if (bytes > 32)
5834 return 0;
5835
5836 for (i = 0; i < words; i++)
5837 classes[i] = X86_64_NO_CLASS;
5838
5839 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5840 signalize memory class, so handle it as special case. */
5841 if (!words)
5842 {
5843 classes[0] = X86_64_NO_CLASS;
5844 return 1;
5845 }
5846
5847 /* Classify each field of record and merge classes. */
5848 switch (TREE_CODE (type))
5849 {
5850 case RECORD_TYPE:
5851 /* And now merge the fields of structure. */
5852 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5853 {
5854 if (TREE_CODE (field) == FIELD_DECL)
5855 {
5856 int num;
5857
5858 if (TREE_TYPE (field) == error_mark_node)
5859 continue;
5860
5861 /* Bitfields are always classified as integer. Handle them
5862 early, since later code would consider them to be
5863 misaligned integers. */
5864 if (DECL_BIT_FIELD (field))
5865 {
5866 for (i = (int_bit_position (field)
5867 + (bit_offset % 64)) / 8 / 8;
5868 i < ((int_bit_position (field) + (bit_offset % 64))
5869 + tree_low_cst (DECL_SIZE (field), 0)
5870 + 63) / 8 / 8; i++)
5871 classes[i] =
5872 merge_classes (X86_64_INTEGER_CLASS,
5873 classes[i]);
5874 }
5875 else
5876 {
5877 int pos;
5878
5879 type = TREE_TYPE (field);
5880
5881 /* Flexible array member is ignored. */
5882 if (TYPE_MODE (type) == BLKmode
5883 && TREE_CODE (type) == ARRAY_TYPE
5884 && TYPE_SIZE (type) == NULL_TREE
5885 && TYPE_DOMAIN (type) != NULL_TREE
5886 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5887 == NULL_TREE))
5888 {
5889 static bool warned;
5890
5891 if (!warned && warn_psabi)
5892 {
5893 warned = true;
5894 inform (input_location,
5895 "the ABI of passing struct with"
5896 " a flexible array member has"
5897 " changed in GCC 4.4");
5898 }
5899 continue;
5900 }
5901 num = classify_argument (TYPE_MODE (type), type,
5902 subclasses,
5903 (int_bit_position (field)
5904 + bit_offset) % 256);
5905 if (!num)
5906 return 0;
5907 pos = (int_bit_position (field)
5908 + (bit_offset % 64)) / 8 / 8;
5909 for (i = 0; i < num && (i + pos) < words; i++)
5910 classes[i + pos] =
5911 merge_classes (subclasses[i], classes[i + pos]);
5912 }
5913 }
5914 }
5915 break;
5916
5917 case ARRAY_TYPE:
5918 /* Arrays are handled as small records. */
5919 {
5920 int num;
5921 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5922 TREE_TYPE (type), subclasses, bit_offset);
5923 if (!num)
5924 return 0;
5925
5926 /* The partial classes are now full classes. */
5927 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5928 subclasses[0] = X86_64_SSE_CLASS;
5929 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5930 && !((bit_offset % 64) == 0 && bytes == 4))
5931 subclasses[0] = X86_64_INTEGER_CLASS;
5932
5933 for (i = 0; i < words; i++)
5934 classes[i] = subclasses[i % num];
5935
5936 break;
5937 }
5938 case UNION_TYPE:
5939 case QUAL_UNION_TYPE:
5940 /* Unions are similar to RECORD_TYPE but offset is always 0.
5941 */
5942 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5943 {
5944 if (TREE_CODE (field) == FIELD_DECL)
5945 {
5946 int num;
5947
5948 if (TREE_TYPE (field) == error_mark_node)
5949 continue;
5950
5951 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5952 TREE_TYPE (field), subclasses,
5953 bit_offset);
5954 if (!num)
5955 return 0;
5956 for (i = 0; i < num; i++)
5957 classes[i] = merge_classes (subclasses[i], classes[i]);
5958 }
5959 }
5960 break;
5961
5962 default:
5963 gcc_unreachable ();
5964 }
5965
5966 if (words > 2)
5967 {
5968 /* When size > 16 bytes, if the first one isn't
5969 X86_64_SSE_CLASS or any other ones aren't
5970 X86_64_SSEUP_CLASS, everything should be passed in
5971 memory. */
5972 if (classes[0] != X86_64_SSE_CLASS)
5973 return 0;
5974
5975 for (i = 1; i < words; i++)
5976 if (classes[i] != X86_64_SSEUP_CLASS)
5977 return 0;
5978 }
5979
5980 /* Final merger cleanup. */
5981 for (i = 0; i < words; i++)
5982 {
5983 /* If one class is MEMORY, everything should be passed in
5984 memory. */
5985 if (classes[i] == X86_64_MEMORY_CLASS)
5986 return 0;
5987
5988 /* The X86_64_SSEUP_CLASS should be always preceded by
5989 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
5990 if (classes[i] == X86_64_SSEUP_CLASS
5991 && classes[i - 1] != X86_64_SSE_CLASS
5992 && classes[i - 1] != X86_64_SSEUP_CLASS)
5993 {
5994 /* The first one should never be X86_64_SSEUP_CLASS. */
5995 gcc_assert (i != 0);
5996 classes[i] = X86_64_SSE_CLASS;
5997 }
5998
5999 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6000 everything should be passed in memory. */
6001 if (classes[i] == X86_64_X87UP_CLASS
6002 && (classes[i - 1] != X86_64_X87_CLASS))
6003 {
6004 static bool warned;
6005
6006 /* The first one should never be X86_64_X87UP_CLASS. */
6007 gcc_assert (i != 0);
6008 if (!warned && warn_psabi)
6009 {
6010 warned = true;
6011 inform (input_location,
6012 "the ABI of passing union with long double"
6013 " has changed in GCC 4.4");
6014 }
6015 return 0;
6016 }
6017 }
6018 return words;
6019 }
6020
6021 /* Compute alignment needed. We align all types to natural boundaries with
6022 exception of XFmode that is aligned to 64bits. */
6023 if (mode != VOIDmode && mode != BLKmode)
6024 {
6025 int mode_alignment = GET_MODE_BITSIZE (mode);
6026
6027 if (mode == XFmode)
6028 mode_alignment = 128;
6029 else if (mode == XCmode)
6030 mode_alignment = 256;
6031 if (COMPLEX_MODE_P (mode))
6032 mode_alignment /= 2;
6033 /* Misaligned fields are always returned in memory. */
6034 if (bit_offset % mode_alignment)
6035 return 0;
6036 }
6037
6038 /* for V1xx modes, just use the base mode */
6039 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6040 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6041 mode = GET_MODE_INNER (mode);
6042
6043 /* Classification of atomic types. */
6044 switch (mode)
6045 {
6046 case SDmode:
6047 case DDmode:
6048 classes[0] = X86_64_SSE_CLASS;
6049 return 1;
6050 case TDmode:
6051 classes[0] = X86_64_SSE_CLASS;
6052 classes[1] = X86_64_SSEUP_CLASS;
6053 return 2;
6054 case DImode:
6055 case SImode:
6056 case HImode:
6057 case QImode:
6058 case CSImode:
6059 case CHImode:
6060 case CQImode:
6061 {
6062 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6063
6064 if (size <= 32)
6065 {
6066 classes[0] = X86_64_INTEGERSI_CLASS;
6067 return 1;
6068 }
6069 else if (size <= 64)
6070 {
6071 classes[0] = X86_64_INTEGER_CLASS;
6072 return 1;
6073 }
6074 else if (size <= 64+32)
6075 {
6076 classes[0] = X86_64_INTEGER_CLASS;
6077 classes[1] = X86_64_INTEGERSI_CLASS;
6078 return 2;
6079 }
6080 else if (size <= 64+64)
6081 {
6082 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6083 return 2;
6084 }
6085 else
6086 gcc_unreachable ();
6087 }
6088 case CDImode:
6089 case TImode:
6090 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6091 return 2;
6092 case COImode:
6093 case OImode:
6094 /* OImode shouldn't be used directly. */
6095 gcc_unreachable ();
6096 case CTImode:
6097 return 0;
6098 case SFmode:
6099 if (!(bit_offset % 64))
6100 classes[0] = X86_64_SSESF_CLASS;
6101 else
6102 classes[0] = X86_64_SSE_CLASS;
6103 return 1;
6104 case DFmode:
6105 classes[0] = X86_64_SSEDF_CLASS;
6106 return 1;
6107 case XFmode:
6108 classes[0] = X86_64_X87_CLASS;
6109 classes[1] = X86_64_X87UP_CLASS;
6110 return 2;
6111 case TFmode:
6112 classes[0] = X86_64_SSE_CLASS;
6113 classes[1] = X86_64_SSEUP_CLASS;
6114 return 2;
6115 case SCmode:
6116 classes[0] = X86_64_SSE_CLASS;
6117 if (!(bit_offset % 64))
6118 return 1;
6119 else
6120 {
6121 static bool warned;
6122
6123 if (!warned && warn_psabi)
6124 {
6125 warned = true;
6126 inform (input_location,
6127 "the ABI of passing structure with complex float"
6128 " member has changed in GCC 4.4");
6129 }
6130 classes[1] = X86_64_SSESF_CLASS;
6131 return 2;
6132 }
6133 case DCmode:
6134 classes[0] = X86_64_SSEDF_CLASS;
6135 classes[1] = X86_64_SSEDF_CLASS;
6136 return 2;
6137 case XCmode:
6138 classes[0] = X86_64_COMPLEX_X87_CLASS;
6139 return 1;
6140 case TCmode:
6141 /* This modes is larger than 16 bytes. */
6142 return 0;
6143 case V8SFmode:
6144 case V8SImode:
6145 case V32QImode:
6146 case V16HImode:
6147 case V4DFmode:
6148 case V4DImode:
6149 classes[0] = X86_64_SSE_CLASS;
6150 classes[1] = X86_64_SSEUP_CLASS;
6151 classes[2] = X86_64_SSEUP_CLASS;
6152 classes[3] = X86_64_SSEUP_CLASS;
6153 return 4;
6154 case V4SFmode:
6155 case V4SImode:
6156 case V16QImode:
6157 case V8HImode:
6158 case V2DFmode:
6159 case V2DImode:
6160 classes[0] = X86_64_SSE_CLASS;
6161 classes[1] = X86_64_SSEUP_CLASS;
6162 return 2;
6163 case V1TImode:
6164 case V1DImode:
6165 case V2SFmode:
6166 case V2SImode:
6167 case V4HImode:
6168 case V8QImode:
6169 classes[0] = X86_64_SSE_CLASS;
6170 return 1;
6171 case BLKmode:
6172 case VOIDmode:
6173 return 0;
6174 default:
6175 gcc_assert (VECTOR_MODE_P (mode));
6176
6177 if (bytes > 16)
6178 return 0;
6179
6180 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6181
6182 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6183 classes[0] = X86_64_INTEGERSI_CLASS;
6184 else
6185 classes[0] = X86_64_INTEGER_CLASS;
6186 classes[1] = X86_64_INTEGER_CLASS;
6187 return 1 + (bytes > 8);
6188 }
6189 }
6190
6191 /* Examine the argument and return set number of register required in each
6192 class. Return 0 iff parameter should be passed in memory. */
6193 static int
6194 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6195 int *int_nregs, int *sse_nregs)
6196 {
6197 enum x86_64_reg_class regclass[MAX_CLASSES];
6198 int n = classify_argument (mode, type, regclass, 0);
6199
6200 *int_nregs = 0;
6201 *sse_nregs = 0;
6202 if (!n)
6203 return 0;
6204 for (n--; n >= 0; n--)
6205 switch (regclass[n])
6206 {
6207 case X86_64_INTEGER_CLASS:
6208 case X86_64_INTEGERSI_CLASS:
6209 (*int_nregs)++;
6210 break;
6211 case X86_64_SSE_CLASS:
6212 case X86_64_SSESF_CLASS:
6213 case X86_64_SSEDF_CLASS:
6214 (*sse_nregs)++;
6215 break;
6216 case X86_64_NO_CLASS:
6217 case X86_64_SSEUP_CLASS:
6218 break;
6219 case X86_64_X87_CLASS:
6220 case X86_64_X87UP_CLASS:
6221 if (!in_return)
6222 return 0;
6223 break;
6224 case X86_64_COMPLEX_X87_CLASS:
6225 return in_return ? 2 : 0;
6226 case X86_64_MEMORY_CLASS:
6227 gcc_unreachable ();
6228 }
6229 return 1;
6230 }
6231
6232 /* Construct container for the argument used by GCC interface. See
6233 FUNCTION_ARG for the detailed description. */
6234
6235 static rtx
6236 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6237 const_tree type, int in_return, int nintregs, int nsseregs,
6238 const int *intreg, int sse_regno)
6239 {
6240 /* The following variables hold the static issued_error state. */
6241 static bool issued_sse_arg_error;
6242 static bool issued_sse_ret_error;
6243 static bool issued_x87_ret_error;
6244
6245 enum machine_mode tmpmode;
6246 int bytes =
6247 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6248 enum x86_64_reg_class regclass[MAX_CLASSES];
6249 int n;
6250 int i;
6251 int nexps = 0;
6252 int needed_sseregs, needed_intregs;
6253 rtx exp[MAX_CLASSES];
6254 rtx ret;
6255
6256 n = classify_argument (mode, type, regclass, 0);
6257 if (!n)
6258 return NULL;
6259 if (!examine_argument (mode, type, in_return, &needed_intregs,
6260 &needed_sseregs))
6261 return NULL;
6262 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6263 return NULL;
6264
6265 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6266 some less clueful developer tries to use floating-point anyway. */
6267 if (needed_sseregs && !TARGET_SSE)
6268 {
6269 if (in_return)
6270 {
6271 if (!issued_sse_ret_error)
6272 {
6273 error ("SSE register return with SSE disabled");
6274 issued_sse_ret_error = true;
6275 }
6276 }
6277 else if (!issued_sse_arg_error)
6278 {
6279 error ("SSE register argument with SSE disabled");
6280 issued_sse_arg_error = true;
6281 }
6282 return NULL;
6283 }
6284
6285 /* Likewise, error if the ABI requires us to return values in the
6286 x87 registers and the user specified -mno-80387. */
6287 if (!TARGET_80387 && in_return)
6288 for (i = 0; i < n; i++)
6289 if (regclass[i] == X86_64_X87_CLASS
6290 || regclass[i] == X86_64_X87UP_CLASS
6291 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6292 {
6293 if (!issued_x87_ret_error)
6294 {
6295 error ("x87 register return with x87 disabled");
6296 issued_x87_ret_error = true;
6297 }
6298 return NULL;
6299 }
6300
6301 /* First construct simple cases. Avoid SCmode, since we want to use
6302 single register to pass this type. */
6303 if (n == 1 && mode != SCmode)
6304 switch (regclass[0])
6305 {
6306 case X86_64_INTEGER_CLASS:
6307 case X86_64_INTEGERSI_CLASS:
6308 return gen_rtx_REG (mode, intreg[0]);
6309 case X86_64_SSE_CLASS:
6310 case X86_64_SSESF_CLASS:
6311 case X86_64_SSEDF_CLASS:
6312 if (mode != BLKmode)
6313 return gen_reg_or_parallel (mode, orig_mode,
6314 SSE_REGNO (sse_regno));
6315 break;
6316 case X86_64_X87_CLASS:
6317 case X86_64_COMPLEX_X87_CLASS:
6318 return gen_rtx_REG (mode, FIRST_STACK_REG);
6319 case X86_64_NO_CLASS:
6320 /* Zero sized array, struct or class. */
6321 return NULL;
6322 default:
6323 gcc_unreachable ();
6324 }
6325 if (n == 2
6326 && regclass[0] == X86_64_SSE_CLASS
6327 && regclass[1] == X86_64_SSEUP_CLASS
6328 && mode != BLKmode)
6329 return gen_reg_or_parallel (mode, orig_mode,
6330 SSE_REGNO (sse_regno));
6331 if (n == 4
6332 && regclass[0] == X86_64_SSE_CLASS
6333 && regclass[1] == X86_64_SSEUP_CLASS
6334 && regclass[2] == X86_64_SSEUP_CLASS
6335 && regclass[3] == X86_64_SSEUP_CLASS
6336 && mode != BLKmode)
6337 return gen_reg_or_parallel (mode, orig_mode,
6338 SSE_REGNO (sse_regno));
6339 if (n == 2
6340 && regclass[0] == X86_64_X87_CLASS
6341 && regclass[1] == X86_64_X87UP_CLASS)
6342 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6343
6344 if (n == 2
6345 && regclass[0] == X86_64_INTEGER_CLASS
6346 && regclass[1] == X86_64_INTEGER_CLASS
6347 && (mode == CDImode || mode == TImode || mode == TFmode)
6348 && intreg[0] + 1 == intreg[1])
6349 return gen_rtx_REG (mode, intreg[0]);
6350
6351 /* Otherwise figure out the entries of the PARALLEL. */
6352 for (i = 0; i < n; i++)
6353 {
6354 int pos;
6355
6356 switch (regclass[i])
6357 {
6358 case X86_64_NO_CLASS:
6359 break;
6360 case X86_64_INTEGER_CLASS:
6361 case X86_64_INTEGERSI_CLASS:
6362 /* Merge TImodes on aligned occasions here too. */
6363 if (i * 8 + 8 > bytes)
6364 tmpmode
6365 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6366 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6367 tmpmode = SImode;
6368 else
6369 tmpmode = DImode;
6370 /* We've requested 24 bytes we
6371 don't have mode for. Use DImode. */
6372 if (tmpmode == BLKmode)
6373 tmpmode = DImode;
6374 exp [nexps++]
6375 = gen_rtx_EXPR_LIST (VOIDmode,
6376 gen_rtx_REG (tmpmode, *intreg),
6377 GEN_INT (i*8));
6378 intreg++;
6379 break;
6380 case X86_64_SSESF_CLASS:
6381 exp [nexps++]
6382 = gen_rtx_EXPR_LIST (VOIDmode,
6383 gen_rtx_REG (SFmode,
6384 SSE_REGNO (sse_regno)),
6385 GEN_INT (i*8));
6386 sse_regno++;
6387 break;
6388 case X86_64_SSEDF_CLASS:
6389 exp [nexps++]
6390 = gen_rtx_EXPR_LIST (VOIDmode,
6391 gen_rtx_REG (DFmode,
6392 SSE_REGNO (sse_regno)),
6393 GEN_INT (i*8));
6394 sse_regno++;
6395 break;
6396 case X86_64_SSE_CLASS:
6397 pos = i;
6398 switch (n)
6399 {
6400 case 1:
6401 tmpmode = DImode;
6402 break;
6403 case 2:
6404 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6405 {
6406 tmpmode = TImode;
6407 i++;
6408 }
6409 else
6410 tmpmode = DImode;
6411 break;
6412 case 4:
6413 gcc_assert (i == 0
6414 && regclass[1] == X86_64_SSEUP_CLASS
6415 && regclass[2] == X86_64_SSEUP_CLASS
6416 && regclass[3] == X86_64_SSEUP_CLASS);
6417 tmpmode = OImode;
6418 i += 3;
6419 break;
6420 default:
6421 gcc_unreachable ();
6422 }
6423 exp [nexps++]
6424 = gen_rtx_EXPR_LIST (VOIDmode,
6425 gen_rtx_REG (tmpmode,
6426 SSE_REGNO (sse_regno)),
6427 GEN_INT (pos*8));
6428 sse_regno++;
6429 break;
6430 default:
6431 gcc_unreachable ();
6432 }
6433 }
6434
6435 /* Empty aligned struct, union or class. */
6436 if (nexps == 0)
6437 return NULL;
6438
6439 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6440 for (i = 0; i < nexps; i++)
6441 XVECEXP (ret, 0, i) = exp [i];
6442 return ret;
6443 }
6444
6445 /* Update the data in CUM to advance over an argument of mode MODE
6446 and data type TYPE. (TYPE is null for libcalls where that information
6447 may not be available.) */
6448
6449 static void
6450 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6451 const_tree type, HOST_WIDE_INT bytes,
6452 HOST_WIDE_INT words)
6453 {
6454 switch (mode)
6455 {
6456 default:
6457 break;
6458
6459 case BLKmode:
6460 if (bytes < 0)
6461 break;
6462 /* FALLTHRU */
6463
6464 case DImode:
6465 case SImode:
6466 case HImode:
6467 case QImode:
6468 cum->words += words;
6469 cum->nregs -= words;
6470 cum->regno += words;
6471
6472 if (cum->nregs <= 0)
6473 {
6474 cum->nregs = 0;
6475 cum->regno = 0;
6476 }
6477 break;
6478
6479 case OImode:
6480 /* OImode shouldn't be used directly. */
6481 gcc_unreachable ();
6482
6483 case DFmode:
6484 if (cum->float_in_sse < 2)
6485 break;
6486 case SFmode:
6487 if (cum->float_in_sse < 1)
6488 break;
6489 /* FALLTHRU */
6490
6491 case V8SFmode:
6492 case V8SImode:
6493 case V32QImode:
6494 case V16HImode:
6495 case V4DFmode:
6496 case V4DImode:
6497 case TImode:
6498 case V16QImode:
6499 case V8HImode:
6500 case V4SImode:
6501 case V2DImode:
6502 case V4SFmode:
6503 case V2DFmode:
6504 if (!type || !AGGREGATE_TYPE_P (type))
6505 {
6506 cum->sse_words += words;
6507 cum->sse_nregs -= 1;
6508 cum->sse_regno += 1;
6509 if (cum->sse_nregs <= 0)
6510 {
6511 cum->sse_nregs = 0;
6512 cum->sse_regno = 0;
6513 }
6514 }
6515 break;
6516
6517 case V8QImode:
6518 case V4HImode:
6519 case V2SImode:
6520 case V2SFmode:
6521 case V1TImode:
6522 case V1DImode:
6523 if (!type || !AGGREGATE_TYPE_P (type))
6524 {
6525 cum->mmx_words += words;
6526 cum->mmx_nregs -= 1;
6527 cum->mmx_regno += 1;
6528 if (cum->mmx_nregs <= 0)
6529 {
6530 cum->mmx_nregs = 0;
6531 cum->mmx_regno = 0;
6532 }
6533 }
6534 break;
6535 }
6536 }
6537
6538 static void
6539 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6540 const_tree type, HOST_WIDE_INT words, bool named)
6541 {
6542 int int_nregs, sse_nregs;
6543
6544 /* Unnamed 256bit vector mode parameters are passed on stack. */
6545 if (!named && VALID_AVX256_REG_MODE (mode))
6546 return;
6547
6548 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6549 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6550 {
6551 cum->nregs -= int_nregs;
6552 cum->sse_nregs -= sse_nregs;
6553 cum->regno += int_nregs;
6554 cum->sse_regno += sse_nregs;
6555 }
6556 else
6557 {
6558 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6559 cum->words = (cum->words + align - 1) & ~(align - 1);
6560 cum->words += words;
6561 }
6562 }
6563
6564 static void
6565 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6566 HOST_WIDE_INT words)
6567 {
6568 /* Otherwise, this should be passed indirect. */
6569 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6570
6571 cum->words += words;
6572 if (cum->nregs > 0)
6573 {
6574 cum->nregs -= 1;
6575 cum->regno += 1;
6576 }
6577 }
6578
6579 /* Update the data in CUM to advance over an argument of mode MODE and
6580 data type TYPE. (TYPE is null for libcalls where that information
6581 may not be available.) */
6582
6583 static void
6584 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6585 const_tree type, bool named)
6586 {
6587 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6588 HOST_WIDE_INT bytes, words;
6589
6590 if (mode == BLKmode)
6591 bytes = int_size_in_bytes (type);
6592 else
6593 bytes = GET_MODE_SIZE (mode);
6594 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6595
6596 if (type)
6597 mode = type_natural_mode (type, NULL);
6598
6599 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6600 function_arg_advance_ms_64 (cum, bytes, words);
6601 else if (TARGET_64BIT)
6602 function_arg_advance_64 (cum, mode, type, words, named);
6603 else
6604 function_arg_advance_32 (cum, mode, type, bytes, words);
6605 }
6606
6607 /* Define where to put the arguments to a function.
6608 Value is zero to push the argument on the stack,
6609 or a hard register in which to store the argument.
6610
6611 MODE is the argument's machine mode.
6612 TYPE is the data type of the argument (as a tree).
6613 This is null for libcalls where that information may
6614 not be available.
6615 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6616 the preceding args and about the function being called.
6617 NAMED is nonzero if this argument is a named parameter
6618 (otherwise it is an extra parameter matching an ellipsis). */
6619
6620 static rtx
6621 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6622 enum machine_mode orig_mode, const_tree type,
6623 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6624 {
6625 static bool warnedsse, warnedmmx;
6626
6627 /* Avoid the AL settings for the Unix64 ABI. */
6628 if (mode == VOIDmode)
6629 return constm1_rtx;
6630
6631 switch (mode)
6632 {
6633 default:
6634 break;
6635
6636 case BLKmode:
6637 if (bytes < 0)
6638 break;
6639 /* FALLTHRU */
6640 case DImode:
6641 case SImode:
6642 case HImode:
6643 case QImode:
6644 if (words <= cum->nregs)
6645 {
6646 int regno = cum->regno;
6647
6648 /* Fastcall allocates the first two DWORD (SImode) or
6649 smaller arguments to ECX and EDX if it isn't an
6650 aggregate type . */
6651 if (cum->fastcall)
6652 {
6653 if (mode == BLKmode
6654 || mode == DImode
6655 || (type && AGGREGATE_TYPE_P (type)))
6656 break;
6657
6658 /* ECX not EAX is the first allocated register. */
6659 if (regno == AX_REG)
6660 regno = CX_REG;
6661 }
6662 return gen_rtx_REG (mode, regno);
6663 }
6664 break;
6665
6666 case DFmode:
6667 if (cum->float_in_sse < 2)
6668 break;
6669 case SFmode:
6670 if (cum->float_in_sse < 1)
6671 break;
6672 /* FALLTHRU */
6673 case TImode:
6674 /* In 32bit, we pass TImode in xmm registers. */
6675 case V16QImode:
6676 case V8HImode:
6677 case V4SImode:
6678 case V2DImode:
6679 case V4SFmode:
6680 case V2DFmode:
6681 if (!type || !AGGREGATE_TYPE_P (type))
6682 {
6683 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6684 {
6685 warnedsse = true;
6686 warning (0, "SSE vector argument without SSE enabled "
6687 "changes the ABI");
6688 }
6689 if (cum->sse_nregs)
6690 return gen_reg_or_parallel (mode, orig_mode,
6691 cum->sse_regno + FIRST_SSE_REG);
6692 }
6693 break;
6694
6695 case OImode:
6696 /* OImode shouldn't be used directly. */
6697 gcc_unreachable ();
6698
6699 case V8SFmode:
6700 case V8SImode:
6701 case V32QImode:
6702 case V16HImode:
6703 case V4DFmode:
6704 case V4DImode:
6705 if (!type || !AGGREGATE_TYPE_P (type))
6706 {
6707 if (cum->sse_nregs)
6708 return gen_reg_or_parallel (mode, orig_mode,
6709 cum->sse_regno + FIRST_SSE_REG);
6710 }
6711 break;
6712
6713 case V8QImode:
6714 case V4HImode:
6715 case V2SImode:
6716 case V2SFmode:
6717 case V1TImode:
6718 case V1DImode:
6719 if (!type || !AGGREGATE_TYPE_P (type))
6720 {
6721 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6722 {
6723 warnedmmx = true;
6724 warning (0, "MMX vector argument without MMX enabled "
6725 "changes the ABI");
6726 }
6727 if (cum->mmx_nregs)
6728 return gen_reg_or_parallel (mode, orig_mode,
6729 cum->mmx_regno + FIRST_MMX_REG);
6730 }
6731 break;
6732 }
6733
6734 return NULL_RTX;
6735 }
6736
6737 static rtx
6738 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6739 enum machine_mode orig_mode, const_tree type, bool named)
6740 {
6741 /* Handle a hidden AL argument containing number of registers
6742 for varargs x86-64 functions. */
6743 if (mode == VOIDmode)
6744 return GEN_INT (cum->maybe_vaarg
6745 ? (cum->sse_nregs < 0
6746 ? X86_64_SSE_REGPARM_MAX
6747 : cum->sse_regno)
6748 : -1);
6749
6750 switch (mode)
6751 {
6752 default:
6753 break;
6754
6755 case V8SFmode:
6756 case V8SImode:
6757 case V32QImode:
6758 case V16HImode:
6759 case V4DFmode:
6760 case V4DImode:
6761 /* Unnamed 256bit vector mode parameters are passed on stack. */
6762 if (!named)
6763 return NULL;
6764 break;
6765 }
6766
6767 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6768 cum->sse_nregs,
6769 &x86_64_int_parameter_registers [cum->regno],
6770 cum->sse_regno);
6771 }
6772
6773 static rtx
6774 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6775 enum machine_mode orig_mode, bool named,
6776 HOST_WIDE_INT bytes)
6777 {
6778 unsigned int regno;
6779
6780 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6781 We use value of -2 to specify that current function call is MSABI. */
6782 if (mode == VOIDmode)
6783 return GEN_INT (-2);
6784
6785 /* If we've run out of registers, it goes on the stack. */
6786 if (cum->nregs == 0)
6787 return NULL_RTX;
6788
6789 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6790
6791 /* Only floating point modes are passed in anything but integer regs. */
6792 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6793 {
6794 if (named)
6795 regno = cum->regno + FIRST_SSE_REG;
6796 else
6797 {
6798 rtx t1, t2;
6799
6800 /* Unnamed floating parameters are passed in both the
6801 SSE and integer registers. */
6802 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6803 t2 = gen_rtx_REG (mode, regno);
6804 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6805 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6806 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6807 }
6808 }
6809 /* Handle aggregated types passed in register. */
6810 if (orig_mode == BLKmode)
6811 {
6812 if (bytes > 0 && bytes <= 8)
6813 mode = (bytes > 4 ? DImode : SImode);
6814 if (mode == BLKmode)
6815 mode = DImode;
6816 }
6817
6818 return gen_reg_or_parallel (mode, orig_mode, regno);
6819 }
6820
6821 /* Return where to put the arguments to a function.
6822 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6823
6824 MODE is the argument's machine mode. TYPE is the data type of the
6825 argument. It is null for libcalls where that information may not be
6826 available. CUM gives information about the preceding args and about
6827 the function being called. NAMED is nonzero if this argument is a
6828 named parameter (otherwise it is an extra parameter matching an
6829 ellipsis). */
6830
6831 static rtx
6832 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6833 const_tree type, bool named)
6834 {
6835 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6836 enum machine_mode mode = omode;
6837 HOST_WIDE_INT bytes, words;
6838 rtx arg;
6839
6840 if (mode == BLKmode)
6841 bytes = int_size_in_bytes (type);
6842 else
6843 bytes = GET_MODE_SIZE (mode);
6844 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6845
6846 /* To simplify the code below, represent vector types with a vector mode
6847 even if MMX/SSE are not active. */
6848 if (type && TREE_CODE (type) == VECTOR_TYPE)
6849 mode = type_natural_mode (type, cum);
6850
6851 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6852 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6853 else if (TARGET_64BIT)
6854 arg = function_arg_64 (cum, mode, omode, type, named);
6855 else
6856 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6857
6858 return arg;
6859 }
6860
6861 /* A C expression that indicates when an argument must be passed by
6862 reference. If nonzero for an argument, a copy of that argument is
6863 made in memory and a pointer to the argument is passed instead of
6864 the argument itself. The pointer is passed in whatever way is
6865 appropriate for passing a pointer to that type. */
6866
6867 static bool
6868 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6869 enum machine_mode mode ATTRIBUTE_UNUSED,
6870 const_tree type, bool named ATTRIBUTE_UNUSED)
6871 {
6872 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6873
6874 /* See Windows x64 Software Convention. */
6875 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6876 {
6877 int msize = (int) GET_MODE_SIZE (mode);
6878 if (type)
6879 {
6880 /* Arrays are passed by reference. */
6881 if (TREE_CODE (type) == ARRAY_TYPE)
6882 return true;
6883
6884 if (AGGREGATE_TYPE_P (type))
6885 {
6886 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6887 are passed by reference. */
6888 msize = int_size_in_bytes (type);
6889 }
6890 }
6891
6892 /* __m128 is passed by reference. */
6893 switch (msize) {
6894 case 1: case 2: case 4: case 8:
6895 break;
6896 default:
6897 return true;
6898 }
6899 }
6900 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6901 return 1;
6902
6903 return 0;
6904 }
6905
6906 /* Return true when TYPE should be 128bit aligned for 32bit argument
6907 passing ABI. XXX: This function is obsolete and is only used for
6908 checking psABI compatibility with previous versions of GCC. */
6909
6910 static bool
6911 ix86_compat_aligned_value_p (const_tree type)
6912 {
6913 enum machine_mode mode = TYPE_MODE (type);
6914 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6915 || mode == TDmode
6916 || mode == TFmode
6917 || mode == TCmode)
6918 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6919 return true;
6920 if (TYPE_ALIGN (type) < 128)
6921 return false;
6922
6923 if (AGGREGATE_TYPE_P (type))
6924 {
6925 /* Walk the aggregates recursively. */
6926 switch (TREE_CODE (type))
6927 {
6928 case RECORD_TYPE:
6929 case UNION_TYPE:
6930 case QUAL_UNION_TYPE:
6931 {
6932 tree field;
6933
6934 /* Walk all the structure fields. */
6935 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6936 {
6937 if (TREE_CODE (field) == FIELD_DECL
6938 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6939 return true;
6940 }
6941 break;
6942 }
6943
6944 case ARRAY_TYPE:
6945 /* Just for use if some languages passes arrays by value. */
6946 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6947 return true;
6948 break;
6949
6950 default:
6951 gcc_unreachable ();
6952 }
6953 }
6954 return false;
6955 }
6956
6957 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6958 XXX: This function is obsolete and is only used for checking psABI
6959 compatibility with previous versions of GCC. */
6960
6961 static unsigned int
6962 ix86_compat_function_arg_boundary (enum machine_mode mode,
6963 const_tree type, unsigned int align)
6964 {
6965 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6966 natural boundaries. */
6967 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6968 {
6969 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6970 make an exception for SSE modes since these require 128bit
6971 alignment.
6972
6973 The handling here differs from field_alignment. ICC aligns MMX
6974 arguments to 4 byte boundaries, while structure fields are aligned
6975 to 8 byte boundaries. */
6976 if (!type)
6977 {
6978 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6979 align = PARM_BOUNDARY;
6980 }
6981 else
6982 {
6983 if (!ix86_compat_aligned_value_p (type))
6984 align = PARM_BOUNDARY;
6985 }
6986 }
6987 if (align > BIGGEST_ALIGNMENT)
6988 align = BIGGEST_ALIGNMENT;
6989 return align;
6990 }
6991
6992 /* Return true when TYPE should be 128bit aligned for 32bit argument
6993 passing ABI. */
6994
6995 static bool
6996 ix86_contains_aligned_value_p (const_tree type)
6997 {
6998 enum machine_mode mode = TYPE_MODE (type);
6999
7000 if (mode == XFmode || mode == XCmode)
7001 return false;
7002
7003 if (TYPE_ALIGN (type) < 128)
7004 return false;
7005
7006 if (AGGREGATE_TYPE_P (type))
7007 {
7008 /* Walk the aggregates recursively. */
7009 switch (TREE_CODE (type))
7010 {
7011 case RECORD_TYPE:
7012 case UNION_TYPE:
7013 case QUAL_UNION_TYPE:
7014 {
7015 tree field;
7016
7017 /* Walk all the structure fields. */
7018 for (field = TYPE_FIELDS (type);
7019 field;
7020 field = DECL_CHAIN (field))
7021 {
7022 if (TREE_CODE (field) == FIELD_DECL
7023 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7024 return true;
7025 }
7026 break;
7027 }
7028
7029 case ARRAY_TYPE:
7030 /* Just for use if some languages passes arrays by value. */
7031 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7032 return true;
7033 break;
7034
7035 default:
7036 gcc_unreachable ();
7037 }
7038 }
7039 else
7040 return TYPE_ALIGN (type) >= 128;
7041
7042 return false;
7043 }
7044
7045 /* Gives the alignment boundary, in bits, of an argument with the
7046 specified mode and type. */
7047
7048 static unsigned int
7049 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7050 {
7051 unsigned int align;
7052 if (type)
7053 {
7054 /* Since the main variant type is used for call, we convert it to
7055 the main variant type. */
7056 type = TYPE_MAIN_VARIANT (type);
7057 align = TYPE_ALIGN (type);
7058 }
7059 else
7060 align = GET_MODE_ALIGNMENT (mode);
7061 if (align < PARM_BOUNDARY)
7062 align = PARM_BOUNDARY;
7063 else
7064 {
7065 static bool warned;
7066 unsigned int saved_align = align;
7067
7068 if (!TARGET_64BIT)
7069 {
7070 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7071 if (!type)
7072 {
7073 if (mode == XFmode || mode == XCmode)
7074 align = PARM_BOUNDARY;
7075 }
7076 else if (!ix86_contains_aligned_value_p (type))
7077 align = PARM_BOUNDARY;
7078
7079 if (align < 128)
7080 align = PARM_BOUNDARY;
7081 }
7082
7083 if (warn_psabi
7084 && !warned
7085 && align != ix86_compat_function_arg_boundary (mode, type,
7086 saved_align))
7087 {
7088 warned = true;
7089 inform (input_location,
7090 "The ABI for passing parameters with %d-byte"
7091 " alignment has changed in GCC 4.6",
7092 align / BITS_PER_UNIT);
7093 }
7094 }
7095
7096 return align;
7097 }
7098
7099 /* Return true if N is a possible register number of function value. */
7100
7101 static bool
7102 ix86_function_value_regno_p (const unsigned int regno)
7103 {
7104 switch (regno)
7105 {
7106 case AX_REG:
7107 return true;
7108
7109 case FIRST_FLOAT_REG:
7110 /* TODO: The function should depend on current function ABI but
7111 builtins.c would need updating then. Therefore we use the
7112 default ABI. */
7113 if (TARGET_64BIT && ix86_abi == MS_ABI)
7114 return false;
7115 return TARGET_FLOAT_RETURNS_IN_80387;
7116
7117 case FIRST_SSE_REG:
7118 return TARGET_SSE;
7119
7120 case FIRST_MMX_REG:
7121 if (TARGET_MACHO || TARGET_64BIT)
7122 return false;
7123 return TARGET_MMX;
7124 }
7125
7126 return false;
7127 }
7128
7129 /* Define how to find the value returned by a function.
7130 VALTYPE is the data type of the value (as a tree).
7131 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7132 otherwise, FUNC is 0. */
7133
7134 static rtx
7135 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7136 const_tree fntype, const_tree fn)
7137 {
7138 unsigned int regno;
7139
7140 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7141 we normally prevent this case when mmx is not available. However
7142 some ABIs may require the result to be returned like DImode. */
7143 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7144 regno = FIRST_MMX_REG;
7145
7146 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7147 we prevent this case when sse is not available. However some ABIs
7148 may require the result to be returned like integer TImode. */
7149 else if (mode == TImode
7150 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7151 regno = FIRST_SSE_REG;
7152
7153 /* 32-byte vector modes in %ymm0. */
7154 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7155 regno = FIRST_SSE_REG;
7156
7157 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7158 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7159 regno = FIRST_FLOAT_REG;
7160 else
7161 /* Most things go in %eax. */
7162 regno = AX_REG;
7163
7164 /* Override FP return register with %xmm0 for local functions when
7165 SSE math is enabled or for functions with sseregparm attribute. */
7166 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7167 {
7168 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7169 if ((sse_level >= 1 && mode == SFmode)
7170 || (sse_level == 2 && mode == DFmode))
7171 regno = FIRST_SSE_REG;
7172 }
7173
7174 /* OImode shouldn't be used directly. */
7175 gcc_assert (mode != OImode);
7176
7177 return gen_rtx_REG (orig_mode, regno);
7178 }
7179
7180 static rtx
7181 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7182 const_tree valtype)
7183 {
7184 rtx ret;
7185
7186 /* Handle libcalls, which don't provide a type node. */
7187 if (valtype == NULL)
7188 {
7189 unsigned int regno;
7190
7191 switch (mode)
7192 {
7193 case SFmode:
7194 case SCmode:
7195 case DFmode:
7196 case DCmode:
7197 case TFmode:
7198 case SDmode:
7199 case DDmode:
7200 case TDmode:
7201 regno = FIRST_SSE_REG;
7202 break;
7203 case XFmode:
7204 case XCmode:
7205 regno = FIRST_FLOAT_REG;
7206 break;
7207 case TCmode:
7208 return NULL;
7209 default:
7210 regno = AX_REG;
7211 }
7212
7213 return gen_rtx_REG (mode, regno);
7214 }
7215 else if (POINTER_TYPE_P (valtype))
7216 {
7217 /* Pointers are always returned in word_mode. */
7218 mode = word_mode;
7219 }
7220
7221 ret = construct_container (mode, orig_mode, valtype, 1,
7222 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7223 x86_64_int_return_registers, 0);
7224
7225 /* For zero sized structures, construct_container returns NULL, but we
7226 need to keep rest of compiler happy by returning meaningful value. */
7227 if (!ret)
7228 ret = gen_rtx_REG (orig_mode, AX_REG);
7229
7230 return ret;
7231 }
7232
7233 static rtx
7234 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7235 {
7236 unsigned int regno = AX_REG;
7237
7238 if (TARGET_SSE)
7239 {
7240 switch (GET_MODE_SIZE (mode))
7241 {
7242 case 16:
7243 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7244 && !COMPLEX_MODE_P (mode))
7245 regno = FIRST_SSE_REG;
7246 break;
7247 case 8:
7248 case 4:
7249 if (mode == SFmode || mode == DFmode)
7250 regno = FIRST_SSE_REG;
7251 break;
7252 default:
7253 break;
7254 }
7255 }
7256 return gen_rtx_REG (orig_mode, regno);
7257 }
7258
7259 static rtx
7260 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7261 enum machine_mode orig_mode, enum machine_mode mode)
7262 {
7263 const_tree fn, fntype;
7264
7265 fn = NULL_TREE;
7266 if (fntype_or_decl && DECL_P (fntype_or_decl))
7267 fn = fntype_or_decl;
7268 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7269
7270 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7271 return function_value_ms_64 (orig_mode, mode);
7272 else if (TARGET_64BIT)
7273 return function_value_64 (orig_mode, mode, valtype);
7274 else
7275 return function_value_32 (orig_mode, mode, fntype, fn);
7276 }
7277
7278 static rtx
7279 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7280 bool outgoing ATTRIBUTE_UNUSED)
7281 {
7282 enum machine_mode mode, orig_mode;
7283
7284 orig_mode = TYPE_MODE (valtype);
7285 mode = type_natural_mode (valtype, NULL);
7286 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7287 }
7288
7289 /* Pointer function arguments and return values are promoted to
7290 word_mode. */
7291
7292 static enum machine_mode
7293 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7294 int *punsignedp, const_tree fntype,
7295 int for_return)
7296 {
7297 if (type != NULL_TREE && POINTER_TYPE_P (type))
7298 {
7299 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7300 return word_mode;
7301 }
7302 return default_promote_function_mode (type, mode, punsignedp, fntype,
7303 for_return);
7304 }
7305
7306 /* Return true if a structure, union or array with MODE containing FIELD
7307 should be accessed using BLKmode. */
7308
7309 static bool
7310 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7311 {
7312 /* Union with XFmode must be in BLKmode. */
7313 return (mode == XFmode
7314 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7315 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7316 }
7317
7318 rtx
7319 ix86_libcall_value (enum machine_mode mode)
7320 {
7321 return ix86_function_value_1 (NULL, NULL, mode, mode);
7322 }
7323
7324 /* Return true iff type is returned in memory. */
7325
7326 static bool ATTRIBUTE_UNUSED
7327 return_in_memory_32 (const_tree type, enum machine_mode mode)
7328 {
7329 HOST_WIDE_INT size;
7330
7331 if (mode == BLKmode)
7332 return true;
7333
7334 size = int_size_in_bytes (type);
7335
7336 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7337 return false;
7338
7339 if (VECTOR_MODE_P (mode) || mode == TImode)
7340 {
7341 /* User-created vectors small enough to fit in EAX. */
7342 if (size < 8)
7343 return false;
7344
7345 /* MMX/3dNow values are returned in MM0,
7346 except when it doesn't exits or the ABI prescribes otherwise. */
7347 if (size == 8)
7348 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7349
7350 /* SSE values are returned in XMM0, except when it doesn't exist. */
7351 if (size == 16)
7352 return !TARGET_SSE;
7353
7354 /* AVX values are returned in YMM0, except when it doesn't exist. */
7355 if (size == 32)
7356 return !TARGET_AVX;
7357 }
7358
7359 if (mode == XFmode)
7360 return false;
7361
7362 if (size > 12)
7363 return true;
7364
7365 /* OImode shouldn't be used directly. */
7366 gcc_assert (mode != OImode);
7367
7368 return false;
7369 }
7370
7371 static bool ATTRIBUTE_UNUSED
7372 return_in_memory_64 (const_tree type, enum machine_mode mode)
7373 {
7374 int needed_intregs, needed_sseregs;
7375 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7376 }
7377
7378 static bool ATTRIBUTE_UNUSED
7379 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7380 {
7381 HOST_WIDE_INT size = int_size_in_bytes (type);
7382
7383 /* __m128 is returned in xmm0. */
7384 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7385 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7386 return false;
7387
7388 /* Otherwise, the size must be exactly in [1248]. */
7389 return size != 1 && size != 2 && size != 4 && size != 8;
7390 }
7391
7392 static bool
7393 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7394 {
7395 #ifdef SUBTARGET_RETURN_IN_MEMORY
7396 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7397 #else
7398 const enum machine_mode mode = type_natural_mode (type, NULL);
7399
7400 if (TARGET_64BIT)
7401 {
7402 if (ix86_function_type_abi (fntype) == MS_ABI)
7403 return return_in_memory_ms_64 (type, mode);
7404 else
7405 return return_in_memory_64 (type, mode);
7406 }
7407 else
7408 return return_in_memory_32 (type, mode);
7409 #endif
7410 }
7411
7412 /* When returning SSE vector types, we have a choice of either
7413 (1) being abi incompatible with a -march switch, or
7414 (2) generating an error.
7415 Given no good solution, I think the safest thing is one warning.
7416 The user won't be able to use -Werror, but....
7417
7418 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7419 called in response to actually generating a caller or callee that
7420 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7421 via aggregate_value_p for general type probing from tree-ssa. */
7422
7423 static rtx
7424 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7425 {
7426 static bool warnedsse, warnedmmx;
7427
7428 if (!TARGET_64BIT && type)
7429 {
7430 /* Look at the return type of the function, not the function type. */
7431 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7432
7433 if (!TARGET_SSE && !warnedsse)
7434 {
7435 if (mode == TImode
7436 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7437 {
7438 warnedsse = true;
7439 warning (0, "SSE vector return without SSE enabled "
7440 "changes the ABI");
7441 }
7442 }
7443
7444 if (!TARGET_MMX && !warnedmmx)
7445 {
7446 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7447 {
7448 warnedmmx = true;
7449 warning (0, "MMX vector return without MMX enabled "
7450 "changes the ABI");
7451 }
7452 }
7453 }
7454
7455 return NULL;
7456 }
7457
7458 \f
7459 /* Create the va_list data type. */
7460
7461 /* Returns the calling convention specific va_list date type.
7462 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7463
7464 static tree
7465 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7466 {
7467 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7468
7469 /* For i386 we use plain pointer to argument area. */
7470 if (!TARGET_64BIT || abi == MS_ABI)
7471 return build_pointer_type (char_type_node);
7472
7473 record = lang_hooks.types.make_type (RECORD_TYPE);
7474 type_decl = build_decl (BUILTINS_LOCATION,
7475 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7476
7477 f_gpr = build_decl (BUILTINS_LOCATION,
7478 FIELD_DECL, get_identifier ("gp_offset"),
7479 unsigned_type_node);
7480 f_fpr = build_decl (BUILTINS_LOCATION,
7481 FIELD_DECL, get_identifier ("fp_offset"),
7482 unsigned_type_node);
7483 f_ovf = build_decl (BUILTINS_LOCATION,
7484 FIELD_DECL, get_identifier ("overflow_arg_area"),
7485 ptr_type_node);
7486 f_sav = build_decl (BUILTINS_LOCATION,
7487 FIELD_DECL, get_identifier ("reg_save_area"),
7488 ptr_type_node);
7489
7490 va_list_gpr_counter_field = f_gpr;
7491 va_list_fpr_counter_field = f_fpr;
7492
7493 DECL_FIELD_CONTEXT (f_gpr) = record;
7494 DECL_FIELD_CONTEXT (f_fpr) = record;
7495 DECL_FIELD_CONTEXT (f_ovf) = record;
7496 DECL_FIELD_CONTEXT (f_sav) = record;
7497
7498 TYPE_STUB_DECL (record) = type_decl;
7499 TYPE_NAME (record) = type_decl;
7500 TYPE_FIELDS (record) = f_gpr;
7501 DECL_CHAIN (f_gpr) = f_fpr;
7502 DECL_CHAIN (f_fpr) = f_ovf;
7503 DECL_CHAIN (f_ovf) = f_sav;
7504
7505 layout_type (record);
7506
7507 /* The correct type is an array type of one element. */
7508 return build_array_type (record, build_index_type (size_zero_node));
7509 }
7510
7511 /* Setup the builtin va_list data type and for 64-bit the additional
7512 calling convention specific va_list data types. */
7513
7514 static tree
7515 ix86_build_builtin_va_list (void)
7516 {
7517 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7518
7519 /* Initialize abi specific va_list builtin types. */
7520 if (TARGET_64BIT)
7521 {
7522 tree t;
7523 if (ix86_abi == MS_ABI)
7524 {
7525 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7526 if (TREE_CODE (t) != RECORD_TYPE)
7527 t = build_variant_type_copy (t);
7528 sysv_va_list_type_node = t;
7529 }
7530 else
7531 {
7532 t = ret;
7533 if (TREE_CODE (t) != RECORD_TYPE)
7534 t = build_variant_type_copy (t);
7535 sysv_va_list_type_node = t;
7536 }
7537 if (ix86_abi != MS_ABI)
7538 {
7539 t = ix86_build_builtin_va_list_abi (MS_ABI);
7540 if (TREE_CODE (t) != RECORD_TYPE)
7541 t = build_variant_type_copy (t);
7542 ms_va_list_type_node = t;
7543 }
7544 else
7545 {
7546 t = ret;
7547 if (TREE_CODE (t) != RECORD_TYPE)
7548 t = build_variant_type_copy (t);
7549 ms_va_list_type_node = t;
7550 }
7551 }
7552
7553 return ret;
7554 }
7555
7556 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7557
7558 static void
7559 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7560 {
7561 rtx save_area, mem;
7562 alias_set_type set;
7563 int i, max;
7564
7565 /* GPR size of varargs save area. */
7566 if (cfun->va_list_gpr_size)
7567 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7568 else
7569 ix86_varargs_gpr_size = 0;
7570
7571 /* FPR size of varargs save area. We don't need it if we don't pass
7572 anything in SSE registers. */
7573 if (TARGET_SSE && cfun->va_list_fpr_size)
7574 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7575 else
7576 ix86_varargs_fpr_size = 0;
7577
7578 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7579 return;
7580
7581 save_area = frame_pointer_rtx;
7582 set = get_varargs_alias_set ();
7583
7584 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7585 if (max > X86_64_REGPARM_MAX)
7586 max = X86_64_REGPARM_MAX;
7587
7588 for (i = cum->regno; i < max; i++)
7589 {
7590 mem = gen_rtx_MEM (word_mode,
7591 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7592 MEM_NOTRAP_P (mem) = 1;
7593 set_mem_alias_set (mem, set);
7594 emit_move_insn (mem,
7595 gen_rtx_REG (word_mode,
7596 x86_64_int_parameter_registers[i]));
7597 }
7598
7599 if (ix86_varargs_fpr_size)
7600 {
7601 enum machine_mode smode;
7602 rtx label, test;
7603
7604 /* Now emit code to save SSE registers. The AX parameter contains number
7605 of SSE parameter registers used to call this function, though all we
7606 actually check here is the zero/non-zero status. */
7607
7608 label = gen_label_rtx ();
7609 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7610 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7611 label));
7612
7613 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7614 we used movdqa (i.e. TImode) instead? Perhaps even better would
7615 be if we could determine the real mode of the data, via a hook
7616 into pass_stdarg. Ignore all that for now. */
7617 smode = V4SFmode;
7618 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7619 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7620
7621 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7622 if (max > X86_64_SSE_REGPARM_MAX)
7623 max = X86_64_SSE_REGPARM_MAX;
7624
7625 for (i = cum->sse_regno; i < max; ++i)
7626 {
7627 mem = plus_constant (Pmode, save_area,
7628 i * 16 + ix86_varargs_gpr_size);
7629 mem = gen_rtx_MEM (smode, mem);
7630 MEM_NOTRAP_P (mem) = 1;
7631 set_mem_alias_set (mem, set);
7632 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7633
7634 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7635 }
7636
7637 emit_label (label);
7638 }
7639 }
7640
7641 static void
7642 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7643 {
7644 alias_set_type set = get_varargs_alias_set ();
7645 int i;
7646
7647 /* Reset to zero, as there might be a sysv vaarg used
7648 before. */
7649 ix86_varargs_gpr_size = 0;
7650 ix86_varargs_fpr_size = 0;
7651
7652 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7653 {
7654 rtx reg, mem;
7655
7656 mem = gen_rtx_MEM (Pmode,
7657 plus_constant (Pmode, virtual_incoming_args_rtx,
7658 i * UNITS_PER_WORD));
7659 MEM_NOTRAP_P (mem) = 1;
7660 set_mem_alias_set (mem, set);
7661
7662 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7663 emit_move_insn (mem, reg);
7664 }
7665 }
7666
7667 static void
7668 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7669 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7670 int no_rtl)
7671 {
7672 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7673 CUMULATIVE_ARGS next_cum;
7674 tree fntype;
7675
7676 /* This argument doesn't appear to be used anymore. Which is good,
7677 because the old code here didn't suppress rtl generation. */
7678 gcc_assert (!no_rtl);
7679
7680 if (!TARGET_64BIT)
7681 return;
7682
7683 fntype = TREE_TYPE (current_function_decl);
7684
7685 /* For varargs, we do not want to skip the dummy va_dcl argument.
7686 For stdargs, we do want to skip the last named argument. */
7687 next_cum = *cum;
7688 if (stdarg_p (fntype))
7689 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7690 true);
7691
7692 if (cum->call_abi == MS_ABI)
7693 setup_incoming_varargs_ms_64 (&next_cum);
7694 else
7695 setup_incoming_varargs_64 (&next_cum);
7696 }
7697
7698 /* Checks if TYPE is of kind va_list char *. */
7699
7700 static bool
7701 is_va_list_char_pointer (tree type)
7702 {
7703 tree canonic;
7704
7705 /* For 32-bit it is always true. */
7706 if (!TARGET_64BIT)
7707 return true;
7708 canonic = ix86_canonical_va_list_type (type);
7709 return (canonic == ms_va_list_type_node
7710 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7711 }
7712
7713 /* Implement va_start. */
7714
7715 static void
7716 ix86_va_start (tree valist, rtx nextarg)
7717 {
7718 HOST_WIDE_INT words, n_gpr, n_fpr;
7719 tree f_gpr, f_fpr, f_ovf, f_sav;
7720 tree gpr, fpr, ovf, sav, t;
7721 tree type;
7722 rtx ovf_rtx;
7723
7724 if (flag_split_stack
7725 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7726 {
7727 unsigned int scratch_regno;
7728
7729 /* When we are splitting the stack, we can't refer to the stack
7730 arguments using internal_arg_pointer, because they may be on
7731 the old stack. The split stack prologue will arrange to
7732 leave a pointer to the old stack arguments in a scratch
7733 register, which we here copy to a pseudo-register. The split
7734 stack prologue can't set the pseudo-register directly because
7735 it (the prologue) runs before any registers have been saved. */
7736
7737 scratch_regno = split_stack_prologue_scratch_regno ();
7738 if (scratch_regno != INVALID_REGNUM)
7739 {
7740 rtx reg, seq;
7741
7742 reg = gen_reg_rtx (Pmode);
7743 cfun->machine->split_stack_varargs_pointer = reg;
7744
7745 start_sequence ();
7746 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7747 seq = get_insns ();
7748 end_sequence ();
7749
7750 push_topmost_sequence ();
7751 emit_insn_after (seq, entry_of_function ());
7752 pop_topmost_sequence ();
7753 }
7754 }
7755
7756 /* Only 64bit target needs something special. */
7757 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7758 {
7759 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7760 std_expand_builtin_va_start (valist, nextarg);
7761 else
7762 {
7763 rtx va_r, next;
7764
7765 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7766 next = expand_binop (ptr_mode, add_optab,
7767 cfun->machine->split_stack_varargs_pointer,
7768 crtl->args.arg_offset_rtx,
7769 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7770 convert_move (va_r, next, 0);
7771 }
7772 return;
7773 }
7774
7775 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7776 f_fpr = DECL_CHAIN (f_gpr);
7777 f_ovf = DECL_CHAIN (f_fpr);
7778 f_sav = DECL_CHAIN (f_ovf);
7779
7780 valist = build_simple_mem_ref (valist);
7781 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7782 /* The following should be folded into the MEM_REF offset. */
7783 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7784 f_gpr, NULL_TREE);
7785 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7786 f_fpr, NULL_TREE);
7787 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7788 f_ovf, NULL_TREE);
7789 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7790 f_sav, NULL_TREE);
7791
7792 /* Count number of gp and fp argument registers used. */
7793 words = crtl->args.info.words;
7794 n_gpr = crtl->args.info.regno;
7795 n_fpr = crtl->args.info.sse_regno;
7796
7797 if (cfun->va_list_gpr_size)
7798 {
7799 type = TREE_TYPE (gpr);
7800 t = build2 (MODIFY_EXPR, type,
7801 gpr, build_int_cst (type, n_gpr * 8));
7802 TREE_SIDE_EFFECTS (t) = 1;
7803 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7804 }
7805
7806 if (TARGET_SSE && cfun->va_list_fpr_size)
7807 {
7808 type = TREE_TYPE (fpr);
7809 t = build2 (MODIFY_EXPR, type, fpr,
7810 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7811 TREE_SIDE_EFFECTS (t) = 1;
7812 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7813 }
7814
7815 /* Find the overflow area. */
7816 type = TREE_TYPE (ovf);
7817 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7818 ovf_rtx = crtl->args.internal_arg_pointer;
7819 else
7820 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7821 t = make_tree (type, ovf_rtx);
7822 if (words != 0)
7823 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7824 t = build2 (MODIFY_EXPR, type, ovf, t);
7825 TREE_SIDE_EFFECTS (t) = 1;
7826 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7827
7828 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7829 {
7830 /* Find the register save area.
7831 Prologue of the function save it right above stack frame. */
7832 type = TREE_TYPE (sav);
7833 t = make_tree (type, frame_pointer_rtx);
7834 if (!ix86_varargs_gpr_size)
7835 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7836 t = build2 (MODIFY_EXPR, type, sav, t);
7837 TREE_SIDE_EFFECTS (t) = 1;
7838 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7839 }
7840 }
7841
7842 /* Implement va_arg. */
7843
7844 static tree
7845 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7846 gimple_seq *post_p)
7847 {
7848 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7849 tree f_gpr, f_fpr, f_ovf, f_sav;
7850 tree gpr, fpr, ovf, sav, t;
7851 int size, rsize;
7852 tree lab_false, lab_over = NULL_TREE;
7853 tree addr, t2;
7854 rtx container;
7855 int indirect_p = 0;
7856 tree ptrtype;
7857 enum machine_mode nat_mode;
7858 unsigned int arg_boundary;
7859
7860 /* Only 64bit target needs something special. */
7861 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7862 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7863
7864 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7865 f_fpr = DECL_CHAIN (f_gpr);
7866 f_ovf = DECL_CHAIN (f_fpr);
7867 f_sav = DECL_CHAIN (f_ovf);
7868
7869 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7870 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7871 valist = build_va_arg_indirect_ref (valist);
7872 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7873 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7874 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7875
7876 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7877 if (indirect_p)
7878 type = build_pointer_type (type);
7879 size = int_size_in_bytes (type);
7880 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7881
7882 nat_mode = type_natural_mode (type, NULL);
7883 switch (nat_mode)
7884 {
7885 case V8SFmode:
7886 case V8SImode:
7887 case V32QImode:
7888 case V16HImode:
7889 case V4DFmode:
7890 case V4DImode:
7891 /* Unnamed 256bit vector mode parameters are passed on stack. */
7892 if (!TARGET_64BIT_MS_ABI)
7893 {
7894 container = NULL;
7895 break;
7896 }
7897
7898 default:
7899 container = construct_container (nat_mode, TYPE_MODE (type),
7900 type, 0, X86_64_REGPARM_MAX,
7901 X86_64_SSE_REGPARM_MAX, intreg,
7902 0);
7903 break;
7904 }
7905
7906 /* Pull the value out of the saved registers. */
7907
7908 addr = create_tmp_var (ptr_type_node, "addr");
7909
7910 if (container)
7911 {
7912 int needed_intregs, needed_sseregs;
7913 bool need_temp;
7914 tree int_addr, sse_addr;
7915
7916 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7917 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7918
7919 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7920
7921 need_temp = (!REG_P (container)
7922 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7923 || TYPE_ALIGN (type) > 128));
7924
7925 /* In case we are passing structure, verify that it is consecutive block
7926 on the register save area. If not we need to do moves. */
7927 if (!need_temp && !REG_P (container))
7928 {
7929 /* Verify that all registers are strictly consecutive */
7930 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7931 {
7932 int i;
7933
7934 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7935 {
7936 rtx slot = XVECEXP (container, 0, i);
7937 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7938 || INTVAL (XEXP (slot, 1)) != i * 16)
7939 need_temp = 1;
7940 }
7941 }
7942 else
7943 {
7944 int i;
7945
7946 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7947 {
7948 rtx slot = XVECEXP (container, 0, i);
7949 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7950 || INTVAL (XEXP (slot, 1)) != i * 8)
7951 need_temp = 1;
7952 }
7953 }
7954 }
7955 if (!need_temp)
7956 {
7957 int_addr = addr;
7958 sse_addr = addr;
7959 }
7960 else
7961 {
7962 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7963 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7964 }
7965
7966 /* First ensure that we fit completely in registers. */
7967 if (needed_intregs)
7968 {
7969 t = build_int_cst (TREE_TYPE (gpr),
7970 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7971 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7972 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7973 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7974 gimplify_and_add (t, pre_p);
7975 }
7976 if (needed_sseregs)
7977 {
7978 t = build_int_cst (TREE_TYPE (fpr),
7979 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7980 + X86_64_REGPARM_MAX * 8);
7981 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7982 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7983 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7984 gimplify_and_add (t, pre_p);
7985 }
7986
7987 /* Compute index to start of area used for integer regs. */
7988 if (needed_intregs)
7989 {
7990 /* int_addr = gpr + sav; */
7991 t = fold_build_pointer_plus (sav, gpr);
7992 gimplify_assign (int_addr, t, pre_p);
7993 }
7994 if (needed_sseregs)
7995 {
7996 /* sse_addr = fpr + sav; */
7997 t = fold_build_pointer_plus (sav, fpr);
7998 gimplify_assign (sse_addr, t, pre_p);
7999 }
8000 if (need_temp)
8001 {
8002 int i, prev_size = 0;
8003 tree temp = create_tmp_var (type, "va_arg_tmp");
8004
8005 /* addr = &temp; */
8006 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8007 gimplify_assign (addr, t, pre_p);
8008
8009 for (i = 0; i < XVECLEN (container, 0); i++)
8010 {
8011 rtx slot = XVECEXP (container, 0, i);
8012 rtx reg = XEXP (slot, 0);
8013 enum machine_mode mode = GET_MODE (reg);
8014 tree piece_type;
8015 tree addr_type;
8016 tree daddr_type;
8017 tree src_addr, src;
8018 int src_offset;
8019 tree dest_addr, dest;
8020 int cur_size = GET_MODE_SIZE (mode);
8021
8022 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8023 prev_size = INTVAL (XEXP (slot, 1));
8024 if (prev_size + cur_size > size)
8025 {
8026 cur_size = size - prev_size;
8027 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8028 if (mode == BLKmode)
8029 mode = QImode;
8030 }
8031 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8032 if (mode == GET_MODE (reg))
8033 addr_type = build_pointer_type (piece_type);
8034 else
8035 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8036 true);
8037 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8038 true);
8039
8040 if (SSE_REGNO_P (REGNO (reg)))
8041 {
8042 src_addr = sse_addr;
8043 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8044 }
8045 else
8046 {
8047 src_addr = int_addr;
8048 src_offset = REGNO (reg) * 8;
8049 }
8050 src_addr = fold_convert (addr_type, src_addr);
8051 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8052
8053 dest_addr = fold_convert (daddr_type, addr);
8054 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8055 if (cur_size == GET_MODE_SIZE (mode))
8056 {
8057 src = build_va_arg_indirect_ref (src_addr);
8058 dest = build_va_arg_indirect_ref (dest_addr);
8059
8060 gimplify_assign (dest, src, pre_p);
8061 }
8062 else
8063 {
8064 tree copy
8065 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8066 3, dest_addr, src_addr,
8067 size_int (cur_size));
8068 gimplify_and_add (copy, pre_p);
8069 }
8070 prev_size += cur_size;
8071 }
8072 }
8073
8074 if (needed_intregs)
8075 {
8076 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8077 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8078 gimplify_assign (gpr, t, pre_p);
8079 }
8080
8081 if (needed_sseregs)
8082 {
8083 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8084 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8085 gimplify_assign (fpr, t, pre_p);
8086 }
8087
8088 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8089
8090 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8091 }
8092
8093 /* ... otherwise out of the overflow area. */
8094
8095 /* When we align parameter on stack for caller, if the parameter
8096 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8097 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8098 here with caller. */
8099 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8100 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8101 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8102
8103 /* Care for on-stack alignment if needed. */
8104 if (arg_boundary <= 64 || size == 0)
8105 t = ovf;
8106 else
8107 {
8108 HOST_WIDE_INT align = arg_boundary / 8;
8109 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8110 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8111 build_int_cst (TREE_TYPE (t), -align));
8112 }
8113
8114 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8115 gimplify_assign (addr, t, pre_p);
8116
8117 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8118 gimplify_assign (unshare_expr (ovf), t, pre_p);
8119
8120 if (container)
8121 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8122
8123 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8124 addr = fold_convert (ptrtype, addr);
8125
8126 if (indirect_p)
8127 addr = build_va_arg_indirect_ref (addr);
8128 return build_va_arg_indirect_ref (addr);
8129 }
8130 \f
8131 /* Return true if OPNUM's MEM should be matched
8132 in movabs* patterns. */
8133
8134 bool
8135 ix86_check_movabs (rtx insn, int opnum)
8136 {
8137 rtx set, mem;
8138
8139 set = PATTERN (insn);
8140 if (GET_CODE (set) == PARALLEL)
8141 set = XVECEXP (set, 0, 0);
8142 gcc_assert (GET_CODE (set) == SET);
8143 mem = XEXP (set, opnum);
8144 while (GET_CODE (mem) == SUBREG)
8145 mem = SUBREG_REG (mem);
8146 gcc_assert (MEM_P (mem));
8147 return volatile_ok || !MEM_VOLATILE_P (mem);
8148 }
8149 \f
8150 /* Initialize the table of extra 80387 mathematical constants. */
8151
8152 static void
8153 init_ext_80387_constants (void)
8154 {
8155 static const char * cst[5] =
8156 {
8157 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8158 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8159 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8160 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8161 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8162 };
8163 int i;
8164
8165 for (i = 0; i < 5; i++)
8166 {
8167 real_from_string (&ext_80387_constants_table[i], cst[i]);
8168 /* Ensure each constant is rounded to XFmode precision. */
8169 real_convert (&ext_80387_constants_table[i],
8170 XFmode, &ext_80387_constants_table[i]);
8171 }
8172
8173 ext_80387_constants_init = 1;
8174 }
8175
8176 /* Return non-zero if the constant is something that
8177 can be loaded with a special instruction. */
8178
8179 int
8180 standard_80387_constant_p (rtx x)
8181 {
8182 enum machine_mode mode = GET_MODE (x);
8183
8184 REAL_VALUE_TYPE r;
8185
8186 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8187 return -1;
8188
8189 if (x == CONST0_RTX (mode))
8190 return 1;
8191 if (x == CONST1_RTX (mode))
8192 return 2;
8193
8194 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8195
8196 /* For XFmode constants, try to find a special 80387 instruction when
8197 optimizing for size or on those CPUs that benefit from them. */
8198 if (mode == XFmode
8199 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8200 {
8201 int i;
8202
8203 if (! ext_80387_constants_init)
8204 init_ext_80387_constants ();
8205
8206 for (i = 0; i < 5; i++)
8207 if (real_identical (&r, &ext_80387_constants_table[i]))
8208 return i + 3;
8209 }
8210
8211 /* Load of the constant -0.0 or -1.0 will be split as
8212 fldz;fchs or fld1;fchs sequence. */
8213 if (real_isnegzero (&r))
8214 return 8;
8215 if (real_identical (&r, &dconstm1))
8216 return 9;
8217
8218 return 0;
8219 }
8220
8221 /* Return the opcode of the special instruction to be used to load
8222 the constant X. */
8223
8224 const char *
8225 standard_80387_constant_opcode (rtx x)
8226 {
8227 switch (standard_80387_constant_p (x))
8228 {
8229 case 1:
8230 return "fldz";
8231 case 2:
8232 return "fld1";
8233 case 3:
8234 return "fldlg2";
8235 case 4:
8236 return "fldln2";
8237 case 5:
8238 return "fldl2e";
8239 case 6:
8240 return "fldl2t";
8241 case 7:
8242 return "fldpi";
8243 case 8:
8244 case 9:
8245 return "#";
8246 default:
8247 gcc_unreachable ();
8248 }
8249 }
8250
8251 /* Return the CONST_DOUBLE representing the 80387 constant that is
8252 loaded by the specified special instruction. The argument IDX
8253 matches the return value from standard_80387_constant_p. */
8254
8255 rtx
8256 standard_80387_constant_rtx (int idx)
8257 {
8258 int i;
8259
8260 if (! ext_80387_constants_init)
8261 init_ext_80387_constants ();
8262
8263 switch (idx)
8264 {
8265 case 3:
8266 case 4:
8267 case 5:
8268 case 6:
8269 case 7:
8270 i = idx - 3;
8271 break;
8272
8273 default:
8274 gcc_unreachable ();
8275 }
8276
8277 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8278 XFmode);
8279 }
8280
8281 /* Return 1 if X is all 0s and 2 if x is all 1s
8282 in supported SSE/AVX vector mode. */
8283
8284 int
8285 standard_sse_constant_p (rtx x)
8286 {
8287 enum machine_mode mode = GET_MODE (x);
8288
8289 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8290 return 1;
8291 if (vector_all_ones_operand (x, mode))
8292 switch (mode)
8293 {
8294 case V16QImode:
8295 case V8HImode:
8296 case V4SImode:
8297 case V2DImode:
8298 if (TARGET_SSE2)
8299 return 2;
8300 case V32QImode:
8301 case V16HImode:
8302 case V8SImode:
8303 case V4DImode:
8304 if (TARGET_AVX2)
8305 return 2;
8306 default:
8307 break;
8308 }
8309
8310 return 0;
8311 }
8312
8313 /* Return the opcode of the special instruction to be used to load
8314 the constant X. */
8315
8316 const char *
8317 standard_sse_constant_opcode (rtx insn, rtx x)
8318 {
8319 switch (standard_sse_constant_p (x))
8320 {
8321 case 1:
8322 switch (get_attr_mode (insn))
8323 {
8324 case MODE_TI:
8325 return "%vpxor\t%0, %d0";
8326 case MODE_V2DF:
8327 return "%vxorpd\t%0, %d0";
8328 case MODE_V4SF:
8329 return "%vxorps\t%0, %d0";
8330
8331 case MODE_OI:
8332 return "vpxor\t%x0, %x0, %x0";
8333 case MODE_V4DF:
8334 return "vxorpd\t%x0, %x0, %x0";
8335 case MODE_V8SF:
8336 return "vxorps\t%x0, %x0, %x0";
8337
8338 default:
8339 break;
8340 }
8341
8342 case 2:
8343 if (TARGET_AVX)
8344 return "vpcmpeqd\t%0, %0, %0";
8345 else
8346 return "pcmpeqd\t%0, %0";
8347
8348 default:
8349 break;
8350 }
8351 gcc_unreachable ();
8352 }
8353
8354 /* Returns true if OP contains a symbol reference */
8355
8356 bool
8357 symbolic_reference_mentioned_p (rtx op)
8358 {
8359 const char *fmt;
8360 int i;
8361
8362 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8363 return true;
8364
8365 fmt = GET_RTX_FORMAT (GET_CODE (op));
8366 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8367 {
8368 if (fmt[i] == 'E')
8369 {
8370 int j;
8371
8372 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8373 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8374 return true;
8375 }
8376
8377 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8378 return true;
8379 }
8380
8381 return false;
8382 }
8383
8384 /* Return true if it is appropriate to emit `ret' instructions in the
8385 body of a function. Do this only if the epilogue is simple, needing a
8386 couple of insns. Prior to reloading, we can't tell how many registers
8387 must be saved, so return false then. Return false if there is no frame
8388 marker to de-allocate. */
8389
8390 bool
8391 ix86_can_use_return_insn_p (void)
8392 {
8393 struct ix86_frame frame;
8394
8395 if (! reload_completed || frame_pointer_needed)
8396 return 0;
8397
8398 /* Don't allow more than 32k pop, since that's all we can do
8399 with one instruction. */
8400 if (crtl->args.pops_args && crtl->args.size >= 32768)
8401 return 0;
8402
8403 ix86_compute_frame_layout (&frame);
8404 return (frame.stack_pointer_offset == UNITS_PER_WORD
8405 && (frame.nregs + frame.nsseregs) == 0);
8406 }
8407 \f
8408 /* Value should be nonzero if functions must have frame pointers.
8409 Zero means the frame pointer need not be set up (and parms may
8410 be accessed via the stack pointer) in functions that seem suitable. */
8411
8412 static bool
8413 ix86_frame_pointer_required (void)
8414 {
8415 /* If we accessed previous frames, then the generated code expects
8416 to be able to access the saved ebp value in our frame. */
8417 if (cfun->machine->accesses_prev_frame)
8418 return true;
8419
8420 /* Several x86 os'es need a frame pointer for other reasons,
8421 usually pertaining to setjmp. */
8422 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8423 return true;
8424
8425 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8426 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8427 return true;
8428
8429 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8430 allocation is 4GB. */
8431 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8432 return true;
8433
8434 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8435 turns off the frame pointer by default. Turn it back on now if
8436 we've not got a leaf function. */
8437 if (TARGET_OMIT_LEAF_FRAME_POINTER
8438 && (!crtl->is_leaf
8439 || ix86_current_function_calls_tls_descriptor))
8440 return true;
8441
8442 if (crtl->profile && !flag_fentry)
8443 return true;
8444
8445 return false;
8446 }
8447
8448 /* Record that the current function accesses previous call frames. */
8449
8450 void
8451 ix86_setup_frame_addresses (void)
8452 {
8453 cfun->machine->accesses_prev_frame = 1;
8454 }
8455 \f
8456 #ifndef USE_HIDDEN_LINKONCE
8457 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8458 # define USE_HIDDEN_LINKONCE 1
8459 # else
8460 # define USE_HIDDEN_LINKONCE 0
8461 # endif
8462 #endif
8463
8464 static int pic_labels_used;
8465
8466 /* Fills in the label name that should be used for a pc thunk for
8467 the given register. */
8468
8469 static void
8470 get_pc_thunk_name (char name[32], unsigned int regno)
8471 {
8472 gcc_assert (!TARGET_64BIT);
8473
8474 if (USE_HIDDEN_LINKONCE)
8475 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8476 else
8477 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8478 }
8479
8480
8481 /* This function generates code for -fpic that loads %ebx with
8482 the return address of the caller and then returns. */
8483
8484 static void
8485 ix86_code_end (void)
8486 {
8487 rtx xops[2];
8488 int regno;
8489
8490 for (regno = AX_REG; regno <= SP_REG; regno++)
8491 {
8492 char name[32];
8493 tree decl;
8494
8495 if (!(pic_labels_used & (1 << regno)))
8496 continue;
8497
8498 get_pc_thunk_name (name, regno);
8499
8500 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8501 get_identifier (name),
8502 build_function_type_list (void_type_node, NULL_TREE));
8503 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8504 NULL_TREE, void_type_node);
8505 TREE_PUBLIC (decl) = 1;
8506 TREE_STATIC (decl) = 1;
8507 DECL_IGNORED_P (decl) = 1;
8508
8509 #if TARGET_MACHO
8510 if (TARGET_MACHO)
8511 {
8512 switch_to_section (darwin_sections[text_coal_section]);
8513 fputs ("\t.weak_definition\t", asm_out_file);
8514 assemble_name (asm_out_file, name);
8515 fputs ("\n\t.private_extern\t", asm_out_file);
8516 assemble_name (asm_out_file, name);
8517 putc ('\n', asm_out_file);
8518 ASM_OUTPUT_LABEL (asm_out_file, name);
8519 DECL_WEAK (decl) = 1;
8520 }
8521 else
8522 #endif
8523 if (USE_HIDDEN_LINKONCE)
8524 {
8525 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8526
8527 targetm.asm_out.unique_section (decl, 0);
8528 switch_to_section (get_named_section (decl, NULL, 0));
8529
8530 targetm.asm_out.globalize_label (asm_out_file, name);
8531 fputs ("\t.hidden\t", asm_out_file);
8532 assemble_name (asm_out_file, name);
8533 putc ('\n', asm_out_file);
8534 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8535 }
8536 else
8537 {
8538 switch_to_section (text_section);
8539 ASM_OUTPUT_LABEL (asm_out_file, name);
8540 }
8541
8542 DECL_INITIAL (decl) = make_node (BLOCK);
8543 current_function_decl = decl;
8544 init_function_start (decl);
8545 first_function_block_is_cold = false;
8546 /* Make sure unwind info is emitted for the thunk if needed. */
8547 final_start_function (emit_barrier (), asm_out_file, 1);
8548
8549 /* Pad stack IP move with 4 instructions (two NOPs count
8550 as one instruction). */
8551 if (TARGET_PAD_SHORT_FUNCTION)
8552 {
8553 int i = 8;
8554
8555 while (i--)
8556 fputs ("\tnop\n", asm_out_file);
8557 }
8558
8559 xops[0] = gen_rtx_REG (Pmode, regno);
8560 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8561 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8562 fputs ("\tret\n", asm_out_file);
8563 final_end_function ();
8564 init_insn_lengths ();
8565 free_after_compilation (cfun);
8566 set_cfun (NULL);
8567 current_function_decl = NULL;
8568 }
8569
8570 if (flag_split_stack)
8571 file_end_indicate_split_stack ();
8572 }
8573
8574 /* Emit code for the SET_GOT patterns. */
8575
8576 const char *
8577 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8578 {
8579 rtx xops[3];
8580
8581 xops[0] = dest;
8582
8583 if (TARGET_VXWORKS_RTP && flag_pic)
8584 {
8585 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8586 xops[2] = gen_rtx_MEM (Pmode,
8587 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8588 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8589
8590 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8591 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8592 an unadorned address. */
8593 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8594 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8595 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8596 return "";
8597 }
8598
8599 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8600
8601 if (!flag_pic)
8602 {
8603 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8604
8605 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8606
8607 #if TARGET_MACHO
8608 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8609 is what will be referenced by the Mach-O PIC subsystem. */
8610 if (!label)
8611 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8612 #endif
8613
8614 targetm.asm_out.internal_label (asm_out_file, "L",
8615 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8616 }
8617 else
8618 {
8619 char name[32];
8620 get_pc_thunk_name (name, REGNO (dest));
8621 pic_labels_used |= 1 << REGNO (dest);
8622
8623 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8624 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8625 output_asm_insn ("call\t%X2", xops);
8626 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8627 is what will be referenced by the Mach-O PIC subsystem. */
8628 #if TARGET_MACHO
8629 if (!label)
8630 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8631 else
8632 targetm.asm_out.internal_label (asm_out_file, "L",
8633 CODE_LABEL_NUMBER (label));
8634 #endif
8635 }
8636
8637 if (!TARGET_MACHO)
8638 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8639
8640 return "";
8641 }
8642
8643 /* Generate an "push" pattern for input ARG. */
8644
8645 static rtx
8646 gen_push (rtx arg)
8647 {
8648 struct machine_function *m = cfun->machine;
8649
8650 if (m->fs.cfa_reg == stack_pointer_rtx)
8651 m->fs.cfa_offset += UNITS_PER_WORD;
8652 m->fs.sp_offset += UNITS_PER_WORD;
8653
8654 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8655 arg = gen_rtx_REG (word_mode, REGNO (arg));
8656
8657 return gen_rtx_SET (VOIDmode,
8658 gen_rtx_MEM (word_mode,
8659 gen_rtx_PRE_DEC (Pmode,
8660 stack_pointer_rtx)),
8661 arg);
8662 }
8663
8664 /* Generate an "pop" pattern for input ARG. */
8665
8666 static rtx
8667 gen_pop (rtx arg)
8668 {
8669 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8670 arg = gen_rtx_REG (word_mode, REGNO (arg));
8671
8672 return gen_rtx_SET (VOIDmode,
8673 arg,
8674 gen_rtx_MEM (word_mode,
8675 gen_rtx_POST_INC (Pmode,
8676 stack_pointer_rtx)));
8677 }
8678
8679 /* Return >= 0 if there is an unused call-clobbered register available
8680 for the entire function. */
8681
8682 static unsigned int
8683 ix86_select_alt_pic_regnum (void)
8684 {
8685 if (crtl->is_leaf
8686 && !crtl->profile
8687 && !ix86_current_function_calls_tls_descriptor)
8688 {
8689 int i, drap;
8690 /* Can't use the same register for both PIC and DRAP. */
8691 if (crtl->drap_reg)
8692 drap = REGNO (crtl->drap_reg);
8693 else
8694 drap = -1;
8695 for (i = 2; i >= 0; --i)
8696 if (i != drap && !df_regs_ever_live_p (i))
8697 return i;
8698 }
8699
8700 return INVALID_REGNUM;
8701 }
8702
8703 /* Return TRUE if we need to save REGNO. */
8704
8705 static bool
8706 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8707 {
8708 if (pic_offset_table_rtx
8709 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8710 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8711 || crtl->profile
8712 || crtl->calls_eh_return
8713 || crtl->uses_const_pool))
8714 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8715
8716 if (crtl->calls_eh_return && maybe_eh_return)
8717 {
8718 unsigned i;
8719 for (i = 0; ; i++)
8720 {
8721 unsigned test = EH_RETURN_DATA_REGNO (i);
8722 if (test == INVALID_REGNUM)
8723 break;
8724 if (test == regno)
8725 return true;
8726 }
8727 }
8728
8729 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8730 return true;
8731
8732 return (df_regs_ever_live_p (regno)
8733 && !call_used_regs[regno]
8734 && !fixed_regs[regno]
8735 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8736 }
8737
8738 /* Return number of saved general prupose registers. */
8739
8740 static int
8741 ix86_nsaved_regs (void)
8742 {
8743 int nregs = 0;
8744 int regno;
8745
8746 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8747 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8748 nregs ++;
8749 return nregs;
8750 }
8751
8752 /* Return number of saved SSE registrers. */
8753
8754 static int
8755 ix86_nsaved_sseregs (void)
8756 {
8757 int nregs = 0;
8758 int regno;
8759
8760 if (!TARGET_64BIT_MS_ABI)
8761 return 0;
8762 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8763 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8764 nregs ++;
8765 return nregs;
8766 }
8767
8768 /* Given FROM and TO register numbers, say whether this elimination is
8769 allowed. If stack alignment is needed, we can only replace argument
8770 pointer with hard frame pointer, or replace frame pointer with stack
8771 pointer. Otherwise, frame pointer elimination is automatically
8772 handled and all other eliminations are valid. */
8773
8774 static bool
8775 ix86_can_eliminate (const int from, const int to)
8776 {
8777 if (stack_realign_fp)
8778 return ((from == ARG_POINTER_REGNUM
8779 && to == HARD_FRAME_POINTER_REGNUM)
8780 || (from == FRAME_POINTER_REGNUM
8781 && to == STACK_POINTER_REGNUM));
8782 else
8783 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8784 }
8785
8786 /* Return the offset between two registers, one to be eliminated, and the other
8787 its replacement, at the start of a routine. */
8788
8789 HOST_WIDE_INT
8790 ix86_initial_elimination_offset (int from, int to)
8791 {
8792 struct ix86_frame frame;
8793 ix86_compute_frame_layout (&frame);
8794
8795 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8796 return frame.hard_frame_pointer_offset;
8797 else if (from == FRAME_POINTER_REGNUM
8798 && to == HARD_FRAME_POINTER_REGNUM)
8799 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8800 else
8801 {
8802 gcc_assert (to == STACK_POINTER_REGNUM);
8803
8804 if (from == ARG_POINTER_REGNUM)
8805 return frame.stack_pointer_offset;
8806
8807 gcc_assert (from == FRAME_POINTER_REGNUM);
8808 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8809 }
8810 }
8811
8812 /* In a dynamically-aligned function, we can't know the offset from
8813 stack pointer to frame pointer, so we must ensure that setjmp
8814 eliminates fp against the hard fp (%ebp) rather than trying to
8815 index from %esp up to the top of the frame across a gap that is
8816 of unknown (at compile-time) size. */
8817 static rtx
8818 ix86_builtin_setjmp_frame_value (void)
8819 {
8820 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8821 }
8822
8823 /* When using -fsplit-stack, the allocation routines set a field in
8824 the TCB to the bottom of the stack plus this much space, measured
8825 in bytes. */
8826
8827 #define SPLIT_STACK_AVAILABLE 256
8828
8829 /* Fill structure ix86_frame about frame of currently computed function. */
8830
8831 static void
8832 ix86_compute_frame_layout (struct ix86_frame *frame)
8833 {
8834 unsigned HOST_WIDE_INT stack_alignment_needed;
8835 HOST_WIDE_INT offset;
8836 unsigned HOST_WIDE_INT preferred_alignment;
8837 HOST_WIDE_INT size = get_frame_size ();
8838 HOST_WIDE_INT to_allocate;
8839
8840 frame->nregs = ix86_nsaved_regs ();
8841 frame->nsseregs = ix86_nsaved_sseregs ();
8842
8843 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8844 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8845
8846 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8847 function prologues and leaf. */
8848 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8849 && (!crtl->is_leaf || cfun->calls_alloca != 0
8850 || ix86_current_function_calls_tls_descriptor))
8851 {
8852 preferred_alignment = 16;
8853 stack_alignment_needed = 16;
8854 crtl->preferred_stack_boundary = 128;
8855 crtl->stack_alignment_needed = 128;
8856 }
8857
8858 gcc_assert (!size || stack_alignment_needed);
8859 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8860 gcc_assert (preferred_alignment <= stack_alignment_needed);
8861
8862 /* For SEH we have to limit the amount of code movement into the prologue.
8863 At present we do this via a BLOCKAGE, at which point there's very little
8864 scheduling that can be done, which means that there's very little point
8865 in doing anything except PUSHs. */
8866 if (TARGET_SEH)
8867 cfun->machine->use_fast_prologue_epilogue = false;
8868
8869 /* During reload iteration the amount of registers saved can change.
8870 Recompute the value as needed. Do not recompute when amount of registers
8871 didn't change as reload does multiple calls to the function and does not
8872 expect the decision to change within single iteration. */
8873 else if (!optimize_function_for_size_p (cfun)
8874 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8875 {
8876 int count = frame->nregs;
8877 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8878
8879 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8880
8881 /* The fast prologue uses move instead of push to save registers. This
8882 is significantly longer, but also executes faster as modern hardware
8883 can execute the moves in parallel, but can't do that for push/pop.
8884
8885 Be careful about choosing what prologue to emit: When function takes
8886 many instructions to execute we may use slow version as well as in
8887 case function is known to be outside hot spot (this is known with
8888 feedback only). Weight the size of function by number of registers
8889 to save as it is cheap to use one or two push instructions but very
8890 slow to use many of them. */
8891 if (count)
8892 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8893 if (node->frequency < NODE_FREQUENCY_NORMAL
8894 || (flag_branch_probabilities
8895 && node->frequency < NODE_FREQUENCY_HOT))
8896 cfun->machine->use_fast_prologue_epilogue = false;
8897 else
8898 cfun->machine->use_fast_prologue_epilogue
8899 = !expensive_function_p (count);
8900 }
8901
8902 frame->save_regs_using_mov
8903 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
8904 /* If static stack checking is enabled and done with probes,
8905 the registers need to be saved before allocating the frame. */
8906 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
8907
8908 /* Skip return address. */
8909 offset = UNITS_PER_WORD;
8910
8911 /* Skip pushed static chain. */
8912 if (ix86_static_chain_on_stack)
8913 offset += UNITS_PER_WORD;
8914
8915 /* Skip saved base pointer. */
8916 if (frame_pointer_needed)
8917 offset += UNITS_PER_WORD;
8918 frame->hfp_save_offset = offset;
8919
8920 /* The traditional frame pointer location is at the top of the frame. */
8921 frame->hard_frame_pointer_offset = offset;
8922
8923 /* Register save area */
8924 offset += frame->nregs * UNITS_PER_WORD;
8925 frame->reg_save_offset = offset;
8926
8927 /* On SEH target, registers are pushed just before the frame pointer
8928 location. */
8929 if (TARGET_SEH)
8930 frame->hard_frame_pointer_offset = offset;
8931
8932 /* Align and set SSE register save area. */
8933 if (frame->nsseregs)
8934 {
8935 /* The only ABI that has saved SSE registers (Win64) also has a
8936 16-byte aligned default stack, and thus we don't need to be
8937 within the re-aligned local stack frame to save them. */
8938 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8939 offset = (offset + 16 - 1) & -16;
8940 offset += frame->nsseregs * 16;
8941 }
8942 frame->sse_reg_save_offset = offset;
8943
8944 /* The re-aligned stack starts here. Values before this point are not
8945 directly comparable with values below this point. In order to make
8946 sure that no value happens to be the same before and after, force
8947 the alignment computation below to add a non-zero value. */
8948 if (stack_realign_fp)
8949 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8950
8951 /* Va-arg area */
8952 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8953 offset += frame->va_arg_size;
8954
8955 /* Align start of frame for local function. */
8956 if (stack_realign_fp
8957 || offset != frame->sse_reg_save_offset
8958 || size != 0
8959 || !crtl->is_leaf
8960 || cfun->calls_alloca
8961 || ix86_current_function_calls_tls_descriptor)
8962 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8963
8964 /* Frame pointer points here. */
8965 frame->frame_pointer_offset = offset;
8966
8967 offset += size;
8968
8969 /* Add outgoing arguments area. Can be skipped if we eliminated
8970 all the function calls as dead code.
8971 Skipping is however impossible when function calls alloca. Alloca
8972 expander assumes that last crtl->outgoing_args_size
8973 of stack frame are unused. */
8974 if (ACCUMULATE_OUTGOING_ARGS
8975 && (!crtl->is_leaf || cfun->calls_alloca
8976 || ix86_current_function_calls_tls_descriptor))
8977 {
8978 offset += crtl->outgoing_args_size;
8979 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8980 }
8981 else
8982 frame->outgoing_arguments_size = 0;
8983
8984 /* Align stack boundary. Only needed if we're calling another function
8985 or using alloca. */
8986 if (!crtl->is_leaf || cfun->calls_alloca
8987 || ix86_current_function_calls_tls_descriptor)
8988 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8989
8990 /* We've reached end of stack frame. */
8991 frame->stack_pointer_offset = offset;
8992
8993 /* Size prologue needs to allocate. */
8994 to_allocate = offset - frame->sse_reg_save_offset;
8995
8996 if ((!to_allocate && frame->nregs <= 1)
8997 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8998 frame->save_regs_using_mov = false;
8999
9000 if (ix86_using_red_zone ()
9001 && crtl->sp_is_unchanging
9002 && crtl->is_leaf
9003 && !ix86_current_function_calls_tls_descriptor)
9004 {
9005 frame->red_zone_size = to_allocate;
9006 if (frame->save_regs_using_mov)
9007 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9008 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9009 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9010 }
9011 else
9012 frame->red_zone_size = 0;
9013 frame->stack_pointer_offset -= frame->red_zone_size;
9014
9015 /* The SEH frame pointer location is near the bottom of the frame.
9016 This is enforced by the fact that the difference between the
9017 stack pointer and the frame pointer is limited to 240 bytes in
9018 the unwind data structure. */
9019 if (TARGET_SEH)
9020 {
9021 HOST_WIDE_INT diff;
9022
9023 /* If we can leave the frame pointer where it is, do so. Also, returns
9024 the establisher frame for __builtin_frame_address (0). */
9025 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9026 if (diff <= SEH_MAX_FRAME_SIZE
9027 && (diff > 240 || (diff & 15) != 0)
9028 && !crtl->accesses_prior_frames)
9029 {
9030 /* Ideally we'd determine what portion of the local stack frame
9031 (within the constraint of the lowest 240) is most heavily used.
9032 But without that complication, simply bias the frame pointer
9033 by 128 bytes so as to maximize the amount of the local stack
9034 frame that is addressable with 8-bit offsets. */
9035 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9036 }
9037 }
9038 }
9039
9040 /* This is semi-inlined memory_address_length, but simplified
9041 since we know that we're always dealing with reg+offset, and
9042 to avoid having to create and discard all that rtl. */
9043
9044 static inline int
9045 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9046 {
9047 int len = 4;
9048
9049 if (offset == 0)
9050 {
9051 /* EBP and R13 cannot be encoded without an offset. */
9052 len = (regno == BP_REG || regno == R13_REG);
9053 }
9054 else if (IN_RANGE (offset, -128, 127))
9055 len = 1;
9056
9057 /* ESP and R12 must be encoded with a SIB byte. */
9058 if (regno == SP_REG || regno == R12_REG)
9059 len++;
9060
9061 return len;
9062 }
9063
9064 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9065 The valid base registers are taken from CFUN->MACHINE->FS. */
9066
9067 static rtx
9068 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9069 {
9070 const struct machine_function *m = cfun->machine;
9071 rtx base_reg = NULL;
9072 HOST_WIDE_INT base_offset = 0;
9073
9074 if (m->use_fast_prologue_epilogue)
9075 {
9076 /* Choose the base register most likely to allow the most scheduling
9077 opportunities. Generally FP is valid throughout the function,
9078 while DRAP must be reloaded within the epilogue. But choose either
9079 over the SP due to increased encoding size. */
9080
9081 if (m->fs.fp_valid)
9082 {
9083 base_reg = hard_frame_pointer_rtx;
9084 base_offset = m->fs.fp_offset - cfa_offset;
9085 }
9086 else if (m->fs.drap_valid)
9087 {
9088 base_reg = crtl->drap_reg;
9089 base_offset = 0 - cfa_offset;
9090 }
9091 else if (m->fs.sp_valid)
9092 {
9093 base_reg = stack_pointer_rtx;
9094 base_offset = m->fs.sp_offset - cfa_offset;
9095 }
9096 }
9097 else
9098 {
9099 HOST_WIDE_INT toffset;
9100 int len = 16, tlen;
9101
9102 /* Choose the base register with the smallest address encoding.
9103 With a tie, choose FP > DRAP > SP. */
9104 if (m->fs.sp_valid)
9105 {
9106 base_reg = stack_pointer_rtx;
9107 base_offset = m->fs.sp_offset - cfa_offset;
9108 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9109 }
9110 if (m->fs.drap_valid)
9111 {
9112 toffset = 0 - cfa_offset;
9113 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9114 if (tlen <= len)
9115 {
9116 base_reg = crtl->drap_reg;
9117 base_offset = toffset;
9118 len = tlen;
9119 }
9120 }
9121 if (m->fs.fp_valid)
9122 {
9123 toffset = m->fs.fp_offset - cfa_offset;
9124 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9125 if (tlen <= len)
9126 {
9127 base_reg = hard_frame_pointer_rtx;
9128 base_offset = toffset;
9129 len = tlen;
9130 }
9131 }
9132 }
9133 gcc_assert (base_reg != NULL);
9134
9135 return plus_constant (Pmode, base_reg, base_offset);
9136 }
9137
9138 /* Emit code to save registers in the prologue. */
9139
9140 static void
9141 ix86_emit_save_regs (void)
9142 {
9143 unsigned int regno;
9144 rtx insn;
9145
9146 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9147 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9148 {
9149 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9150 RTX_FRAME_RELATED_P (insn) = 1;
9151 }
9152 }
9153
9154 /* Emit a single register save at CFA - CFA_OFFSET. */
9155
9156 static void
9157 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9158 HOST_WIDE_INT cfa_offset)
9159 {
9160 struct machine_function *m = cfun->machine;
9161 rtx reg = gen_rtx_REG (mode, regno);
9162 rtx mem, addr, base, insn;
9163
9164 addr = choose_baseaddr (cfa_offset);
9165 mem = gen_frame_mem (mode, addr);
9166
9167 /* For SSE saves, we need to indicate the 128-bit alignment. */
9168 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9169
9170 insn = emit_move_insn (mem, reg);
9171 RTX_FRAME_RELATED_P (insn) = 1;
9172
9173 base = addr;
9174 if (GET_CODE (base) == PLUS)
9175 base = XEXP (base, 0);
9176 gcc_checking_assert (REG_P (base));
9177
9178 /* When saving registers into a re-aligned local stack frame, avoid
9179 any tricky guessing by dwarf2out. */
9180 if (m->fs.realigned)
9181 {
9182 gcc_checking_assert (stack_realign_drap);
9183
9184 if (regno == REGNO (crtl->drap_reg))
9185 {
9186 /* A bit of a hack. We force the DRAP register to be saved in
9187 the re-aligned stack frame, which provides us with a copy
9188 of the CFA that will last past the prologue. Install it. */
9189 gcc_checking_assert (cfun->machine->fs.fp_valid);
9190 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9191 cfun->machine->fs.fp_offset - cfa_offset);
9192 mem = gen_rtx_MEM (mode, addr);
9193 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9194 }
9195 else
9196 {
9197 /* The frame pointer is a stable reference within the
9198 aligned frame. Use it. */
9199 gcc_checking_assert (cfun->machine->fs.fp_valid);
9200 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9201 cfun->machine->fs.fp_offset - cfa_offset);
9202 mem = gen_rtx_MEM (mode, addr);
9203 add_reg_note (insn, REG_CFA_EXPRESSION,
9204 gen_rtx_SET (VOIDmode, mem, reg));
9205 }
9206 }
9207
9208 /* The memory may not be relative to the current CFA register,
9209 which means that we may need to generate a new pattern for
9210 use by the unwind info. */
9211 else if (base != m->fs.cfa_reg)
9212 {
9213 addr = plus_constant (Pmode, m->fs.cfa_reg,
9214 m->fs.cfa_offset - cfa_offset);
9215 mem = gen_rtx_MEM (mode, addr);
9216 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9217 }
9218 }
9219
9220 /* Emit code to save registers using MOV insns.
9221 First register is stored at CFA - CFA_OFFSET. */
9222 static void
9223 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9224 {
9225 unsigned int regno;
9226
9227 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9228 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9229 {
9230 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9231 cfa_offset -= UNITS_PER_WORD;
9232 }
9233 }
9234
9235 /* Emit code to save SSE registers using MOV insns.
9236 First register is stored at CFA - CFA_OFFSET. */
9237 static void
9238 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9239 {
9240 unsigned int regno;
9241
9242 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9243 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9244 {
9245 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9246 cfa_offset -= 16;
9247 }
9248 }
9249
9250 static GTY(()) rtx queued_cfa_restores;
9251
9252 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9253 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9254 Don't add the note if the previously saved value will be left untouched
9255 within stack red-zone till return, as unwinders can find the same value
9256 in the register and on the stack. */
9257
9258 static void
9259 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9260 {
9261 if (!crtl->shrink_wrapped
9262 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9263 return;
9264
9265 if (insn)
9266 {
9267 add_reg_note (insn, REG_CFA_RESTORE, reg);
9268 RTX_FRAME_RELATED_P (insn) = 1;
9269 }
9270 else
9271 queued_cfa_restores
9272 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9273 }
9274
9275 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9276
9277 static void
9278 ix86_add_queued_cfa_restore_notes (rtx insn)
9279 {
9280 rtx last;
9281 if (!queued_cfa_restores)
9282 return;
9283 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9284 ;
9285 XEXP (last, 1) = REG_NOTES (insn);
9286 REG_NOTES (insn) = queued_cfa_restores;
9287 queued_cfa_restores = NULL_RTX;
9288 RTX_FRAME_RELATED_P (insn) = 1;
9289 }
9290
9291 /* Expand prologue or epilogue stack adjustment.
9292 The pattern exist to put a dependency on all ebp-based memory accesses.
9293 STYLE should be negative if instructions should be marked as frame related,
9294 zero if %r11 register is live and cannot be freely used and positive
9295 otherwise. */
9296
9297 static void
9298 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9299 int style, bool set_cfa)
9300 {
9301 struct machine_function *m = cfun->machine;
9302 rtx insn;
9303 bool add_frame_related_expr = false;
9304
9305 if (Pmode == SImode)
9306 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9307 else if (x86_64_immediate_operand (offset, DImode))
9308 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9309 else
9310 {
9311 rtx tmp;
9312 /* r11 is used by indirect sibcall return as well, set before the
9313 epilogue and used after the epilogue. */
9314 if (style)
9315 tmp = gen_rtx_REG (DImode, R11_REG);
9316 else
9317 {
9318 gcc_assert (src != hard_frame_pointer_rtx
9319 && dest != hard_frame_pointer_rtx);
9320 tmp = hard_frame_pointer_rtx;
9321 }
9322 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9323 if (style < 0)
9324 add_frame_related_expr = true;
9325
9326 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9327 }
9328
9329 insn = emit_insn (insn);
9330 if (style >= 0)
9331 ix86_add_queued_cfa_restore_notes (insn);
9332
9333 if (set_cfa)
9334 {
9335 rtx r;
9336
9337 gcc_assert (m->fs.cfa_reg == src);
9338 m->fs.cfa_offset += INTVAL (offset);
9339 m->fs.cfa_reg = dest;
9340
9341 r = gen_rtx_PLUS (Pmode, src, offset);
9342 r = gen_rtx_SET (VOIDmode, dest, r);
9343 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9344 RTX_FRAME_RELATED_P (insn) = 1;
9345 }
9346 else if (style < 0)
9347 {
9348 RTX_FRAME_RELATED_P (insn) = 1;
9349 if (add_frame_related_expr)
9350 {
9351 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9352 r = gen_rtx_SET (VOIDmode, dest, r);
9353 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9354 }
9355 }
9356
9357 if (dest == stack_pointer_rtx)
9358 {
9359 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9360 bool valid = m->fs.sp_valid;
9361
9362 if (src == hard_frame_pointer_rtx)
9363 {
9364 valid = m->fs.fp_valid;
9365 ooffset = m->fs.fp_offset;
9366 }
9367 else if (src == crtl->drap_reg)
9368 {
9369 valid = m->fs.drap_valid;
9370 ooffset = 0;
9371 }
9372 else
9373 {
9374 /* Else there are two possibilities: SP itself, which we set
9375 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9376 taken care of this by hand along the eh_return path. */
9377 gcc_checking_assert (src == stack_pointer_rtx
9378 || offset == const0_rtx);
9379 }
9380
9381 m->fs.sp_offset = ooffset - INTVAL (offset);
9382 m->fs.sp_valid = valid;
9383 }
9384 }
9385
9386 /* Find an available register to be used as dynamic realign argument
9387 pointer regsiter. Such a register will be written in prologue and
9388 used in begin of body, so it must not be
9389 1. parameter passing register.
9390 2. GOT pointer.
9391 We reuse static-chain register if it is available. Otherwise, we
9392 use DI for i386 and R13 for x86-64. We chose R13 since it has
9393 shorter encoding.
9394
9395 Return: the regno of chosen register. */
9396
9397 static unsigned int
9398 find_drap_reg (void)
9399 {
9400 tree decl = cfun->decl;
9401
9402 if (TARGET_64BIT)
9403 {
9404 /* Use R13 for nested function or function need static chain.
9405 Since function with tail call may use any caller-saved
9406 registers in epilogue, DRAP must not use caller-saved
9407 register in such case. */
9408 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9409 return R13_REG;
9410
9411 return R10_REG;
9412 }
9413 else
9414 {
9415 /* Use DI for nested function or function need static chain.
9416 Since function with tail call may use any caller-saved
9417 registers in epilogue, DRAP must not use caller-saved
9418 register in such case. */
9419 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9420 return DI_REG;
9421
9422 /* Reuse static chain register if it isn't used for parameter
9423 passing. */
9424 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9425 {
9426 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9427 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9428 return CX_REG;
9429 }
9430 return DI_REG;
9431 }
9432 }
9433
9434 /* Return minimum incoming stack alignment. */
9435
9436 static unsigned int
9437 ix86_minimum_incoming_stack_boundary (bool sibcall)
9438 {
9439 unsigned int incoming_stack_boundary;
9440
9441 /* Prefer the one specified at command line. */
9442 if (ix86_user_incoming_stack_boundary)
9443 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9444 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9445 if -mstackrealign is used, it isn't used for sibcall check and
9446 estimated stack alignment is 128bit. */
9447 else if (!sibcall
9448 && !TARGET_64BIT
9449 && ix86_force_align_arg_pointer
9450 && crtl->stack_alignment_estimated == 128)
9451 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9452 else
9453 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9454
9455 /* Incoming stack alignment can be changed on individual functions
9456 via force_align_arg_pointer attribute. We use the smallest
9457 incoming stack boundary. */
9458 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9459 && lookup_attribute (ix86_force_align_arg_pointer_string,
9460 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9461 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9462
9463 /* The incoming stack frame has to be aligned at least at
9464 parm_stack_boundary. */
9465 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9466 incoming_stack_boundary = crtl->parm_stack_boundary;
9467
9468 /* Stack at entrance of main is aligned by runtime. We use the
9469 smallest incoming stack boundary. */
9470 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9471 && DECL_NAME (current_function_decl)
9472 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9473 && DECL_FILE_SCOPE_P (current_function_decl))
9474 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9475
9476 return incoming_stack_boundary;
9477 }
9478
9479 /* Update incoming stack boundary and estimated stack alignment. */
9480
9481 static void
9482 ix86_update_stack_boundary (void)
9483 {
9484 ix86_incoming_stack_boundary
9485 = ix86_minimum_incoming_stack_boundary (false);
9486
9487 /* x86_64 vararg needs 16byte stack alignment for register save
9488 area. */
9489 if (TARGET_64BIT
9490 && cfun->stdarg
9491 && crtl->stack_alignment_estimated < 128)
9492 crtl->stack_alignment_estimated = 128;
9493 }
9494
9495 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9496 needed or an rtx for DRAP otherwise. */
9497
9498 static rtx
9499 ix86_get_drap_rtx (void)
9500 {
9501 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9502 crtl->need_drap = true;
9503
9504 if (stack_realign_drap)
9505 {
9506 /* Assign DRAP to vDRAP and returns vDRAP */
9507 unsigned int regno = find_drap_reg ();
9508 rtx drap_vreg;
9509 rtx arg_ptr;
9510 rtx seq, insn;
9511
9512 arg_ptr = gen_rtx_REG (Pmode, regno);
9513 crtl->drap_reg = arg_ptr;
9514
9515 start_sequence ();
9516 drap_vreg = copy_to_reg (arg_ptr);
9517 seq = get_insns ();
9518 end_sequence ();
9519
9520 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9521 if (!optimize)
9522 {
9523 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9524 RTX_FRAME_RELATED_P (insn) = 1;
9525 }
9526 return drap_vreg;
9527 }
9528 else
9529 return NULL;
9530 }
9531
9532 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9533
9534 static rtx
9535 ix86_internal_arg_pointer (void)
9536 {
9537 return virtual_incoming_args_rtx;
9538 }
9539
9540 struct scratch_reg {
9541 rtx reg;
9542 bool saved;
9543 };
9544
9545 /* Return a short-lived scratch register for use on function entry.
9546 In 32-bit mode, it is valid only after the registers are saved
9547 in the prologue. This register must be released by means of
9548 release_scratch_register_on_entry once it is dead. */
9549
9550 static void
9551 get_scratch_register_on_entry (struct scratch_reg *sr)
9552 {
9553 int regno;
9554
9555 sr->saved = false;
9556
9557 if (TARGET_64BIT)
9558 {
9559 /* We always use R11 in 64-bit mode. */
9560 regno = R11_REG;
9561 }
9562 else
9563 {
9564 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9565 bool fastcall_p
9566 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9567 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9568 int regparm = ix86_function_regparm (fntype, decl);
9569 int drap_regno
9570 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9571
9572 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9573 for the static chain register. */
9574 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9575 && drap_regno != AX_REG)
9576 regno = AX_REG;
9577 else if (regparm < 2 && drap_regno != DX_REG)
9578 regno = DX_REG;
9579 /* ecx is the static chain register. */
9580 else if (regparm < 3 && !fastcall_p && !static_chain_p
9581 && drap_regno != CX_REG)
9582 regno = CX_REG;
9583 else if (ix86_save_reg (BX_REG, true))
9584 regno = BX_REG;
9585 /* esi is the static chain register. */
9586 else if (!(regparm == 3 && static_chain_p)
9587 && ix86_save_reg (SI_REG, true))
9588 regno = SI_REG;
9589 else if (ix86_save_reg (DI_REG, true))
9590 regno = DI_REG;
9591 else
9592 {
9593 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9594 sr->saved = true;
9595 }
9596 }
9597
9598 sr->reg = gen_rtx_REG (Pmode, regno);
9599 if (sr->saved)
9600 {
9601 rtx insn = emit_insn (gen_push (sr->reg));
9602 RTX_FRAME_RELATED_P (insn) = 1;
9603 }
9604 }
9605
9606 /* Release a scratch register obtained from the preceding function. */
9607
9608 static void
9609 release_scratch_register_on_entry (struct scratch_reg *sr)
9610 {
9611 if (sr->saved)
9612 {
9613 struct machine_function *m = cfun->machine;
9614 rtx x, insn = emit_insn (gen_pop (sr->reg));
9615
9616 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9617 RTX_FRAME_RELATED_P (insn) = 1;
9618 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9619 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9620 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9621 m->fs.sp_offset -= UNITS_PER_WORD;
9622 }
9623 }
9624
9625 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9626
9627 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9628
9629 static void
9630 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9631 {
9632 /* We skip the probe for the first interval + a small dope of 4 words and
9633 probe that many bytes past the specified size to maintain a protection
9634 area at the botton of the stack. */
9635 const int dope = 4 * UNITS_PER_WORD;
9636 rtx size_rtx = GEN_INT (size), last;
9637
9638 /* See if we have a constant small number of probes to generate. If so,
9639 that's the easy case. The run-time loop is made up of 11 insns in the
9640 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9641 for n # of intervals. */
9642 if (size <= 5 * PROBE_INTERVAL)
9643 {
9644 HOST_WIDE_INT i, adjust;
9645 bool first_probe = true;
9646
9647 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9648 values of N from 1 until it exceeds SIZE. If only one probe is
9649 needed, this will not generate any code. Then adjust and probe
9650 to PROBE_INTERVAL + SIZE. */
9651 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9652 {
9653 if (first_probe)
9654 {
9655 adjust = 2 * PROBE_INTERVAL + dope;
9656 first_probe = false;
9657 }
9658 else
9659 adjust = PROBE_INTERVAL;
9660
9661 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9662 plus_constant (Pmode, stack_pointer_rtx,
9663 -adjust)));
9664 emit_stack_probe (stack_pointer_rtx);
9665 }
9666
9667 if (first_probe)
9668 adjust = size + PROBE_INTERVAL + dope;
9669 else
9670 adjust = size + PROBE_INTERVAL - i;
9671
9672 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9673 plus_constant (Pmode, stack_pointer_rtx,
9674 -adjust)));
9675 emit_stack_probe (stack_pointer_rtx);
9676
9677 /* Adjust back to account for the additional first interval. */
9678 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9679 plus_constant (Pmode, stack_pointer_rtx,
9680 PROBE_INTERVAL + dope)));
9681 }
9682
9683 /* Otherwise, do the same as above, but in a loop. Note that we must be
9684 extra careful with variables wrapping around because we might be at
9685 the very top (or the very bottom) of the address space and we have
9686 to be able to handle this case properly; in particular, we use an
9687 equality test for the loop condition. */
9688 else
9689 {
9690 HOST_WIDE_INT rounded_size;
9691 struct scratch_reg sr;
9692
9693 get_scratch_register_on_entry (&sr);
9694
9695
9696 /* Step 1: round SIZE to the previous multiple of the interval. */
9697
9698 rounded_size = size & -PROBE_INTERVAL;
9699
9700
9701 /* Step 2: compute initial and final value of the loop counter. */
9702
9703 /* SP = SP_0 + PROBE_INTERVAL. */
9704 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9705 plus_constant (Pmode, stack_pointer_rtx,
9706 - (PROBE_INTERVAL + dope))));
9707
9708 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9709 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9710 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9711 gen_rtx_PLUS (Pmode, sr.reg,
9712 stack_pointer_rtx)));
9713
9714
9715 /* Step 3: the loop
9716
9717 while (SP != LAST_ADDR)
9718 {
9719 SP = SP + PROBE_INTERVAL
9720 probe at SP
9721 }
9722
9723 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9724 values of N from 1 until it is equal to ROUNDED_SIZE. */
9725
9726 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9727
9728
9729 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9730 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9731
9732 if (size != rounded_size)
9733 {
9734 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9735 plus_constant (Pmode, stack_pointer_rtx,
9736 rounded_size - size)));
9737 emit_stack_probe (stack_pointer_rtx);
9738 }
9739
9740 /* Adjust back to account for the additional first interval. */
9741 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9742 plus_constant (Pmode, stack_pointer_rtx,
9743 PROBE_INTERVAL + dope)));
9744
9745 release_scratch_register_on_entry (&sr);
9746 }
9747
9748 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9749
9750 /* Even if the stack pointer isn't the CFA register, we need to correctly
9751 describe the adjustments made to it, in particular differentiate the
9752 frame-related ones from the frame-unrelated ones. */
9753 if (size > 0)
9754 {
9755 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9756 XVECEXP (expr, 0, 0)
9757 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9758 plus_constant (Pmode, stack_pointer_rtx, -size));
9759 XVECEXP (expr, 0, 1)
9760 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9761 plus_constant (Pmode, stack_pointer_rtx,
9762 PROBE_INTERVAL + dope + size));
9763 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9764 RTX_FRAME_RELATED_P (last) = 1;
9765
9766 cfun->machine->fs.sp_offset += size;
9767 }
9768
9769 /* Make sure nothing is scheduled before we are done. */
9770 emit_insn (gen_blockage ());
9771 }
9772
9773 /* Adjust the stack pointer up to REG while probing it. */
9774
9775 const char *
9776 output_adjust_stack_and_probe (rtx reg)
9777 {
9778 static int labelno = 0;
9779 char loop_lab[32], end_lab[32];
9780 rtx xops[2];
9781
9782 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9783 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9784
9785 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9786
9787 /* Jump to END_LAB if SP == LAST_ADDR. */
9788 xops[0] = stack_pointer_rtx;
9789 xops[1] = reg;
9790 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9791 fputs ("\tje\t", asm_out_file);
9792 assemble_name_raw (asm_out_file, end_lab);
9793 fputc ('\n', asm_out_file);
9794
9795 /* SP = SP + PROBE_INTERVAL. */
9796 xops[1] = GEN_INT (PROBE_INTERVAL);
9797 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9798
9799 /* Probe at SP. */
9800 xops[1] = const0_rtx;
9801 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9802
9803 fprintf (asm_out_file, "\tjmp\t");
9804 assemble_name_raw (asm_out_file, loop_lab);
9805 fputc ('\n', asm_out_file);
9806
9807 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9808
9809 return "";
9810 }
9811
9812 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9813 inclusive. These are offsets from the current stack pointer. */
9814
9815 static void
9816 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9817 {
9818 /* See if we have a constant small number of probes to generate. If so,
9819 that's the easy case. The run-time loop is made up of 7 insns in the
9820 generic case while the compile-time loop is made up of n insns for n #
9821 of intervals. */
9822 if (size <= 7 * PROBE_INTERVAL)
9823 {
9824 HOST_WIDE_INT i;
9825
9826 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9827 it exceeds SIZE. If only one probe is needed, this will not
9828 generate any code. Then probe at FIRST + SIZE. */
9829 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9830 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9831 -(first + i)));
9832
9833 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9834 -(first + size)));
9835 }
9836
9837 /* Otherwise, do the same as above, but in a loop. Note that we must be
9838 extra careful with variables wrapping around because we might be at
9839 the very top (or the very bottom) of the address space and we have
9840 to be able to handle this case properly; in particular, we use an
9841 equality test for the loop condition. */
9842 else
9843 {
9844 HOST_WIDE_INT rounded_size, last;
9845 struct scratch_reg sr;
9846
9847 get_scratch_register_on_entry (&sr);
9848
9849
9850 /* Step 1: round SIZE to the previous multiple of the interval. */
9851
9852 rounded_size = size & -PROBE_INTERVAL;
9853
9854
9855 /* Step 2: compute initial and final value of the loop counter. */
9856
9857 /* TEST_OFFSET = FIRST. */
9858 emit_move_insn (sr.reg, GEN_INT (-first));
9859
9860 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9861 last = first + rounded_size;
9862
9863
9864 /* Step 3: the loop
9865
9866 while (TEST_ADDR != LAST_ADDR)
9867 {
9868 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9869 probe at TEST_ADDR
9870 }
9871
9872 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9873 until it is equal to ROUNDED_SIZE. */
9874
9875 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9876
9877
9878 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9879 that SIZE is equal to ROUNDED_SIZE. */
9880
9881 if (size != rounded_size)
9882 emit_stack_probe (plus_constant (Pmode,
9883 gen_rtx_PLUS (Pmode,
9884 stack_pointer_rtx,
9885 sr.reg),
9886 rounded_size - size));
9887
9888 release_scratch_register_on_entry (&sr);
9889 }
9890
9891 /* Make sure nothing is scheduled before we are done. */
9892 emit_insn (gen_blockage ());
9893 }
9894
9895 /* Probe a range of stack addresses from REG to END, inclusive. These are
9896 offsets from the current stack pointer. */
9897
9898 const char *
9899 output_probe_stack_range (rtx reg, rtx end)
9900 {
9901 static int labelno = 0;
9902 char loop_lab[32], end_lab[32];
9903 rtx xops[3];
9904
9905 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9906 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9907
9908 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9909
9910 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9911 xops[0] = reg;
9912 xops[1] = end;
9913 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9914 fputs ("\tje\t", asm_out_file);
9915 assemble_name_raw (asm_out_file, end_lab);
9916 fputc ('\n', asm_out_file);
9917
9918 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9919 xops[1] = GEN_INT (PROBE_INTERVAL);
9920 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9921
9922 /* Probe at TEST_ADDR. */
9923 xops[0] = stack_pointer_rtx;
9924 xops[1] = reg;
9925 xops[2] = const0_rtx;
9926 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9927
9928 fprintf (asm_out_file, "\tjmp\t");
9929 assemble_name_raw (asm_out_file, loop_lab);
9930 fputc ('\n', asm_out_file);
9931
9932 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9933
9934 return "";
9935 }
9936
9937 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9938 to be generated in correct form. */
9939 static void
9940 ix86_finalize_stack_realign_flags (void)
9941 {
9942 /* Check if stack realign is really needed after reload, and
9943 stores result in cfun */
9944 unsigned int incoming_stack_boundary
9945 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9946 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9947 unsigned int stack_realign = (incoming_stack_boundary
9948 < (crtl->is_leaf
9949 ? crtl->max_used_stack_slot_alignment
9950 : crtl->stack_alignment_needed));
9951
9952 if (crtl->stack_realign_finalized)
9953 {
9954 /* After stack_realign_needed is finalized, we can't no longer
9955 change it. */
9956 gcc_assert (crtl->stack_realign_needed == stack_realign);
9957 return;
9958 }
9959
9960 /* If the only reason for frame_pointer_needed is that we conservatively
9961 assumed stack realignment might be needed, but in the end nothing that
9962 needed the stack alignment had been spilled, clear frame_pointer_needed
9963 and say we don't need stack realignment. */
9964 if (stack_realign
9965 && !crtl->need_drap
9966 && frame_pointer_needed
9967 && crtl->is_leaf
9968 && flag_omit_frame_pointer
9969 && crtl->sp_is_unchanging
9970 && !ix86_current_function_calls_tls_descriptor
9971 && !crtl->accesses_prior_frames
9972 && !cfun->calls_alloca
9973 && !crtl->calls_eh_return
9974 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
9975 && !ix86_frame_pointer_required ()
9976 && get_frame_size () == 0
9977 && ix86_nsaved_sseregs () == 0
9978 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
9979 {
9980 HARD_REG_SET set_up_by_prologue, prologue_used;
9981 basic_block bb;
9982
9983 CLEAR_HARD_REG_SET (prologue_used);
9984 CLEAR_HARD_REG_SET (set_up_by_prologue);
9985 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
9986 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
9987 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
9988 HARD_FRAME_POINTER_REGNUM);
9989 FOR_EACH_BB (bb)
9990 {
9991 rtx insn;
9992 FOR_BB_INSNS (bb, insn)
9993 if (NONDEBUG_INSN_P (insn)
9994 && requires_stack_frame_p (insn, prologue_used,
9995 set_up_by_prologue))
9996 {
9997 crtl->stack_realign_needed = stack_realign;
9998 crtl->stack_realign_finalized = true;
9999 return;
10000 }
10001 }
10002
10003 frame_pointer_needed = false;
10004 stack_realign = false;
10005 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10006 crtl->stack_alignment_needed = incoming_stack_boundary;
10007 crtl->stack_alignment_estimated = incoming_stack_boundary;
10008 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10009 crtl->preferred_stack_boundary = incoming_stack_boundary;
10010 df_finish_pass (true);
10011 df_scan_alloc (NULL);
10012 df_scan_blocks ();
10013 df_compute_regs_ever_live (true);
10014 df_analyze ();
10015 }
10016
10017 crtl->stack_realign_needed = stack_realign;
10018 crtl->stack_realign_finalized = true;
10019 }
10020
10021 /* Expand the prologue into a bunch of separate insns. */
10022
10023 void
10024 ix86_expand_prologue (void)
10025 {
10026 struct machine_function *m = cfun->machine;
10027 rtx insn, t;
10028 bool pic_reg_used;
10029 struct ix86_frame frame;
10030 HOST_WIDE_INT allocate;
10031 bool int_registers_saved;
10032 bool sse_registers_saved;
10033
10034 ix86_finalize_stack_realign_flags ();
10035
10036 /* DRAP should not coexist with stack_realign_fp */
10037 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10038
10039 memset (&m->fs, 0, sizeof (m->fs));
10040
10041 /* Initialize CFA state for before the prologue. */
10042 m->fs.cfa_reg = stack_pointer_rtx;
10043 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10044
10045 /* Track SP offset to the CFA. We continue tracking this after we've
10046 swapped the CFA register away from SP. In the case of re-alignment
10047 this is fudged; we're interested to offsets within the local frame. */
10048 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10049 m->fs.sp_valid = true;
10050
10051 ix86_compute_frame_layout (&frame);
10052
10053 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10054 {
10055 /* We should have already generated an error for any use of
10056 ms_hook on a nested function. */
10057 gcc_checking_assert (!ix86_static_chain_on_stack);
10058
10059 /* Check if profiling is active and we shall use profiling before
10060 prologue variant. If so sorry. */
10061 if (crtl->profile && flag_fentry != 0)
10062 sorry ("ms_hook_prologue attribute isn%'t compatible "
10063 "with -mfentry for 32-bit");
10064
10065 /* In ix86_asm_output_function_label we emitted:
10066 8b ff movl.s %edi,%edi
10067 55 push %ebp
10068 8b ec movl.s %esp,%ebp
10069
10070 This matches the hookable function prologue in Win32 API
10071 functions in Microsoft Windows XP Service Pack 2 and newer.
10072 Wine uses this to enable Windows apps to hook the Win32 API
10073 functions provided by Wine.
10074
10075 What that means is that we've already set up the frame pointer. */
10076
10077 if (frame_pointer_needed
10078 && !(crtl->drap_reg && crtl->stack_realign_needed))
10079 {
10080 rtx push, mov;
10081
10082 /* We've decided to use the frame pointer already set up.
10083 Describe this to the unwinder by pretending that both
10084 push and mov insns happen right here.
10085
10086 Putting the unwind info here at the end of the ms_hook
10087 is done so that we can make absolutely certain we get
10088 the required byte sequence at the start of the function,
10089 rather than relying on an assembler that can produce
10090 the exact encoding required.
10091
10092 However it does mean (in the unpatched case) that we have
10093 a 1 insn window where the asynchronous unwind info is
10094 incorrect. However, if we placed the unwind info at
10095 its correct location we would have incorrect unwind info
10096 in the patched case. Which is probably all moot since
10097 I don't expect Wine generates dwarf2 unwind info for the
10098 system libraries that use this feature. */
10099
10100 insn = emit_insn (gen_blockage ());
10101
10102 push = gen_push (hard_frame_pointer_rtx);
10103 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10104 stack_pointer_rtx);
10105 RTX_FRAME_RELATED_P (push) = 1;
10106 RTX_FRAME_RELATED_P (mov) = 1;
10107
10108 RTX_FRAME_RELATED_P (insn) = 1;
10109 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10110 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10111
10112 /* Note that gen_push incremented m->fs.cfa_offset, even
10113 though we didn't emit the push insn here. */
10114 m->fs.cfa_reg = hard_frame_pointer_rtx;
10115 m->fs.fp_offset = m->fs.cfa_offset;
10116 m->fs.fp_valid = true;
10117 }
10118 else
10119 {
10120 /* The frame pointer is not needed so pop %ebp again.
10121 This leaves us with a pristine state. */
10122 emit_insn (gen_pop (hard_frame_pointer_rtx));
10123 }
10124 }
10125
10126 /* The first insn of a function that accepts its static chain on the
10127 stack is to push the register that would be filled in by a direct
10128 call. This insn will be skipped by the trampoline. */
10129 else if (ix86_static_chain_on_stack)
10130 {
10131 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10132 emit_insn (gen_blockage ());
10133
10134 /* We don't want to interpret this push insn as a register save,
10135 only as a stack adjustment. The real copy of the register as
10136 a save will be done later, if needed. */
10137 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10138 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10139 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10140 RTX_FRAME_RELATED_P (insn) = 1;
10141 }
10142
10143 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10144 of DRAP is needed and stack realignment is really needed after reload */
10145 if (stack_realign_drap)
10146 {
10147 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10148
10149 /* Only need to push parameter pointer reg if it is caller saved. */
10150 if (!call_used_regs[REGNO (crtl->drap_reg)])
10151 {
10152 /* Push arg pointer reg */
10153 insn = emit_insn (gen_push (crtl->drap_reg));
10154 RTX_FRAME_RELATED_P (insn) = 1;
10155 }
10156
10157 /* Grab the argument pointer. */
10158 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10159 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10160 RTX_FRAME_RELATED_P (insn) = 1;
10161 m->fs.cfa_reg = crtl->drap_reg;
10162 m->fs.cfa_offset = 0;
10163
10164 /* Align the stack. */
10165 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10166 stack_pointer_rtx,
10167 GEN_INT (-align_bytes)));
10168 RTX_FRAME_RELATED_P (insn) = 1;
10169
10170 /* Replicate the return address on the stack so that return
10171 address can be reached via (argp - 1) slot. This is needed
10172 to implement macro RETURN_ADDR_RTX and intrinsic function
10173 expand_builtin_return_addr etc. */
10174 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10175 t = gen_frame_mem (word_mode, t);
10176 insn = emit_insn (gen_push (t));
10177 RTX_FRAME_RELATED_P (insn) = 1;
10178
10179 /* For the purposes of frame and register save area addressing,
10180 we've started over with a new frame. */
10181 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10182 m->fs.realigned = true;
10183 }
10184
10185 int_registers_saved = (frame.nregs == 0);
10186 sse_registers_saved = (frame.nsseregs == 0);
10187
10188 if (frame_pointer_needed && !m->fs.fp_valid)
10189 {
10190 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10191 slower on all targets. Also sdb doesn't like it. */
10192 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10193 RTX_FRAME_RELATED_P (insn) = 1;
10194
10195 /* Push registers now, before setting the frame pointer
10196 on SEH target. */
10197 if (!int_registers_saved
10198 && TARGET_SEH
10199 && !frame.save_regs_using_mov)
10200 {
10201 ix86_emit_save_regs ();
10202 int_registers_saved = true;
10203 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10204 }
10205
10206 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10207 {
10208 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10209 RTX_FRAME_RELATED_P (insn) = 1;
10210
10211 if (m->fs.cfa_reg == stack_pointer_rtx)
10212 m->fs.cfa_reg = hard_frame_pointer_rtx;
10213 m->fs.fp_offset = m->fs.sp_offset;
10214 m->fs.fp_valid = true;
10215 }
10216 }
10217
10218 if (!int_registers_saved)
10219 {
10220 /* If saving registers via PUSH, do so now. */
10221 if (!frame.save_regs_using_mov)
10222 {
10223 ix86_emit_save_regs ();
10224 int_registers_saved = true;
10225 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10226 }
10227
10228 /* When using red zone we may start register saving before allocating
10229 the stack frame saving one cycle of the prologue. However, avoid
10230 doing this if we have to probe the stack; at least on x86_64 the
10231 stack probe can turn into a call that clobbers a red zone location. */
10232 else if (ix86_using_red_zone ()
10233 && (! TARGET_STACK_PROBE
10234 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10235 {
10236 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10237 int_registers_saved = true;
10238 }
10239 }
10240
10241 if (stack_realign_fp)
10242 {
10243 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10244 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10245
10246 /* The computation of the size of the re-aligned stack frame means
10247 that we must allocate the size of the register save area before
10248 performing the actual alignment. Otherwise we cannot guarantee
10249 that there's enough storage above the realignment point. */
10250 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10251 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10252 GEN_INT (m->fs.sp_offset
10253 - frame.sse_reg_save_offset),
10254 -1, false);
10255
10256 /* Align the stack. */
10257 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10258 stack_pointer_rtx,
10259 GEN_INT (-align_bytes)));
10260
10261 /* For the purposes of register save area addressing, the stack
10262 pointer is no longer valid. As for the value of sp_offset,
10263 see ix86_compute_frame_layout, which we need to match in order
10264 to pass verification of stack_pointer_offset at the end. */
10265 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10266 m->fs.sp_valid = false;
10267 }
10268
10269 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10270
10271 if (flag_stack_usage_info)
10272 {
10273 /* We start to count from ARG_POINTER. */
10274 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10275
10276 /* If it was realigned, take into account the fake frame. */
10277 if (stack_realign_drap)
10278 {
10279 if (ix86_static_chain_on_stack)
10280 stack_size += UNITS_PER_WORD;
10281
10282 if (!call_used_regs[REGNO (crtl->drap_reg)])
10283 stack_size += UNITS_PER_WORD;
10284
10285 /* This over-estimates by 1 minimal-stack-alignment-unit but
10286 mitigates that by counting in the new return address slot. */
10287 current_function_dynamic_stack_size
10288 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10289 }
10290
10291 current_function_static_stack_size = stack_size;
10292 }
10293
10294 /* On SEH target with very large frame size, allocate an area to save
10295 SSE registers (as the very large allocation won't be described). */
10296 if (TARGET_SEH
10297 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10298 && !sse_registers_saved)
10299 {
10300 HOST_WIDE_INT sse_size =
10301 frame.sse_reg_save_offset - frame.reg_save_offset;
10302
10303 gcc_assert (int_registers_saved);
10304
10305 /* No need to do stack checking as the area will be immediately
10306 written. */
10307 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10308 GEN_INT (-sse_size), -1,
10309 m->fs.cfa_reg == stack_pointer_rtx);
10310 allocate -= sse_size;
10311 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10312 sse_registers_saved = true;
10313 }
10314
10315 /* The stack has already been decremented by the instruction calling us
10316 so probe if the size is non-negative to preserve the protection area. */
10317 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10318 {
10319 /* We expect the registers to be saved when probes are used. */
10320 gcc_assert (int_registers_saved);
10321
10322 if (STACK_CHECK_MOVING_SP)
10323 {
10324 ix86_adjust_stack_and_probe (allocate);
10325 allocate = 0;
10326 }
10327 else
10328 {
10329 HOST_WIDE_INT size = allocate;
10330
10331 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10332 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10333
10334 if (TARGET_STACK_PROBE)
10335 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10336 else
10337 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10338 }
10339 }
10340
10341 if (allocate == 0)
10342 ;
10343 else if (!ix86_target_stack_probe ()
10344 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10345 {
10346 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10347 GEN_INT (-allocate), -1,
10348 m->fs.cfa_reg == stack_pointer_rtx);
10349 }
10350 else
10351 {
10352 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10353 rtx r10 = NULL;
10354 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10355 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10356 bool eax_live = false;
10357 bool r10_live = false;
10358
10359 if (TARGET_64BIT)
10360 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10361 if (!TARGET_64BIT_MS_ABI)
10362 eax_live = ix86_eax_live_at_start_p ();
10363
10364 /* Note that SEH directives need to continue tracking the stack
10365 pointer even after the frame pointer has been set up. */
10366 if (eax_live)
10367 {
10368 insn = emit_insn (gen_push (eax));
10369 allocate -= UNITS_PER_WORD;
10370 if (sp_is_cfa_reg || TARGET_SEH)
10371 {
10372 if (sp_is_cfa_reg)
10373 m->fs.cfa_offset += UNITS_PER_WORD;
10374 RTX_FRAME_RELATED_P (insn) = 1;
10375 }
10376 }
10377
10378 if (r10_live)
10379 {
10380 r10 = gen_rtx_REG (Pmode, R10_REG);
10381 insn = emit_insn (gen_push (r10));
10382 allocate -= UNITS_PER_WORD;
10383 if (sp_is_cfa_reg || TARGET_SEH)
10384 {
10385 if (sp_is_cfa_reg)
10386 m->fs.cfa_offset += UNITS_PER_WORD;
10387 RTX_FRAME_RELATED_P (insn) = 1;
10388 }
10389 }
10390
10391 emit_move_insn (eax, GEN_INT (allocate));
10392 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10393
10394 /* Use the fact that AX still contains ALLOCATE. */
10395 adjust_stack_insn = (Pmode == DImode
10396 ? gen_pro_epilogue_adjust_stack_di_sub
10397 : gen_pro_epilogue_adjust_stack_si_sub);
10398
10399 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10400 stack_pointer_rtx, eax));
10401
10402 if (sp_is_cfa_reg || TARGET_SEH)
10403 {
10404 if (sp_is_cfa_reg)
10405 m->fs.cfa_offset += allocate;
10406 RTX_FRAME_RELATED_P (insn) = 1;
10407 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10408 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10409 plus_constant (Pmode, stack_pointer_rtx,
10410 -allocate)));
10411 }
10412 m->fs.sp_offset += allocate;
10413
10414 if (r10_live && eax_live)
10415 {
10416 t = choose_baseaddr (m->fs.sp_offset - allocate);
10417 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10418 gen_frame_mem (word_mode, t));
10419 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10420 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10421 gen_frame_mem (word_mode, t));
10422 }
10423 else if (eax_live || r10_live)
10424 {
10425 t = choose_baseaddr (m->fs.sp_offset - allocate);
10426 emit_move_insn (gen_rtx_REG (word_mode,
10427 (eax_live ? AX_REG : R10_REG)),
10428 gen_frame_mem (word_mode, t));
10429 }
10430 }
10431 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10432
10433 /* If we havn't already set up the frame pointer, do so now. */
10434 if (frame_pointer_needed && !m->fs.fp_valid)
10435 {
10436 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10437 GEN_INT (frame.stack_pointer_offset
10438 - frame.hard_frame_pointer_offset));
10439 insn = emit_insn (insn);
10440 RTX_FRAME_RELATED_P (insn) = 1;
10441 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10442
10443 if (m->fs.cfa_reg == stack_pointer_rtx)
10444 m->fs.cfa_reg = hard_frame_pointer_rtx;
10445 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10446 m->fs.fp_valid = true;
10447 }
10448
10449 if (!int_registers_saved)
10450 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10451 if (!sse_registers_saved)
10452 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10453
10454 pic_reg_used = false;
10455 if (pic_offset_table_rtx
10456 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10457 || crtl->profile))
10458 {
10459 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10460
10461 if (alt_pic_reg_used != INVALID_REGNUM)
10462 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10463
10464 pic_reg_used = true;
10465 }
10466
10467 if (pic_reg_used)
10468 {
10469 if (TARGET_64BIT)
10470 {
10471 if (ix86_cmodel == CM_LARGE_PIC)
10472 {
10473 rtx label, tmp_reg;
10474
10475 gcc_assert (Pmode == DImode);
10476 label = gen_label_rtx ();
10477 emit_label (label);
10478 LABEL_PRESERVE_P (label) = 1;
10479 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10480 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10481 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10482 label));
10483 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10484 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10485 pic_offset_table_rtx, tmp_reg));
10486 }
10487 else
10488 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10489 }
10490 else
10491 {
10492 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10493 RTX_FRAME_RELATED_P (insn) = 1;
10494 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10495 }
10496 }
10497
10498 /* In the pic_reg_used case, make sure that the got load isn't deleted
10499 when mcount needs it. Blockage to avoid call movement across mcount
10500 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10501 note. */
10502 if (crtl->profile && !flag_fentry && pic_reg_used)
10503 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10504
10505 if (crtl->drap_reg && !crtl->stack_realign_needed)
10506 {
10507 /* vDRAP is setup but after reload it turns out stack realign
10508 isn't necessary, here we will emit prologue to setup DRAP
10509 without stack realign adjustment */
10510 t = choose_baseaddr (0);
10511 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10512 }
10513
10514 /* Prevent instructions from being scheduled into register save push
10515 sequence when access to the redzone area is done through frame pointer.
10516 The offset between the frame pointer and the stack pointer is calculated
10517 relative to the value of the stack pointer at the end of the function
10518 prologue, and moving instructions that access redzone area via frame
10519 pointer inside push sequence violates this assumption. */
10520 if (frame_pointer_needed && frame.red_zone_size)
10521 emit_insn (gen_memory_blockage ());
10522
10523 /* Emit cld instruction if stringops are used in the function. */
10524 if (TARGET_CLD && ix86_current_function_needs_cld)
10525 emit_insn (gen_cld ());
10526
10527 /* SEH requires that the prologue end within 256 bytes of the start of
10528 the function. Prevent instruction schedules that would extend that.
10529 Further, prevent alloca modifications to the stack pointer from being
10530 combined with prologue modifications. */
10531 if (TARGET_SEH)
10532 emit_insn (gen_prologue_use (stack_pointer_rtx));
10533 }
10534
10535 /* Emit code to restore REG using a POP insn. */
10536
10537 static void
10538 ix86_emit_restore_reg_using_pop (rtx reg)
10539 {
10540 struct machine_function *m = cfun->machine;
10541 rtx insn = emit_insn (gen_pop (reg));
10542
10543 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10544 m->fs.sp_offset -= UNITS_PER_WORD;
10545
10546 if (m->fs.cfa_reg == crtl->drap_reg
10547 && REGNO (reg) == REGNO (crtl->drap_reg))
10548 {
10549 /* Previously we'd represented the CFA as an expression
10550 like *(%ebp - 8). We've just popped that value from
10551 the stack, which means we need to reset the CFA to
10552 the drap register. This will remain until we restore
10553 the stack pointer. */
10554 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10555 RTX_FRAME_RELATED_P (insn) = 1;
10556
10557 /* This means that the DRAP register is valid for addressing too. */
10558 m->fs.drap_valid = true;
10559 return;
10560 }
10561
10562 if (m->fs.cfa_reg == stack_pointer_rtx)
10563 {
10564 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10565 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10566 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10567 RTX_FRAME_RELATED_P (insn) = 1;
10568
10569 m->fs.cfa_offset -= UNITS_PER_WORD;
10570 }
10571
10572 /* When the frame pointer is the CFA, and we pop it, we are
10573 swapping back to the stack pointer as the CFA. This happens
10574 for stack frames that don't allocate other data, so we assume
10575 the stack pointer is now pointing at the return address, i.e.
10576 the function entry state, which makes the offset be 1 word. */
10577 if (reg == hard_frame_pointer_rtx)
10578 {
10579 m->fs.fp_valid = false;
10580 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10581 {
10582 m->fs.cfa_reg = stack_pointer_rtx;
10583 m->fs.cfa_offset -= UNITS_PER_WORD;
10584
10585 add_reg_note (insn, REG_CFA_DEF_CFA,
10586 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10587 GEN_INT (m->fs.cfa_offset)));
10588 RTX_FRAME_RELATED_P (insn) = 1;
10589 }
10590 }
10591 }
10592
10593 /* Emit code to restore saved registers using POP insns. */
10594
10595 static void
10596 ix86_emit_restore_regs_using_pop (void)
10597 {
10598 unsigned int regno;
10599
10600 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10601 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10602 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10603 }
10604
10605 /* Emit code and notes for the LEAVE instruction. */
10606
10607 static void
10608 ix86_emit_leave (void)
10609 {
10610 struct machine_function *m = cfun->machine;
10611 rtx insn = emit_insn (ix86_gen_leave ());
10612
10613 ix86_add_queued_cfa_restore_notes (insn);
10614
10615 gcc_assert (m->fs.fp_valid);
10616 m->fs.sp_valid = true;
10617 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10618 m->fs.fp_valid = false;
10619
10620 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10621 {
10622 m->fs.cfa_reg = stack_pointer_rtx;
10623 m->fs.cfa_offset = m->fs.sp_offset;
10624
10625 add_reg_note (insn, REG_CFA_DEF_CFA,
10626 plus_constant (Pmode, stack_pointer_rtx,
10627 m->fs.sp_offset));
10628 RTX_FRAME_RELATED_P (insn) = 1;
10629 }
10630 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10631 m->fs.fp_offset);
10632 }
10633
10634 /* Emit code to restore saved registers using MOV insns.
10635 First register is restored from CFA - CFA_OFFSET. */
10636 static void
10637 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10638 bool maybe_eh_return)
10639 {
10640 struct machine_function *m = cfun->machine;
10641 unsigned int regno;
10642
10643 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10644 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10645 {
10646 rtx reg = gen_rtx_REG (word_mode, regno);
10647 rtx insn, mem;
10648
10649 mem = choose_baseaddr (cfa_offset);
10650 mem = gen_frame_mem (word_mode, mem);
10651 insn = emit_move_insn (reg, mem);
10652
10653 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10654 {
10655 /* Previously we'd represented the CFA as an expression
10656 like *(%ebp - 8). We've just popped that value from
10657 the stack, which means we need to reset the CFA to
10658 the drap register. This will remain until we restore
10659 the stack pointer. */
10660 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10661 RTX_FRAME_RELATED_P (insn) = 1;
10662
10663 /* This means that the DRAP register is valid for addressing. */
10664 m->fs.drap_valid = true;
10665 }
10666 else
10667 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10668
10669 cfa_offset -= UNITS_PER_WORD;
10670 }
10671 }
10672
10673 /* Emit code to restore saved registers using MOV insns.
10674 First register is restored from CFA - CFA_OFFSET. */
10675 static void
10676 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10677 bool maybe_eh_return)
10678 {
10679 unsigned int regno;
10680
10681 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10682 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10683 {
10684 rtx reg = gen_rtx_REG (V4SFmode, regno);
10685 rtx mem;
10686
10687 mem = choose_baseaddr (cfa_offset);
10688 mem = gen_rtx_MEM (V4SFmode, mem);
10689 set_mem_align (mem, 128);
10690 emit_move_insn (reg, mem);
10691
10692 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10693
10694 cfa_offset -= 16;
10695 }
10696 }
10697
10698 /* Restore function stack, frame, and registers. */
10699
10700 void
10701 ix86_expand_epilogue (int style)
10702 {
10703 struct machine_function *m = cfun->machine;
10704 struct machine_frame_state frame_state_save = m->fs;
10705 struct ix86_frame frame;
10706 bool restore_regs_via_mov;
10707 bool using_drap;
10708
10709 ix86_finalize_stack_realign_flags ();
10710 ix86_compute_frame_layout (&frame);
10711
10712 m->fs.sp_valid = (!frame_pointer_needed
10713 || (crtl->sp_is_unchanging
10714 && !stack_realign_fp));
10715 gcc_assert (!m->fs.sp_valid
10716 || m->fs.sp_offset == frame.stack_pointer_offset);
10717
10718 /* The FP must be valid if the frame pointer is present. */
10719 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10720 gcc_assert (!m->fs.fp_valid
10721 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10722
10723 /* We must have *some* valid pointer to the stack frame. */
10724 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10725
10726 /* The DRAP is never valid at this point. */
10727 gcc_assert (!m->fs.drap_valid);
10728
10729 /* See the comment about red zone and frame
10730 pointer usage in ix86_expand_prologue. */
10731 if (frame_pointer_needed && frame.red_zone_size)
10732 emit_insn (gen_memory_blockage ());
10733
10734 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10735 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10736
10737 /* Determine the CFA offset of the end of the red-zone. */
10738 m->fs.red_zone_offset = 0;
10739 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10740 {
10741 /* The red-zone begins below the return address. */
10742 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10743
10744 /* When the register save area is in the aligned portion of
10745 the stack, determine the maximum runtime displacement that
10746 matches up with the aligned frame. */
10747 if (stack_realign_drap)
10748 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10749 + UNITS_PER_WORD);
10750 }
10751
10752 /* Special care must be taken for the normal return case of a function
10753 using eh_return: the eax and edx registers are marked as saved, but
10754 not restored along this path. Adjust the save location to match. */
10755 if (crtl->calls_eh_return && style != 2)
10756 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10757
10758 /* EH_RETURN requires the use of moves to function properly. */
10759 if (crtl->calls_eh_return)
10760 restore_regs_via_mov = true;
10761 /* SEH requires the use of pops to identify the epilogue. */
10762 else if (TARGET_SEH)
10763 restore_regs_via_mov = false;
10764 /* If we're only restoring one register and sp is not valid then
10765 using a move instruction to restore the register since it's
10766 less work than reloading sp and popping the register. */
10767 else if (!m->fs.sp_valid && frame.nregs <= 1)
10768 restore_regs_via_mov = true;
10769 else if (TARGET_EPILOGUE_USING_MOVE
10770 && cfun->machine->use_fast_prologue_epilogue
10771 && (frame.nregs > 1
10772 || m->fs.sp_offset != frame.reg_save_offset))
10773 restore_regs_via_mov = true;
10774 else if (frame_pointer_needed
10775 && !frame.nregs
10776 && m->fs.sp_offset != frame.reg_save_offset)
10777 restore_regs_via_mov = true;
10778 else if (frame_pointer_needed
10779 && TARGET_USE_LEAVE
10780 && cfun->machine->use_fast_prologue_epilogue
10781 && frame.nregs == 1)
10782 restore_regs_via_mov = true;
10783 else
10784 restore_regs_via_mov = false;
10785
10786 if (restore_regs_via_mov || frame.nsseregs)
10787 {
10788 /* Ensure that the entire register save area is addressable via
10789 the stack pointer, if we will restore via sp. */
10790 if (TARGET_64BIT
10791 && m->fs.sp_offset > 0x7fffffff
10792 && !(m->fs.fp_valid || m->fs.drap_valid)
10793 && (frame.nsseregs + frame.nregs) != 0)
10794 {
10795 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10796 GEN_INT (m->fs.sp_offset
10797 - frame.sse_reg_save_offset),
10798 style,
10799 m->fs.cfa_reg == stack_pointer_rtx);
10800 }
10801 }
10802
10803 /* If there are any SSE registers to restore, then we have to do it
10804 via moves, since there's obviously no pop for SSE regs. */
10805 if (frame.nsseregs)
10806 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10807 style == 2);
10808
10809 if (restore_regs_via_mov)
10810 {
10811 rtx t;
10812
10813 if (frame.nregs)
10814 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10815
10816 /* eh_return epilogues need %ecx added to the stack pointer. */
10817 if (style == 2)
10818 {
10819 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10820
10821 /* Stack align doesn't work with eh_return. */
10822 gcc_assert (!stack_realign_drap);
10823 /* Neither does regparm nested functions. */
10824 gcc_assert (!ix86_static_chain_on_stack);
10825
10826 if (frame_pointer_needed)
10827 {
10828 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10829 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10830 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10831
10832 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10833 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10834
10835 /* Note that we use SA as a temporary CFA, as the return
10836 address is at the proper place relative to it. We
10837 pretend this happens at the FP restore insn because
10838 prior to this insn the FP would be stored at the wrong
10839 offset relative to SA, and after this insn we have no
10840 other reasonable register to use for the CFA. We don't
10841 bother resetting the CFA to the SP for the duration of
10842 the return insn. */
10843 add_reg_note (insn, REG_CFA_DEF_CFA,
10844 plus_constant (Pmode, sa, UNITS_PER_WORD));
10845 ix86_add_queued_cfa_restore_notes (insn);
10846 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10847 RTX_FRAME_RELATED_P (insn) = 1;
10848
10849 m->fs.cfa_reg = sa;
10850 m->fs.cfa_offset = UNITS_PER_WORD;
10851 m->fs.fp_valid = false;
10852
10853 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10854 const0_rtx, style, false);
10855 }
10856 else
10857 {
10858 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10859 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10860 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10861 ix86_add_queued_cfa_restore_notes (insn);
10862
10863 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10864 if (m->fs.cfa_offset != UNITS_PER_WORD)
10865 {
10866 m->fs.cfa_offset = UNITS_PER_WORD;
10867 add_reg_note (insn, REG_CFA_DEF_CFA,
10868 plus_constant (Pmode, stack_pointer_rtx,
10869 UNITS_PER_WORD));
10870 RTX_FRAME_RELATED_P (insn) = 1;
10871 }
10872 }
10873 m->fs.sp_offset = UNITS_PER_WORD;
10874 m->fs.sp_valid = true;
10875 }
10876 }
10877 else
10878 {
10879 /* SEH requires that the function end with (1) a stack adjustment
10880 if necessary, (2) a sequence of pops, and (3) a return or
10881 jump instruction. Prevent insns from the function body from
10882 being scheduled into this sequence. */
10883 if (TARGET_SEH)
10884 {
10885 /* Prevent a catch region from being adjacent to the standard
10886 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10887 several other flags that would be interesting to test are
10888 not yet set up. */
10889 if (flag_non_call_exceptions)
10890 emit_insn (gen_nops (const1_rtx));
10891 else
10892 emit_insn (gen_blockage ());
10893 }
10894
10895 /* First step is to deallocate the stack frame so that we can
10896 pop the registers. Also do it on SEH target for very large
10897 frame as the emitted instructions aren't allowed by the ABI in
10898 epilogues. */
10899 if (!m->fs.sp_valid
10900 || (TARGET_SEH
10901 && (m->fs.sp_offset - frame.reg_save_offset
10902 >= SEH_MAX_FRAME_SIZE)))
10903 {
10904 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10905 GEN_INT (m->fs.fp_offset
10906 - frame.reg_save_offset),
10907 style, false);
10908 }
10909 else if (m->fs.sp_offset != frame.reg_save_offset)
10910 {
10911 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10912 GEN_INT (m->fs.sp_offset
10913 - frame.reg_save_offset),
10914 style,
10915 m->fs.cfa_reg == stack_pointer_rtx);
10916 }
10917
10918 ix86_emit_restore_regs_using_pop ();
10919 }
10920
10921 /* If we used a stack pointer and haven't already got rid of it,
10922 then do so now. */
10923 if (m->fs.fp_valid)
10924 {
10925 /* If the stack pointer is valid and pointing at the frame
10926 pointer store address, then we only need a pop. */
10927 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10928 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10929 /* Leave results in shorter dependency chains on CPUs that are
10930 able to grok it fast. */
10931 else if (TARGET_USE_LEAVE
10932 || optimize_function_for_size_p (cfun)
10933 || !cfun->machine->use_fast_prologue_epilogue)
10934 ix86_emit_leave ();
10935 else
10936 {
10937 pro_epilogue_adjust_stack (stack_pointer_rtx,
10938 hard_frame_pointer_rtx,
10939 const0_rtx, style, !using_drap);
10940 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10941 }
10942 }
10943
10944 if (using_drap)
10945 {
10946 int param_ptr_offset = UNITS_PER_WORD;
10947 rtx insn;
10948
10949 gcc_assert (stack_realign_drap);
10950
10951 if (ix86_static_chain_on_stack)
10952 param_ptr_offset += UNITS_PER_WORD;
10953 if (!call_used_regs[REGNO (crtl->drap_reg)])
10954 param_ptr_offset += UNITS_PER_WORD;
10955
10956 insn = emit_insn (gen_rtx_SET
10957 (VOIDmode, stack_pointer_rtx,
10958 gen_rtx_PLUS (Pmode,
10959 crtl->drap_reg,
10960 GEN_INT (-param_ptr_offset))));
10961 m->fs.cfa_reg = stack_pointer_rtx;
10962 m->fs.cfa_offset = param_ptr_offset;
10963 m->fs.sp_offset = param_ptr_offset;
10964 m->fs.realigned = false;
10965
10966 add_reg_note (insn, REG_CFA_DEF_CFA,
10967 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10968 GEN_INT (param_ptr_offset)));
10969 RTX_FRAME_RELATED_P (insn) = 1;
10970
10971 if (!call_used_regs[REGNO (crtl->drap_reg)])
10972 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10973 }
10974
10975 /* At this point the stack pointer must be valid, and we must have
10976 restored all of the registers. We may not have deallocated the
10977 entire stack frame. We've delayed this until now because it may
10978 be possible to merge the local stack deallocation with the
10979 deallocation forced by ix86_static_chain_on_stack. */
10980 gcc_assert (m->fs.sp_valid);
10981 gcc_assert (!m->fs.fp_valid);
10982 gcc_assert (!m->fs.realigned);
10983 if (m->fs.sp_offset != UNITS_PER_WORD)
10984 {
10985 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10986 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10987 style, true);
10988 }
10989 else
10990 ix86_add_queued_cfa_restore_notes (get_last_insn ());
10991
10992 /* Sibcall epilogues don't want a return instruction. */
10993 if (style == 0)
10994 {
10995 m->fs = frame_state_save;
10996 return;
10997 }
10998
10999 if (crtl->args.pops_args && crtl->args.size)
11000 {
11001 rtx popc = GEN_INT (crtl->args.pops_args);
11002
11003 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11004 address, do explicit add, and jump indirectly to the caller. */
11005
11006 if (crtl->args.pops_args >= 65536)
11007 {
11008 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11009 rtx insn;
11010
11011 /* There is no "pascal" calling convention in any 64bit ABI. */
11012 gcc_assert (!TARGET_64BIT);
11013
11014 insn = emit_insn (gen_pop (ecx));
11015 m->fs.cfa_offset -= UNITS_PER_WORD;
11016 m->fs.sp_offset -= UNITS_PER_WORD;
11017
11018 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11019 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11020 add_reg_note (insn, REG_CFA_REGISTER,
11021 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11022 RTX_FRAME_RELATED_P (insn) = 1;
11023
11024 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11025 popc, -1, true);
11026 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11027 }
11028 else
11029 emit_jump_insn (gen_simple_return_pop_internal (popc));
11030 }
11031 else
11032 emit_jump_insn (gen_simple_return_internal ());
11033
11034 /* Restore the state back to the state from the prologue,
11035 so that it's correct for the next epilogue. */
11036 m->fs = frame_state_save;
11037 }
11038
11039 /* Reset from the function's potential modifications. */
11040
11041 static void
11042 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11043 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11044 {
11045 if (pic_offset_table_rtx)
11046 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11047 #if TARGET_MACHO
11048 /* Mach-O doesn't support labels at the end of objects, so if
11049 it looks like we might want one, insert a NOP. */
11050 {
11051 rtx insn = get_last_insn ();
11052 rtx deleted_debug_label = NULL_RTX;
11053 while (insn
11054 && NOTE_P (insn)
11055 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11056 {
11057 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11058 notes only, instead set their CODE_LABEL_NUMBER to -1,
11059 otherwise there would be code generation differences
11060 in between -g and -g0. */
11061 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11062 deleted_debug_label = insn;
11063 insn = PREV_INSN (insn);
11064 }
11065 if (insn
11066 && (LABEL_P (insn)
11067 || (NOTE_P (insn)
11068 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11069 fputs ("\tnop\n", file);
11070 else if (deleted_debug_label)
11071 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11072 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11073 CODE_LABEL_NUMBER (insn) = -1;
11074 }
11075 #endif
11076
11077 }
11078
11079 /* Return a scratch register to use in the split stack prologue. The
11080 split stack prologue is used for -fsplit-stack. It is the first
11081 instructions in the function, even before the regular prologue.
11082 The scratch register can be any caller-saved register which is not
11083 used for parameters or for the static chain. */
11084
11085 static unsigned int
11086 split_stack_prologue_scratch_regno (void)
11087 {
11088 if (TARGET_64BIT)
11089 return R11_REG;
11090 else
11091 {
11092 bool is_fastcall;
11093 int regparm;
11094
11095 is_fastcall = (lookup_attribute ("fastcall",
11096 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11097 != NULL);
11098 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11099
11100 if (is_fastcall)
11101 {
11102 if (DECL_STATIC_CHAIN (cfun->decl))
11103 {
11104 sorry ("-fsplit-stack does not support fastcall with "
11105 "nested function");
11106 return INVALID_REGNUM;
11107 }
11108 return AX_REG;
11109 }
11110 else if (regparm < 3)
11111 {
11112 if (!DECL_STATIC_CHAIN (cfun->decl))
11113 return CX_REG;
11114 else
11115 {
11116 if (regparm >= 2)
11117 {
11118 sorry ("-fsplit-stack does not support 2 register "
11119 " parameters for a nested function");
11120 return INVALID_REGNUM;
11121 }
11122 return DX_REG;
11123 }
11124 }
11125 else
11126 {
11127 /* FIXME: We could make this work by pushing a register
11128 around the addition and comparison. */
11129 sorry ("-fsplit-stack does not support 3 register parameters");
11130 return INVALID_REGNUM;
11131 }
11132 }
11133 }
11134
11135 /* A SYMBOL_REF for the function which allocates new stackspace for
11136 -fsplit-stack. */
11137
11138 static GTY(()) rtx split_stack_fn;
11139
11140 /* A SYMBOL_REF for the more stack function when using the large
11141 model. */
11142
11143 static GTY(()) rtx split_stack_fn_large;
11144
11145 /* Handle -fsplit-stack. These are the first instructions in the
11146 function, even before the regular prologue. */
11147
11148 void
11149 ix86_expand_split_stack_prologue (void)
11150 {
11151 struct ix86_frame frame;
11152 HOST_WIDE_INT allocate;
11153 unsigned HOST_WIDE_INT args_size;
11154 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11155 rtx scratch_reg = NULL_RTX;
11156 rtx varargs_label = NULL_RTX;
11157 rtx fn;
11158
11159 gcc_assert (flag_split_stack && reload_completed);
11160
11161 ix86_finalize_stack_realign_flags ();
11162 ix86_compute_frame_layout (&frame);
11163 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11164
11165 /* This is the label we will branch to if we have enough stack
11166 space. We expect the basic block reordering pass to reverse this
11167 branch if optimizing, so that we branch in the unlikely case. */
11168 label = gen_label_rtx ();
11169
11170 /* We need to compare the stack pointer minus the frame size with
11171 the stack boundary in the TCB. The stack boundary always gives
11172 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11173 can compare directly. Otherwise we need to do an addition. */
11174
11175 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11176 UNSPEC_STACK_CHECK);
11177 limit = gen_rtx_CONST (Pmode, limit);
11178 limit = gen_rtx_MEM (Pmode, limit);
11179 if (allocate < SPLIT_STACK_AVAILABLE)
11180 current = stack_pointer_rtx;
11181 else
11182 {
11183 unsigned int scratch_regno;
11184 rtx offset;
11185
11186 /* We need a scratch register to hold the stack pointer minus
11187 the required frame size. Since this is the very start of the
11188 function, the scratch register can be any caller-saved
11189 register which is not used for parameters. */
11190 offset = GEN_INT (- allocate);
11191 scratch_regno = split_stack_prologue_scratch_regno ();
11192 if (scratch_regno == INVALID_REGNUM)
11193 return;
11194 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11195 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11196 {
11197 /* We don't use ix86_gen_add3 in this case because it will
11198 want to split to lea, but when not optimizing the insn
11199 will not be split after this point. */
11200 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11201 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11202 offset)));
11203 }
11204 else
11205 {
11206 emit_move_insn (scratch_reg, offset);
11207 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11208 stack_pointer_rtx));
11209 }
11210 current = scratch_reg;
11211 }
11212
11213 ix86_expand_branch (GEU, current, limit, label);
11214 jump_insn = get_last_insn ();
11215 JUMP_LABEL (jump_insn) = label;
11216
11217 /* Mark the jump as very likely to be taken. */
11218 add_reg_note (jump_insn, REG_BR_PROB,
11219 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11220
11221 if (split_stack_fn == NULL_RTX)
11222 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11223 fn = split_stack_fn;
11224
11225 /* Get more stack space. We pass in the desired stack space and the
11226 size of the arguments to copy to the new stack. In 32-bit mode
11227 we push the parameters; __morestack will return on a new stack
11228 anyhow. In 64-bit mode we pass the parameters in r10 and
11229 r11. */
11230 allocate_rtx = GEN_INT (allocate);
11231 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11232 call_fusage = NULL_RTX;
11233 if (TARGET_64BIT)
11234 {
11235 rtx reg10, reg11;
11236
11237 reg10 = gen_rtx_REG (Pmode, R10_REG);
11238 reg11 = gen_rtx_REG (Pmode, R11_REG);
11239
11240 /* If this function uses a static chain, it will be in %r10.
11241 Preserve it across the call to __morestack. */
11242 if (DECL_STATIC_CHAIN (cfun->decl))
11243 {
11244 rtx rax;
11245
11246 rax = gen_rtx_REG (word_mode, AX_REG);
11247 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11248 use_reg (&call_fusage, rax);
11249 }
11250
11251 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11252 {
11253 HOST_WIDE_INT argval;
11254
11255 gcc_assert (Pmode == DImode);
11256 /* When using the large model we need to load the address
11257 into a register, and we've run out of registers. So we
11258 switch to a different calling convention, and we call a
11259 different function: __morestack_large. We pass the
11260 argument size in the upper 32 bits of r10 and pass the
11261 frame size in the lower 32 bits. */
11262 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11263 gcc_assert ((args_size & 0xffffffff) == args_size);
11264
11265 if (split_stack_fn_large == NULL_RTX)
11266 split_stack_fn_large =
11267 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11268
11269 if (ix86_cmodel == CM_LARGE_PIC)
11270 {
11271 rtx label, x;
11272
11273 label = gen_label_rtx ();
11274 emit_label (label);
11275 LABEL_PRESERVE_P (label) = 1;
11276 emit_insn (gen_set_rip_rex64 (reg10, label));
11277 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11278 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11279 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11280 UNSPEC_GOT);
11281 x = gen_rtx_CONST (Pmode, x);
11282 emit_move_insn (reg11, x);
11283 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11284 x = gen_const_mem (Pmode, x);
11285 emit_move_insn (reg11, x);
11286 }
11287 else
11288 emit_move_insn (reg11, split_stack_fn_large);
11289
11290 fn = reg11;
11291
11292 argval = ((args_size << 16) << 16) + allocate;
11293 emit_move_insn (reg10, GEN_INT (argval));
11294 }
11295 else
11296 {
11297 emit_move_insn (reg10, allocate_rtx);
11298 emit_move_insn (reg11, GEN_INT (args_size));
11299 use_reg (&call_fusage, reg11);
11300 }
11301
11302 use_reg (&call_fusage, reg10);
11303 }
11304 else
11305 {
11306 emit_insn (gen_push (GEN_INT (args_size)));
11307 emit_insn (gen_push (allocate_rtx));
11308 }
11309 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11310 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11311 NULL_RTX, false);
11312 add_function_usage_to (call_insn, call_fusage);
11313
11314 /* In order to make call/return prediction work right, we now need
11315 to execute a return instruction. See
11316 libgcc/config/i386/morestack.S for the details on how this works.
11317
11318 For flow purposes gcc must not see this as a return
11319 instruction--we need control flow to continue at the subsequent
11320 label. Therefore, we use an unspec. */
11321 gcc_assert (crtl->args.pops_args < 65536);
11322 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11323
11324 /* If we are in 64-bit mode and this function uses a static chain,
11325 we saved %r10 in %rax before calling _morestack. */
11326 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11327 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11328 gen_rtx_REG (word_mode, AX_REG));
11329
11330 /* If this function calls va_start, we need to store a pointer to
11331 the arguments on the old stack, because they may not have been
11332 all copied to the new stack. At this point the old stack can be
11333 found at the frame pointer value used by __morestack, because
11334 __morestack has set that up before calling back to us. Here we
11335 store that pointer in a scratch register, and in
11336 ix86_expand_prologue we store the scratch register in a stack
11337 slot. */
11338 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11339 {
11340 unsigned int scratch_regno;
11341 rtx frame_reg;
11342 int words;
11343
11344 scratch_regno = split_stack_prologue_scratch_regno ();
11345 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11346 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11347
11348 /* 64-bit:
11349 fp -> old fp value
11350 return address within this function
11351 return address of caller of this function
11352 stack arguments
11353 So we add three words to get to the stack arguments.
11354
11355 32-bit:
11356 fp -> old fp value
11357 return address within this function
11358 first argument to __morestack
11359 second argument to __morestack
11360 return address of caller of this function
11361 stack arguments
11362 So we add five words to get to the stack arguments.
11363 */
11364 words = TARGET_64BIT ? 3 : 5;
11365 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11366 gen_rtx_PLUS (Pmode, frame_reg,
11367 GEN_INT (words * UNITS_PER_WORD))));
11368
11369 varargs_label = gen_label_rtx ();
11370 emit_jump_insn (gen_jump (varargs_label));
11371 JUMP_LABEL (get_last_insn ()) = varargs_label;
11372
11373 emit_barrier ();
11374 }
11375
11376 emit_label (label);
11377 LABEL_NUSES (label) = 1;
11378
11379 /* If this function calls va_start, we now have to set the scratch
11380 register for the case where we do not call __morestack. In this
11381 case we need to set it based on the stack pointer. */
11382 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11383 {
11384 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11385 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11386 GEN_INT (UNITS_PER_WORD))));
11387
11388 emit_label (varargs_label);
11389 LABEL_NUSES (varargs_label) = 1;
11390 }
11391 }
11392
11393 /* We may have to tell the dataflow pass that the split stack prologue
11394 is initializing a scratch register. */
11395
11396 static void
11397 ix86_live_on_entry (bitmap regs)
11398 {
11399 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11400 {
11401 gcc_assert (flag_split_stack);
11402 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11403 }
11404 }
11405 \f
11406 /* Determine if op is suitable SUBREG RTX for address. */
11407
11408 static bool
11409 ix86_address_subreg_operand (rtx op)
11410 {
11411 enum machine_mode mode;
11412
11413 if (!REG_P (op))
11414 return false;
11415
11416 mode = GET_MODE (op);
11417
11418 if (GET_MODE_CLASS (mode) != MODE_INT)
11419 return false;
11420
11421 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11422 failures when the register is one word out of a two word structure. */
11423 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11424 return false;
11425
11426 /* Allow only SUBREGs of non-eliminable hard registers. */
11427 return register_no_elim_operand (op, mode);
11428 }
11429
11430 /* Extract the parts of an RTL expression that is a valid memory address
11431 for an instruction. Return 0 if the structure of the address is
11432 grossly off. Return -1 if the address contains ASHIFT, so it is not
11433 strictly valid, but still used for computing length of lea instruction. */
11434
11435 int
11436 ix86_decompose_address (rtx addr, struct ix86_address *out)
11437 {
11438 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11439 rtx base_reg, index_reg;
11440 HOST_WIDE_INT scale = 1;
11441 rtx scale_rtx = NULL_RTX;
11442 rtx tmp;
11443 int retval = 1;
11444 enum ix86_address_seg seg = SEG_DEFAULT;
11445
11446 /* Allow zero-extended SImode addresses,
11447 they will be emitted with addr32 prefix. */
11448 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11449 {
11450 if (GET_CODE (addr) == ZERO_EXTEND
11451 && GET_MODE (XEXP (addr, 0)) == SImode)
11452 {
11453 addr = XEXP (addr, 0);
11454 if (CONST_INT_P (addr))
11455 return 0;
11456 }
11457 else if (GET_CODE (addr) == AND
11458 && const_32bit_mask (XEXP (addr, 1), DImode))
11459 {
11460 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11461 if (addr == NULL_RTX)
11462 return 0;
11463
11464 if (CONST_INT_P (addr))
11465 return 0;
11466 }
11467 }
11468
11469 /* Allow SImode subregs of DImode addresses,
11470 they will be emitted with addr32 prefix. */
11471 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11472 {
11473 if (GET_CODE (addr) == SUBREG
11474 && GET_MODE (SUBREG_REG (addr)) == DImode)
11475 {
11476 addr = SUBREG_REG (addr);
11477 if (CONST_INT_P (addr))
11478 return 0;
11479 }
11480 }
11481
11482 if (REG_P (addr))
11483 base = addr;
11484 else if (GET_CODE (addr) == SUBREG)
11485 {
11486 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11487 base = addr;
11488 else
11489 return 0;
11490 }
11491 else if (GET_CODE (addr) == PLUS)
11492 {
11493 rtx addends[4], op;
11494 int n = 0, i;
11495
11496 op = addr;
11497 do
11498 {
11499 if (n >= 4)
11500 return 0;
11501 addends[n++] = XEXP (op, 1);
11502 op = XEXP (op, 0);
11503 }
11504 while (GET_CODE (op) == PLUS);
11505 if (n >= 4)
11506 return 0;
11507 addends[n] = op;
11508
11509 for (i = n; i >= 0; --i)
11510 {
11511 op = addends[i];
11512 switch (GET_CODE (op))
11513 {
11514 case MULT:
11515 if (index)
11516 return 0;
11517 index = XEXP (op, 0);
11518 scale_rtx = XEXP (op, 1);
11519 break;
11520
11521 case ASHIFT:
11522 if (index)
11523 return 0;
11524 index = XEXP (op, 0);
11525 tmp = XEXP (op, 1);
11526 if (!CONST_INT_P (tmp))
11527 return 0;
11528 scale = INTVAL (tmp);
11529 if ((unsigned HOST_WIDE_INT) scale > 3)
11530 return 0;
11531 scale = 1 << scale;
11532 break;
11533
11534 case ZERO_EXTEND:
11535 op = XEXP (op, 0);
11536 if (GET_CODE (op) != UNSPEC)
11537 return 0;
11538 /* FALLTHRU */
11539
11540 case UNSPEC:
11541 if (XINT (op, 1) == UNSPEC_TP
11542 && TARGET_TLS_DIRECT_SEG_REFS
11543 && seg == SEG_DEFAULT)
11544 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11545 else
11546 return 0;
11547 break;
11548
11549 case SUBREG:
11550 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11551 return 0;
11552 /* FALLTHRU */
11553
11554 case REG:
11555 if (!base)
11556 base = op;
11557 else if (!index)
11558 index = op;
11559 else
11560 return 0;
11561 break;
11562
11563 case CONST:
11564 case CONST_INT:
11565 case SYMBOL_REF:
11566 case LABEL_REF:
11567 if (disp)
11568 return 0;
11569 disp = op;
11570 break;
11571
11572 default:
11573 return 0;
11574 }
11575 }
11576 }
11577 else if (GET_CODE (addr) == MULT)
11578 {
11579 index = XEXP (addr, 0); /* index*scale */
11580 scale_rtx = XEXP (addr, 1);
11581 }
11582 else if (GET_CODE (addr) == ASHIFT)
11583 {
11584 /* We're called for lea too, which implements ashift on occasion. */
11585 index = XEXP (addr, 0);
11586 tmp = XEXP (addr, 1);
11587 if (!CONST_INT_P (tmp))
11588 return 0;
11589 scale = INTVAL (tmp);
11590 if ((unsigned HOST_WIDE_INT) scale > 3)
11591 return 0;
11592 scale = 1 << scale;
11593 retval = -1;
11594 }
11595 else if (CONST_INT_P (addr))
11596 {
11597 if (!x86_64_immediate_operand (addr, VOIDmode))
11598 return 0;
11599
11600 /* Constant addresses are sign extended to 64bit, we have to
11601 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11602 if (TARGET_X32
11603 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11604 return 0;
11605
11606 disp = addr;
11607 }
11608 else
11609 disp = addr; /* displacement */
11610
11611 if (index)
11612 {
11613 if (REG_P (index))
11614 ;
11615 else if (GET_CODE (index) == SUBREG
11616 && ix86_address_subreg_operand (SUBREG_REG (index)))
11617 ;
11618 else
11619 return 0;
11620 }
11621
11622 /* Address override works only on the (%reg) part of %fs:(%reg). */
11623 if (seg != SEG_DEFAULT
11624 && ((base && GET_MODE (base) != word_mode)
11625 || (index && GET_MODE (index) != word_mode)))
11626 return 0;
11627
11628 /* Extract the integral value of scale. */
11629 if (scale_rtx)
11630 {
11631 if (!CONST_INT_P (scale_rtx))
11632 return 0;
11633 scale = INTVAL (scale_rtx);
11634 }
11635
11636 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11637 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11638
11639 /* Avoid useless 0 displacement. */
11640 if (disp == const0_rtx && (base || index))
11641 disp = NULL_RTX;
11642
11643 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11644 if (base_reg && index_reg && scale == 1
11645 && (index_reg == arg_pointer_rtx
11646 || index_reg == frame_pointer_rtx
11647 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11648 {
11649 rtx tmp;
11650 tmp = base, base = index, index = tmp;
11651 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11652 }
11653
11654 /* Special case: %ebp cannot be encoded as a base without a displacement.
11655 Similarly %r13. */
11656 if (!disp
11657 && base_reg
11658 && (base_reg == hard_frame_pointer_rtx
11659 || base_reg == frame_pointer_rtx
11660 || base_reg == arg_pointer_rtx
11661 || (REG_P (base_reg)
11662 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11663 || REGNO (base_reg) == R13_REG))))
11664 disp = const0_rtx;
11665
11666 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11667 Avoid this by transforming to [%esi+0].
11668 Reload calls address legitimization without cfun defined, so we need
11669 to test cfun for being non-NULL. */
11670 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11671 && base_reg && !index_reg && !disp
11672 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11673 disp = const0_rtx;
11674
11675 /* Special case: encode reg+reg instead of reg*2. */
11676 if (!base && index && scale == 2)
11677 base = index, base_reg = index_reg, scale = 1;
11678
11679 /* Special case: scaling cannot be encoded without base or displacement. */
11680 if (!base && !disp && index && scale != 1)
11681 disp = const0_rtx;
11682
11683 out->base = base;
11684 out->index = index;
11685 out->disp = disp;
11686 out->scale = scale;
11687 out->seg = seg;
11688
11689 return retval;
11690 }
11691 \f
11692 /* Return cost of the memory address x.
11693 For i386, it is better to use a complex address than let gcc copy
11694 the address into a reg and make a new pseudo. But not if the address
11695 requires to two regs - that would mean more pseudos with longer
11696 lifetimes. */
11697 static int
11698 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11699 addr_space_t as ATTRIBUTE_UNUSED,
11700 bool speed ATTRIBUTE_UNUSED)
11701 {
11702 struct ix86_address parts;
11703 int cost = 1;
11704 int ok = ix86_decompose_address (x, &parts);
11705
11706 gcc_assert (ok);
11707
11708 if (parts.base && GET_CODE (parts.base) == SUBREG)
11709 parts.base = SUBREG_REG (parts.base);
11710 if (parts.index && GET_CODE (parts.index) == SUBREG)
11711 parts.index = SUBREG_REG (parts.index);
11712
11713 /* Attempt to minimize number of registers in the address. */
11714 if ((parts.base
11715 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11716 || (parts.index
11717 && (!REG_P (parts.index)
11718 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11719 cost++;
11720
11721 if (parts.base
11722 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11723 && parts.index
11724 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11725 && parts.base != parts.index)
11726 cost++;
11727
11728 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11729 since it's predecode logic can't detect the length of instructions
11730 and it degenerates to vector decoded. Increase cost of such
11731 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11732 to split such addresses or even refuse such addresses at all.
11733
11734 Following addressing modes are affected:
11735 [base+scale*index]
11736 [scale*index+disp]
11737 [base+index]
11738
11739 The first and last case may be avoidable by explicitly coding the zero in
11740 memory address, but I don't have AMD-K6 machine handy to check this
11741 theory. */
11742
11743 if (TARGET_K6
11744 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11745 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11746 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11747 cost += 10;
11748
11749 return cost;
11750 }
11751 \f
11752 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11753 this is used for to form addresses to local data when -fPIC is in
11754 use. */
11755
11756 static bool
11757 darwin_local_data_pic (rtx disp)
11758 {
11759 return (GET_CODE (disp) == UNSPEC
11760 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11761 }
11762
11763 /* Determine if a given RTX is a valid constant. We already know this
11764 satisfies CONSTANT_P. */
11765
11766 static bool
11767 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11768 {
11769 switch (GET_CODE (x))
11770 {
11771 case CONST:
11772 x = XEXP (x, 0);
11773
11774 if (GET_CODE (x) == PLUS)
11775 {
11776 if (!CONST_INT_P (XEXP (x, 1)))
11777 return false;
11778 x = XEXP (x, 0);
11779 }
11780
11781 if (TARGET_MACHO && darwin_local_data_pic (x))
11782 return true;
11783
11784 /* Only some unspecs are valid as "constants". */
11785 if (GET_CODE (x) == UNSPEC)
11786 switch (XINT (x, 1))
11787 {
11788 case UNSPEC_GOT:
11789 case UNSPEC_GOTOFF:
11790 case UNSPEC_PLTOFF:
11791 return TARGET_64BIT;
11792 case UNSPEC_TPOFF:
11793 case UNSPEC_NTPOFF:
11794 x = XVECEXP (x, 0, 0);
11795 return (GET_CODE (x) == SYMBOL_REF
11796 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11797 case UNSPEC_DTPOFF:
11798 x = XVECEXP (x, 0, 0);
11799 return (GET_CODE (x) == SYMBOL_REF
11800 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11801 default:
11802 return false;
11803 }
11804
11805 /* We must have drilled down to a symbol. */
11806 if (GET_CODE (x) == LABEL_REF)
11807 return true;
11808 if (GET_CODE (x) != SYMBOL_REF)
11809 return false;
11810 /* FALLTHRU */
11811
11812 case SYMBOL_REF:
11813 /* TLS symbols are never valid. */
11814 if (SYMBOL_REF_TLS_MODEL (x))
11815 return false;
11816
11817 /* DLLIMPORT symbols are never valid. */
11818 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11819 && SYMBOL_REF_DLLIMPORT_P (x))
11820 return false;
11821
11822 #if TARGET_MACHO
11823 /* mdynamic-no-pic */
11824 if (MACHO_DYNAMIC_NO_PIC_P)
11825 return machopic_symbol_defined_p (x);
11826 #endif
11827 break;
11828
11829 case CONST_DOUBLE:
11830 if (GET_MODE (x) == TImode
11831 && x != CONST0_RTX (TImode)
11832 && !TARGET_64BIT)
11833 return false;
11834 break;
11835
11836 case CONST_VECTOR:
11837 if (!standard_sse_constant_p (x))
11838 return false;
11839
11840 default:
11841 break;
11842 }
11843
11844 /* Otherwise we handle everything else in the move patterns. */
11845 return true;
11846 }
11847
11848 /* Determine if it's legal to put X into the constant pool. This
11849 is not possible for the address of thread-local symbols, which
11850 is checked above. */
11851
11852 static bool
11853 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11854 {
11855 /* We can always put integral constants and vectors in memory. */
11856 switch (GET_CODE (x))
11857 {
11858 case CONST_INT:
11859 case CONST_DOUBLE:
11860 case CONST_VECTOR:
11861 return false;
11862
11863 default:
11864 break;
11865 }
11866 return !ix86_legitimate_constant_p (mode, x);
11867 }
11868
11869
11870 /* Nonzero if the constant value X is a legitimate general operand
11871 when generating PIC code. It is given that flag_pic is on and
11872 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11873
11874 bool
11875 legitimate_pic_operand_p (rtx x)
11876 {
11877 rtx inner;
11878
11879 switch (GET_CODE (x))
11880 {
11881 case CONST:
11882 inner = XEXP (x, 0);
11883 if (GET_CODE (inner) == PLUS
11884 && CONST_INT_P (XEXP (inner, 1)))
11885 inner = XEXP (inner, 0);
11886
11887 /* Only some unspecs are valid as "constants". */
11888 if (GET_CODE (inner) == UNSPEC)
11889 switch (XINT (inner, 1))
11890 {
11891 case UNSPEC_GOT:
11892 case UNSPEC_GOTOFF:
11893 case UNSPEC_PLTOFF:
11894 return TARGET_64BIT;
11895 case UNSPEC_TPOFF:
11896 x = XVECEXP (inner, 0, 0);
11897 return (GET_CODE (x) == SYMBOL_REF
11898 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11899 case UNSPEC_MACHOPIC_OFFSET:
11900 return legitimate_pic_address_disp_p (x);
11901 default:
11902 return false;
11903 }
11904 /* FALLTHRU */
11905
11906 case SYMBOL_REF:
11907 case LABEL_REF:
11908 return legitimate_pic_address_disp_p (x);
11909
11910 default:
11911 return true;
11912 }
11913 }
11914
11915 /* Determine if a given CONST RTX is a valid memory displacement
11916 in PIC mode. */
11917
11918 bool
11919 legitimate_pic_address_disp_p (rtx disp)
11920 {
11921 bool saw_plus;
11922
11923 /* In 64bit mode we can allow direct addresses of symbols and labels
11924 when they are not dynamic symbols. */
11925 if (TARGET_64BIT)
11926 {
11927 rtx op0 = disp, op1;
11928
11929 switch (GET_CODE (disp))
11930 {
11931 case LABEL_REF:
11932 return true;
11933
11934 case CONST:
11935 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11936 break;
11937 op0 = XEXP (XEXP (disp, 0), 0);
11938 op1 = XEXP (XEXP (disp, 0), 1);
11939 if (!CONST_INT_P (op1)
11940 || INTVAL (op1) >= 16*1024*1024
11941 || INTVAL (op1) < -16*1024*1024)
11942 break;
11943 if (GET_CODE (op0) == LABEL_REF)
11944 return true;
11945 if (GET_CODE (op0) == CONST
11946 && GET_CODE (XEXP (op0, 0)) == UNSPEC
11947 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
11948 return true;
11949 if (GET_CODE (op0) == UNSPEC
11950 && XINT (op0, 1) == UNSPEC_PCREL)
11951 return true;
11952 if (GET_CODE (op0) != SYMBOL_REF)
11953 break;
11954 /* FALLTHRU */
11955
11956 case SYMBOL_REF:
11957 /* TLS references should always be enclosed in UNSPEC. */
11958 if (SYMBOL_REF_TLS_MODEL (op0))
11959 return false;
11960 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11961 && ix86_cmodel != CM_LARGE_PIC)
11962 return true;
11963 break;
11964
11965 default:
11966 break;
11967 }
11968 }
11969 if (GET_CODE (disp) != CONST)
11970 return false;
11971 disp = XEXP (disp, 0);
11972
11973 if (TARGET_64BIT)
11974 {
11975 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11976 of GOT tables. We should not need these anyway. */
11977 if (GET_CODE (disp) != UNSPEC
11978 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11979 && XINT (disp, 1) != UNSPEC_GOTOFF
11980 && XINT (disp, 1) != UNSPEC_PCREL
11981 && XINT (disp, 1) != UNSPEC_PLTOFF))
11982 return false;
11983
11984 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11985 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11986 return false;
11987 return true;
11988 }
11989
11990 saw_plus = false;
11991 if (GET_CODE (disp) == PLUS)
11992 {
11993 if (!CONST_INT_P (XEXP (disp, 1)))
11994 return false;
11995 disp = XEXP (disp, 0);
11996 saw_plus = true;
11997 }
11998
11999 if (TARGET_MACHO && darwin_local_data_pic (disp))
12000 return true;
12001
12002 if (GET_CODE (disp) != UNSPEC)
12003 return false;
12004
12005 switch (XINT (disp, 1))
12006 {
12007 case UNSPEC_GOT:
12008 if (saw_plus)
12009 return false;
12010 /* We need to check for both symbols and labels because VxWorks loads
12011 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12012 details. */
12013 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12014 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12015 case UNSPEC_GOTOFF:
12016 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12017 While ABI specify also 32bit relocation but we don't produce it in
12018 small PIC model at all. */
12019 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12020 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12021 && !TARGET_64BIT)
12022 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12023 return false;
12024 case UNSPEC_GOTTPOFF:
12025 case UNSPEC_GOTNTPOFF:
12026 case UNSPEC_INDNTPOFF:
12027 if (saw_plus)
12028 return false;
12029 disp = XVECEXP (disp, 0, 0);
12030 return (GET_CODE (disp) == SYMBOL_REF
12031 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12032 case UNSPEC_NTPOFF:
12033 disp = XVECEXP (disp, 0, 0);
12034 return (GET_CODE (disp) == SYMBOL_REF
12035 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12036 case UNSPEC_DTPOFF:
12037 disp = XVECEXP (disp, 0, 0);
12038 return (GET_CODE (disp) == SYMBOL_REF
12039 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12040 }
12041
12042 return false;
12043 }
12044
12045 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12046 replace the input X, or the original X if no replacement is called for.
12047 The output parameter *WIN is 1 if the calling macro should goto WIN,
12048 0 if it should not. */
12049
12050 bool
12051 ix86_legitimize_reload_address (rtx x,
12052 enum machine_mode mode ATTRIBUTE_UNUSED,
12053 int opnum, int type,
12054 int ind_levels ATTRIBUTE_UNUSED)
12055 {
12056 /* Reload can generate:
12057
12058 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12059 (reg:DI 97))
12060 (reg:DI 2 cx))
12061
12062 This RTX is rejected from ix86_legitimate_address_p due to
12063 non-strictness of base register 97. Following this rejection,
12064 reload pushes all three components into separate registers,
12065 creating invalid memory address RTX.
12066
12067 Following code reloads only the invalid part of the
12068 memory address RTX. */
12069
12070 if (GET_CODE (x) == PLUS
12071 && REG_P (XEXP (x, 1))
12072 && GET_CODE (XEXP (x, 0)) == PLUS
12073 && REG_P (XEXP (XEXP (x, 0), 1)))
12074 {
12075 rtx base, index;
12076 bool something_reloaded = false;
12077
12078 base = XEXP (XEXP (x, 0), 1);
12079 if (!REG_OK_FOR_BASE_STRICT_P (base))
12080 {
12081 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12082 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12083 opnum, (enum reload_type) type);
12084 something_reloaded = true;
12085 }
12086
12087 index = XEXP (x, 1);
12088 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12089 {
12090 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12091 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12092 opnum, (enum reload_type) type);
12093 something_reloaded = true;
12094 }
12095
12096 gcc_assert (something_reloaded);
12097 return true;
12098 }
12099
12100 return false;
12101 }
12102
12103 /* Recognizes RTL expressions that are valid memory addresses for an
12104 instruction. The MODE argument is the machine mode for the MEM
12105 expression that wants to use this address.
12106
12107 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12108 convert common non-canonical forms to canonical form so that they will
12109 be recognized. */
12110
12111 static bool
12112 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12113 rtx addr, bool strict)
12114 {
12115 struct ix86_address parts;
12116 rtx base, index, disp;
12117 HOST_WIDE_INT scale;
12118
12119 if (ix86_decompose_address (addr, &parts) <= 0)
12120 /* Decomposition failed. */
12121 return false;
12122
12123 base = parts.base;
12124 index = parts.index;
12125 disp = parts.disp;
12126 scale = parts.scale;
12127
12128 /* Validate base register. */
12129 if (base)
12130 {
12131 rtx reg;
12132
12133 if (REG_P (base))
12134 reg = base;
12135 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12136 reg = SUBREG_REG (base);
12137 else
12138 /* Base is not a register. */
12139 return false;
12140
12141 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12142 return false;
12143
12144 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12145 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12146 /* Base is not valid. */
12147 return false;
12148 }
12149
12150 /* Validate index register. */
12151 if (index)
12152 {
12153 rtx reg;
12154
12155 if (REG_P (index))
12156 reg = index;
12157 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12158 reg = SUBREG_REG (index);
12159 else
12160 /* Index is not a register. */
12161 return false;
12162
12163 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12164 return false;
12165
12166 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12167 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12168 /* Index is not valid. */
12169 return false;
12170 }
12171
12172 /* Index and base should have the same mode. */
12173 if (base && index
12174 && GET_MODE (base) != GET_MODE (index))
12175 return false;
12176
12177 /* Validate scale factor. */
12178 if (scale != 1)
12179 {
12180 if (!index)
12181 /* Scale without index. */
12182 return false;
12183
12184 if (scale != 2 && scale != 4 && scale != 8)
12185 /* Scale is not a valid multiplier. */
12186 return false;
12187 }
12188
12189 /* Validate displacement. */
12190 if (disp)
12191 {
12192 if (GET_CODE (disp) == CONST
12193 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12194 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12195 switch (XINT (XEXP (disp, 0), 1))
12196 {
12197 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12198 used. While ABI specify also 32bit relocations, we don't produce
12199 them at all and use IP relative instead. */
12200 case UNSPEC_GOT:
12201 case UNSPEC_GOTOFF:
12202 gcc_assert (flag_pic);
12203 if (!TARGET_64BIT)
12204 goto is_legitimate_pic;
12205
12206 /* 64bit address unspec. */
12207 return false;
12208
12209 case UNSPEC_GOTPCREL:
12210 case UNSPEC_PCREL:
12211 gcc_assert (flag_pic);
12212 goto is_legitimate_pic;
12213
12214 case UNSPEC_GOTTPOFF:
12215 case UNSPEC_GOTNTPOFF:
12216 case UNSPEC_INDNTPOFF:
12217 case UNSPEC_NTPOFF:
12218 case UNSPEC_DTPOFF:
12219 break;
12220
12221 case UNSPEC_STACK_CHECK:
12222 gcc_assert (flag_split_stack);
12223 break;
12224
12225 default:
12226 /* Invalid address unspec. */
12227 return false;
12228 }
12229
12230 else if (SYMBOLIC_CONST (disp)
12231 && (flag_pic
12232 || (TARGET_MACHO
12233 #if TARGET_MACHO
12234 && MACHOPIC_INDIRECT
12235 && !machopic_operand_p (disp)
12236 #endif
12237 )))
12238 {
12239
12240 is_legitimate_pic:
12241 if (TARGET_64BIT && (index || base))
12242 {
12243 /* foo@dtpoff(%rX) is ok. */
12244 if (GET_CODE (disp) != CONST
12245 || GET_CODE (XEXP (disp, 0)) != PLUS
12246 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12247 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12248 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12249 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12250 /* Non-constant pic memory reference. */
12251 return false;
12252 }
12253 else if ((!TARGET_MACHO || flag_pic)
12254 && ! legitimate_pic_address_disp_p (disp))
12255 /* Displacement is an invalid pic construct. */
12256 return false;
12257 #if TARGET_MACHO
12258 else if (MACHO_DYNAMIC_NO_PIC_P
12259 && !ix86_legitimate_constant_p (Pmode, disp))
12260 /* displacment must be referenced via non_lazy_pointer */
12261 return false;
12262 #endif
12263
12264 /* This code used to verify that a symbolic pic displacement
12265 includes the pic_offset_table_rtx register.
12266
12267 While this is good idea, unfortunately these constructs may
12268 be created by "adds using lea" optimization for incorrect
12269 code like:
12270
12271 int a;
12272 int foo(int i)
12273 {
12274 return *(&a+i);
12275 }
12276
12277 This code is nonsensical, but results in addressing
12278 GOT table with pic_offset_table_rtx base. We can't
12279 just refuse it easily, since it gets matched by
12280 "addsi3" pattern, that later gets split to lea in the
12281 case output register differs from input. While this
12282 can be handled by separate addsi pattern for this case
12283 that never results in lea, this seems to be easier and
12284 correct fix for crash to disable this test. */
12285 }
12286 else if (GET_CODE (disp) != LABEL_REF
12287 && !CONST_INT_P (disp)
12288 && (GET_CODE (disp) != CONST
12289 || !ix86_legitimate_constant_p (Pmode, disp))
12290 && (GET_CODE (disp) != SYMBOL_REF
12291 || !ix86_legitimate_constant_p (Pmode, disp)))
12292 /* Displacement is not constant. */
12293 return false;
12294 else if (TARGET_64BIT
12295 && !x86_64_immediate_operand (disp, VOIDmode))
12296 /* Displacement is out of range. */
12297 return false;
12298 }
12299
12300 /* Everything looks valid. */
12301 return true;
12302 }
12303
12304 /* Determine if a given RTX is a valid constant address. */
12305
12306 bool
12307 constant_address_p (rtx x)
12308 {
12309 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12310 }
12311 \f
12312 /* Return a unique alias set for the GOT. */
12313
12314 static alias_set_type
12315 ix86_GOT_alias_set (void)
12316 {
12317 static alias_set_type set = -1;
12318 if (set == -1)
12319 set = new_alias_set ();
12320 return set;
12321 }
12322
12323 /* Return a legitimate reference for ORIG (an address) using the
12324 register REG. If REG is 0, a new pseudo is generated.
12325
12326 There are two types of references that must be handled:
12327
12328 1. Global data references must load the address from the GOT, via
12329 the PIC reg. An insn is emitted to do this load, and the reg is
12330 returned.
12331
12332 2. Static data references, constant pool addresses, and code labels
12333 compute the address as an offset from the GOT, whose base is in
12334 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12335 differentiate them from global data objects. The returned
12336 address is the PIC reg + an unspec constant.
12337
12338 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12339 reg also appears in the address. */
12340
12341 static rtx
12342 legitimize_pic_address (rtx orig, rtx reg)
12343 {
12344 rtx addr = orig;
12345 rtx new_rtx = orig;
12346
12347 #if TARGET_MACHO
12348 if (TARGET_MACHO && !TARGET_64BIT)
12349 {
12350 if (reg == 0)
12351 reg = gen_reg_rtx (Pmode);
12352 /* Use the generic Mach-O PIC machinery. */
12353 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12354 }
12355 #endif
12356
12357 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12358 new_rtx = addr;
12359 else if (TARGET_64BIT
12360 && ix86_cmodel != CM_SMALL_PIC
12361 && gotoff_operand (addr, Pmode))
12362 {
12363 rtx tmpreg;
12364 /* This symbol may be referenced via a displacement from the PIC
12365 base address (@GOTOFF). */
12366
12367 if (reload_in_progress)
12368 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12369 if (GET_CODE (addr) == CONST)
12370 addr = XEXP (addr, 0);
12371 if (GET_CODE (addr) == PLUS)
12372 {
12373 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12374 UNSPEC_GOTOFF);
12375 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12376 }
12377 else
12378 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12379 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12380 if (!reg)
12381 tmpreg = gen_reg_rtx (Pmode);
12382 else
12383 tmpreg = reg;
12384 emit_move_insn (tmpreg, new_rtx);
12385
12386 if (reg != 0)
12387 {
12388 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12389 tmpreg, 1, OPTAB_DIRECT);
12390 new_rtx = reg;
12391 }
12392 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12393 }
12394 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12395 {
12396 /* This symbol may be referenced via a displacement from the PIC
12397 base address (@GOTOFF). */
12398
12399 if (reload_in_progress)
12400 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12401 if (GET_CODE (addr) == CONST)
12402 addr = XEXP (addr, 0);
12403 if (GET_CODE (addr) == PLUS)
12404 {
12405 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12406 UNSPEC_GOTOFF);
12407 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12408 }
12409 else
12410 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12411 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12412 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12413
12414 if (reg != 0)
12415 {
12416 emit_move_insn (reg, new_rtx);
12417 new_rtx = reg;
12418 }
12419 }
12420 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12421 /* We can't use @GOTOFF for text labels on VxWorks;
12422 see gotoff_operand. */
12423 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12424 {
12425 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12426 {
12427 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12428 return legitimize_dllimport_symbol (addr, true);
12429 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12430 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12431 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12432 {
12433 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12434 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12435 }
12436 }
12437
12438 /* For x64 PE-COFF there is no GOT table. So we use address
12439 directly. */
12440 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12441 {
12442 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12443 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12444
12445 if (reg == 0)
12446 reg = gen_reg_rtx (Pmode);
12447 emit_move_insn (reg, new_rtx);
12448 new_rtx = reg;
12449 }
12450 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12451 {
12452 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12453 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12454 new_rtx = gen_const_mem (Pmode, new_rtx);
12455 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12456
12457 if (reg == 0)
12458 reg = gen_reg_rtx (Pmode);
12459 /* Use directly gen_movsi, otherwise the address is loaded
12460 into register for CSE. We don't want to CSE this addresses,
12461 instead we CSE addresses from the GOT table, so skip this. */
12462 emit_insn (gen_movsi (reg, new_rtx));
12463 new_rtx = reg;
12464 }
12465 else
12466 {
12467 /* This symbol must be referenced via a load from the
12468 Global Offset Table (@GOT). */
12469
12470 if (reload_in_progress)
12471 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12472 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12473 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12474 if (TARGET_64BIT)
12475 new_rtx = force_reg (Pmode, new_rtx);
12476 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12477 new_rtx = gen_const_mem (Pmode, new_rtx);
12478 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12479
12480 if (reg == 0)
12481 reg = gen_reg_rtx (Pmode);
12482 emit_move_insn (reg, new_rtx);
12483 new_rtx = reg;
12484 }
12485 }
12486 else
12487 {
12488 if (CONST_INT_P (addr)
12489 && !x86_64_immediate_operand (addr, VOIDmode))
12490 {
12491 if (reg)
12492 {
12493 emit_move_insn (reg, addr);
12494 new_rtx = reg;
12495 }
12496 else
12497 new_rtx = force_reg (Pmode, addr);
12498 }
12499 else if (GET_CODE (addr) == CONST)
12500 {
12501 addr = XEXP (addr, 0);
12502
12503 /* We must match stuff we generate before. Assume the only
12504 unspecs that can get here are ours. Not that we could do
12505 anything with them anyway.... */
12506 if (GET_CODE (addr) == UNSPEC
12507 || (GET_CODE (addr) == PLUS
12508 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12509 return orig;
12510 gcc_assert (GET_CODE (addr) == PLUS);
12511 }
12512 if (GET_CODE (addr) == PLUS)
12513 {
12514 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12515
12516 /* Check first to see if this is a constant offset from a @GOTOFF
12517 symbol reference. */
12518 if (gotoff_operand (op0, Pmode)
12519 && CONST_INT_P (op1))
12520 {
12521 if (!TARGET_64BIT)
12522 {
12523 if (reload_in_progress)
12524 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12525 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12526 UNSPEC_GOTOFF);
12527 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12528 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12529 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12530
12531 if (reg != 0)
12532 {
12533 emit_move_insn (reg, new_rtx);
12534 new_rtx = reg;
12535 }
12536 }
12537 else
12538 {
12539 if (INTVAL (op1) < -16*1024*1024
12540 || INTVAL (op1) >= 16*1024*1024)
12541 {
12542 if (!x86_64_immediate_operand (op1, Pmode))
12543 op1 = force_reg (Pmode, op1);
12544 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12545 }
12546 }
12547 }
12548 else
12549 {
12550 rtx base = legitimize_pic_address (op0, reg);
12551 enum machine_mode mode = GET_MODE (base);
12552 new_rtx
12553 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12554
12555 if (CONST_INT_P (new_rtx))
12556 {
12557 if (INTVAL (new_rtx) < -16*1024*1024
12558 || INTVAL (new_rtx) >= 16*1024*1024)
12559 {
12560 if (!x86_64_immediate_operand (new_rtx, mode))
12561 new_rtx = force_reg (mode, new_rtx);
12562 new_rtx
12563 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12564 }
12565 else
12566 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12567 }
12568 else
12569 {
12570 if (GET_CODE (new_rtx) == PLUS
12571 && CONSTANT_P (XEXP (new_rtx, 1)))
12572 {
12573 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12574 new_rtx = XEXP (new_rtx, 1);
12575 }
12576 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12577 }
12578 }
12579 }
12580 }
12581 return new_rtx;
12582 }
12583 \f
12584 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12585
12586 static rtx
12587 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12588 {
12589 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12590
12591 if (GET_MODE (tp) != tp_mode)
12592 {
12593 gcc_assert (GET_MODE (tp) == SImode);
12594 gcc_assert (tp_mode == DImode);
12595
12596 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12597 }
12598
12599 if (to_reg)
12600 tp = copy_to_mode_reg (tp_mode, tp);
12601
12602 return tp;
12603 }
12604
12605 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12606
12607 static GTY(()) rtx ix86_tls_symbol;
12608
12609 static rtx
12610 ix86_tls_get_addr (void)
12611 {
12612 if (!ix86_tls_symbol)
12613 {
12614 const char *sym
12615 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12616 ? "___tls_get_addr" : "__tls_get_addr");
12617
12618 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12619 }
12620
12621 return ix86_tls_symbol;
12622 }
12623
12624 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12625
12626 static GTY(()) rtx ix86_tls_module_base_symbol;
12627
12628 rtx
12629 ix86_tls_module_base (void)
12630 {
12631 if (!ix86_tls_module_base_symbol)
12632 {
12633 ix86_tls_module_base_symbol
12634 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12635
12636 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12637 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12638 }
12639
12640 return ix86_tls_module_base_symbol;
12641 }
12642
12643 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12644 false if we expect this to be used for a memory address and true if
12645 we expect to load the address into a register. */
12646
12647 static rtx
12648 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12649 {
12650 rtx dest, base, off;
12651 rtx pic = NULL_RTX, tp = NULL_RTX;
12652 enum machine_mode tp_mode = Pmode;
12653 int type;
12654
12655 switch (model)
12656 {
12657 case TLS_MODEL_GLOBAL_DYNAMIC:
12658 dest = gen_reg_rtx (Pmode);
12659
12660 if (!TARGET_64BIT)
12661 {
12662 if (flag_pic)
12663 pic = pic_offset_table_rtx;
12664 else
12665 {
12666 pic = gen_reg_rtx (Pmode);
12667 emit_insn (gen_set_got (pic));
12668 }
12669 }
12670
12671 if (TARGET_GNU2_TLS)
12672 {
12673 if (TARGET_64BIT)
12674 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12675 else
12676 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12677
12678 tp = get_thread_pointer (Pmode, true);
12679 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12680
12681 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12682 }
12683 else
12684 {
12685 rtx caddr = ix86_tls_get_addr ();
12686
12687 if (TARGET_64BIT)
12688 {
12689 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12690
12691 start_sequence ();
12692 emit_call_insn (ix86_gen_tls_global_dynamic_64 (rax, x,
12693 caddr));
12694 insns = get_insns ();
12695 end_sequence ();
12696
12697 RTL_CONST_CALL_P (insns) = 1;
12698 emit_libcall_block (insns, dest, rax, x);
12699 }
12700 else
12701 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12702 }
12703 break;
12704
12705 case TLS_MODEL_LOCAL_DYNAMIC:
12706 base = gen_reg_rtx (Pmode);
12707
12708 if (!TARGET_64BIT)
12709 {
12710 if (flag_pic)
12711 pic = pic_offset_table_rtx;
12712 else
12713 {
12714 pic = gen_reg_rtx (Pmode);
12715 emit_insn (gen_set_got (pic));
12716 }
12717 }
12718
12719 if (TARGET_GNU2_TLS)
12720 {
12721 rtx tmp = ix86_tls_module_base ();
12722
12723 if (TARGET_64BIT)
12724 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12725 else
12726 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12727
12728 tp = get_thread_pointer (Pmode, true);
12729 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12730 gen_rtx_MINUS (Pmode, tmp, tp));
12731 }
12732 else
12733 {
12734 rtx caddr = ix86_tls_get_addr ();
12735
12736 if (TARGET_64BIT)
12737 {
12738 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12739
12740 start_sequence ();
12741 emit_call_insn (ix86_gen_tls_local_dynamic_base_64 (rax,
12742 caddr));
12743 insns = get_insns ();
12744 end_sequence ();
12745
12746 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12747 share the LD_BASE result with other LD model accesses. */
12748 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12749 UNSPEC_TLS_LD_BASE);
12750
12751 RTL_CONST_CALL_P (insns) = 1;
12752 emit_libcall_block (insns, base, rax, eqv);
12753 }
12754 else
12755 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12756 }
12757
12758 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12759 off = gen_rtx_CONST (Pmode, off);
12760
12761 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12762
12763 if (TARGET_GNU2_TLS)
12764 {
12765 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12766
12767 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12768 }
12769 break;
12770
12771 case TLS_MODEL_INITIAL_EXEC:
12772 if (TARGET_64BIT)
12773 {
12774 if (TARGET_SUN_TLS && !TARGET_X32)
12775 {
12776 /* The Sun linker took the AMD64 TLS spec literally
12777 and can only handle %rax as destination of the
12778 initial executable code sequence. */
12779
12780 dest = gen_reg_rtx (DImode);
12781 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12782 return dest;
12783 }
12784
12785 /* Generate DImode references to avoid %fs:(%reg32)
12786 problems and linker IE->LE relaxation bug. */
12787 tp_mode = DImode;
12788 pic = NULL;
12789 type = UNSPEC_GOTNTPOFF;
12790 }
12791 else if (flag_pic)
12792 {
12793 if (reload_in_progress)
12794 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12795 pic = pic_offset_table_rtx;
12796 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12797 }
12798 else if (!TARGET_ANY_GNU_TLS)
12799 {
12800 pic = gen_reg_rtx (Pmode);
12801 emit_insn (gen_set_got (pic));
12802 type = UNSPEC_GOTTPOFF;
12803 }
12804 else
12805 {
12806 pic = NULL;
12807 type = UNSPEC_INDNTPOFF;
12808 }
12809
12810 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12811 off = gen_rtx_CONST (tp_mode, off);
12812 if (pic)
12813 off = gen_rtx_PLUS (tp_mode, pic, off);
12814 off = gen_const_mem (tp_mode, off);
12815 set_mem_alias_set (off, ix86_GOT_alias_set ());
12816
12817 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12818 {
12819 base = get_thread_pointer (tp_mode,
12820 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12821 off = force_reg (tp_mode, off);
12822 return gen_rtx_PLUS (tp_mode, base, off);
12823 }
12824 else
12825 {
12826 base = get_thread_pointer (Pmode, true);
12827 dest = gen_reg_rtx (Pmode);
12828 emit_insn (ix86_gen_sub3 (dest, base, off));
12829 }
12830 break;
12831
12832 case TLS_MODEL_LOCAL_EXEC:
12833 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12834 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12835 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12836 off = gen_rtx_CONST (Pmode, off);
12837
12838 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12839 {
12840 base = get_thread_pointer (Pmode,
12841 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12842 return gen_rtx_PLUS (Pmode, base, off);
12843 }
12844 else
12845 {
12846 base = get_thread_pointer (Pmode, true);
12847 dest = gen_reg_rtx (Pmode);
12848 emit_insn (ix86_gen_sub3 (dest, base, off));
12849 }
12850 break;
12851
12852 default:
12853 gcc_unreachable ();
12854 }
12855
12856 return dest;
12857 }
12858
12859 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12860 to symbol DECL. */
12861
12862 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12863 htab_t dllimport_map;
12864
12865 static tree
12866 get_dllimport_decl (tree decl)
12867 {
12868 struct tree_map *h, in;
12869 void **loc;
12870 const char *name;
12871 const char *prefix;
12872 size_t namelen, prefixlen;
12873 char *imp_name;
12874 tree to;
12875 rtx rtl;
12876
12877 if (!dllimport_map)
12878 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12879
12880 in.hash = htab_hash_pointer (decl);
12881 in.base.from = decl;
12882 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12883 h = (struct tree_map *) *loc;
12884 if (h)
12885 return h->to;
12886
12887 *loc = h = ggc_alloc_tree_map ();
12888 h->hash = in.hash;
12889 h->base.from = decl;
12890 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12891 VAR_DECL, NULL, ptr_type_node);
12892 DECL_ARTIFICIAL (to) = 1;
12893 DECL_IGNORED_P (to) = 1;
12894 DECL_EXTERNAL (to) = 1;
12895 TREE_READONLY (to) = 1;
12896
12897 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12898 name = targetm.strip_name_encoding (name);
12899 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12900 ? "*__imp_" : "*__imp__";
12901 namelen = strlen (name);
12902 prefixlen = strlen (prefix);
12903 imp_name = (char *) alloca (namelen + prefixlen + 1);
12904 memcpy (imp_name, prefix, prefixlen);
12905 memcpy (imp_name + prefixlen, name, namelen + 1);
12906
12907 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12908 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12909 SET_SYMBOL_REF_DECL (rtl, to);
12910 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12911
12912 rtl = gen_const_mem (Pmode, rtl);
12913 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12914
12915 SET_DECL_RTL (to, rtl);
12916 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12917
12918 return to;
12919 }
12920
12921 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12922 true if we require the result be a register. */
12923
12924 static rtx
12925 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12926 {
12927 tree imp_decl;
12928 rtx x;
12929
12930 gcc_assert (SYMBOL_REF_DECL (symbol));
12931 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12932
12933 x = DECL_RTL (imp_decl);
12934 if (want_reg)
12935 x = force_reg (Pmode, x);
12936 return x;
12937 }
12938
12939 /* Try machine-dependent ways of modifying an illegitimate address
12940 to be legitimate. If we find one, return the new, valid address.
12941 This macro is used in only one place: `memory_address' in explow.c.
12942
12943 OLDX is the address as it was before break_out_memory_refs was called.
12944 In some cases it is useful to look at this to decide what needs to be done.
12945
12946 It is always safe for this macro to do nothing. It exists to recognize
12947 opportunities to optimize the output.
12948
12949 For the 80386, we handle X+REG by loading X into a register R and
12950 using R+REG. R will go in a general reg and indexing will be used.
12951 However, if REG is a broken-out memory address or multiplication,
12952 nothing needs to be done because REG can certainly go in a general reg.
12953
12954 When -fpic is used, special handling is needed for symbolic references.
12955 See comments by legitimize_pic_address in i386.c for details. */
12956
12957 static rtx
12958 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12959 enum machine_mode mode)
12960 {
12961 int changed = 0;
12962 unsigned log;
12963
12964 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12965 if (log)
12966 return legitimize_tls_address (x, (enum tls_model) log, false);
12967 if (GET_CODE (x) == CONST
12968 && GET_CODE (XEXP (x, 0)) == PLUS
12969 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12970 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12971 {
12972 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12973 (enum tls_model) log, false);
12974 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12975 }
12976
12977 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12978 {
12979 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12980 return legitimize_dllimport_symbol (x, true);
12981 if (GET_CODE (x) == CONST
12982 && GET_CODE (XEXP (x, 0)) == PLUS
12983 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12984 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12985 {
12986 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12987 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12988 }
12989 }
12990
12991 if (flag_pic && SYMBOLIC_CONST (x))
12992 return legitimize_pic_address (x, 0);
12993
12994 #if TARGET_MACHO
12995 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12996 return machopic_indirect_data_reference (x, 0);
12997 #endif
12998
12999 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13000 if (GET_CODE (x) == ASHIFT
13001 && CONST_INT_P (XEXP (x, 1))
13002 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13003 {
13004 changed = 1;
13005 log = INTVAL (XEXP (x, 1));
13006 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13007 GEN_INT (1 << log));
13008 }
13009
13010 if (GET_CODE (x) == PLUS)
13011 {
13012 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13013
13014 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13015 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13016 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13017 {
13018 changed = 1;
13019 log = INTVAL (XEXP (XEXP (x, 0), 1));
13020 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13021 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13022 GEN_INT (1 << log));
13023 }
13024
13025 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13026 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13027 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13028 {
13029 changed = 1;
13030 log = INTVAL (XEXP (XEXP (x, 1), 1));
13031 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13032 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13033 GEN_INT (1 << log));
13034 }
13035
13036 /* Put multiply first if it isn't already. */
13037 if (GET_CODE (XEXP (x, 1)) == MULT)
13038 {
13039 rtx tmp = XEXP (x, 0);
13040 XEXP (x, 0) = XEXP (x, 1);
13041 XEXP (x, 1) = tmp;
13042 changed = 1;
13043 }
13044
13045 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13046 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13047 created by virtual register instantiation, register elimination, and
13048 similar optimizations. */
13049 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13050 {
13051 changed = 1;
13052 x = gen_rtx_PLUS (Pmode,
13053 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13054 XEXP (XEXP (x, 1), 0)),
13055 XEXP (XEXP (x, 1), 1));
13056 }
13057
13058 /* Canonicalize
13059 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13060 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13061 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13062 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13063 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13064 && CONSTANT_P (XEXP (x, 1)))
13065 {
13066 rtx constant;
13067 rtx other = NULL_RTX;
13068
13069 if (CONST_INT_P (XEXP (x, 1)))
13070 {
13071 constant = XEXP (x, 1);
13072 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13073 }
13074 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13075 {
13076 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13077 other = XEXP (x, 1);
13078 }
13079 else
13080 constant = 0;
13081
13082 if (constant)
13083 {
13084 changed = 1;
13085 x = gen_rtx_PLUS (Pmode,
13086 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13087 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13088 plus_constant (Pmode, other,
13089 INTVAL (constant)));
13090 }
13091 }
13092
13093 if (changed && ix86_legitimate_address_p (mode, x, false))
13094 return x;
13095
13096 if (GET_CODE (XEXP (x, 0)) == MULT)
13097 {
13098 changed = 1;
13099 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13100 }
13101
13102 if (GET_CODE (XEXP (x, 1)) == MULT)
13103 {
13104 changed = 1;
13105 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13106 }
13107
13108 if (changed
13109 && REG_P (XEXP (x, 1))
13110 && REG_P (XEXP (x, 0)))
13111 return x;
13112
13113 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13114 {
13115 changed = 1;
13116 x = legitimize_pic_address (x, 0);
13117 }
13118
13119 if (changed && ix86_legitimate_address_p (mode, x, false))
13120 return x;
13121
13122 if (REG_P (XEXP (x, 0)))
13123 {
13124 rtx temp = gen_reg_rtx (Pmode);
13125 rtx val = force_operand (XEXP (x, 1), temp);
13126 if (val != temp)
13127 {
13128 if (GET_MODE (val) != Pmode)
13129 val = convert_to_mode (Pmode, val, 1);
13130 emit_move_insn (temp, val);
13131 }
13132
13133 XEXP (x, 1) = temp;
13134 return x;
13135 }
13136
13137 else if (REG_P (XEXP (x, 1)))
13138 {
13139 rtx temp = gen_reg_rtx (Pmode);
13140 rtx val = force_operand (XEXP (x, 0), temp);
13141 if (val != temp)
13142 {
13143 if (GET_MODE (val) != Pmode)
13144 val = convert_to_mode (Pmode, val, 1);
13145 emit_move_insn (temp, val);
13146 }
13147
13148 XEXP (x, 0) = temp;
13149 return x;
13150 }
13151 }
13152
13153 return x;
13154 }
13155 \f
13156 /* Print an integer constant expression in assembler syntax. Addition
13157 and subtraction are the only arithmetic that may appear in these
13158 expressions. FILE is the stdio stream to write to, X is the rtx, and
13159 CODE is the operand print code from the output string. */
13160
13161 static void
13162 output_pic_addr_const (FILE *file, rtx x, int code)
13163 {
13164 char buf[256];
13165
13166 switch (GET_CODE (x))
13167 {
13168 case PC:
13169 gcc_assert (flag_pic);
13170 putc ('.', file);
13171 break;
13172
13173 case SYMBOL_REF:
13174 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13175 output_addr_const (file, x);
13176 else
13177 {
13178 const char *name = XSTR (x, 0);
13179
13180 /* Mark the decl as referenced so that cgraph will
13181 output the function. */
13182 if (SYMBOL_REF_DECL (x))
13183 mark_decl_referenced (SYMBOL_REF_DECL (x));
13184
13185 #if TARGET_MACHO
13186 if (MACHOPIC_INDIRECT
13187 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13188 name = machopic_indirection_name (x, /*stub_p=*/true);
13189 #endif
13190 assemble_name (file, name);
13191 }
13192 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13193 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13194 fputs ("@PLT", file);
13195 break;
13196
13197 case LABEL_REF:
13198 x = XEXP (x, 0);
13199 /* FALLTHRU */
13200 case CODE_LABEL:
13201 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13202 assemble_name (asm_out_file, buf);
13203 break;
13204
13205 case CONST_INT:
13206 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13207 break;
13208
13209 case CONST:
13210 /* This used to output parentheses around the expression,
13211 but that does not work on the 386 (either ATT or BSD assembler). */
13212 output_pic_addr_const (file, XEXP (x, 0), code);
13213 break;
13214
13215 case CONST_DOUBLE:
13216 if (GET_MODE (x) == VOIDmode)
13217 {
13218 /* We can use %d if the number is <32 bits and positive. */
13219 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13220 fprintf (file, "0x%lx%08lx",
13221 (unsigned long) CONST_DOUBLE_HIGH (x),
13222 (unsigned long) CONST_DOUBLE_LOW (x));
13223 else
13224 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13225 }
13226 else
13227 /* We can't handle floating point constants;
13228 TARGET_PRINT_OPERAND must handle them. */
13229 output_operand_lossage ("floating constant misused");
13230 break;
13231
13232 case PLUS:
13233 /* Some assemblers need integer constants to appear first. */
13234 if (CONST_INT_P (XEXP (x, 0)))
13235 {
13236 output_pic_addr_const (file, XEXP (x, 0), code);
13237 putc ('+', file);
13238 output_pic_addr_const (file, XEXP (x, 1), code);
13239 }
13240 else
13241 {
13242 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13243 output_pic_addr_const (file, XEXP (x, 1), code);
13244 putc ('+', file);
13245 output_pic_addr_const (file, XEXP (x, 0), code);
13246 }
13247 break;
13248
13249 case MINUS:
13250 if (!TARGET_MACHO)
13251 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13252 output_pic_addr_const (file, XEXP (x, 0), code);
13253 putc ('-', file);
13254 output_pic_addr_const (file, XEXP (x, 1), code);
13255 if (!TARGET_MACHO)
13256 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13257 break;
13258
13259 case UNSPEC:
13260 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13261 {
13262 bool f = i386_asm_output_addr_const_extra (file, x);
13263 gcc_assert (f);
13264 break;
13265 }
13266
13267 gcc_assert (XVECLEN (x, 0) == 1);
13268 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13269 switch (XINT (x, 1))
13270 {
13271 case UNSPEC_GOT:
13272 fputs ("@GOT", file);
13273 break;
13274 case UNSPEC_GOTOFF:
13275 fputs ("@GOTOFF", file);
13276 break;
13277 case UNSPEC_PLTOFF:
13278 fputs ("@PLTOFF", file);
13279 break;
13280 case UNSPEC_PCREL:
13281 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13282 "(%rip)" : "[rip]", file);
13283 break;
13284 case UNSPEC_GOTPCREL:
13285 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13286 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13287 break;
13288 case UNSPEC_GOTTPOFF:
13289 /* FIXME: This might be @TPOFF in Sun ld too. */
13290 fputs ("@gottpoff", file);
13291 break;
13292 case UNSPEC_TPOFF:
13293 fputs ("@tpoff", file);
13294 break;
13295 case UNSPEC_NTPOFF:
13296 if (TARGET_64BIT)
13297 fputs ("@tpoff", file);
13298 else
13299 fputs ("@ntpoff", file);
13300 break;
13301 case UNSPEC_DTPOFF:
13302 fputs ("@dtpoff", file);
13303 break;
13304 case UNSPEC_GOTNTPOFF:
13305 if (TARGET_64BIT)
13306 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13307 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13308 else
13309 fputs ("@gotntpoff", file);
13310 break;
13311 case UNSPEC_INDNTPOFF:
13312 fputs ("@indntpoff", file);
13313 break;
13314 #if TARGET_MACHO
13315 case UNSPEC_MACHOPIC_OFFSET:
13316 putc ('-', file);
13317 machopic_output_function_base_name (file);
13318 break;
13319 #endif
13320 default:
13321 output_operand_lossage ("invalid UNSPEC as operand");
13322 break;
13323 }
13324 break;
13325
13326 default:
13327 output_operand_lossage ("invalid expression as operand");
13328 }
13329 }
13330
13331 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13332 We need to emit DTP-relative relocations. */
13333
13334 static void ATTRIBUTE_UNUSED
13335 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13336 {
13337 fputs (ASM_LONG, file);
13338 output_addr_const (file, x);
13339 fputs ("@dtpoff", file);
13340 switch (size)
13341 {
13342 case 4:
13343 break;
13344 case 8:
13345 fputs (", 0", file);
13346 break;
13347 default:
13348 gcc_unreachable ();
13349 }
13350 }
13351
13352 /* Return true if X is a representation of the PIC register. This copes
13353 with calls from ix86_find_base_term, where the register might have
13354 been replaced by a cselib value. */
13355
13356 static bool
13357 ix86_pic_register_p (rtx x)
13358 {
13359 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13360 return (pic_offset_table_rtx
13361 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13362 else
13363 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13364 }
13365
13366 /* Helper function for ix86_delegitimize_address.
13367 Attempt to delegitimize TLS local-exec accesses. */
13368
13369 static rtx
13370 ix86_delegitimize_tls_address (rtx orig_x)
13371 {
13372 rtx x = orig_x, unspec;
13373 struct ix86_address addr;
13374
13375 if (!TARGET_TLS_DIRECT_SEG_REFS)
13376 return orig_x;
13377 if (MEM_P (x))
13378 x = XEXP (x, 0);
13379 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13380 return orig_x;
13381 if (ix86_decompose_address (x, &addr) == 0
13382 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13383 || addr.disp == NULL_RTX
13384 || GET_CODE (addr.disp) != CONST)
13385 return orig_x;
13386 unspec = XEXP (addr.disp, 0);
13387 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13388 unspec = XEXP (unspec, 0);
13389 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13390 return orig_x;
13391 x = XVECEXP (unspec, 0, 0);
13392 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13393 if (unspec != XEXP (addr.disp, 0))
13394 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13395 if (addr.index)
13396 {
13397 rtx idx = addr.index;
13398 if (addr.scale != 1)
13399 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13400 x = gen_rtx_PLUS (Pmode, idx, x);
13401 }
13402 if (addr.base)
13403 x = gen_rtx_PLUS (Pmode, addr.base, x);
13404 if (MEM_P (orig_x))
13405 x = replace_equiv_address_nv (orig_x, x);
13406 return x;
13407 }
13408
13409 /* In the name of slightly smaller debug output, and to cater to
13410 general assembler lossage, recognize PIC+GOTOFF and turn it back
13411 into a direct symbol reference.
13412
13413 On Darwin, this is necessary to avoid a crash, because Darwin
13414 has a different PIC label for each routine but the DWARF debugging
13415 information is not associated with any particular routine, so it's
13416 necessary to remove references to the PIC label from RTL stored by
13417 the DWARF output code. */
13418
13419 static rtx
13420 ix86_delegitimize_address (rtx x)
13421 {
13422 rtx orig_x = delegitimize_mem_from_attrs (x);
13423 /* addend is NULL or some rtx if x is something+GOTOFF where
13424 something doesn't include the PIC register. */
13425 rtx addend = NULL_RTX;
13426 /* reg_addend is NULL or a multiple of some register. */
13427 rtx reg_addend = NULL_RTX;
13428 /* const_addend is NULL or a const_int. */
13429 rtx const_addend = NULL_RTX;
13430 /* This is the result, or NULL. */
13431 rtx result = NULL_RTX;
13432
13433 x = orig_x;
13434
13435 if (MEM_P (x))
13436 x = XEXP (x, 0);
13437
13438 if (TARGET_64BIT)
13439 {
13440 if (GET_CODE (x) == CONST
13441 && GET_CODE (XEXP (x, 0)) == PLUS
13442 && GET_MODE (XEXP (x, 0)) == Pmode
13443 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13444 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13445 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13446 {
13447 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13448 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13449 if (MEM_P (orig_x))
13450 x = replace_equiv_address_nv (orig_x, x);
13451 return x;
13452 }
13453 if (GET_CODE (x) != CONST
13454 || GET_CODE (XEXP (x, 0)) != UNSPEC
13455 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
13456 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
13457 || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
13458 return ix86_delegitimize_tls_address (orig_x);
13459 x = XVECEXP (XEXP (x, 0), 0, 0);
13460 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13461 {
13462 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13463 GET_MODE (x), 0);
13464 if (x == NULL_RTX)
13465 return orig_x;
13466 }
13467 return x;
13468 }
13469
13470 if (GET_CODE (x) != PLUS
13471 || GET_CODE (XEXP (x, 1)) != CONST)
13472 return ix86_delegitimize_tls_address (orig_x);
13473
13474 if (ix86_pic_register_p (XEXP (x, 0)))
13475 /* %ebx + GOT/GOTOFF */
13476 ;
13477 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13478 {
13479 /* %ebx + %reg * scale + GOT/GOTOFF */
13480 reg_addend = XEXP (x, 0);
13481 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13482 reg_addend = XEXP (reg_addend, 1);
13483 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13484 reg_addend = XEXP (reg_addend, 0);
13485 else
13486 {
13487 reg_addend = NULL_RTX;
13488 addend = XEXP (x, 0);
13489 }
13490 }
13491 else
13492 addend = XEXP (x, 0);
13493
13494 x = XEXP (XEXP (x, 1), 0);
13495 if (GET_CODE (x) == PLUS
13496 && CONST_INT_P (XEXP (x, 1)))
13497 {
13498 const_addend = XEXP (x, 1);
13499 x = XEXP (x, 0);
13500 }
13501
13502 if (GET_CODE (x) == UNSPEC
13503 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13504 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
13505 result = XVECEXP (x, 0, 0);
13506
13507 if (TARGET_MACHO && darwin_local_data_pic (x)
13508 && !MEM_P (orig_x))
13509 result = XVECEXP (x, 0, 0);
13510
13511 if (! result)
13512 return ix86_delegitimize_tls_address (orig_x);
13513
13514 if (const_addend)
13515 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13516 if (reg_addend)
13517 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13518 if (addend)
13519 {
13520 /* If the rest of original X doesn't involve the PIC register, add
13521 addend and subtract pic_offset_table_rtx. This can happen e.g.
13522 for code like:
13523 leal (%ebx, %ecx, 4), %ecx
13524 ...
13525 movl foo@GOTOFF(%ecx), %edx
13526 in which case we return (%ecx - %ebx) + foo. */
13527 if (pic_offset_table_rtx)
13528 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13529 pic_offset_table_rtx),
13530 result);
13531 else
13532 return orig_x;
13533 }
13534 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13535 {
13536 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13537 if (result == NULL_RTX)
13538 return orig_x;
13539 }
13540 return result;
13541 }
13542
13543 /* If X is a machine specific address (i.e. a symbol or label being
13544 referenced as a displacement from the GOT implemented using an
13545 UNSPEC), then return the base term. Otherwise return X. */
13546
13547 rtx
13548 ix86_find_base_term (rtx x)
13549 {
13550 rtx term;
13551
13552 if (TARGET_64BIT)
13553 {
13554 if (GET_CODE (x) != CONST)
13555 return x;
13556 term = XEXP (x, 0);
13557 if (GET_CODE (term) == PLUS
13558 && (CONST_INT_P (XEXP (term, 1))
13559 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13560 term = XEXP (term, 0);
13561 if (GET_CODE (term) != UNSPEC
13562 || (XINT (term, 1) != UNSPEC_GOTPCREL
13563 && XINT (term, 1) != UNSPEC_PCREL))
13564 return x;
13565
13566 return XVECEXP (term, 0, 0);
13567 }
13568
13569 return ix86_delegitimize_address (x);
13570 }
13571 \f
13572 static void
13573 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13574 bool fp, FILE *file)
13575 {
13576 const char *suffix;
13577
13578 if (mode == CCFPmode || mode == CCFPUmode)
13579 {
13580 code = ix86_fp_compare_code_to_integer (code);
13581 mode = CCmode;
13582 }
13583 if (reverse)
13584 code = reverse_condition (code);
13585
13586 switch (code)
13587 {
13588 case EQ:
13589 switch (mode)
13590 {
13591 case CCAmode:
13592 suffix = "a";
13593 break;
13594
13595 case CCCmode:
13596 suffix = "c";
13597 break;
13598
13599 case CCOmode:
13600 suffix = "o";
13601 break;
13602
13603 case CCSmode:
13604 suffix = "s";
13605 break;
13606
13607 default:
13608 suffix = "e";
13609 }
13610 break;
13611 case NE:
13612 switch (mode)
13613 {
13614 case CCAmode:
13615 suffix = "na";
13616 break;
13617
13618 case CCCmode:
13619 suffix = "nc";
13620 break;
13621
13622 case CCOmode:
13623 suffix = "no";
13624 break;
13625
13626 case CCSmode:
13627 suffix = "ns";
13628 break;
13629
13630 default:
13631 suffix = "ne";
13632 }
13633 break;
13634 case GT:
13635 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13636 suffix = "g";
13637 break;
13638 case GTU:
13639 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13640 Those same assemblers have the same but opposite lossage on cmov. */
13641 if (mode == CCmode)
13642 suffix = fp ? "nbe" : "a";
13643 else if (mode == CCCmode)
13644 suffix = "b";
13645 else
13646 gcc_unreachable ();
13647 break;
13648 case LT:
13649 switch (mode)
13650 {
13651 case CCNOmode:
13652 case CCGOCmode:
13653 suffix = "s";
13654 break;
13655
13656 case CCmode:
13657 case CCGCmode:
13658 suffix = "l";
13659 break;
13660
13661 default:
13662 gcc_unreachable ();
13663 }
13664 break;
13665 case LTU:
13666 gcc_assert (mode == CCmode || mode == CCCmode);
13667 suffix = "b";
13668 break;
13669 case GE:
13670 switch (mode)
13671 {
13672 case CCNOmode:
13673 case CCGOCmode:
13674 suffix = "ns";
13675 break;
13676
13677 case CCmode:
13678 case CCGCmode:
13679 suffix = "ge";
13680 break;
13681
13682 default:
13683 gcc_unreachable ();
13684 }
13685 break;
13686 case GEU:
13687 /* ??? As above. */
13688 gcc_assert (mode == CCmode || mode == CCCmode);
13689 suffix = fp ? "nb" : "ae";
13690 break;
13691 case LE:
13692 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13693 suffix = "le";
13694 break;
13695 case LEU:
13696 /* ??? As above. */
13697 if (mode == CCmode)
13698 suffix = "be";
13699 else if (mode == CCCmode)
13700 suffix = fp ? "nb" : "ae";
13701 else
13702 gcc_unreachable ();
13703 break;
13704 case UNORDERED:
13705 suffix = fp ? "u" : "p";
13706 break;
13707 case ORDERED:
13708 suffix = fp ? "nu" : "np";
13709 break;
13710 default:
13711 gcc_unreachable ();
13712 }
13713 fputs (suffix, file);
13714 }
13715
13716 /* Print the name of register X to FILE based on its machine mode and number.
13717 If CODE is 'w', pretend the mode is HImode.
13718 If CODE is 'b', pretend the mode is QImode.
13719 If CODE is 'k', pretend the mode is SImode.
13720 If CODE is 'q', pretend the mode is DImode.
13721 If CODE is 'x', pretend the mode is V4SFmode.
13722 If CODE is 't', pretend the mode is V8SFmode.
13723 If CODE is 'h', pretend the reg is the 'high' byte register.
13724 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13725 If CODE is 'd', duplicate the operand for AVX instruction.
13726 */
13727
13728 void
13729 print_reg (rtx x, int code, FILE *file)
13730 {
13731 const char *reg;
13732 unsigned int regno;
13733 bool duplicated = code == 'd' && TARGET_AVX;
13734
13735 if (ASSEMBLER_DIALECT == ASM_ATT)
13736 putc ('%', file);
13737
13738 if (x == pc_rtx)
13739 {
13740 gcc_assert (TARGET_64BIT);
13741 fputs ("rip", file);
13742 return;
13743 }
13744
13745 regno = true_regnum (x);
13746 gcc_assert (regno != ARG_POINTER_REGNUM
13747 && regno != FRAME_POINTER_REGNUM
13748 && regno != FLAGS_REG
13749 && regno != FPSR_REG
13750 && regno != FPCR_REG);
13751
13752 if (code == 'w' || MMX_REG_P (x))
13753 code = 2;
13754 else if (code == 'b')
13755 code = 1;
13756 else if (code == 'k')
13757 code = 4;
13758 else if (code == 'q')
13759 code = 8;
13760 else if (code == 'y')
13761 code = 3;
13762 else if (code == 'h')
13763 code = 0;
13764 else if (code == 'x')
13765 code = 16;
13766 else if (code == 't')
13767 code = 32;
13768 else
13769 code = GET_MODE_SIZE (GET_MODE (x));
13770
13771 /* Irritatingly, AMD extended registers use different naming convention
13772 from the normal registers: "r%d[bwd]" */
13773 if (REX_INT_REGNO_P (regno))
13774 {
13775 gcc_assert (TARGET_64BIT);
13776 putc ('r', file);
13777 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13778 switch (code)
13779 {
13780 case 0:
13781 error ("extended registers have no high halves");
13782 break;
13783 case 1:
13784 putc ('b', file);
13785 break;
13786 case 2:
13787 putc ('w', file);
13788 break;
13789 case 4:
13790 putc ('d', file);
13791 break;
13792 case 8:
13793 /* no suffix */
13794 break;
13795 default:
13796 error ("unsupported operand size for extended register");
13797 break;
13798 }
13799 return;
13800 }
13801
13802 reg = NULL;
13803 switch (code)
13804 {
13805 case 3:
13806 if (STACK_TOP_P (x))
13807 {
13808 reg = "st(0)";
13809 break;
13810 }
13811 /* FALLTHRU */
13812 case 8:
13813 case 4:
13814 case 12:
13815 if (! ANY_FP_REG_P (x))
13816 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13817 /* FALLTHRU */
13818 case 16:
13819 case 2:
13820 normal:
13821 reg = hi_reg_name[regno];
13822 break;
13823 case 1:
13824 if (regno >= ARRAY_SIZE (qi_reg_name))
13825 goto normal;
13826 reg = qi_reg_name[regno];
13827 break;
13828 case 0:
13829 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13830 goto normal;
13831 reg = qi_high_reg_name[regno];
13832 break;
13833 case 32:
13834 if (SSE_REG_P (x))
13835 {
13836 gcc_assert (!duplicated);
13837 putc ('y', file);
13838 fputs (hi_reg_name[regno] + 1, file);
13839 return;
13840 }
13841 break;
13842 default:
13843 gcc_unreachable ();
13844 }
13845
13846 fputs (reg, file);
13847 if (duplicated)
13848 {
13849 if (ASSEMBLER_DIALECT == ASM_ATT)
13850 fprintf (file, ", %%%s", reg);
13851 else
13852 fprintf (file, ", %s", reg);
13853 }
13854 }
13855
13856 /* Locate some local-dynamic symbol still in use by this function
13857 so that we can print its name in some tls_local_dynamic_base
13858 pattern. */
13859
13860 static int
13861 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13862 {
13863 rtx x = *px;
13864
13865 if (GET_CODE (x) == SYMBOL_REF
13866 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13867 {
13868 cfun->machine->some_ld_name = XSTR (x, 0);
13869 return 1;
13870 }
13871
13872 return 0;
13873 }
13874
13875 static const char *
13876 get_some_local_dynamic_name (void)
13877 {
13878 rtx insn;
13879
13880 if (cfun->machine->some_ld_name)
13881 return cfun->machine->some_ld_name;
13882
13883 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13884 if (NONDEBUG_INSN_P (insn)
13885 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13886 return cfun->machine->some_ld_name;
13887
13888 return NULL;
13889 }
13890
13891 /* Meaning of CODE:
13892 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13893 C -- print opcode suffix for set/cmov insn.
13894 c -- like C, but print reversed condition
13895 F,f -- likewise, but for floating-point.
13896 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13897 otherwise nothing
13898 R -- print the prefix for register names.
13899 z -- print the opcode suffix for the size of the current operand.
13900 Z -- likewise, with special suffixes for x87 instructions.
13901 * -- print a star (in certain assembler syntax)
13902 A -- print an absolute memory reference.
13903 E -- print address with DImode register names if TARGET_64BIT.
13904 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13905 s -- print a shift double count, followed by the assemblers argument
13906 delimiter.
13907 b -- print the QImode name of the register for the indicated operand.
13908 %b0 would print %al if operands[0] is reg 0.
13909 w -- likewise, print the HImode name of the register.
13910 k -- likewise, print the SImode name of the register.
13911 q -- likewise, print the DImode name of the register.
13912 x -- likewise, print the V4SFmode name of the register.
13913 t -- likewise, print the V8SFmode name of the register.
13914 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13915 y -- print "st(0)" instead of "st" as a register.
13916 d -- print duplicated register operand for AVX instruction.
13917 D -- print condition for SSE cmp instruction.
13918 P -- if PIC, print an @PLT suffix.
13919 p -- print raw symbol name.
13920 X -- don't print any sort of PIC '@' suffix for a symbol.
13921 & -- print some in-use local-dynamic symbol name.
13922 H -- print a memory address offset by 8; used for sse high-parts
13923 Y -- print condition for XOP pcom* instruction.
13924 + -- print a branch hint as 'cs' or 'ds' prefix
13925 ; -- print a semicolon (after prefixes due to bug in older gas).
13926 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
13927 @ -- print a segment register of thread base pointer load
13928 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
13929 */
13930
13931 void
13932 ix86_print_operand (FILE *file, rtx x, int code)
13933 {
13934 if (code)
13935 {
13936 switch (code)
13937 {
13938 case 'A':
13939 switch (ASSEMBLER_DIALECT)
13940 {
13941 case ASM_ATT:
13942 putc ('*', file);
13943 break;
13944
13945 case ASM_INTEL:
13946 /* Intel syntax. For absolute addresses, registers should not
13947 be surrounded by braces. */
13948 if (!REG_P (x))
13949 {
13950 putc ('[', file);
13951 ix86_print_operand (file, x, 0);
13952 putc (']', file);
13953 return;
13954 }
13955 break;
13956
13957 default:
13958 gcc_unreachable ();
13959 }
13960
13961 ix86_print_operand (file, x, 0);
13962 return;
13963
13964 case 'E':
13965 /* Wrap address in an UNSPEC to declare special handling. */
13966 if (TARGET_64BIT)
13967 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
13968
13969 output_address (x);
13970 return;
13971
13972 case 'L':
13973 if (ASSEMBLER_DIALECT == ASM_ATT)
13974 putc ('l', file);
13975 return;
13976
13977 case 'W':
13978 if (ASSEMBLER_DIALECT == ASM_ATT)
13979 putc ('w', file);
13980 return;
13981
13982 case 'B':
13983 if (ASSEMBLER_DIALECT == ASM_ATT)
13984 putc ('b', file);
13985 return;
13986
13987 case 'Q':
13988 if (ASSEMBLER_DIALECT == ASM_ATT)
13989 putc ('l', file);
13990 return;
13991
13992 case 'S':
13993 if (ASSEMBLER_DIALECT == ASM_ATT)
13994 putc ('s', file);
13995 return;
13996
13997 case 'T':
13998 if (ASSEMBLER_DIALECT == ASM_ATT)
13999 putc ('t', file);
14000 return;
14001
14002 case 'O':
14003 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14004 if (ASSEMBLER_DIALECT != ASM_ATT)
14005 return;
14006
14007 switch (GET_MODE_SIZE (GET_MODE (x)))
14008 {
14009 case 2:
14010 putc ('w', file);
14011 break;
14012
14013 case 4:
14014 putc ('l', file);
14015 break;
14016
14017 case 8:
14018 putc ('q', file);
14019 break;
14020
14021 default:
14022 output_operand_lossage
14023 ("invalid operand size for operand code 'O'");
14024 return;
14025 }
14026
14027 putc ('.', file);
14028 #endif
14029 return;
14030
14031 case 'z':
14032 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14033 {
14034 /* Opcodes don't get size suffixes if using Intel opcodes. */
14035 if (ASSEMBLER_DIALECT == ASM_INTEL)
14036 return;
14037
14038 switch (GET_MODE_SIZE (GET_MODE (x)))
14039 {
14040 case 1:
14041 putc ('b', file);
14042 return;
14043
14044 case 2:
14045 putc ('w', file);
14046 return;
14047
14048 case 4:
14049 putc ('l', file);
14050 return;
14051
14052 case 8:
14053 putc ('q', file);
14054 return;
14055
14056 default:
14057 output_operand_lossage
14058 ("invalid operand size for operand code 'z'");
14059 return;
14060 }
14061 }
14062
14063 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14064 warning
14065 (0, "non-integer operand used with operand code 'z'");
14066 /* FALLTHRU */
14067
14068 case 'Z':
14069 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14070 if (ASSEMBLER_DIALECT == ASM_INTEL)
14071 return;
14072
14073 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14074 {
14075 switch (GET_MODE_SIZE (GET_MODE (x)))
14076 {
14077 case 2:
14078 #ifdef HAVE_AS_IX86_FILDS
14079 putc ('s', file);
14080 #endif
14081 return;
14082
14083 case 4:
14084 putc ('l', file);
14085 return;
14086
14087 case 8:
14088 #ifdef HAVE_AS_IX86_FILDQ
14089 putc ('q', file);
14090 #else
14091 fputs ("ll", file);
14092 #endif
14093 return;
14094
14095 default:
14096 break;
14097 }
14098 }
14099 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14100 {
14101 /* 387 opcodes don't get size suffixes
14102 if the operands are registers. */
14103 if (STACK_REG_P (x))
14104 return;
14105
14106 switch (GET_MODE_SIZE (GET_MODE (x)))
14107 {
14108 case 4:
14109 putc ('s', file);
14110 return;
14111
14112 case 8:
14113 putc ('l', file);
14114 return;
14115
14116 case 12:
14117 case 16:
14118 putc ('t', file);
14119 return;
14120
14121 default:
14122 break;
14123 }
14124 }
14125 else
14126 {
14127 output_operand_lossage
14128 ("invalid operand type used with operand code 'Z'");
14129 return;
14130 }
14131
14132 output_operand_lossage
14133 ("invalid operand size for operand code 'Z'");
14134 return;
14135
14136 case 'd':
14137 case 'b':
14138 case 'w':
14139 case 'k':
14140 case 'q':
14141 case 'h':
14142 case 't':
14143 case 'y':
14144 case 'x':
14145 case 'X':
14146 case 'P':
14147 case 'p':
14148 break;
14149
14150 case 's':
14151 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14152 {
14153 ix86_print_operand (file, x, 0);
14154 fputs (", ", file);
14155 }
14156 return;
14157
14158 case 'Y':
14159 switch (GET_CODE (x))
14160 {
14161 case NE:
14162 fputs ("neq", file);
14163 break;
14164 case EQ:
14165 fputs ("eq", file);
14166 break;
14167 case GE:
14168 case GEU:
14169 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14170 break;
14171 case GT:
14172 case GTU:
14173 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14174 break;
14175 case LE:
14176 case LEU:
14177 fputs ("le", file);
14178 break;
14179 case LT:
14180 case LTU:
14181 fputs ("lt", file);
14182 break;
14183 case UNORDERED:
14184 fputs ("unord", file);
14185 break;
14186 case ORDERED:
14187 fputs ("ord", file);
14188 break;
14189 case UNEQ:
14190 fputs ("ueq", file);
14191 break;
14192 case UNGE:
14193 fputs ("nlt", file);
14194 break;
14195 case UNGT:
14196 fputs ("nle", file);
14197 break;
14198 case UNLE:
14199 fputs ("ule", file);
14200 break;
14201 case UNLT:
14202 fputs ("ult", file);
14203 break;
14204 case LTGT:
14205 fputs ("une", file);
14206 break;
14207 default:
14208 output_operand_lossage ("operand is not a condition code, "
14209 "invalid operand code 'Y'");
14210 return;
14211 }
14212 return;
14213
14214 case 'D':
14215 /* Little bit of braindamage here. The SSE compare instructions
14216 does use completely different names for the comparisons that the
14217 fp conditional moves. */
14218 switch (GET_CODE (x))
14219 {
14220 case UNEQ:
14221 if (TARGET_AVX)
14222 {
14223 fputs ("eq_us", file);
14224 break;
14225 }
14226 case EQ:
14227 fputs ("eq", file);
14228 break;
14229 case UNLT:
14230 if (TARGET_AVX)
14231 {
14232 fputs ("nge", file);
14233 break;
14234 }
14235 case LT:
14236 fputs ("lt", file);
14237 break;
14238 case UNLE:
14239 if (TARGET_AVX)
14240 {
14241 fputs ("ngt", file);
14242 break;
14243 }
14244 case LE:
14245 fputs ("le", file);
14246 break;
14247 case UNORDERED:
14248 fputs ("unord", file);
14249 break;
14250 case LTGT:
14251 if (TARGET_AVX)
14252 {
14253 fputs ("neq_oq", file);
14254 break;
14255 }
14256 case NE:
14257 fputs ("neq", file);
14258 break;
14259 case GE:
14260 if (TARGET_AVX)
14261 {
14262 fputs ("ge", file);
14263 break;
14264 }
14265 case UNGE:
14266 fputs ("nlt", file);
14267 break;
14268 case GT:
14269 if (TARGET_AVX)
14270 {
14271 fputs ("gt", file);
14272 break;
14273 }
14274 case UNGT:
14275 fputs ("nle", file);
14276 break;
14277 case ORDERED:
14278 fputs ("ord", file);
14279 break;
14280 default:
14281 output_operand_lossage ("operand is not a condition code, "
14282 "invalid operand code 'D'");
14283 return;
14284 }
14285 return;
14286
14287 case 'F':
14288 case 'f':
14289 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14290 if (ASSEMBLER_DIALECT == ASM_ATT)
14291 putc ('.', file);
14292 #endif
14293
14294 case 'C':
14295 case 'c':
14296 if (!COMPARISON_P (x))
14297 {
14298 output_operand_lossage ("operand is not a condition code, "
14299 "invalid operand code '%c'", code);
14300 return;
14301 }
14302 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14303 code == 'c' || code == 'f',
14304 code == 'F' || code == 'f',
14305 file);
14306 return;
14307
14308 case 'H':
14309 if (!offsettable_memref_p (x))
14310 {
14311 output_operand_lossage ("operand is not an offsettable memory "
14312 "reference, invalid operand code 'H'");
14313 return;
14314 }
14315 /* It doesn't actually matter what mode we use here, as we're
14316 only going to use this for printing. */
14317 x = adjust_address_nv (x, DImode, 8);
14318 break;
14319
14320 case 'K':
14321 gcc_assert (CONST_INT_P (x));
14322
14323 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14324 #ifdef HAVE_AS_IX86_HLE
14325 fputs ("xacquire ", file);
14326 #else
14327 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14328 #endif
14329 else if (INTVAL (x) & IX86_HLE_RELEASE)
14330 #ifdef HAVE_AS_IX86_HLE
14331 fputs ("xrelease ", file);
14332 #else
14333 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14334 #endif
14335 /* We do not want to print value of the operand. */
14336 return;
14337
14338 case '*':
14339 if (ASSEMBLER_DIALECT == ASM_ATT)
14340 putc ('*', file);
14341 return;
14342
14343 case '&':
14344 {
14345 const char *name = get_some_local_dynamic_name ();
14346 if (name == NULL)
14347 output_operand_lossage ("'%%&' used without any "
14348 "local dynamic TLS references");
14349 else
14350 assemble_name (file, name);
14351 return;
14352 }
14353
14354 case '+':
14355 {
14356 rtx x;
14357
14358 if (!optimize
14359 || optimize_function_for_size_p (cfun)
14360 || !TARGET_BRANCH_PREDICTION_HINTS)
14361 return;
14362
14363 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14364 if (x)
14365 {
14366 int pred_val = INTVAL (XEXP (x, 0));
14367
14368 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14369 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14370 {
14371 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14372 bool cputaken
14373 = final_forward_branch_p (current_output_insn) == 0;
14374
14375 /* Emit hints only in the case default branch prediction
14376 heuristics would fail. */
14377 if (taken != cputaken)
14378 {
14379 /* We use 3e (DS) prefix for taken branches and
14380 2e (CS) prefix for not taken branches. */
14381 if (taken)
14382 fputs ("ds ; ", file);
14383 else
14384 fputs ("cs ; ", file);
14385 }
14386 }
14387 }
14388 return;
14389 }
14390
14391 case ';':
14392 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14393 putc (';', file);
14394 #endif
14395 return;
14396
14397 case '@':
14398 if (ASSEMBLER_DIALECT == ASM_ATT)
14399 putc ('%', file);
14400
14401 /* The kernel uses a different segment register for performance
14402 reasons; a system call would not have to trash the userspace
14403 segment register, which would be expensive. */
14404 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14405 fputs ("fs", file);
14406 else
14407 fputs ("gs", file);
14408 return;
14409
14410 case '~':
14411 putc (TARGET_AVX2 ? 'i' : 'f', file);
14412 return;
14413
14414 case '^':
14415 if (TARGET_64BIT && Pmode != word_mode)
14416 fputs ("addr32 ", file);
14417 return;
14418
14419 default:
14420 output_operand_lossage ("invalid operand code '%c'", code);
14421 }
14422 }
14423
14424 if (REG_P (x))
14425 print_reg (x, code, file);
14426
14427 else if (MEM_P (x))
14428 {
14429 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14430 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14431 && GET_MODE (x) != BLKmode)
14432 {
14433 const char * size;
14434 switch (GET_MODE_SIZE (GET_MODE (x)))
14435 {
14436 case 1: size = "BYTE"; break;
14437 case 2: size = "WORD"; break;
14438 case 4: size = "DWORD"; break;
14439 case 8: size = "QWORD"; break;
14440 case 12: size = "TBYTE"; break;
14441 case 16:
14442 if (GET_MODE (x) == XFmode)
14443 size = "TBYTE";
14444 else
14445 size = "XMMWORD";
14446 break;
14447 case 32: size = "YMMWORD"; break;
14448 default:
14449 gcc_unreachable ();
14450 }
14451
14452 /* Check for explicit size override (codes 'b', 'w', 'k',
14453 'q' and 'x') */
14454 if (code == 'b')
14455 size = "BYTE";
14456 else if (code == 'w')
14457 size = "WORD";
14458 else if (code == 'k')
14459 size = "DWORD";
14460 else if (code == 'q')
14461 size = "QWORD";
14462 else if (code == 'x')
14463 size = "XMMWORD";
14464
14465 fputs (size, file);
14466 fputs (" PTR ", file);
14467 }
14468
14469 x = XEXP (x, 0);
14470 /* Avoid (%rip) for call operands. */
14471 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14472 && !CONST_INT_P (x))
14473 output_addr_const (file, x);
14474 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14475 output_operand_lossage ("invalid constraints for operand");
14476 else
14477 output_address (x);
14478 }
14479
14480 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14481 {
14482 REAL_VALUE_TYPE r;
14483 long l;
14484
14485 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14486 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14487
14488 if (ASSEMBLER_DIALECT == ASM_ATT)
14489 putc ('$', file);
14490 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14491 if (code == 'q')
14492 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
14493 else
14494 fprintf (file, "0x%08x", (unsigned int) l);
14495 }
14496
14497 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14498 {
14499 REAL_VALUE_TYPE r;
14500 long l[2];
14501
14502 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14503 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14504
14505 if (ASSEMBLER_DIALECT == ASM_ATT)
14506 putc ('$', file);
14507 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14508 }
14509
14510 /* These float cases don't actually occur as immediate operands. */
14511 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14512 {
14513 char dstr[30];
14514
14515 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14516 fputs (dstr, file);
14517 }
14518
14519 else
14520 {
14521 /* We have patterns that allow zero sets of memory, for instance.
14522 In 64-bit mode, we should probably support all 8-byte vectors,
14523 since we can in fact encode that into an immediate. */
14524 if (GET_CODE (x) == CONST_VECTOR)
14525 {
14526 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14527 x = const0_rtx;
14528 }
14529
14530 if (code != 'P' && code != 'p')
14531 {
14532 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14533 {
14534 if (ASSEMBLER_DIALECT == ASM_ATT)
14535 putc ('$', file);
14536 }
14537 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14538 || GET_CODE (x) == LABEL_REF)
14539 {
14540 if (ASSEMBLER_DIALECT == ASM_ATT)
14541 putc ('$', file);
14542 else
14543 fputs ("OFFSET FLAT:", file);
14544 }
14545 }
14546 if (CONST_INT_P (x))
14547 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14548 else if (flag_pic || MACHOPIC_INDIRECT)
14549 output_pic_addr_const (file, x, code);
14550 else
14551 output_addr_const (file, x);
14552 }
14553 }
14554
14555 static bool
14556 ix86_print_operand_punct_valid_p (unsigned char code)
14557 {
14558 return (code == '@' || code == '*' || code == '+' || code == '&'
14559 || code == ';' || code == '~' || code == '^');
14560 }
14561 \f
14562 /* Print a memory operand whose address is ADDR. */
14563
14564 static void
14565 ix86_print_operand_address (FILE *file, rtx addr)
14566 {
14567 struct ix86_address parts;
14568 rtx base, index, disp;
14569 int scale;
14570 int ok;
14571 bool vsib = false;
14572 int code = 0;
14573
14574 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14575 {
14576 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14577 gcc_assert (parts.index == NULL_RTX);
14578 parts.index = XVECEXP (addr, 0, 1);
14579 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14580 addr = XVECEXP (addr, 0, 0);
14581 vsib = true;
14582 }
14583 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14584 {
14585 gcc_assert (TARGET_64BIT);
14586 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14587 code = 'q';
14588 }
14589 else
14590 ok = ix86_decompose_address (addr, &parts);
14591
14592 gcc_assert (ok);
14593
14594 base = parts.base;
14595 index = parts.index;
14596 disp = parts.disp;
14597 scale = parts.scale;
14598
14599 switch (parts.seg)
14600 {
14601 case SEG_DEFAULT:
14602 break;
14603 case SEG_FS:
14604 case SEG_GS:
14605 if (ASSEMBLER_DIALECT == ASM_ATT)
14606 putc ('%', file);
14607 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14608 break;
14609 default:
14610 gcc_unreachable ();
14611 }
14612
14613 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14614 if (TARGET_64BIT && !base && !index)
14615 {
14616 rtx symbol = disp;
14617
14618 if (GET_CODE (disp) == CONST
14619 && GET_CODE (XEXP (disp, 0)) == PLUS
14620 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14621 symbol = XEXP (XEXP (disp, 0), 0);
14622
14623 if (GET_CODE (symbol) == LABEL_REF
14624 || (GET_CODE (symbol) == SYMBOL_REF
14625 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14626 base = pc_rtx;
14627 }
14628 if (!base && !index)
14629 {
14630 /* Displacement only requires special attention. */
14631
14632 if (CONST_INT_P (disp))
14633 {
14634 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14635 fputs ("ds:", file);
14636 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14637 }
14638 else if (flag_pic)
14639 output_pic_addr_const (file, disp, 0);
14640 else
14641 output_addr_const (file, disp);
14642 }
14643 else
14644 {
14645 /* Print SImode register names to force addr32 prefix. */
14646 if (SImode_address_operand (addr, VOIDmode))
14647 {
14648 #ifdef ENABLE_CHECKING
14649 gcc_assert (TARGET_64BIT);
14650 switch (GET_CODE (addr))
14651 {
14652 case SUBREG:
14653 gcc_assert (GET_MODE (addr) == SImode);
14654 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14655 break;
14656 case ZERO_EXTEND:
14657 case AND:
14658 gcc_assert (GET_MODE (addr) == DImode);
14659 break;
14660 default:
14661 gcc_unreachable ();
14662 }
14663 #endif
14664 gcc_assert (!code);
14665 code = 'k';
14666 }
14667 else if (code == 0
14668 && TARGET_X32
14669 && disp
14670 && CONST_INT_P (disp)
14671 && INTVAL (disp) < -16*1024*1024)
14672 {
14673 /* X32 runs in 64-bit mode, where displacement, DISP, in
14674 address DISP(%r64), is encoded as 32-bit immediate sign-
14675 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14676 address is %r64 + 0xffffffffbffffd00. When %r64 <
14677 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14678 which is invalid for x32. The correct address is %r64
14679 - 0x40000300 == 0xf7ffdd64. To properly encode
14680 -0x40000300(%r64) for x32, we zero-extend negative
14681 displacement by forcing addr32 prefix which truncates
14682 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14683 zero-extend all negative displacements, including -1(%rsp).
14684 However, for small negative displacements, sign-extension
14685 won't cause overflow. We only zero-extend negative
14686 displacements if they < -16*1024*1024, which is also used
14687 to check legitimate address displacements for PIC. */
14688 code = 'k';
14689 }
14690
14691 if (ASSEMBLER_DIALECT == ASM_ATT)
14692 {
14693 if (disp)
14694 {
14695 if (flag_pic)
14696 output_pic_addr_const (file, disp, 0);
14697 else if (GET_CODE (disp) == LABEL_REF)
14698 output_asm_label (disp);
14699 else
14700 output_addr_const (file, disp);
14701 }
14702
14703 putc ('(', file);
14704 if (base)
14705 print_reg (base, code, file);
14706 if (index)
14707 {
14708 putc (',', file);
14709 print_reg (index, vsib ? 0 : code, file);
14710 if (scale != 1 || vsib)
14711 fprintf (file, ",%d", scale);
14712 }
14713 putc (')', file);
14714 }
14715 else
14716 {
14717 rtx offset = NULL_RTX;
14718
14719 if (disp)
14720 {
14721 /* Pull out the offset of a symbol; print any symbol itself. */
14722 if (GET_CODE (disp) == CONST
14723 && GET_CODE (XEXP (disp, 0)) == PLUS
14724 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14725 {
14726 offset = XEXP (XEXP (disp, 0), 1);
14727 disp = gen_rtx_CONST (VOIDmode,
14728 XEXP (XEXP (disp, 0), 0));
14729 }
14730
14731 if (flag_pic)
14732 output_pic_addr_const (file, disp, 0);
14733 else if (GET_CODE (disp) == LABEL_REF)
14734 output_asm_label (disp);
14735 else if (CONST_INT_P (disp))
14736 offset = disp;
14737 else
14738 output_addr_const (file, disp);
14739 }
14740
14741 putc ('[', file);
14742 if (base)
14743 {
14744 print_reg (base, code, file);
14745 if (offset)
14746 {
14747 if (INTVAL (offset) >= 0)
14748 putc ('+', file);
14749 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14750 }
14751 }
14752 else if (offset)
14753 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14754 else
14755 putc ('0', file);
14756
14757 if (index)
14758 {
14759 putc ('+', file);
14760 print_reg (index, vsib ? 0 : code, file);
14761 if (scale != 1 || vsib)
14762 fprintf (file, "*%d", scale);
14763 }
14764 putc (']', file);
14765 }
14766 }
14767 }
14768
14769 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14770
14771 static bool
14772 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14773 {
14774 rtx op;
14775
14776 if (GET_CODE (x) != UNSPEC)
14777 return false;
14778
14779 op = XVECEXP (x, 0, 0);
14780 switch (XINT (x, 1))
14781 {
14782 case UNSPEC_GOTTPOFF:
14783 output_addr_const (file, op);
14784 /* FIXME: This might be @TPOFF in Sun ld. */
14785 fputs ("@gottpoff", file);
14786 break;
14787 case UNSPEC_TPOFF:
14788 output_addr_const (file, op);
14789 fputs ("@tpoff", file);
14790 break;
14791 case UNSPEC_NTPOFF:
14792 output_addr_const (file, op);
14793 if (TARGET_64BIT)
14794 fputs ("@tpoff", file);
14795 else
14796 fputs ("@ntpoff", file);
14797 break;
14798 case UNSPEC_DTPOFF:
14799 output_addr_const (file, op);
14800 fputs ("@dtpoff", file);
14801 break;
14802 case UNSPEC_GOTNTPOFF:
14803 output_addr_const (file, op);
14804 if (TARGET_64BIT)
14805 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14806 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14807 else
14808 fputs ("@gotntpoff", file);
14809 break;
14810 case UNSPEC_INDNTPOFF:
14811 output_addr_const (file, op);
14812 fputs ("@indntpoff", file);
14813 break;
14814 #if TARGET_MACHO
14815 case UNSPEC_MACHOPIC_OFFSET:
14816 output_addr_const (file, op);
14817 putc ('-', file);
14818 machopic_output_function_base_name (file);
14819 break;
14820 #endif
14821
14822 case UNSPEC_STACK_CHECK:
14823 {
14824 int offset;
14825
14826 gcc_assert (flag_split_stack);
14827
14828 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14829 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14830 #else
14831 gcc_unreachable ();
14832 #endif
14833
14834 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14835 }
14836 break;
14837
14838 default:
14839 return false;
14840 }
14841
14842 return true;
14843 }
14844 \f
14845 /* Split one or more double-mode RTL references into pairs of half-mode
14846 references. The RTL can be REG, offsettable MEM, integer constant, or
14847 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14848 split and "num" is its length. lo_half and hi_half are output arrays
14849 that parallel "operands". */
14850
14851 void
14852 split_double_mode (enum machine_mode mode, rtx operands[],
14853 int num, rtx lo_half[], rtx hi_half[])
14854 {
14855 enum machine_mode half_mode;
14856 unsigned int byte;
14857
14858 switch (mode)
14859 {
14860 case TImode:
14861 half_mode = DImode;
14862 break;
14863 case DImode:
14864 half_mode = SImode;
14865 break;
14866 default:
14867 gcc_unreachable ();
14868 }
14869
14870 byte = GET_MODE_SIZE (half_mode);
14871
14872 while (num--)
14873 {
14874 rtx op = operands[num];
14875
14876 /* simplify_subreg refuse to split volatile memory addresses,
14877 but we still have to handle it. */
14878 if (MEM_P (op))
14879 {
14880 lo_half[num] = adjust_address (op, half_mode, 0);
14881 hi_half[num] = adjust_address (op, half_mode, byte);
14882 }
14883 else
14884 {
14885 lo_half[num] = simplify_gen_subreg (half_mode, op,
14886 GET_MODE (op) == VOIDmode
14887 ? mode : GET_MODE (op), 0);
14888 hi_half[num] = simplify_gen_subreg (half_mode, op,
14889 GET_MODE (op) == VOIDmode
14890 ? mode : GET_MODE (op), byte);
14891 }
14892 }
14893 }
14894 \f
14895 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14896 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14897 is the expression of the binary operation. The output may either be
14898 emitted here, or returned to the caller, like all output_* functions.
14899
14900 There is no guarantee that the operands are the same mode, as they
14901 might be within FLOAT or FLOAT_EXTEND expressions. */
14902
14903 #ifndef SYSV386_COMPAT
14904 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14905 wants to fix the assemblers because that causes incompatibility
14906 with gcc. No-one wants to fix gcc because that causes
14907 incompatibility with assemblers... You can use the option of
14908 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14909 #define SYSV386_COMPAT 1
14910 #endif
14911
14912 const char *
14913 output_387_binary_op (rtx insn, rtx *operands)
14914 {
14915 static char buf[40];
14916 const char *p;
14917 const char *ssep;
14918 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14919
14920 #ifdef ENABLE_CHECKING
14921 /* Even if we do not want to check the inputs, this documents input
14922 constraints. Which helps in understanding the following code. */
14923 if (STACK_REG_P (operands[0])
14924 && ((REG_P (operands[1])
14925 && REGNO (operands[0]) == REGNO (operands[1])
14926 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14927 || (REG_P (operands[2])
14928 && REGNO (operands[0]) == REGNO (operands[2])
14929 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14930 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14931 ; /* ok */
14932 else
14933 gcc_assert (is_sse);
14934 #endif
14935
14936 switch (GET_CODE (operands[3]))
14937 {
14938 case PLUS:
14939 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14940 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14941 p = "fiadd";
14942 else
14943 p = "fadd";
14944 ssep = "vadd";
14945 break;
14946
14947 case MINUS:
14948 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14949 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14950 p = "fisub";
14951 else
14952 p = "fsub";
14953 ssep = "vsub";
14954 break;
14955
14956 case MULT:
14957 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14958 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14959 p = "fimul";
14960 else
14961 p = "fmul";
14962 ssep = "vmul";
14963 break;
14964
14965 case DIV:
14966 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14967 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14968 p = "fidiv";
14969 else
14970 p = "fdiv";
14971 ssep = "vdiv";
14972 break;
14973
14974 default:
14975 gcc_unreachable ();
14976 }
14977
14978 if (is_sse)
14979 {
14980 if (TARGET_AVX)
14981 {
14982 strcpy (buf, ssep);
14983 if (GET_MODE (operands[0]) == SFmode)
14984 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14985 else
14986 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14987 }
14988 else
14989 {
14990 strcpy (buf, ssep + 1);
14991 if (GET_MODE (operands[0]) == SFmode)
14992 strcat (buf, "ss\t{%2, %0|%0, %2}");
14993 else
14994 strcat (buf, "sd\t{%2, %0|%0, %2}");
14995 }
14996 return buf;
14997 }
14998 strcpy (buf, p);
14999
15000 switch (GET_CODE (operands[3]))
15001 {
15002 case MULT:
15003 case PLUS:
15004 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15005 {
15006 rtx temp = operands[2];
15007 operands[2] = operands[1];
15008 operands[1] = temp;
15009 }
15010
15011 /* know operands[0] == operands[1]. */
15012
15013 if (MEM_P (operands[2]))
15014 {
15015 p = "%Z2\t%2";
15016 break;
15017 }
15018
15019 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15020 {
15021 if (STACK_TOP_P (operands[0]))
15022 /* How is it that we are storing to a dead operand[2]?
15023 Well, presumably operands[1] is dead too. We can't
15024 store the result to st(0) as st(0) gets popped on this
15025 instruction. Instead store to operands[2] (which I
15026 think has to be st(1)). st(1) will be popped later.
15027 gcc <= 2.8.1 didn't have this check and generated
15028 assembly code that the Unixware assembler rejected. */
15029 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15030 else
15031 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15032 break;
15033 }
15034
15035 if (STACK_TOP_P (operands[0]))
15036 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15037 else
15038 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15039 break;
15040
15041 case MINUS:
15042 case DIV:
15043 if (MEM_P (operands[1]))
15044 {
15045 p = "r%Z1\t%1";
15046 break;
15047 }
15048
15049 if (MEM_P (operands[2]))
15050 {
15051 p = "%Z2\t%2";
15052 break;
15053 }
15054
15055 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15056 {
15057 #if SYSV386_COMPAT
15058 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15059 derived assemblers, confusingly reverse the direction of
15060 the operation for fsub{r} and fdiv{r} when the
15061 destination register is not st(0). The Intel assembler
15062 doesn't have this brain damage. Read !SYSV386_COMPAT to
15063 figure out what the hardware really does. */
15064 if (STACK_TOP_P (operands[0]))
15065 p = "{p\t%0, %2|rp\t%2, %0}";
15066 else
15067 p = "{rp\t%2, %0|p\t%0, %2}";
15068 #else
15069 if (STACK_TOP_P (operands[0]))
15070 /* As above for fmul/fadd, we can't store to st(0). */
15071 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15072 else
15073 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15074 #endif
15075 break;
15076 }
15077
15078 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15079 {
15080 #if SYSV386_COMPAT
15081 if (STACK_TOP_P (operands[0]))
15082 p = "{rp\t%0, %1|p\t%1, %0}";
15083 else
15084 p = "{p\t%1, %0|rp\t%0, %1}";
15085 #else
15086 if (STACK_TOP_P (operands[0]))
15087 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15088 else
15089 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15090 #endif
15091 break;
15092 }
15093
15094 if (STACK_TOP_P (operands[0]))
15095 {
15096 if (STACK_TOP_P (operands[1]))
15097 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15098 else
15099 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15100 break;
15101 }
15102 else if (STACK_TOP_P (operands[1]))
15103 {
15104 #if SYSV386_COMPAT
15105 p = "{\t%1, %0|r\t%0, %1}";
15106 #else
15107 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15108 #endif
15109 }
15110 else
15111 {
15112 #if SYSV386_COMPAT
15113 p = "{r\t%2, %0|\t%0, %2}";
15114 #else
15115 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15116 #endif
15117 }
15118 break;
15119
15120 default:
15121 gcc_unreachable ();
15122 }
15123
15124 strcat (buf, p);
15125 return buf;
15126 }
15127
15128 /* Check if a 256bit AVX register is referenced inside of EXP. */
15129
15130 static int
15131 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15132 {
15133 rtx exp = *pexp;
15134
15135 if (GET_CODE (exp) == SUBREG)
15136 exp = SUBREG_REG (exp);
15137
15138 if (REG_P (exp)
15139 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15140 return 1;
15141
15142 return 0;
15143 }
15144
15145 /* Return needed mode for entity in optimize_mode_switching pass. */
15146
15147 static int
15148 ix86_avx_u128_mode_needed (rtx insn)
15149 {
15150 if (CALL_P (insn))
15151 {
15152 rtx link;
15153
15154 /* Needed mode is set to AVX_U128_CLEAN if there are
15155 no 256bit modes used in function arguments. */
15156 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15157 link;
15158 link = XEXP (link, 1))
15159 {
15160 if (GET_CODE (XEXP (link, 0)) == USE)
15161 {
15162 rtx arg = XEXP (XEXP (link, 0), 0);
15163
15164 if (ix86_check_avx256_register (&arg, NULL))
15165 return AVX_U128_ANY;
15166 }
15167 }
15168
15169 return AVX_U128_CLEAN;
15170 }
15171
15172 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15173 changes state only when a 256bit register is written to, but we need
15174 to prevent the compiler from moving optimal insertion point above
15175 eventual read from 256bit register. */
15176 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15177 return AVX_U128_DIRTY;
15178
15179 return AVX_U128_ANY;
15180 }
15181
15182 /* Return mode that i387 must be switched into
15183 prior to the execution of insn. */
15184
15185 static int
15186 ix86_i387_mode_needed (int entity, rtx insn)
15187 {
15188 enum attr_i387_cw mode;
15189
15190 /* The mode UNINITIALIZED is used to store control word after a
15191 function call or ASM pattern. The mode ANY specify that function
15192 has no requirements on the control word and make no changes in the
15193 bits we are interested in. */
15194
15195 if (CALL_P (insn)
15196 || (NONJUMP_INSN_P (insn)
15197 && (asm_noperands (PATTERN (insn)) >= 0
15198 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15199 return I387_CW_UNINITIALIZED;
15200
15201 if (recog_memoized (insn) < 0)
15202 return I387_CW_ANY;
15203
15204 mode = get_attr_i387_cw (insn);
15205
15206 switch (entity)
15207 {
15208 case I387_TRUNC:
15209 if (mode == I387_CW_TRUNC)
15210 return mode;
15211 break;
15212
15213 case I387_FLOOR:
15214 if (mode == I387_CW_FLOOR)
15215 return mode;
15216 break;
15217
15218 case I387_CEIL:
15219 if (mode == I387_CW_CEIL)
15220 return mode;
15221 break;
15222
15223 case I387_MASK_PM:
15224 if (mode == I387_CW_MASK_PM)
15225 return mode;
15226 break;
15227
15228 default:
15229 gcc_unreachable ();
15230 }
15231
15232 return I387_CW_ANY;
15233 }
15234
15235 /* Return mode that entity must be switched into
15236 prior to the execution of insn. */
15237
15238 int
15239 ix86_mode_needed (int entity, rtx insn)
15240 {
15241 switch (entity)
15242 {
15243 case AVX_U128:
15244 return ix86_avx_u128_mode_needed (insn);
15245 case I387_TRUNC:
15246 case I387_FLOOR:
15247 case I387_CEIL:
15248 case I387_MASK_PM:
15249 return ix86_i387_mode_needed (entity, insn);
15250 default:
15251 gcc_unreachable ();
15252 }
15253 return 0;
15254 }
15255
15256 /* Check if a 256bit AVX register is referenced in stores. */
15257
15258 static void
15259 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15260 {
15261 if (ix86_check_avx256_register (&dest, NULL))
15262 {
15263 bool *used = (bool *) data;
15264 *used = true;
15265 }
15266 }
15267
15268 /* Calculate mode of upper 128bit AVX registers after the insn. */
15269
15270 static int
15271 ix86_avx_u128_mode_after (int mode, rtx insn)
15272 {
15273 rtx pat = PATTERN (insn);
15274
15275 if (vzeroupper_operation (pat, VOIDmode)
15276 || vzeroall_operation (pat, VOIDmode))
15277 return AVX_U128_CLEAN;
15278
15279 /* We know that state is clean after CALL insn if there are no
15280 256bit registers used in the function return register. */
15281 if (CALL_P (insn))
15282 {
15283 bool avx_reg256_found = false;
15284 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15285 if (!avx_reg256_found)
15286 return AVX_U128_CLEAN;
15287 }
15288
15289 /* Otherwise, return current mode. Remember that if insn
15290 references AVX 256bit registers, the mode was already changed
15291 to DIRTY from MODE_NEEDED. */
15292 return mode;
15293 }
15294
15295 /* Return the mode that an insn results in. */
15296
15297 int
15298 ix86_mode_after (int entity, int mode, rtx insn)
15299 {
15300 switch (entity)
15301 {
15302 case AVX_U128:
15303 return ix86_avx_u128_mode_after (mode, insn);
15304 case I387_TRUNC:
15305 case I387_FLOOR:
15306 case I387_CEIL:
15307 case I387_MASK_PM:
15308 return mode;
15309 default:
15310 gcc_unreachable ();
15311 }
15312 }
15313
15314 static int
15315 ix86_avx_u128_mode_entry (void)
15316 {
15317 tree arg;
15318
15319 /* Entry mode is set to AVX_U128_DIRTY if there are
15320 256bit modes used in function arguments. */
15321 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15322 arg = TREE_CHAIN (arg))
15323 {
15324 rtx incoming = DECL_INCOMING_RTL (arg);
15325
15326 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15327 return AVX_U128_DIRTY;
15328 }
15329
15330 return AVX_U128_CLEAN;
15331 }
15332
15333 /* Return a mode that ENTITY is assumed to be
15334 switched to at function entry. */
15335
15336 int
15337 ix86_mode_entry (int entity)
15338 {
15339 switch (entity)
15340 {
15341 case AVX_U128:
15342 return ix86_avx_u128_mode_entry ();
15343 case I387_TRUNC:
15344 case I387_FLOOR:
15345 case I387_CEIL:
15346 case I387_MASK_PM:
15347 return I387_CW_ANY;
15348 default:
15349 gcc_unreachable ();
15350 }
15351 }
15352
15353 static int
15354 ix86_avx_u128_mode_exit (void)
15355 {
15356 rtx reg = crtl->return_rtx;
15357
15358 /* Exit mode is set to AVX_U128_DIRTY if there are
15359 256bit modes used in the function return register. */
15360 if (reg && ix86_check_avx256_register (&reg, NULL))
15361 return AVX_U128_DIRTY;
15362
15363 return AVX_U128_CLEAN;
15364 }
15365
15366 /* Return a mode that ENTITY is assumed to be
15367 switched to at function exit. */
15368
15369 int
15370 ix86_mode_exit (int entity)
15371 {
15372 switch (entity)
15373 {
15374 case AVX_U128:
15375 return ix86_avx_u128_mode_exit ();
15376 case I387_TRUNC:
15377 case I387_FLOOR:
15378 case I387_CEIL:
15379 case I387_MASK_PM:
15380 return I387_CW_ANY;
15381 default:
15382 gcc_unreachable ();
15383 }
15384 }
15385
15386 /* Output code to initialize control word copies used by trunc?f?i and
15387 rounding patterns. CURRENT_MODE is set to current control word,
15388 while NEW_MODE is set to new control word. */
15389
15390 static void
15391 emit_i387_cw_initialization (int mode)
15392 {
15393 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15394 rtx new_mode;
15395
15396 enum ix86_stack_slot slot;
15397
15398 rtx reg = gen_reg_rtx (HImode);
15399
15400 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15401 emit_move_insn (reg, copy_rtx (stored_mode));
15402
15403 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15404 || optimize_function_for_size_p (cfun))
15405 {
15406 switch (mode)
15407 {
15408 case I387_CW_TRUNC:
15409 /* round toward zero (truncate) */
15410 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15411 slot = SLOT_CW_TRUNC;
15412 break;
15413
15414 case I387_CW_FLOOR:
15415 /* round down toward -oo */
15416 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15417 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15418 slot = SLOT_CW_FLOOR;
15419 break;
15420
15421 case I387_CW_CEIL:
15422 /* round up toward +oo */
15423 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15424 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15425 slot = SLOT_CW_CEIL;
15426 break;
15427
15428 case I387_CW_MASK_PM:
15429 /* mask precision exception for nearbyint() */
15430 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15431 slot = SLOT_CW_MASK_PM;
15432 break;
15433
15434 default:
15435 gcc_unreachable ();
15436 }
15437 }
15438 else
15439 {
15440 switch (mode)
15441 {
15442 case I387_CW_TRUNC:
15443 /* round toward zero (truncate) */
15444 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15445 slot = SLOT_CW_TRUNC;
15446 break;
15447
15448 case I387_CW_FLOOR:
15449 /* round down toward -oo */
15450 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15451 slot = SLOT_CW_FLOOR;
15452 break;
15453
15454 case I387_CW_CEIL:
15455 /* round up toward +oo */
15456 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15457 slot = SLOT_CW_CEIL;
15458 break;
15459
15460 case I387_CW_MASK_PM:
15461 /* mask precision exception for nearbyint() */
15462 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15463 slot = SLOT_CW_MASK_PM;
15464 break;
15465
15466 default:
15467 gcc_unreachable ();
15468 }
15469 }
15470
15471 gcc_assert (slot < MAX_386_STACK_LOCALS);
15472
15473 new_mode = assign_386_stack_local (HImode, slot);
15474 emit_move_insn (new_mode, reg);
15475 }
15476
15477 /* Emit vzeroupper. */
15478
15479 void
15480 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15481 {
15482 int i;
15483
15484 /* Cancel automatic vzeroupper insertion if there are
15485 live call-saved SSE registers at the insertion point. */
15486
15487 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15488 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15489 return;
15490
15491 if (TARGET_64BIT)
15492 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15493 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15494 return;
15495
15496 emit_insn (gen_avx_vzeroupper ());
15497 }
15498
15499 /* Generate one or more insns to set ENTITY to MODE. */
15500
15501 void
15502 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15503 {
15504 switch (entity)
15505 {
15506 case AVX_U128:
15507 if (mode == AVX_U128_CLEAN)
15508 ix86_avx_emit_vzeroupper (regs_live);
15509 break;
15510 case I387_TRUNC:
15511 case I387_FLOOR:
15512 case I387_CEIL:
15513 case I387_MASK_PM:
15514 if (mode != I387_CW_ANY
15515 && mode != I387_CW_UNINITIALIZED)
15516 emit_i387_cw_initialization (mode);
15517 break;
15518 default:
15519 gcc_unreachable ();
15520 }
15521 }
15522
15523 /* Output code for INSN to convert a float to a signed int. OPERANDS
15524 are the insn operands. The output may be [HSD]Imode and the input
15525 operand may be [SDX]Fmode. */
15526
15527 const char *
15528 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15529 {
15530 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15531 int dimode_p = GET_MODE (operands[0]) == DImode;
15532 int round_mode = get_attr_i387_cw (insn);
15533
15534 /* Jump through a hoop or two for DImode, since the hardware has no
15535 non-popping instruction. We used to do this a different way, but
15536 that was somewhat fragile and broke with post-reload splitters. */
15537 if ((dimode_p || fisttp) && !stack_top_dies)
15538 output_asm_insn ("fld\t%y1", operands);
15539
15540 gcc_assert (STACK_TOP_P (operands[1]));
15541 gcc_assert (MEM_P (operands[0]));
15542 gcc_assert (GET_MODE (operands[1]) != TFmode);
15543
15544 if (fisttp)
15545 output_asm_insn ("fisttp%Z0\t%0", operands);
15546 else
15547 {
15548 if (round_mode != I387_CW_ANY)
15549 output_asm_insn ("fldcw\t%3", operands);
15550 if (stack_top_dies || dimode_p)
15551 output_asm_insn ("fistp%Z0\t%0", operands);
15552 else
15553 output_asm_insn ("fist%Z0\t%0", operands);
15554 if (round_mode != I387_CW_ANY)
15555 output_asm_insn ("fldcw\t%2", operands);
15556 }
15557
15558 return "";
15559 }
15560
15561 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15562 have the values zero or one, indicates the ffreep insn's operand
15563 from the OPERANDS array. */
15564
15565 static const char *
15566 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15567 {
15568 if (TARGET_USE_FFREEP)
15569 #ifdef HAVE_AS_IX86_FFREEP
15570 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15571 #else
15572 {
15573 static char retval[32];
15574 int regno = REGNO (operands[opno]);
15575
15576 gcc_assert (STACK_REGNO_P (regno));
15577
15578 regno -= FIRST_STACK_REG;
15579
15580 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15581 return retval;
15582 }
15583 #endif
15584
15585 return opno ? "fstp\t%y1" : "fstp\t%y0";
15586 }
15587
15588
15589 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15590 should be used. UNORDERED_P is true when fucom should be used. */
15591
15592 const char *
15593 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15594 {
15595 int stack_top_dies;
15596 rtx cmp_op0, cmp_op1;
15597 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15598
15599 if (eflags_p)
15600 {
15601 cmp_op0 = operands[0];
15602 cmp_op1 = operands[1];
15603 }
15604 else
15605 {
15606 cmp_op0 = operands[1];
15607 cmp_op1 = operands[2];
15608 }
15609
15610 if (is_sse)
15611 {
15612 if (GET_MODE (operands[0]) == SFmode)
15613 if (unordered_p)
15614 return "%vucomiss\t{%1, %0|%0, %1}";
15615 else
15616 return "%vcomiss\t{%1, %0|%0, %1}";
15617 else
15618 if (unordered_p)
15619 return "%vucomisd\t{%1, %0|%0, %1}";
15620 else
15621 return "%vcomisd\t{%1, %0|%0, %1}";
15622 }
15623
15624 gcc_assert (STACK_TOP_P (cmp_op0));
15625
15626 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15627
15628 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15629 {
15630 if (stack_top_dies)
15631 {
15632 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15633 return output_387_ffreep (operands, 1);
15634 }
15635 else
15636 return "ftst\n\tfnstsw\t%0";
15637 }
15638
15639 if (STACK_REG_P (cmp_op1)
15640 && stack_top_dies
15641 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15642 && REGNO (cmp_op1) != FIRST_STACK_REG)
15643 {
15644 /* If both the top of the 387 stack dies, and the other operand
15645 is also a stack register that dies, then this must be a
15646 `fcompp' float compare */
15647
15648 if (eflags_p)
15649 {
15650 /* There is no double popping fcomi variant. Fortunately,
15651 eflags is immune from the fstp's cc clobbering. */
15652 if (unordered_p)
15653 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15654 else
15655 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15656 return output_387_ffreep (operands, 0);
15657 }
15658 else
15659 {
15660 if (unordered_p)
15661 return "fucompp\n\tfnstsw\t%0";
15662 else
15663 return "fcompp\n\tfnstsw\t%0";
15664 }
15665 }
15666 else
15667 {
15668 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15669
15670 static const char * const alt[16] =
15671 {
15672 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15673 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15674 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15675 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15676
15677 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15678 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15679 NULL,
15680 NULL,
15681
15682 "fcomi\t{%y1, %0|%0, %y1}",
15683 "fcomip\t{%y1, %0|%0, %y1}",
15684 "fucomi\t{%y1, %0|%0, %y1}",
15685 "fucomip\t{%y1, %0|%0, %y1}",
15686
15687 NULL,
15688 NULL,
15689 NULL,
15690 NULL
15691 };
15692
15693 int mask;
15694 const char *ret;
15695
15696 mask = eflags_p << 3;
15697 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15698 mask |= unordered_p << 1;
15699 mask |= stack_top_dies;
15700
15701 gcc_assert (mask < 16);
15702 ret = alt[mask];
15703 gcc_assert (ret);
15704
15705 return ret;
15706 }
15707 }
15708
15709 void
15710 ix86_output_addr_vec_elt (FILE *file, int value)
15711 {
15712 const char *directive = ASM_LONG;
15713
15714 #ifdef ASM_QUAD
15715 if (TARGET_LP64)
15716 directive = ASM_QUAD;
15717 #else
15718 gcc_assert (!TARGET_64BIT);
15719 #endif
15720
15721 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15722 }
15723
15724 void
15725 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15726 {
15727 const char *directive = ASM_LONG;
15728
15729 #ifdef ASM_QUAD
15730 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15731 directive = ASM_QUAD;
15732 #else
15733 gcc_assert (!TARGET_64BIT);
15734 #endif
15735 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15736 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15737 fprintf (file, "%s%s%d-%s%d\n",
15738 directive, LPREFIX, value, LPREFIX, rel);
15739 else if (HAVE_AS_GOTOFF_IN_DATA)
15740 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15741 #if TARGET_MACHO
15742 else if (TARGET_MACHO)
15743 {
15744 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15745 machopic_output_function_base_name (file);
15746 putc ('\n', file);
15747 }
15748 #endif
15749 else
15750 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15751 GOT_SYMBOL_NAME, LPREFIX, value);
15752 }
15753 \f
15754 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15755 for the target. */
15756
15757 void
15758 ix86_expand_clear (rtx dest)
15759 {
15760 rtx tmp;
15761
15762 /* We play register width games, which are only valid after reload. */
15763 gcc_assert (reload_completed);
15764
15765 /* Avoid HImode and its attendant prefix byte. */
15766 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15767 dest = gen_rtx_REG (SImode, REGNO (dest));
15768 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15769
15770 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15771 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15772 {
15773 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15774 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15775 }
15776
15777 emit_insn (tmp);
15778 }
15779
15780 /* X is an unchanging MEM. If it is a constant pool reference, return
15781 the constant pool rtx, else NULL. */
15782
15783 rtx
15784 maybe_get_pool_constant (rtx x)
15785 {
15786 x = ix86_delegitimize_address (XEXP (x, 0));
15787
15788 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15789 return get_pool_constant (x);
15790
15791 return NULL_RTX;
15792 }
15793
15794 void
15795 ix86_expand_move (enum machine_mode mode, rtx operands[])
15796 {
15797 rtx op0, op1;
15798 enum tls_model model;
15799
15800 op0 = operands[0];
15801 op1 = operands[1];
15802
15803 if (GET_CODE (op1) == SYMBOL_REF)
15804 {
15805 model = SYMBOL_REF_TLS_MODEL (op1);
15806 if (model)
15807 {
15808 op1 = legitimize_tls_address (op1, model, true);
15809 op1 = force_operand (op1, op0);
15810 if (op1 == op0)
15811 return;
15812 if (GET_MODE (op1) != mode)
15813 op1 = convert_to_mode (mode, op1, 1);
15814 }
15815 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15816 && SYMBOL_REF_DLLIMPORT_P (op1))
15817 op1 = legitimize_dllimport_symbol (op1, false);
15818 }
15819 else if (GET_CODE (op1) == CONST
15820 && GET_CODE (XEXP (op1, 0)) == PLUS
15821 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15822 {
15823 rtx addend = XEXP (XEXP (op1, 0), 1);
15824 rtx symbol = XEXP (XEXP (op1, 0), 0);
15825 rtx tmp = NULL;
15826
15827 model = SYMBOL_REF_TLS_MODEL (symbol);
15828 if (model)
15829 tmp = legitimize_tls_address (symbol, model, true);
15830 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15831 && SYMBOL_REF_DLLIMPORT_P (symbol))
15832 tmp = legitimize_dllimport_symbol (symbol, true);
15833
15834 if (tmp)
15835 {
15836 tmp = force_operand (tmp, NULL);
15837 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15838 op0, 1, OPTAB_DIRECT);
15839 if (tmp == op0)
15840 return;
15841 if (GET_MODE (tmp) != mode)
15842 op1 = convert_to_mode (mode, tmp, 1);
15843 }
15844 }
15845
15846 if ((flag_pic || MACHOPIC_INDIRECT)
15847 && symbolic_operand (op1, mode))
15848 {
15849 if (TARGET_MACHO && !TARGET_64BIT)
15850 {
15851 #if TARGET_MACHO
15852 /* dynamic-no-pic */
15853 if (MACHOPIC_INDIRECT)
15854 {
15855 rtx temp = ((reload_in_progress
15856 || ((op0 && REG_P (op0))
15857 && mode == Pmode))
15858 ? op0 : gen_reg_rtx (Pmode));
15859 op1 = machopic_indirect_data_reference (op1, temp);
15860 if (MACHOPIC_PURE)
15861 op1 = machopic_legitimize_pic_address (op1, mode,
15862 temp == op1 ? 0 : temp);
15863 }
15864 if (op0 != op1 && GET_CODE (op0) != MEM)
15865 {
15866 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15867 emit_insn (insn);
15868 return;
15869 }
15870 if (GET_CODE (op0) == MEM)
15871 op1 = force_reg (Pmode, op1);
15872 else
15873 {
15874 rtx temp = op0;
15875 if (GET_CODE (temp) != REG)
15876 temp = gen_reg_rtx (Pmode);
15877 temp = legitimize_pic_address (op1, temp);
15878 if (temp == op0)
15879 return;
15880 op1 = temp;
15881 }
15882 /* dynamic-no-pic */
15883 #endif
15884 }
15885 else
15886 {
15887 if (MEM_P (op0))
15888 op1 = force_reg (mode, op1);
15889 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
15890 {
15891 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15892 op1 = legitimize_pic_address (op1, reg);
15893 if (op0 == op1)
15894 return;
15895 if (GET_MODE (op1) != mode)
15896 op1 = convert_to_mode (mode, op1, 1);
15897 }
15898 }
15899 }
15900 else
15901 {
15902 if (MEM_P (op0)
15903 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15904 || !push_operand (op0, mode))
15905 && MEM_P (op1))
15906 op1 = force_reg (mode, op1);
15907
15908 if (push_operand (op0, mode)
15909 && ! general_no_elim_operand (op1, mode))
15910 op1 = copy_to_mode_reg (mode, op1);
15911
15912 /* Force large constants in 64bit compilation into register
15913 to get them CSEed. */
15914 if (can_create_pseudo_p ()
15915 && (mode == DImode) && TARGET_64BIT
15916 && immediate_operand (op1, mode)
15917 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15918 && !register_operand (op0, mode)
15919 && optimize)
15920 op1 = copy_to_mode_reg (mode, op1);
15921
15922 if (can_create_pseudo_p ()
15923 && FLOAT_MODE_P (mode)
15924 && GET_CODE (op1) == CONST_DOUBLE)
15925 {
15926 /* If we are loading a floating point constant to a register,
15927 force the value to memory now, since we'll get better code
15928 out the back end. */
15929
15930 op1 = validize_mem (force_const_mem (mode, op1));
15931 if (!register_operand (op0, mode))
15932 {
15933 rtx temp = gen_reg_rtx (mode);
15934 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15935 emit_move_insn (op0, temp);
15936 return;
15937 }
15938 }
15939 }
15940
15941 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15942 }
15943
15944 void
15945 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15946 {
15947 rtx op0 = operands[0], op1 = operands[1];
15948 unsigned int align = GET_MODE_ALIGNMENT (mode);
15949
15950 /* Force constants other than zero into memory. We do not know how
15951 the instructions used to build constants modify the upper 64 bits
15952 of the register, once we have that information we may be able
15953 to handle some of them more efficiently. */
15954 if (can_create_pseudo_p ()
15955 && register_operand (op0, mode)
15956 && (CONSTANT_P (op1)
15957 || (GET_CODE (op1) == SUBREG
15958 && CONSTANT_P (SUBREG_REG (op1))))
15959 && !standard_sse_constant_p (op1))
15960 op1 = validize_mem (force_const_mem (mode, op1));
15961
15962 /* We need to check memory alignment for SSE mode since attribute
15963 can make operands unaligned. */
15964 if (can_create_pseudo_p ()
15965 && SSE_REG_MODE_P (mode)
15966 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15967 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15968 {
15969 rtx tmp[2];
15970
15971 /* ix86_expand_vector_move_misalign() does not like constants ... */
15972 if (CONSTANT_P (op1)
15973 || (GET_CODE (op1) == SUBREG
15974 && CONSTANT_P (SUBREG_REG (op1))))
15975 op1 = validize_mem (force_const_mem (mode, op1));
15976
15977 /* ... nor both arguments in memory. */
15978 if (!register_operand (op0, mode)
15979 && !register_operand (op1, mode))
15980 op1 = force_reg (mode, op1);
15981
15982 tmp[0] = op0; tmp[1] = op1;
15983 ix86_expand_vector_move_misalign (mode, tmp);
15984 return;
15985 }
15986
15987 /* Make operand1 a register if it isn't already. */
15988 if (can_create_pseudo_p ()
15989 && !register_operand (op0, mode)
15990 && !register_operand (op1, mode))
15991 {
15992 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15993 return;
15994 }
15995
15996 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15997 }
15998
15999 /* Split 32-byte AVX unaligned load and store if needed. */
16000
16001 static void
16002 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16003 {
16004 rtx m;
16005 rtx (*extract) (rtx, rtx, rtx);
16006 rtx (*load_unaligned) (rtx, rtx);
16007 rtx (*store_unaligned) (rtx, rtx);
16008 enum machine_mode mode;
16009
16010 switch (GET_MODE (op0))
16011 {
16012 default:
16013 gcc_unreachable ();
16014 case V32QImode:
16015 extract = gen_avx_vextractf128v32qi;
16016 load_unaligned = gen_avx_loaddqu256;
16017 store_unaligned = gen_avx_storedqu256;
16018 mode = V16QImode;
16019 break;
16020 case V8SFmode:
16021 extract = gen_avx_vextractf128v8sf;
16022 load_unaligned = gen_avx_loadups256;
16023 store_unaligned = gen_avx_storeups256;
16024 mode = V4SFmode;
16025 break;
16026 case V4DFmode:
16027 extract = gen_avx_vextractf128v4df;
16028 load_unaligned = gen_avx_loadupd256;
16029 store_unaligned = gen_avx_storeupd256;
16030 mode = V2DFmode;
16031 break;
16032 }
16033
16034 if (MEM_P (op1))
16035 {
16036 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16037 {
16038 rtx r = gen_reg_rtx (mode);
16039 m = adjust_address (op1, mode, 0);
16040 emit_move_insn (r, m);
16041 m = adjust_address (op1, mode, 16);
16042 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16043 emit_move_insn (op0, r);
16044 }
16045 else
16046 emit_insn (load_unaligned (op0, op1));
16047 }
16048 else if (MEM_P (op0))
16049 {
16050 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16051 {
16052 m = adjust_address (op0, mode, 0);
16053 emit_insn (extract (m, op1, const0_rtx));
16054 m = adjust_address (op0, mode, 16);
16055 emit_insn (extract (m, op1, const1_rtx));
16056 }
16057 else
16058 emit_insn (store_unaligned (op0, op1));
16059 }
16060 else
16061 gcc_unreachable ();
16062 }
16063
16064 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16065 straight to ix86_expand_vector_move. */
16066 /* Code generation for scalar reg-reg moves of single and double precision data:
16067 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16068 movaps reg, reg
16069 else
16070 movss reg, reg
16071 if (x86_sse_partial_reg_dependency == true)
16072 movapd reg, reg
16073 else
16074 movsd reg, reg
16075
16076 Code generation for scalar loads of double precision data:
16077 if (x86_sse_split_regs == true)
16078 movlpd mem, reg (gas syntax)
16079 else
16080 movsd mem, reg
16081
16082 Code generation for unaligned packed loads of single precision data
16083 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16084 if (x86_sse_unaligned_move_optimal)
16085 movups mem, reg
16086
16087 if (x86_sse_partial_reg_dependency == true)
16088 {
16089 xorps reg, reg
16090 movlps mem, reg
16091 movhps mem+8, reg
16092 }
16093 else
16094 {
16095 movlps mem, reg
16096 movhps mem+8, reg
16097 }
16098
16099 Code generation for unaligned packed loads of double precision data
16100 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16101 if (x86_sse_unaligned_move_optimal)
16102 movupd mem, reg
16103
16104 if (x86_sse_split_regs == true)
16105 {
16106 movlpd mem, reg
16107 movhpd mem+8, reg
16108 }
16109 else
16110 {
16111 movsd mem, reg
16112 movhpd mem+8, reg
16113 }
16114 */
16115
16116 void
16117 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16118 {
16119 rtx op0, op1, m;
16120
16121 op0 = operands[0];
16122 op1 = operands[1];
16123
16124 if (TARGET_AVX
16125 && GET_MODE_SIZE (mode) == 32)
16126 {
16127 switch (GET_MODE_CLASS (mode))
16128 {
16129 case MODE_VECTOR_INT:
16130 case MODE_INT:
16131 op0 = gen_lowpart (V32QImode, op0);
16132 op1 = gen_lowpart (V32QImode, op1);
16133 /* FALLTHRU */
16134
16135 case MODE_VECTOR_FLOAT:
16136 ix86_avx256_split_vector_move_misalign (op0, op1);
16137 break;
16138
16139 default:
16140 gcc_unreachable ();
16141 }
16142
16143 return;
16144 }
16145
16146 if (MEM_P (op1))
16147 {
16148 /* ??? If we have typed data, then it would appear that using
16149 movdqu is the only way to get unaligned data loaded with
16150 integer type. */
16151 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16152 {
16153 op0 = gen_lowpart (V16QImode, op0);
16154 op1 = gen_lowpart (V16QImode, op1);
16155 /* We will eventually emit movups based on insn attributes. */
16156 emit_insn (gen_sse2_loaddqu (op0, op1));
16157 }
16158 else if (TARGET_SSE2 && mode == V2DFmode)
16159 {
16160 rtx zero;
16161
16162 if (TARGET_AVX
16163 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16164 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16165 || optimize_function_for_size_p (cfun))
16166 {
16167 /* We will eventually emit movups based on insn attributes. */
16168 emit_insn (gen_sse2_loadupd (op0, op1));
16169 return;
16170 }
16171
16172 /* When SSE registers are split into halves, we can avoid
16173 writing to the top half twice. */
16174 if (TARGET_SSE_SPLIT_REGS)
16175 {
16176 emit_clobber (op0);
16177 zero = op0;
16178 }
16179 else
16180 {
16181 /* ??? Not sure about the best option for the Intel chips.
16182 The following would seem to satisfy; the register is
16183 entirely cleared, breaking the dependency chain. We
16184 then store to the upper half, with a dependency depth
16185 of one. A rumor has it that Intel recommends two movsd
16186 followed by an unpacklpd, but this is unconfirmed. And
16187 given that the dependency depth of the unpacklpd would
16188 still be one, I'm not sure why this would be better. */
16189 zero = CONST0_RTX (V2DFmode);
16190 }
16191
16192 m = adjust_address (op1, DFmode, 0);
16193 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16194 m = adjust_address (op1, DFmode, 8);
16195 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16196 }
16197 else
16198 {
16199 if (TARGET_AVX
16200 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16201 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16202 || optimize_function_for_size_p (cfun))
16203 {
16204 op0 = gen_lowpart (V4SFmode, op0);
16205 op1 = gen_lowpart (V4SFmode, op1);
16206 emit_insn (gen_sse_loadups (op0, op1));
16207 return;
16208 }
16209
16210 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16211 emit_move_insn (op0, CONST0_RTX (mode));
16212 else
16213 emit_clobber (op0);
16214
16215 if (mode != V4SFmode)
16216 op0 = gen_lowpart (V4SFmode, op0);
16217
16218 m = adjust_address (op1, V2SFmode, 0);
16219 emit_insn (gen_sse_loadlps (op0, op0, m));
16220 m = adjust_address (op1, V2SFmode, 8);
16221 emit_insn (gen_sse_loadhps (op0, op0, m));
16222 }
16223 }
16224 else if (MEM_P (op0))
16225 {
16226 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16227 {
16228 op0 = gen_lowpart (V16QImode, op0);
16229 op1 = gen_lowpart (V16QImode, op1);
16230 /* We will eventually emit movups based on insn attributes. */
16231 emit_insn (gen_sse2_storedqu (op0, op1));
16232 }
16233 else if (TARGET_SSE2 && mode == V2DFmode)
16234 {
16235 if (TARGET_AVX
16236 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16237 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16238 || optimize_function_for_size_p (cfun))
16239 /* We will eventually emit movups based on insn attributes. */
16240 emit_insn (gen_sse2_storeupd (op0, op1));
16241 else
16242 {
16243 m = adjust_address (op0, DFmode, 0);
16244 emit_insn (gen_sse2_storelpd (m, op1));
16245 m = adjust_address (op0, DFmode, 8);
16246 emit_insn (gen_sse2_storehpd (m, op1));
16247 }
16248 }
16249 else
16250 {
16251 if (mode != V4SFmode)
16252 op1 = gen_lowpart (V4SFmode, op1);
16253
16254 if (TARGET_AVX
16255 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16256 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16257 || optimize_function_for_size_p (cfun))
16258 {
16259 op0 = gen_lowpart (V4SFmode, op0);
16260 emit_insn (gen_sse_storeups (op0, op1));
16261 }
16262 else
16263 {
16264 m = adjust_address (op0, V2SFmode, 0);
16265 emit_insn (gen_sse_storelps (m, op1));
16266 m = adjust_address (op0, V2SFmode, 8);
16267 emit_insn (gen_sse_storehps (m, op1));
16268 }
16269 }
16270 }
16271 else
16272 gcc_unreachable ();
16273 }
16274
16275 /* Expand a push in MODE. This is some mode for which we do not support
16276 proper push instructions, at least from the registers that we expect
16277 the value to live in. */
16278
16279 void
16280 ix86_expand_push (enum machine_mode mode, rtx x)
16281 {
16282 rtx tmp;
16283
16284 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16285 GEN_INT (-GET_MODE_SIZE (mode)),
16286 stack_pointer_rtx, 1, OPTAB_DIRECT);
16287 if (tmp != stack_pointer_rtx)
16288 emit_move_insn (stack_pointer_rtx, tmp);
16289
16290 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16291
16292 /* When we push an operand onto stack, it has to be aligned at least
16293 at the function argument boundary. However since we don't have
16294 the argument type, we can't determine the actual argument
16295 boundary. */
16296 emit_move_insn (tmp, x);
16297 }
16298
16299 /* Helper function of ix86_fixup_binary_operands to canonicalize
16300 operand order. Returns true if the operands should be swapped. */
16301
16302 static bool
16303 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16304 rtx operands[])
16305 {
16306 rtx dst = operands[0];
16307 rtx src1 = operands[1];
16308 rtx src2 = operands[2];
16309
16310 /* If the operation is not commutative, we can't do anything. */
16311 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16312 return false;
16313
16314 /* Highest priority is that src1 should match dst. */
16315 if (rtx_equal_p (dst, src1))
16316 return false;
16317 if (rtx_equal_p (dst, src2))
16318 return true;
16319
16320 /* Next highest priority is that immediate constants come second. */
16321 if (immediate_operand (src2, mode))
16322 return false;
16323 if (immediate_operand (src1, mode))
16324 return true;
16325
16326 /* Lowest priority is that memory references should come second. */
16327 if (MEM_P (src2))
16328 return false;
16329 if (MEM_P (src1))
16330 return true;
16331
16332 return false;
16333 }
16334
16335
16336 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16337 destination to use for the operation. If different from the true
16338 destination in operands[0], a copy operation will be required. */
16339
16340 rtx
16341 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16342 rtx operands[])
16343 {
16344 rtx dst = operands[0];
16345 rtx src1 = operands[1];
16346 rtx src2 = operands[2];
16347
16348 /* Canonicalize operand order. */
16349 if (ix86_swap_binary_operands_p (code, mode, operands))
16350 {
16351 rtx temp;
16352
16353 /* It is invalid to swap operands of different modes. */
16354 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16355
16356 temp = src1;
16357 src1 = src2;
16358 src2 = temp;
16359 }
16360
16361 /* Both source operands cannot be in memory. */
16362 if (MEM_P (src1) && MEM_P (src2))
16363 {
16364 /* Optimization: Only read from memory once. */
16365 if (rtx_equal_p (src1, src2))
16366 {
16367 src2 = force_reg (mode, src2);
16368 src1 = src2;
16369 }
16370 else
16371 src2 = force_reg (mode, src2);
16372 }
16373
16374 /* If the destination is memory, and we do not have matching source
16375 operands, do things in registers. */
16376 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16377 dst = gen_reg_rtx (mode);
16378
16379 /* Source 1 cannot be a constant. */
16380 if (CONSTANT_P (src1))
16381 src1 = force_reg (mode, src1);
16382
16383 /* Source 1 cannot be a non-matching memory. */
16384 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16385 src1 = force_reg (mode, src1);
16386
16387 /* Improve address combine. */
16388 if (code == PLUS
16389 && GET_MODE_CLASS (mode) == MODE_INT
16390 && MEM_P (src2))
16391 src2 = force_reg (mode, src2);
16392
16393 operands[1] = src1;
16394 operands[2] = src2;
16395 return dst;
16396 }
16397
16398 /* Similarly, but assume that the destination has already been
16399 set up properly. */
16400
16401 void
16402 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16403 enum machine_mode mode, rtx operands[])
16404 {
16405 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16406 gcc_assert (dst == operands[0]);
16407 }
16408
16409 /* Attempt to expand a binary operator. Make the expansion closer to the
16410 actual machine, then just general_operand, which will allow 3 separate
16411 memory references (one output, two input) in a single insn. */
16412
16413 void
16414 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16415 rtx operands[])
16416 {
16417 rtx src1, src2, dst, op, clob;
16418
16419 dst = ix86_fixup_binary_operands (code, mode, operands);
16420 src1 = operands[1];
16421 src2 = operands[2];
16422
16423 /* Emit the instruction. */
16424
16425 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16426 if (reload_in_progress)
16427 {
16428 /* Reload doesn't know about the flags register, and doesn't know that
16429 it doesn't want to clobber it. We can only do this with PLUS. */
16430 gcc_assert (code == PLUS);
16431 emit_insn (op);
16432 }
16433 else if (reload_completed
16434 && code == PLUS
16435 && !rtx_equal_p (dst, src1))
16436 {
16437 /* This is going to be an LEA; avoid splitting it later. */
16438 emit_insn (op);
16439 }
16440 else
16441 {
16442 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16443 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16444 }
16445
16446 /* Fix up the destination if needed. */
16447 if (dst != operands[0])
16448 emit_move_insn (operands[0], dst);
16449 }
16450
16451 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16452 the given OPERANDS. */
16453
16454 void
16455 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16456 rtx operands[])
16457 {
16458 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16459 if (GET_CODE (operands[1]) == SUBREG)
16460 {
16461 op1 = operands[1];
16462 op2 = operands[2];
16463 }
16464 else if (GET_CODE (operands[2]) == SUBREG)
16465 {
16466 op1 = operands[2];
16467 op2 = operands[1];
16468 }
16469 /* Optimize (__m128i) d | (__m128i) e and similar code
16470 when d and e are float vectors into float vector logical
16471 insn. In C/C++ without using intrinsics there is no other way
16472 to express vector logical operation on float vectors than
16473 to cast them temporarily to integer vectors. */
16474 if (op1
16475 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16476 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16477 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16478 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16479 && SUBREG_BYTE (op1) == 0
16480 && (GET_CODE (op2) == CONST_VECTOR
16481 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16482 && SUBREG_BYTE (op2) == 0))
16483 && can_create_pseudo_p ())
16484 {
16485 rtx dst;
16486 switch (GET_MODE (SUBREG_REG (op1)))
16487 {
16488 case V4SFmode:
16489 case V8SFmode:
16490 case V2DFmode:
16491 case V4DFmode:
16492 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16493 if (GET_CODE (op2) == CONST_VECTOR)
16494 {
16495 op2 = gen_lowpart (GET_MODE (dst), op2);
16496 op2 = force_reg (GET_MODE (dst), op2);
16497 }
16498 else
16499 {
16500 op1 = operands[1];
16501 op2 = SUBREG_REG (operands[2]);
16502 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16503 op2 = force_reg (GET_MODE (dst), op2);
16504 }
16505 op1 = SUBREG_REG (op1);
16506 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16507 op1 = force_reg (GET_MODE (dst), op1);
16508 emit_insn (gen_rtx_SET (VOIDmode, dst,
16509 gen_rtx_fmt_ee (code, GET_MODE (dst),
16510 op1, op2)));
16511 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16512 return;
16513 default:
16514 break;
16515 }
16516 }
16517 if (!nonimmediate_operand (operands[1], mode))
16518 operands[1] = force_reg (mode, operands[1]);
16519 if (!nonimmediate_operand (operands[2], mode))
16520 operands[2] = force_reg (mode, operands[2]);
16521 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16522 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16523 gen_rtx_fmt_ee (code, mode, operands[1],
16524 operands[2])));
16525 }
16526
16527 /* Return TRUE or FALSE depending on whether the binary operator meets the
16528 appropriate constraints. */
16529
16530 bool
16531 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16532 rtx operands[3])
16533 {
16534 rtx dst = operands[0];
16535 rtx src1 = operands[1];
16536 rtx src2 = operands[2];
16537
16538 /* Both source operands cannot be in memory. */
16539 if (MEM_P (src1) && MEM_P (src2))
16540 return false;
16541
16542 /* Canonicalize operand order for commutative operators. */
16543 if (ix86_swap_binary_operands_p (code, mode, operands))
16544 {
16545 rtx temp = src1;
16546 src1 = src2;
16547 src2 = temp;
16548 }
16549
16550 /* If the destination is memory, we must have a matching source operand. */
16551 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16552 return false;
16553
16554 /* Source 1 cannot be a constant. */
16555 if (CONSTANT_P (src1))
16556 return false;
16557
16558 /* Source 1 cannot be a non-matching memory. */
16559 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16560 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16561 return (code == AND
16562 && (mode == HImode
16563 || mode == SImode
16564 || (TARGET_64BIT && mode == DImode))
16565 && satisfies_constraint_L (src2));
16566
16567 return true;
16568 }
16569
16570 /* Attempt to expand a unary operator. Make the expansion closer to the
16571 actual machine, then just general_operand, which will allow 2 separate
16572 memory references (one output, one input) in a single insn. */
16573
16574 void
16575 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16576 rtx operands[])
16577 {
16578 int matching_memory;
16579 rtx src, dst, op, clob;
16580
16581 dst = operands[0];
16582 src = operands[1];
16583
16584 /* If the destination is memory, and we do not have matching source
16585 operands, do things in registers. */
16586 matching_memory = 0;
16587 if (MEM_P (dst))
16588 {
16589 if (rtx_equal_p (dst, src))
16590 matching_memory = 1;
16591 else
16592 dst = gen_reg_rtx (mode);
16593 }
16594
16595 /* When source operand is memory, destination must match. */
16596 if (MEM_P (src) && !matching_memory)
16597 src = force_reg (mode, src);
16598
16599 /* Emit the instruction. */
16600
16601 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16602 if (reload_in_progress || code == NOT)
16603 {
16604 /* Reload doesn't know about the flags register, and doesn't know that
16605 it doesn't want to clobber it. */
16606 gcc_assert (code == NOT);
16607 emit_insn (op);
16608 }
16609 else
16610 {
16611 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16612 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16613 }
16614
16615 /* Fix up the destination if needed. */
16616 if (dst != operands[0])
16617 emit_move_insn (operands[0], dst);
16618 }
16619
16620 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16621 divisor are within the range [0-255]. */
16622
16623 void
16624 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16625 bool signed_p)
16626 {
16627 rtx end_label, qimode_label;
16628 rtx insn, div, mod;
16629 rtx scratch, tmp0, tmp1, tmp2;
16630 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16631 rtx (*gen_zero_extend) (rtx, rtx);
16632 rtx (*gen_test_ccno_1) (rtx, rtx);
16633
16634 switch (mode)
16635 {
16636 case SImode:
16637 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16638 gen_test_ccno_1 = gen_testsi_ccno_1;
16639 gen_zero_extend = gen_zero_extendqisi2;
16640 break;
16641 case DImode:
16642 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16643 gen_test_ccno_1 = gen_testdi_ccno_1;
16644 gen_zero_extend = gen_zero_extendqidi2;
16645 break;
16646 default:
16647 gcc_unreachable ();
16648 }
16649
16650 end_label = gen_label_rtx ();
16651 qimode_label = gen_label_rtx ();
16652
16653 scratch = gen_reg_rtx (mode);
16654
16655 /* Use 8bit unsigned divimod if dividend and divisor are within
16656 the range [0-255]. */
16657 emit_move_insn (scratch, operands[2]);
16658 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16659 scratch, 1, OPTAB_DIRECT);
16660 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16661 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16662 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16663 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16664 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16665 pc_rtx);
16666 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16667 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16668 JUMP_LABEL (insn) = qimode_label;
16669
16670 /* Generate original signed/unsigned divimod. */
16671 div = gen_divmod4_1 (operands[0], operands[1],
16672 operands[2], operands[3]);
16673 emit_insn (div);
16674
16675 /* Branch to the end. */
16676 emit_jump_insn (gen_jump (end_label));
16677 emit_barrier ();
16678
16679 /* Generate 8bit unsigned divide. */
16680 emit_label (qimode_label);
16681 /* Don't use operands[0] for result of 8bit divide since not all
16682 registers support QImode ZERO_EXTRACT. */
16683 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16684 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16685 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16686 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16687
16688 if (signed_p)
16689 {
16690 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16691 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16692 }
16693 else
16694 {
16695 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16696 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16697 }
16698
16699 /* Extract remainder from AH. */
16700 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16701 if (REG_P (operands[1]))
16702 insn = emit_move_insn (operands[1], tmp1);
16703 else
16704 {
16705 /* Need a new scratch register since the old one has result
16706 of 8bit divide. */
16707 scratch = gen_reg_rtx (mode);
16708 emit_move_insn (scratch, tmp1);
16709 insn = emit_move_insn (operands[1], scratch);
16710 }
16711 set_unique_reg_note (insn, REG_EQUAL, mod);
16712
16713 /* Zero extend quotient from AL. */
16714 tmp1 = gen_lowpart (QImode, tmp0);
16715 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16716 set_unique_reg_note (insn, REG_EQUAL, div);
16717
16718 emit_label (end_label);
16719 }
16720
16721 #define LEA_MAX_STALL (3)
16722 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16723
16724 /* Increase given DISTANCE in half-cycles according to
16725 dependencies between PREV and NEXT instructions.
16726 Add 1 half-cycle if there is no dependency and
16727 go to next cycle if there is some dependecy. */
16728
16729 static unsigned int
16730 increase_distance (rtx prev, rtx next, unsigned int distance)
16731 {
16732 df_ref *use_rec;
16733 df_ref *def_rec;
16734
16735 if (!prev || !next)
16736 return distance + (distance & 1) + 2;
16737
16738 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16739 return distance + 1;
16740
16741 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16742 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16743 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16744 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16745 return distance + (distance & 1) + 2;
16746
16747 return distance + 1;
16748 }
16749
16750 /* Function checks if instruction INSN defines register number
16751 REGNO1 or REGNO2. */
16752
16753 static bool
16754 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16755 rtx insn)
16756 {
16757 df_ref *def_rec;
16758
16759 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16760 if (DF_REF_REG_DEF_P (*def_rec)
16761 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16762 && (regno1 == DF_REF_REGNO (*def_rec)
16763 || regno2 == DF_REF_REGNO (*def_rec)))
16764 {
16765 return true;
16766 }
16767
16768 return false;
16769 }
16770
16771 /* Function checks if instruction INSN uses register number
16772 REGNO as a part of address expression. */
16773
16774 static bool
16775 insn_uses_reg_mem (unsigned int regno, rtx insn)
16776 {
16777 df_ref *use_rec;
16778
16779 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16780 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16781 return true;
16782
16783 return false;
16784 }
16785
16786 /* Search backward for non-agu definition of register number REGNO1
16787 or register number REGNO2 in basic block starting from instruction
16788 START up to head of basic block or instruction INSN.
16789
16790 Function puts true value into *FOUND var if definition was found
16791 and false otherwise.
16792
16793 Distance in half-cycles between START and found instruction or head
16794 of BB is added to DISTANCE and returned. */
16795
16796 static int
16797 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16798 rtx insn, int distance,
16799 rtx start, bool *found)
16800 {
16801 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16802 rtx prev = start;
16803 rtx next = NULL;
16804
16805 *found = false;
16806
16807 while (prev
16808 && prev != insn
16809 && distance < LEA_SEARCH_THRESHOLD)
16810 {
16811 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16812 {
16813 distance = increase_distance (prev, next, distance);
16814 if (insn_defines_reg (regno1, regno2, prev))
16815 {
16816 if (recog_memoized (prev) < 0
16817 || get_attr_type (prev) != TYPE_LEA)
16818 {
16819 *found = true;
16820 return distance;
16821 }
16822 }
16823
16824 next = prev;
16825 }
16826 if (prev == BB_HEAD (bb))
16827 break;
16828
16829 prev = PREV_INSN (prev);
16830 }
16831
16832 return distance;
16833 }
16834
16835 /* Search backward for non-agu definition of register number REGNO1
16836 or register number REGNO2 in INSN's basic block until
16837 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16838 2. Reach neighbour BBs boundary, or
16839 3. Reach agu definition.
16840 Returns the distance between the non-agu definition point and INSN.
16841 If no definition point, returns -1. */
16842
16843 static int
16844 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16845 rtx insn)
16846 {
16847 basic_block bb = BLOCK_FOR_INSN (insn);
16848 int distance = 0;
16849 bool found = false;
16850
16851 if (insn != BB_HEAD (bb))
16852 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16853 distance, PREV_INSN (insn),
16854 &found);
16855
16856 if (!found && distance < LEA_SEARCH_THRESHOLD)
16857 {
16858 edge e;
16859 edge_iterator ei;
16860 bool simple_loop = false;
16861
16862 FOR_EACH_EDGE (e, ei, bb->preds)
16863 if (e->src == bb)
16864 {
16865 simple_loop = true;
16866 break;
16867 }
16868
16869 if (simple_loop)
16870 distance = distance_non_agu_define_in_bb (regno1, regno2,
16871 insn, distance,
16872 BB_END (bb), &found);
16873 else
16874 {
16875 int shortest_dist = -1;
16876 bool found_in_bb = false;
16877
16878 FOR_EACH_EDGE (e, ei, bb->preds)
16879 {
16880 int bb_dist
16881 = distance_non_agu_define_in_bb (regno1, regno2,
16882 insn, distance,
16883 BB_END (e->src),
16884 &found_in_bb);
16885 if (found_in_bb)
16886 {
16887 if (shortest_dist < 0)
16888 shortest_dist = bb_dist;
16889 else if (bb_dist > 0)
16890 shortest_dist = MIN (bb_dist, shortest_dist);
16891
16892 found = true;
16893 }
16894 }
16895
16896 distance = shortest_dist;
16897 }
16898 }
16899
16900 /* get_attr_type may modify recog data. We want to make sure
16901 that recog data is valid for instruction INSN, on which
16902 distance_non_agu_define is called. INSN is unchanged here. */
16903 extract_insn_cached (insn);
16904
16905 if (!found)
16906 return -1;
16907
16908 return distance >> 1;
16909 }
16910
16911 /* Return the distance in half-cycles between INSN and the next
16912 insn that uses register number REGNO in memory address added
16913 to DISTANCE. Return -1 if REGNO0 is set.
16914
16915 Put true value into *FOUND if register usage was found and
16916 false otherwise.
16917 Put true value into *REDEFINED if register redefinition was
16918 found and false otherwise. */
16919
16920 static int
16921 distance_agu_use_in_bb (unsigned int regno,
16922 rtx insn, int distance, rtx start,
16923 bool *found, bool *redefined)
16924 {
16925 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16926 rtx next = start;
16927 rtx prev = NULL;
16928
16929 *found = false;
16930 *redefined = false;
16931
16932 while (next
16933 && next != insn
16934 && distance < LEA_SEARCH_THRESHOLD)
16935 {
16936 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
16937 {
16938 distance = increase_distance(prev, next, distance);
16939 if (insn_uses_reg_mem (regno, next))
16940 {
16941 /* Return DISTANCE if OP0 is used in memory
16942 address in NEXT. */
16943 *found = true;
16944 return distance;
16945 }
16946
16947 if (insn_defines_reg (regno, INVALID_REGNUM, next))
16948 {
16949 /* Return -1 if OP0 is set in NEXT. */
16950 *redefined = true;
16951 return -1;
16952 }
16953
16954 prev = next;
16955 }
16956
16957 if (next == BB_END (bb))
16958 break;
16959
16960 next = NEXT_INSN (next);
16961 }
16962
16963 return distance;
16964 }
16965
16966 /* Return the distance between INSN and the next insn that uses
16967 register number REGNO0 in memory address. Return -1 if no such
16968 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
16969
16970 static int
16971 distance_agu_use (unsigned int regno0, rtx insn)
16972 {
16973 basic_block bb = BLOCK_FOR_INSN (insn);
16974 int distance = 0;
16975 bool found = false;
16976 bool redefined = false;
16977
16978 if (insn != BB_END (bb))
16979 distance = distance_agu_use_in_bb (regno0, insn, distance,
16980 NEXT_INSN (insn),
16981 &found, &redefined);
16982
16983 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
16984 {
16985 edge e;
16986 edge_iterator ei;
16987 bool simple_loop = false;
16988
16989 FOR_EACH_EDGE (e, ei, bb->succs)
16990 if (e->dest == bb)
16991 {
16992 simple_loop = true;
16993 break;
16994 }
16995
16996 if (simple_loop)
16997 distance = distance_agu_use_in_bb (regno0, insn,
16998 distance, BB_HEAD (bb),
16999 &found, &redefined);
17000 else
17001 {
17002 int shortest_dist = -1;
17003 bool found_in_bb = false;
17004 bool redefined_in_bb = false;
17005
17006 FOR_EACH_EDGE (e, ei, bb->succs)
17007 {
17008 int bb_dist
17009 = distance_agu_use_in_bb (regno0, insn,
17010 distance, BB_HEAD (e->dest),
17011 &found_in_bb, &redefined_in_bb);
17012 if (found_in_bb)
17013 {
17014 if (shortest_dist < 0)
17015 shortest_dist = bb_dist;
17016 else if (bb_dist > 0)
17017 shortest_dist = MIN (bb_dist, shortest_dist);
17018
17019 found = true;
17020 }
17021 }
17022
17023 distance = shortest_dist;
17024 }
17025 }
17026
17027 if (!found || redefined)
17028 return -1;
17029
17030 return distance >> 1;
17031 }
17032
17033 /* Define this macro to tune LEA priority vs ADD, it take effect when
17034 there is a dilemma of choicing LEA or ADD
17035 Negative value: ADD is more preferred than LEA
17036 Zero: Netrual
17037 Positive value: LEA is more preferred than ADD*/
17038 #define IX86_LEA_PRIORITY 0
17039
17040 /* Return true if usage of lea INSN has performance advantage
17041 over a sequence of instructions. Instructions sequence has
17042 SPLIT_COST cycles higher latency than lea latency. */
17043
17044 static bool
17045 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17046 unsigned int regno2, int split_cost)
17047 {
17048 int dist_define, dist_use;
17049
17050 dist_define = distance_non_agu_define (regno1, regno2, insn);
17051 dist_use = distance_agu_use (regno0, insn);
17052
17053 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17054 {
17055 /* If there is no non AGU operand definition, no AGU
17056 operand usage and split cost is 0 then both lea
17057 and non lea variants have same priority. Currently
17058 we prefer lea for 64 bit code and non lea on 32 bit
17059 code. */
17060 if (dist_use < 0 && split_cost == 0)
17061 return TARGET_64BIT || IX86_LEA_PRIORITY;
17062 else
17063 return true;
17064 }
17065
17066 /* With longer definitions distance lea is more preferable.
17067 Here we change it to take into account splitting cost and
17068 lea priority. */
17069 dist_define += split_cost + IX86_LEA_PRIORITY;
17070
17071 /* If there is no use in memory addess then we just check
17072 that split cost exceeds AGU stall. */
17073 if (dist_use < 0)
17074 return dist_define > LEA_MAX_STALL;
17075
17076 /* If this insn has both backward non-agu dependence and forward
17077 agu dependence, the one with short distance takes effect. */
17078 return dist_define >= dist_use;
17079 }
17080
17081 /* Return true if it is legal to clobber flags by INSN and
17082 false otherwise. */
17083
17084 static bool
17085 ix86_ok_to_clobber_flags (rtx insn)
17086 {
17087 basic_block bb = BLOCK_FOR_INSN (insn);
17088 df_ref *use;
17089 bitmap live;
17090
17091 while (insn)
17092 {
17093 if (NONDEBUG_INSN_P (insn))
17094 {
17095 for (use = DF_INSN_USES (insn); *use; use++)
17096 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17097 return false;
17098
17099 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17100 return true;
17101 }
17102
17103 if (insn == BB_END (bb))
17104 break;
17105
17106 insn = NEXT_INSN (insn);
17107 }
17108
17109 live = df_get_live_out(bb);
17110 return !REGNO_REG_SET_P (live, FLAGS_REG);
17111 }
17112
17113 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17114 move and add to avoid AGU stalls. */
17115
17116 bool
17117 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17118 {
17119 unsigned int regno0, regno1, regno2;
17120
17121 /* Check if we need to optimize. */
17122 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17123 return false;
17124
17125 /* Check it is correct to split here. */
17126 if (!ix86_ok_to_clobber_flags(insn))
17127 return false;
17128
17129 regno0 = true_regnum (operands[0]);
17130 regno1 = true_regnum (operands[1]);
17131 regno2 = true_regnum (operands[2]);
17132
17133 /* We need to split only adds with non destructive
17134 destination operand. */
17135 if (regno0 == regno1 || regno0 == regno2)
17136 return false;
17137 else
17138 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17139 }
17140
17141 /* Return true if we should emit lea instruction instead of mov
17142 instruction. */
17143
17144 bool
17145 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17146 {
17147 unsigned int regno0, regno1;
17148
17149 /* Check if we need to optimize. */
17150 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17151 return false;
17152
17153 /* Use lea for reg to reg moves only. */
17154 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17155 return false;
17156
17157 regno0 = true_regnum (operands[0]);
17158 regno1 = true_regnum (operands[1]);
17159
17160 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17161 }
17162
17163 /* Return true if we need to split lea into a sequence of
17164 instructions to avoid AGU stalls. */
17165
17166 bool
17167 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17168 {
17169 unsigned int regno0, regno1, regno2;
17170 int split_cost;
17171 struct ix86_address parts;
17172 int ok;
17173
17174 /* Check we need to optimize. */
17175 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17176 return false;
17177
17178 /* Check it is correct to split here. */
17179 if (!ix86_ok_to_clobber_flags(insn))
17180 return false;
17181
17182 ok = ix86_decompose_address (operands[1], &parts);
17183 gcc_assert (ok);
17184
17185 /* There should be at least two components in the address. */
17186 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17187 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17188 return false;
17189
17190 /* We should not split into add if non legitimate pic
17191 operand is used as displacement. */
17192 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17193 return false;
17194
17195 regno0 = true_regnum (operands[0]) ;
17196 regno1 = INVALID_REGNUM;
17197 regno2 = INVALID_REGNUM;
17198
17199 if (parts.base)
17200 regno1 = true_regnum (parts.base);
17201 if (parts.index)
17202 regno2 = true_regnum (parts.index);
17203
17204 split_cost = 0;
17205
17206 /* Compute how many cycles we will add to execution time
17207 if split lea into a sequence of instructions. */
17208 if (parts.base || parts.index)
17209 {
17210 /* Have to use mov instruction if non desctructive
17211 destination form is used. */
17212 if (regno1 != regno0 && regno2 != regno0)
17213 split_cost += 1;
17214
17215 /* Have to add index to base if both exist. */
17216 if (parts.base && parts.index)
17217 split_cost += 1;
17218
17219 /* Have to use shift and adds if scale is 2 or greater. */
17220 if (parts.scale > 1)
17221 {
17222 if (regno0 != regno1)
17223 split_cost += 1;
17224 else if (regno2 == regno0)
17225 split_cost += 4;
17226 else
17227 split_cost += parts.scale;
17228 }
17229
17230 /* Have to use add instruction with immediate if
17231 disp is non zero. */
17232 if (parts.disp && parts.disp != const0_rtx)
17233 split_cost += 1;
17234
17235 /* Subtract the price of lea. */
17236 split_cost -= 1;
17237 }
17238
17239 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17240 }
17241
17242 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17243 matches destination. RTX includes clobber of FLAGS_REG. */
17244
17245 static void
17246 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17247 rtx dst, rtx src)
17248 {
17249 rtx op, clob;
17250
17251 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17252 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17253
17254 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17255 }
17256
17257 /* Return true if regno1 def is nearest to the insn. */
17258
17259 static bool
17260 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17261 {
17262 rtx prev = insn;
17263 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17264
17265 if (insn == start)
17266 return false;
17267 while (prev && prev != start)
17268 {
17269 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17270 {
17271 prev = PREV_INSN (prev);
17272 continue;
17273 }
17274 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17275 return true;
17276 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17277 return false;
17278 prev = PREV_INSN (prev);
17279 }
17280
17281 /* None of the regs is defined in the bb. */
17282 return false;
17283 }
17284
17285 /* Split lea instructions into a sequence of instructions
17286 which are executed on ALU to avoid AGU stalls.
17287 It is assumed that it is allowed to clobber flags register
17288 at lea position. */
17289
17290 void
17291 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17292 {
17293 unsigned int regno0, regno1, regno2;
17294 struct ix86_address parts;
17295 rtx target, tmp;
17296 int ok, adds;
17297
17298 ok = ix86_decompose_address (operands[1], &parts);
17299 gcc_assert (ok);
17300
17301 target = gen_lowpart (mode, operands[0]);
17302
17303 regno0 = true_regnum (target);
17304 regno1 = INVALID_REGNUM;
17305 regno2 = INVALID_REGNUM;
17306
17307 if (parts.base)
17308 {
17309 parts.base = gen_lowpart (mode, parts.base);
17310 regno1 = true_regnum (parts.base);
17311 }
17312
17313 if (parts.index)
17314 {
17315 parts.index = gen_lowpart (mode, parts.index);
17316 regno2 = true_regnum (parts.index);
17317 }
17318
17319 if (parts.disp)
17320 parts.disp = gen_lowpart (mode, parts.disp);
17321
17322 if (parts.scale > 1)
17323 {
17324 /* Case r1 = r1 + ... */
17325 if (regno1 == regno0)
17326 {
17327 /* If we have a case r1 = r1 + C * r1 then we
17328 should use multiplication which is very
17329 expensive. Assume cost model is wrong if we
17330 have such case here. */
17331 gcc_assert (regno2 != regno0);
17332
17333 for (adds = parts.scale; adds > 0; adds--)
17334 ix86_emit_binop (PLUS, mode, target, parts.index);
17335 }
17336 else
17337 {
17338 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17339 if (regno0 != regno2)
17340 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17341
17342 /* Use shift for scaling. */
17343 ix86_emit_binop (ASHIFT, mode, target,
17344 GEN_INT (exact_log2 (parts.scale)));
17345
17346 if (parts.base)
17347 ix86_emit_binop (PLUS, mode, target, parts.base);
17348
17349 if (parts.disp && parts.disp != const0_rtx)
17350 ix86_emit_binop (PLUS, mode, target, parts.disp);
17351 }
17352 }
17353 else if (!parts.base && !parts.index)
17354 {
17355 gcc_assert(parts.disp);
17356 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17357 }
17358 else
17359 {
17360 if (!parts.base)
17361 {
17362 if (regno0 != regno2)
17363 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17364 }
17365 else if (!parts.index)
17366 {
17367 if (regno0 != regno1)
17368 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17369 }
17370 else
17371 {
17372 if (regno0 == regno1)
17373 tmp = parts.index;
17374 else if (regno0 == regno2)
17375 tmp = parts.base;
17376 else
17377 {
17378 rtx tmp1;
17379
17380 /* Find better operand for SET instruction, depending
17381 on which definition is farther from the insn. */
17382 if (find_nearest_reg_def (insn, regno1, regno2))
17383 tmp = parts.index, tmp1 = parts.base;
17384 else
17385 tmp = parts.base, tmp1 = parts.index;
17386
17387 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17388
17389 if (parts.disp && parts.disp != const0_rtx)
17390 ix86_emit_binop (PLUS, mode, target, parts.disp);
17391
17392 ix86_emit_binop (PLUS, mode, target, tmp1);
17393 return;
17394 }
17395
17396 ix86_emit_binop (PLUS, mode, target, tmp);
17397 }
17398
17399 if (parts.disp && parts.disp != const0_rtx)
17400 ix86_emit_binop (PLUS, mode, target, parts.disp);
17401 }
17402 }
17403
17404 /* Return true if it is ok to optimize an ADD operation to LEA
17405 operation to avoid flag register consumation. For most processors,
17406 ADD is faster than LEA. For the processors like ATOM, if the
17407 destination register of LEA holds an actual address which will be
17408 used soon, LEA is better and otherwise ADD is better. */
17409
17410 bool
17411 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17412 {
17413 unsigned int regno0 = true_regnum (operands[0]);
17414 unsigned int regno1 = true_regnum (operands[1]);
17415 unsigned int regno2 = true_regnum (operands[2]);
17416
17417 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17418 if (regno0 != regno1 && regno0 != regno2)
17419 return true;
17420
17421 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17422 return false;
17423
17424 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17425 }
17426
17427 /* Return true if destination reg of SET_BODY is shift count of
17428 USE_BODY. */
17429
17430 static bool
17431 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17432 {
17433 rtx set_dest;
17434 rtx shift_rtx;
17435 int i;
17436
17437 /* Retrieve destination of SET_BODY. */
17438 switch (GET_CODE (set_body))
17439 {
17440 case SET:
17441 set_dest = SET_DEST (set_body);
17442 if (!set_dest || !REG_P (set_dest))
17443 return false;
17444 break;
17445 case PARALLEL:
17446 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17447 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17448 use_body))
17449 return true;
17450 default:
17451 return false;
17452 break;
17453 }
17454
17455 /* Retrieve shift count of USE_BODY. */
17456 switch (GET_CODE (use_body))
17457 {
17458 case SET:
17459 shift_rtx = XEXP (use_body, 1);
17460 break;
17461 case PARALLEL:
17462 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17463 if (ix86_dep_by_shift_count_body (set_body,
17464 XVECEXP (use_body, 0, i)))
17465 return true;
17466 default:
17467 return false;
17468 break;
17469 }
17470
17471 if (shift_rtx
17472 && (GET_CODE (shift_rtx) == ASHIFT
17473 || GET_CODE (shift_rtx) == LSHIFTRT
17474 || GET_CODE (shift_rtx) == ASHIFTRT
17475 || GET_CODE (shift_rtx) == ROTATE
17476 || GET_CODE (shift_rtx) == ROTATERT))
17477 {
17478 rtx shift_count = XEXP (shift_rtx, 1);
17479
17480 /* Return true if shift count is dest of SET_BODY. */
17481 if (REG_P (shift_count))
17482 {
17483 /* Add check since it can be invoked before register
17484 allocation in pre-reload schedule. */
17485 if (reload_completed
17486 && true_regnum (set_dest) == true_regnum (shift_count))
17487 return true;
17488 else if (REGNO(set_dest) == REGNO(shift_count))
17489 return true;
17490 }
17491 }
17492
17493 return false;
17494 }
17495
17496 /* Return true if destination reg of SET_INSN is shift count of
17497 USE_INSN. */
17498
17499 bool
17500 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17501 {
17502 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17503 PATTERN (use_insn));
17504 }
17505
17506 /* Return TRUE or FALSE depending on whether the unary operator meets the
17507 appropriate constraints. */
17508
17509 bool
17510 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17511 enum machine_mode mode ATTRIBUTE_UNUSED,
17512 rtx operands[2] ATTRIBUTE_UNUSED)
17513 {
17514 /* If one of operands is memory, source and destination must match. */
17515 if ((MEM_P (operands[0])
17516 || MEM_P (operands[1]))
17517 && ! rtx_equal_p (operands[0], operands[1]))
17518 return false;
17519 return true;
17520 }
17521
17522 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17523 are ok, keeping in mind the possible movddup alternative. */
17524
17525 bool
17526 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17527 {
17528 if (MEM_P (operands[0]))
17529 return rtx_equal_p (operands[0], operands[1 + high]);
17530 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17531 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17532 return true;
17533 }
17534
17535 /* Post-reload splitter for converting an SF or DFmode value in an
17536 SSE register into an unsigned SImode. */
17537
17538 void
17539 ix86_split_convert_uns_si_sse (rtx operands[])
17540 {
17541 enum machine_mode vecmode;
17542 rtx value, large, zero_or_two31, input, two31, x;
17543
17544 large = operands[1];
17545 zero_or_two31 = operands[2];
17546 input = operands[3];
17547 two31 = operands[4];
17548 vecmode = GET_MODE (large);
17549 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17550
17551 /* Load up the value into the low element. We must ensure that the other
17552 elements are valid floats -- zero is the easiest such value. */
17553 if (MEM_P (input))
17554 {
17555 if (vecmode == V4SFmode)
17556 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17557 else
17558 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17559 }
17560 else
17561 {
17562 input = gen_rtx_REG (vecmode, REGNO (input));
17563 emit_move_insn (value, CONST0_RTX (vecmode));
17564 if (vecmode == V4SFmode)
17565 emit_insn (gen_sse_movss (value, value, input));
17566 else
17567 emit_insn (gen_sse2_movsd (value, value, input));
17568 }
17569
17570 emit_move_insn (large, two31);
17571 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17572
17573 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17574 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17575
17576 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17577 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17578
17579 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17580 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17581
17582 large = gen_rtx_REG (V4SImode, REGNO (large));
17583 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17584
17585 x = gen_rtx_REG (V4SImode, REGNO (value));
17586 if (vecmode == V4SFmode)
17587 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17588 else
17589 emit_insn (gen_sse2_cvttpd2dq (x, value));
17590 value = x;
17591
17592 emit_insn (gen_xorv4si3 (value, value, large));
17593 }
17594
17595 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17596 Expects the 64-bit DImode to be supplied in a pair of integral
17597 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17598 -mfpmath=sse, !optimize_size only. */
17599
17600 void
17601 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17602 {
17603 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17604 rtx int_xmm, fp_xmm;
17605 rtx biases, exponents;
17606 rtx x;
17607
17608 int_xmm = gen_reg_rtx (V4SImode);
17609 if (TARGET_INTER_UNIT_MOVES)
17610 emit_insn (gen_movdi_to_sse (int_xmm, input));
17611 else if (TARGET_SSE_SPLIT_REGS)
17612 {
17613 emit_clobber (int_xmm);
17614 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17615 }
17616 else
17617 {
17618 x = gen_reg_rtx (V2DImode);
17619 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17620 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17621 }
17622
17623 x = gen_rtx_CONST_VECTOR (V4SImode,
17624 gen_rtvec (4, GEN_INT (0x43300000UL),
17625 GEN_INT (0x45300000UL),
17626 const0_rtx, const0_rtx));
17627 exponents = validize_mem (force_const_mem (V4SImode, x));
17628
17629 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17630 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17631
17632 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17633 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17634 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17635 (0x1.0p84 + double(fp_value_hi_xmm)).
17636 Note these exponents differ by 32. */
17637
17638 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17639
17640 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17641 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17642 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17643 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17644 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17645 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17646 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17647 biases = validize_mem (force_const_mem (V2DFmode, biases));
17648 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17649
17650 /* Add the upper and lower DFmode values together. */
17651 if (TARGET_SSE3)
17652 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17653 else
17654 {
17655 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17656 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17657 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17658 }
17659
17660 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17661 }
17662
17663 /* Not used, but eases macroization of patterns. */
17664 void
17665 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17666 rtx input ATTRIBUTE_UNUSED)
17667 {
17668 gcc_unreachable ();
17669 }
17670
17671 /* Convert an unsigned SImode value into a DFmode. Only currently used
17672 for SSE, but applicable anywhere. */
17673
17674 void
17675 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17676 {
17677 REAL_VALUE_TYPE TWO31r;
17678 rtx x, fp;
17679
17680 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17681 NULL, 1, OPTAB_DIRECT);
17682
17683 fp = gen_reg_rtx (DFmode);
17684 emit_insn (gen_floatsidf2 (fp, x));
17685
17686 real_ldexp (&TWO31r, &dconst1, 31);
17687 x = const_double_from_real_value (TWO31r, DFmode);
17688
17689 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17690 if (x != target)
17691 emit_move_insn (target, x);
17692 }
17693
17694 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17695 32-bit mode; otherwise we have a direct convert instruction. */
17696
17697 void
17698 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17699 {
17700 REAL_VALUE_TYPE TWO32r;
17701 rtx fp_lo, fp_hi, x;
17702
17703 fp_lo = gen_reg_rtx (DFmode);
17704 fp_hi = gen_reg_rtx (DFmode);
17705
17706 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17707
17708 real_ldexp (&TWO32r, &dconst1, 32);
17709 x = const_double_from_real_value (TWO32r, DFmode);
17710 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17711
17712 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17713
17714 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17715 0, OPTAB_DIRECT);
17716 if (x != target)
17717 emit_move_insn (target, x);
17718 }
17719
17720 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17721 For x86_32, -mfpmath=sse, !optimize_size only. */
17722 void
17723 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17724 {
17725 REAL_VALUE_TYPE ONE16r;
17726 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17727
17728 real_ldexp (&ONE16r, &dconst1, 16);
17729 x = const_double_from_real_value (ONE16r, SFmode);
17730 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17731 NULL, 0, OPTAB_DIRECT);
17732 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17733 NULL, 0, OPTAB_DIRECT);
17734 fp_hi = gen_reg_rtx (SFmode);
17735 fp_lo = gen_reg_rtx (SFmode);
17736 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17737 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17738 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17739 0, OPTAB_DIRECT);
17740 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17741 0, OPTAB_DIRECT);
17742 if (!rtx_equal_p (target, fp_hi))
17743 emit_move_insn (target, fp_hi);
17744 }
17745
17746 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17747 a vector of unsigned ints VAL to vector of floats TARGET. */
17748
17749 void
17750 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17751 {
17752 rtx tmp[8];
17753 REAL_VALUE_TYPE TWO16r;
17754 enum machine_mode intmode = GET_MODE (val);
17755 enum machine_mode fltmode = GET_MODE (target);
17756 rtx (*cvt) (rtx, rtx);
17757
17758 if (intmode == V4SImode)
17759 cvt = gen_floatv4siv4sf2;
17760 else
17761 cvt = gen_floatv8siv8sf2;
17762 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17763 tmp[0] = force_reg (intmode, tmp[0]);
17764 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17765 OPTAB_DIRECT);
17766 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17767 NULL_RTX, 1, OPTAB_DIRECT);
17768 tmp[3] = gen_reg_rtx (fltmode);
17769 emit_insn (cvt (tmp[3], tmp[1]));
17770 tmp[4] = gen_reg_rtx (fltmode);
17771 emit_insn (cvt (tmp[4], tmp[2]));
17772 real_ldexp (&TWO16r, &dconst1, 16);
17773 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17774 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17775 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17776 OPTAB_DIRECT);
17777 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17778 OPTAB_DIRECT);
17779 if (tmp[7] != target)
17780 emit_move_insn (target, tmp[7]);
17781 }
17782
17783 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17784 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17785 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17786 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17787
17788 rtx
17789 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17790 {
17791 REAL_VALUE_TYPE TWO31r;
17792 rtx two31r, tmp[4];
17793 enum machine_mode mode = GET_MODE (val);
17794 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17795 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17796 rtx (*cmp) (rtx, rtx, rtx, rtx);
17797 int i;
17798
17799 for (i = 0; i < 3; i++)
17800 tmp[i] = gen_reg_rtx (mode);
17801 real_ldexp (&TWO31r, &dconst1, 31);
17802 two31r = const_double_from_real_value (TWO31r, scalarmode);
17803 two31r = ix86_build_const_vector (mode, 1, two31r);
17804 two31r = force_reg (mode, two31r);
17805 switch (mode)
17806 {
17807 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17808 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17809 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17810 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17811 default: gcc_unreachable ();
17812 }
17813 tmp[3] = gen_rtx_LE (mode, two31r, val);
17814 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17815 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17816 0, OPTAB_DIRECT);
17817 if (intmode == V4SImode || TARGET_AVX2)
17818 *xorp = expand_simple_binop (intmode, ASHIFT,
17819 gen_lowpart (intmode, tmp[0]),
17820 GEN_INT (31), NULL_RTX, 0,
17821 OPTAB_DIRECT);
17822 else
17823 {
17824 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17825 two31 = ix86_build_const_vector (intmode, 1, two31);
17826 *xorp = expand_simple_binop (intmode, AND,
17827 gen_lowpart (intmode, tmp[0]),
17828 two31, NULL_RTX, 0,
17829 OPTAB_DIRECT);
17830 }
17831 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17832 0, OPTAB_DIRECT);
17833 }
17834
17835 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17836 then replicate the value for all elements of the vector
17837 register. */
17838
17839 rtx
17840 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17841 {
17842 int i, n_elt;
17843 rtvec v;
17844 enum machine_mode scalar_mode;
17845
17846 switch (mode)
17847 {
17848 case V32QImode:
17849 case V16QImode:
17850 case V16HImode:
17851 case V8HImode:
17852 case V8SImode:
17853 case V4SImode:
17854 case V4DImode:
17855 case V2DImode:
17856 gcc_assert (vect);
17857 case V8SFmode:
17858 case V4SFmode:
17859 case V4DFmode:
17860 case V2DFmode:
17861 n_elt = GET_MODE_NUNITS (mode);
17862 v = rtvec_alloc (n_elt);
17863 scalar_mode = GET_MODE_INNER (mode);
17864
17865 RTVEC_ELT (v, 0) = value;
17866
17867 for (i = 1; i < n_elt; ++i)
17868 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
17869
17870 return gen_rtx_CONST_VECTOR (mode, v);
17871
17872 default:
17873 gcc_unreachable ();
17874 }
17875 }
17876
17877 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
17878 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
17879 for an SSE register. If VECT is true, then replicate the mask for
17880 all elements of the vector register. If INVERT is true, then create
17881 a mask excluding the sign bit. */
17882
17883 rtx
17884 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
17885 {
17886 enum machine_mode vec_mode, imode;
17887 HOST_WIDE_INT hi, lo;
17888 int shift = 63;
17889 rtx v;
17890 rtx mask;
17891
17892 /* Find the sign bit, sign extended to 2*HWI. */
17893 switch (mode)
17894 {
17895 case V8SImode:
17896 case V4SImode:
17897 case V8SFmode:
17898 case V4SFmode:
17899 vec_mode = mode;
17900 mode = GET_MODE_INNER (mode);
17901 imode = SImode;
17902 lo = 0x80000000, hi = lo < 0;
17903 break;
17904
17905 case V4DImode:
17906 case V2DImode:
17907 case V4DFmode:
17908 case V2DFmode:
17909 vec_mode = mode;
17910 mode = GET_MODE_INNER (mode);
17911 imode = DImode;
17912 if (HOST_BITS_PER_WIDE_INT >= 64)
17913 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
17914 else
17915 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17916 break;
17917
17918 case TImode:
17919 case TFmode:
17920 vec_mode = VOIDmode;
17921 if (HOST_BITS_PER_WIDE_INT >= 64)
17922 {
17923 imode = TImode;
17924 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
17925 }
17926 else
17927 {
17928 rtvec vec;
17929
17930 imode = DImode;
17931 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
17932
17933 if (invert)
17934 {
17935 lo = ~lo, hi = ~hi;
17936 v = constm1_rtx;
17937 }
17938 else
17939 v = const0_rtx;
17940
17941 mask = immed_double_const (lo, hi, imode);
17942
17943 vec = gen_rtvec (2, v, mask);
17944 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
17945 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
17946
17947 return v;
17948 }
17949 break;
17950
17951 default:
17952 gcc_unreachable ();
17953 }
17954
17955 if (invert)
17956 lo = ~lo, hi = ~hi;
17957
17958 /* Force this value into the low part of a fp vector constant. */
17959 mask = immed_double_const (lo, hi, imode);
17960 mask = gen_lowpart (mode, mask);
17961
17962 if (vec_mode == VOIDmode)
17963 return force_reg (mode, mask);
17964
17965 v = ix86_build_const_vector (vec_mode, vect, mask);
17966 return force_reg (vec_mode, v);
17967 }
17968
17969 /* Generate code for floating point ABS or NEG. */
17970
17971 void
17972 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
17973 rtx operands[])
17974 {
17975 rtx mask, set, dst, src;
17976 bool use_sse = false;
17977 bool vector_mode = VECTOR_MODE_P (mode);
17978 enum machine_mode vmode = mode;
17979
17980 if (vector_mode)
17981 use_sse = true;
17982 else if (mode == TFmode)
17983 use_sse = true;
17984 else if (TARGET_SSE_MATH)
17985 {
17986 use_sse = SSE_FLOAT_MODE_P (mode);
17987 if (mode == SFmode)
17988 vmode = V4SFmode;
17989 else if (mode == DFmode)
17990 vmode = V2DFmode;
17991 }
17992
17993 /* NEG and ABS performed with SSE use bitwise mask operations.
17994 Create the appropriate mask now. */
17995 if (use_sse)
17996 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
17997 else
17998 mask = NULL_RTX;
17999
18000 dst = operands[0];
18001 src = operands[1];
18002
18003 set = gen_rtx_fmt_e (code, mode, src);
18004 set = gen_rtx_SET (VOIDmode, dst, set);
18005
18006 if (mask)
18007 {
18008 rtx use, clob;
18009 rtvec par;
18010
18011 use = gen_rtx_USE (VOIDmode, mask);
18012 if (vector_mode)
18013 par = gen_rtvec (2, set, use);
18014 else
18015 {
18016 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18017 par = gen_rtvec (3, set, use, clob);
18018 }
18019 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18020 }
18021 else
18022 emit_insn (set);
18023 }
18024
18025 /* Expand a copysign operation. Special case operand 0 being a constant. */
18026
18027 void
18028 ix86_expand_copysign (rtx operands[])
18029 {
18030 enum machine_mode mode, vmode;
18031 rtx dest, op0, op1, mask, nmask;
18032
18033 dest = operands[0];
18034 op0 = operands[1];
18035 op1 = operands[2];
18036
18037 mode = GET_MODE (dest);
18038
18039 if (mode == SFmode)
18040 vmode = V4SFmode;
18041 else if (mode == DFmode)
18042 vmode = V2DFmode;
18043 else
18044 vmode = mode;
18045
18046 if (GET_CODE (op0) == CONST_DOUBLE)
18047 {
18048 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18049
18050 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18051 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18052
18053 if (mode == SFmode || mode == DFmode)
18054 {
18055 if (op0 == CONST0_RTX (mode))
18056 op0 = CONST0_RTX (vmode);
18057 else
18058 {
18059 rtx v = ix86_build_const_vector (vmode, false, op0);
18060
18061 op0 = force_reg (vmode, v);
18062 }
18063 }
18064 else if (op0 != CONST0_RTX (mode))
18065 op0 = force_reg (mode, op0);
18066
18067 mask = ix86_build_signbit_mask (vmode, 0, 0);
18068
18069 if (mode == SFmode)
18070 copysign_insn = gen_copysignsf3_const;
18071 else if (mode == DFmode)
18072 copysign_insn = gen_copysigndf3_const;
18073 else
18074 copysign_insn = gen_copysigntf3_const;
18075
18076 emit_insn (copysign_insn (dest, op0, op1, mask));
18077 }
18078 else
18079 {
18080 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18081
18082 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18083 mask = ix86_build_signbit_mask (vmode, 0, 0);
18084
18085 if (mode == SFmode)
18086 copysign_insn = gen_copysignsf3_var;
18087 else if (mode == DFmode)
18088 copysign_insn = gen_copysigndf3_var;
18089 else
18090 copysign_insn = gen_copysigntf3_var;
18091
18092 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18093 }
18094 }
18095
18096 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18097 be a constant, and so has already been expanded into a vector constant. */
18098
18099 void
18100 ix86_split_copysign_const (rtx operands[])
18101 {
18102 enum machine_mode mode, vmode;
18103 rtx dest, op0, mask, x;
18104
18105 dest = operands[0];
18106 op0 = operands[1];
18107 mask = operands[3];
18108
18109 mode = GET_MODE (dest);
18110 vmode = GET_MODE (mask);
18111
18112 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18113 x = gen_rtx_AND (vmode, dest, mask);
18114 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18115
18116 if (op0 != CONST0_RTX (vmode))
18117 {
18118 x = gen_rtx_IOR (vmode, dest, op0);
18119 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18120 }
18121 }
18122
18123 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18124 so we have to do two masks. */
18125
18126 void
18127 ix86_split_copysign_var (rtx operands[])
18128 {
18129 enum machine_mode mode, vmode;
18130 rtx dest, scratch, op0, op1, mask, nmask, x;
18131
18132 dest = operands[0];
18133 scratch = operands[1];
18134 op0 = operands[2];
18135 op1 = operands[3];
18136 nmask = operands[4];
18137 mask = operands[5];
18138
18139 mode = GET_MODE (dest);
18140 vmode = GET_MODE (mask);
18141
18142 if (rtx_equal_p (op0, op1))
18143 {
18144 /* Shouldn't happen often (it's useless, obviously), but when it does
18145 we'd generate incorrect code if we continue below. */
18146 emit_move_insn (dest, op0);
18147 return;
18148 }
18149
18150 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18151 {
18152 gcc_assert (REGNO (op1) == REGNO (scratch));
18153
18154 x = gen_rtx_AND (vmode, scratch, mask);
18155 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18156
18157 dest = mask;
18158 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18159 x = gen_rtx_NOT (vmode, dest);
18160 x = gen_rtx_AND (vmode, x, op0);
18161 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18162 }
18163 else
18164 {
18165 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18166 {
18167 x = gen_rtx_AND (vmode, scratch, mask);
18168 }
18169 else /* alternative 2,4 */
18170 {
18171 gcc_assert (REGNO (mask) == REGNO (scratch));
18172 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18173 x = gen_rtx_AND (vmode, scratch, op1);
18174 }
18175 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18176
18177 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18178 {
18179 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18180 x = gen_rtx_AND (vmode, dest, nmask);
18181 }
18182 else /* alternative 3,4 */
18183 {
18184 gcc_assert (REGNO (nmask) == REGNO (dest));
18185 dest = nmask;
18186 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18187 x = gen_rtx_AND (vmode, dest, op0);
18188 }
18189 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18190 }
18191
18192 x = gen_rtx_IOR (vmode, dest, scratch);
18193 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18194 }
18195
18196 /* Return TRUE or FALSE depending on whether the first SET in INSN
18197 has source and destination with matching CC modes, and that the
18198 CC mode is at least as constrained as REQ_MODE. */
18199
18200 bool
18201 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18202 {
18203 rtx set;
18204 enum machine_mode set_mode;
18205
18206 set = PATTERN (insn);
18207 if (GET_CODE (set) == PARALLEL)
18208 set = XVECEXP (set, 0, 0);
18209 gcc_assert (GET_CODE (set) == SET);
18210 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18211
18212 set_mode = GET_MODE (SET_DEST (set));
18213 switch (set_mode)
18214 {
18215 case CCNOmode:
18216 if (req_mode != CCNOmode
18217 && (req_mode != CCmode
18218 || XEXP (SET_SRC (set), 1) != const0_rtx))
18219 return false;
18220 break;
18221 case CCmode:
18222 if (req_mode == CCGCmode)
18223 return false;
18224 /* FALLTHRU */
18225 case CCGCmode:
18226 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18227 return false;
18228 /* FALLTHRU */
18229 case CCGOCmode:
18230 if (req_mode == CCZmode)
18231 return false;
18232 /* FALLTHRU */
18233 case CCZmode:
18234 break;
18235
18236 case CCAmode:
18237 case CCCmode:
18238 case CCOmode:
18239 case CCSmode:
18240 if (set_mode != req_mode)
18241 return false;
18242 break;
18243
18244 default:
18245 gcc_unreachable ();
18246 }
18247
18248 return GET_MODE (SET_SRC (set)) == set_mode;
18249 }
18250
18251 /* Generate insn patterns to do an integer compare of OPERANDS. */
18252
18253 static rtx
18254 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18255 {
18256 enum machine_mode cmpmode;
18257 rtx tmp, flags;
18258
18259 cmpmode = SELECT_CC_MODE (code, op0, op1);
18260 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18261
18262 /* This is very simple, but making the interface the same as in the
18263 FP case makes the rest of the code easier. */
18264 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18265 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18266
18267 /* Return the test that should be put into the flags user, i.e.
18268 the bcc, scc, or cmov instruction. */
18269 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18270 }
18271
18272 /* Figure out whether to use ordered or unordered fp comparisons.
18273 Return the appropriate mode to use. */
18274
18275 enum machine_mode
18276 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18277 {
18278 /* ??? In order to make all comparisons reversible, we do all comparisons
18279 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18280 all forms trapping and nontrapping comparisons, we can make inequality
18281 comparisons trapping again, since it results in better code when using
18282 FCOM based compares. */
18283 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18284 }
18285
18286 enum machine_mode
18287 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18288 {
18289 enum machine_mode mode = GET_MODE (op0);
18290
18291 if (SCALAR_FLOAT_MODE_P (mode))
18292 {
18293 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18294 return ix86_fp_compare_mode (code);
18295 }
18296
18297 switch (code)
18298 {
18299 /* Only zero flag is needed. */
18300 case EQ: /* ZF=0 */
18301 case NE: /* ZF!=0 */
18302 return CCZmode;
18303 /* Codes needing carry flag. */
18304 case GEU: /* CF=0 */
18305 case LTU: /* CF=1 */
18306 /* Detect overflow checks. They need just the carry flag. */
18307 if (GET_CODE (op0) == PLUS
18308 && rtx_equal_p (op1, XEXP (op0, 0)))
18309 return CCCmode;
18310 else
18311 return CCmode;
18312 case GTU: /* CF=0 & ZF=0 */
18313 case LEU: /* CF=1 | ZF=1 */
18314 /* Detect overflow checks. They need just the carry flag. */
18315 if (GET_CODE (op0) == MINUS
18316 && rtx_equal_p (op1, XEXP (op0, 0)))
18317 return CCCmode;
18318 else
18319 return CCmode;
18320 /* Codes possibly doable only with sign flag when
18321 comparing against zero. */
18322 case GE: /* SF=OF or SF=0 */
18323 case LT: /* SF<>OF or SF=1 */
18324 if (op1 == const0_rtx)
18325 return CCGOCmode;
18326 else
18327 /* For other cases Carry flag is not required. */
18328 return CCGCmode;
18329 /* Codes doable only with sign flag when comparing
18330 against zero, but we miss jump instruction for it
18331 so we need to use relational tests against overflow
18332 that thus needs to be zero. */
18333 case GT: /* ZF=0 & SF=OF */
18334 case LE: /* ZF=1 | SF<>OF */
18335 if (op1 == const0_rtx)
18336 return CCNOmode;
18337 else
18338 return CCGCmode;
18339 /* strcmp pattern do (use flags) and combine may ask us for proper
18340 mode. */
18341 case USE:
18342 return CCmode;
18343 default:
18344 gcc_unreachable ();
18345 }
18346 }
18347
18348 /* Return the fixed registers used for condition codes. */
18349
18350 static bool
18351 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18352 {
18353 *p1 = FLAGS_REG;
18354 *p2 = FPSR_REG;
18355 return true;
18356 }
18357
18358 /* If two condition code modes are compatible, return a condition code
18359 mode which is compatible with both. Otherwise, return
18360 VOIDmode. */
18361
18362 static enum machine_mode
18363 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18364 {
18365 if (m1 == m2)
18366 return m1;
18367
18368 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18369 return VOIDmode;
18370
18371 if ((m1 == CCGCmode && m2 == CCGOCmode)
18372 || (m1 == CCGOCmode && m2 == CCGCmode))
18373 return CCGCmode;
18374
18375 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18376 return m2;
18377 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18378 return m1;
18379
18380 switch (m1)
18381 {
18382 default:
18383 gcc_unreachable ();
18384
18385 case CCmode:
18386 case CCGCmode:
18387 case CCGOCmode:
18388 case CCNOmode:
18389 case CCAmode:
18390 case CCCmode:
18391 case CCOmode:
18392 case CCSmode:
18393 case CCZmode:
18394 switch (m2)
18395 {
18396 default:
18397 return VOIDmode;
18398
18399 case CCmode:
18400 case CCGCmode:
18401 case CCGOCmode:
18402 case CCNOmode:
18403 case CCAmode:
18404 case CCCmode:
18405 case CCOmode:
18406 case CCSmode:
18407 case CCZmode:
18408 return CCmode;
18409 }
18410
18411 case CCFPmode:
18412 case CCFPUmode:
18413 /* These are only compatible with themselves, which we already
18414 checked above. */
18415 return VOIDmode;
18416 }
18417 }
18418
18419
18420 /* Return a comparison we can do and that it is equivalent to
18421 swap_condition (code) apart possibly from orderedness.
18422 But, never change orderedness if TARGET_IEEE_FP, returning
18423 UNKNOWN in that case if necessary. */
18424
18425 static enum rtx_code
18426 ix86_fp_swap_condition (enum rtx_code code)
18427 {
18428 switch (code)
18429 {
18430 case GT: /* GTU - CF=0 & ZF=0 */
18431 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18432 case GE: /* GEU - CF=0 */
18433 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18434 case UNLT: /* LTU - CF=1 */
18435 return TARGET_IEEE_FP ? UNKNOWN : GT;
18436 case UNLE: /* LEU - CF=1 | ZF=1 */
18437 return TARGET_IEEE_FP ? UNKNOWN : GE;
18438 default:
18439 return swap_condition (code);
18440 }
18441 }
18442
18443 /* Return cost of comparison CODE using the best strategy for performance.
18444 All following functions do use number of instructions as a cost metrics.
18445 In future this should be tweaked to compute bytes for optimize_size and
18446 take into account performance of various instructions on various CPUs. */
18447
18448 static int
18449 ix86_fp_comparison_cost (enum rtx_code code)
18450 {
18451 int arith_cost;
18452
18453 /* The cost of code using bit-twiddling on %ah. */
18454 switch (code)
18455 {
18456 case UNLE:
18457 case UNLT:
18458 case LTGT:
18459 case GT:
18460 case GE:
18461 case UNORDERED:
18462 case ORDERED:
18463 case UNEQ:
18464 arith_cost = 4;
18465 break;
18466 case LT:
18467 case NE:
18468 case EQ:
18469 case UNGE:
18470 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18471 break;
18472 case LE:
18473 case UNGT:
18474 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18475 break;
18476 default:
18477 gcc_unreachable ();
18478 }
18479
18480 switch (ix86_fp_comparison_strategy (code))
18481 {
18482 case IX86_FPCMP_COMI:
18483 return arith_cost > 4 ? 3 : 2;
18484 case IX86_FPCMP_SAHF:
18485 return arith_cost > 4 ? 4 : 3;
18486 default:
18487 return arith_cost;
18488 }
18489 }
18490
18491 /* Return strategy to use for floating-point. We assume that fcomi is always
18492 preferrable where available, since that is also true when looking at size
18493 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18494
18495 enum ix86_fpcmp_strategy
18496 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18497 {
18498 /* Do fcomi/sahf based test when profitable. */
18499
18500 if (TARGET_CMOVE)
18501 return IX86_FPCMP_COMI;
18502
18503 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18504 return IX86_FPCMP_SAHF;
18505
18506 return IX86_FPCMP_ARITH;
18507 }
18508
18509 /* Swap, force into registers, or otherwise massage the two operands
18510 to a fp comparison. The operands are updated in place; the new
18511 comparison code is returned. */
18512
18513 static enum rtx_code
18514 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18515 {
18516 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18517 rtx op0 = *pop0, op1 = *pop1;
18518 enum machine_mode op_mode = GET_MODE (op0);
18519 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18520
18521 /* All of the unordered compare instructions only work on registers.
18522 The same is true of the fcomi compare instructions. The XFmode
18523 compare instructions require registers except when comparing
18524 against zero or when converting operand 1 from fixed point to
18525 floating point. */
18526
18527 if (!is_sse
18528 && (fpcmp_mode == CCFPUmode
18529 || (op_mode == XFmode
18530 && ! (standard_80387_constant_p (op0) == 1
18531 || standard_80387_constant_p (op1) == 1)
18532 && GET_CODE (op1) != FLOAT)
18533 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18534 {
18535 op0 = force_reg (op_mode, op0);
18536 op1 = force_reg (op_mode, op1);
18537 }
18538 else
18539 {
18540 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18541 things around if they appear profitable, otherwise force op0
18542 into a register. */
18543
18544 if (standard_80387_constant_p (op0) == 0
18545 || (MEM_P (op0)
18546 && ! (standard_80387_constant_p (op1) == 0
18547 || MEM_P (op1))))
18548 {
18549 enum rtx_code new_code = ix86_fp_swap_condition (code);
18550 if (new_code != UNKNOWN)
18551 {
18552 rtx tmp;
18553 tmp = op0, op0 = op1, op1 = tmp;
18554 code = new_code;
18555 }
18556 }
18557
18558 if (!REG_P (op0))
18559 op0 = force_reg (op_mode, op0);
18560
18561 if (CONSTANT_P (op1))
18562 {
18563 int tmp = standard_80387_constant_p (op1);
18564 if (tmp == 0)
18565 op1 = validize_mem (force_const_mem (op_mode, op1));
18566 else if (tmp == 1)
18567 {
18568 if (TARGET_CMOVE)
18569 op1 = force_reg (op_mode, op1);
18570 }
18571 else
18572 op1 = force_reg (op_mode, op1);
18573 }
18574 }
18575
18576 /* Try to rearrange the comparison to make it cheaper. */
18577 if (ix86_fp_comparison_cost (code)
18578 > ix86_fp_comparison_cost (swap_condition (code))
18579 && (REG_P (op1) || can_create_pseudo_p ()))
18580 {
18581 rtx tmp;
18582 tmp = op0, op0 = op1, op1 = tmp;
18583 code = swap_condition (code);
18584 if (!REG_P (op0))
18585 op0 = force_reg (op_mode, op0);
18586 }
18587
18588 *pop0 = op0;
18589 *pop1 = op1;
18590 return code;
18591 }
18592
18593 /* Convert comparison codes we use to represent FP comparison to integer
18594 code that will result in proper branch. Return UNKNOWN if no such code
18595 is available. */
18596
18597 enum rtx_code
18598 ix86_fp_compare_code_to_integer (enum rtx_code code)
18599 {
18600 switch (code)
18601 {
18602 case GT:
18603 return GTU;
18604 case GE:
18605 return GEU;
18606 case ORDERED:
18607 case UNORDERED:
18608 return code;
18609 break;
18610 case UNEQ:
18611 return EQ;
18612 break;
18613 case UNLT:
18614 return LTU;
18615 break;
18616 case UNLE:
18617 return LEU;
18618 break;
18619 case LTGT:
18620 return NE;
18621 break;
18622 default:
18623 return UNKNOWN;
18624 }
18625 }
18626
18627 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18628
18629 static rtx
18630 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18631 {
18632 enum machine_mode fpcmp_mode, intcmp_mode;
18633 rtx tmp, tmp2;
18634
18635 fpcmp_mode = ix86_fp_compare_mode (code);
18636 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18637
18638 /* Do fcomi/sahf based test when profitable. */
18639 switch (ix86_fp_comparison_strategy (code))
18640 {
18641 case IX86_FPCMP_COMI:
18642 intcmp_mode = fpcmp_mode;
18643 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18644 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18645 tmp);
18646 emit_insn (tmp);
18647 break;
18648
18649 case IX86_FPCMP_SAHF:
18650 intcmp_mode = fpcmp_mode;
18651 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18652 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18653 tmp);
18654
18655 if (!scratch)
18656 scratch = gen_reg_rtx (HImode);
18657 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18658 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18659 break;
18660
18661 case IX86_FPCMP_ARITH:
18662 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18663 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18664 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18665 if (!scratch)
18666 scratch = gen_reg_rtx (HImode);
18667 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18668
18669 /* In the unordered case, we have to check C2 for NaN's, which
18670 doesn't happen to work out to anything nice combination-wise.
18671 So do some bit twiddling on the value we've got in AH to come
18672 up with an appropriate set of condition codes. */
18673
18674 intcmp_mode = CCNOmode;
18675 switch (code)
18676 {
18677 case GT:
18678 case UNGT:
18679 if (code == GT || !TARGET_IEEE_FP)
18680 {
18681 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18682 code = EQ;
18683 }
18684 else
18685 {
18686 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18687 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18688 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18689 intcmp_mode = CCmode;
18690 code = GEU;
18691 }
18692 break;
18693 case LT:
18694 case UNLT:
18695 if (code == LT && TARGET_IEEE_FP)
18696 {
18697 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18698 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18699 intcmp_mode = CCmode;
18700 code = EQ;
18701 }
18702 else
18703 {
18704 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18705 code = NE;
18706 }
18707 break;
18708 case GE:
18709 case UNGE:
18710 if (code == GE || !TARGET_IEEE_FP)
18711 {
18712 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18713 code = EQ;
18714 }
18715 else
18716 {
18717 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18718 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18719 code = NE;
18720 }
18721 break;
18722 case LE:
18723 case UNLE:
18724 if (code == LE && TARGET_IEEE_FP)
18725 {
18726 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18727 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18728 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18729 intcmp_mode = CCmode;
18730 code = LTU;
18731 }
18732 else
18733 {
18734 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18735 code = NE;
18736 }
18737 break;
18738 case EQ:
18739 case UNEQ:
18740 if (code == EQ && TARGET_IEEE_FP)
18741 {
18742 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18743 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18744 intcmp_mode = CCmode;
18745 code = EQ;
18746 }
18747 else
18748 {
18749 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18750 code = NE;
18751 }
18752 break;
18753 case NE:
18754 case LTGT:
18755 if (code == NE && TARGET_IEEE_FP)
18756 {
18757 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18758 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18759 GEN_INT (0x40)));
18760 code = NE;
18761 }
18762 else
18763 {
18764 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18765 code = EQ;
18766 }
18767 break;
18768
18769 case UNORDERED:
18770 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18771 code = NE;
18772 break;
18773 case ORDERED:
18774 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18775 code = EQ;
18776 break;
18777
18778 default:
18779 gcc_unreachable ();
18780 }
18781 break;
18782
18783 default:
18784 gcc_unreachable();
18785 }
18786
18787 /* Return the test that should be put into the flags user, i.e.
18788 the bcc, scc, or cmov instruction. */
18789 return gen_rtx_fmt_ee (code, VOIDmode,
18790 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18791 const0_rtx);
18792 }
18793
18794 static rtx
18795 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18796 {
18797 rtx ret;
18798
18799 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18800 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18801
18802 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18803 {
18804 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18805 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18806 }
18807 else
18808 ret = ix86_expand_int_compare (code, op0, op1);
18809
18810 return ret;
18811 }
18812
18813 void
18814 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18815 {
18816 enum machine_mode mode = GET_MODE (op0);
18817 rtx tmp;
18818
18819 switch (mode)
18820 {
18821 case SFmode:
18822 case DFmode:
18823 case XFmode:
18824 case QImode:
18825 case HImode:
18826 case SImode:
18827 simple:
18828 tmp = ix86_expand_compare (code, op0, op1);
18829 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18830 gen_rtx_LABEL_REF (VOIDmode, label),
18831 pc_rtx);
18832 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18833 return;
18834
18835 case DImode:
18836 if (TARGET_64BIT)
18837 goto simple;
18838 case TImode:
18839 /* Expand DImode branch into multiple compare+branch. */
18840 {
18841 rtx lo[2], hi[2], label2;
18842 enum rtx_code code1, code2, code3;
18843 enum machine_mode submode;
18844
18845 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18846 {
18847 tmp = op0, op0 = op1, op1 = tmp;
18848 code = swap_condition (code);
18849 }
18850
18851 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18852 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18853
18854 submode = mode == DImode ? SImode : DImode;
18855
18856 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18857 avoid two branches. This costs one extra insn, so disable when
18858 optimizing for size. */
18859
18860 if ((code == EQ || code == NE)
18861 && (!optimize_insn_for_size_p ()
18862 || hi[1] == const0_rtx || lo[1] == const0_rtx))
18863 {
18864 rtx xor0, xor1;
18865
18866 xor1 = hi[0];
18867 if (hi[1] != const0_rtx)
18868 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
18869 NULL_RTX, 0, OPTAB_WIDEN);
18870
18871 xor0 = lo[0];
18872 if (lo[1] != const0_rtx)
18873 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
18874 NULL_RTX, 0, OPTAB_WIDEN);
18875
18876 tmp = expand_binop (submode, ior_optab, xor1, xor0,
18877 NULL_RTX, 0, OPTAB_WIDEN);
18878
18879 ix86_expand_branch (code, tmp, const0_rtx, label);
18880 return;
18881 }
18882
18883 /* Otherwise, if we are doing less-than or greater-or-equal-than,
18884 op1 is a constant and the low word is zero, then we can just
18885 examine the high word. Similarly for low word -1 and
18886 less-or-equal-than or greater-than. */
18887
18888 if (CONST_INT_P (hi[1]))
18889 switch (code)
18890 {
18891 case LT: case LTU: case GE: case GEU:
18892 if (lo[1] == const0_rtx)
18893 {
18894 ix86_expand_branch (code, hi[0], hi[1], label);
18895 return;
18896 }
18897 break;
18898 case LE: case LEU: case GT: case GTU:
18899 if (lo[1] == constm1_rtx)
18900 {
18901 ix86_expand_branch (code, hi[0], hi[1], label);
18902 return;
18903 }
18904 break;
18905 default:
18906 break;
18907 }
18908
18909 /* Otherwise, we need two or three jumps. */
18910
18911 label2 = gen_label_rtx ();
18912
18913 code1 = code;
18914 code2 = swap_condition (code);
18915 code3 = unsigned_condition (code);
18916
18917 switch (code)
18918 {
18919 case LT: case GT: case LTU: case GTU:
18920 break;
18921
18922 case LE: code1 = LT; code2 = GT; break;
18923 case GE: code1 = GT; code2 = LT; break;
18924 case LEU: code1 = LTU; code2 = GTU; break;
18925 case GEU: code1 = GTU; code2 = LTU; break;
18926
18927 case EQ: code1 = UNKNOWN; code2 = NE; break;
18928 case NE: code2 = UNKNOWN; break;
18929
18930 default:
18931 gcc_unreachable ();
18932 }
18933
18934 /*
18935 * a < b =>
18936 * if (hi(a) < hi(b)) goto true;
18937 * if (hi(a) > hi(b)) goto false;
18938 * if (lo(a) < lo(b)) goto true;
18939 * false:
18940 */
18941
18942 if (code1 != UNKNOWN)
18943 ix86_expand_branch (code1, hi[0], hi[1], label);
18944 if (code2 != UNKNOWN)
18945 ix86_expand_branch (code2, hi[0], hi[1], label2);
18946
18947 ix86_expand_branch (code3, lo[0], lo[1], label);
18948
18949 if (code2 != UNKNOWN)
18950 emit_label (label2);
18951 return;
18952 }
18953
18954 default:
18955 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
18956 goto simple;
18957 }
18958 }
18959
18960 /* Split branch based on floating point condition. */
18961 void
18962 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
18963 rtx target1, rtx target2, rtx tmp, rtx pushed)
18964 {
18965 rtx condition;
18966 rtx i;
18967
18968 if (target2 != pc_rtx)
18969 {
18970 rtx tmp = target2;
18971 code = reverse_condition_maybe_unordered (code);
18972 target2 = target1;
18973 target1 = tmp;
18974 }
18975
18976 condition = ix86_expand_fp_compare (code, op1, op2,
18977 tmp);
18978
18979 /* Remove pushed operand from stack. */
18980 if (pushed)
18981 ix86_free_from_memory (GET_MODE (pushed));
18982
18983 i = emit_jump_insn (gen_rtx_SET
18984 (VOIDmode, pc_rtx,
18985 gen_rtx_IF_THEN_ELSE (VOIDmode,
18986 condition, target1, target2)));
18987 if (split_branch_probability >= 0)
18988 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
18989 }
18990
18991 void
18992 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
18993 {
18994 rtx ret;
18995
18996 gcc_assert (GET_MODE (dest) == QImode);
18997
18998 ret = ix86_expand_compare (code, op0, op1);
18999 PUT_MODE (ret, QImode);
19000 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19001 }
19002
19003 /* Expand comparison setting or clearing carry flag. Return true when
19004 successful and set pop for the operation. */
19005 static bool
19006 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19007 {
19008 enum machine_mode mode =
19009 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19010
19011 /* Do not handle double-mode compares that go through special path. */
19012 if (mode == (TARGET_64BIT ? TImode : DImode))
19013 return false;
19014
19015 if (SCALAR_FLOAT_MODE_P (mode))
19016 {
19017 rtx compare_op, compare_seq;
19018
19019 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19020
19021 /* Shortcut: following common codes never translate
19022 into carry flag compares. */
19023 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19024 || code == ORDERED || code == UNORDERED)
19025 return false;
19026
19027 /* These comparisons require zero flag; swap operands so they won't. */
19028 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19029 && !TARGET_IEEE_FP)
19030 {
19031 rtx tmp = op0;
19032 op0 = op1;
19033 op1 = tmp;
19034 code = swap_condition (code);
19035 }
19036
19037 /* Try to expand the comparison and verify that we end up with
19038 carry flag based comparison. This fails to be true only when
19039 we decide to expand comparison using arithmetic that is not
19040 too common scenario. */
19041 start_sequence ();
19042 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19043 compare_seq = get_insns ();
19044 end_sequence ();
19045
19046 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19047 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19048 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19049 else
19050 code = GET_CODE (compare_op);
19051
19052 if (code != LTU && code != GEU)
19053 return false;
19054
19055 emit_insn (compare_seq);
19056 *pop = compare_op;
19057 return true;
19058 }
19059
19060 if (!INTEGRAL_MODE_P (mode))
19061 return false;
19062
19063 switch (code)
19064 {
19065 case LTU:
19066 case GEU:
19067 break;
19068
19069 /* Convert a==0 into (unsigned)a<1. */
19070 case EQ:
19071 case NE:
19072 if (op1 != const0_rtx)
19073 return false;
19074 op1 = const1_rtx;
19075 code = (code == EQ ? LTU : GEU);
19076 break;
19077
19078 /* Convert a>b into b<a or a>=b-1. */
19079 case GTU:
19080 case LEU:
19081 if (CONST_INT_P (op1))
19082 {
19083 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19084 /* Bail out on overflow. We still can swap operands but that
19085 would force loading of the constant into register. */
19086 if (op1 == const0_rtx
19087 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19088 return false;
19089 code = (code == GTU ? GEU : LTU);
19090 }
19091 else
19092 {
19093 rtx tmp = op1;
19094 op1 = op0;
19095 op0 = tmp;
19096 code = (code == GTU ? LTU : GEU);
19097 }
19098 break;
19099
19100 /* Convert a>=0 into (unsigned)a<0x80000000. */
19101 case LT:
19102 case GE:
19103 if (mode == DImode || op1 != const0_rtx)
19104 return false;
19105 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19106 code = (code == LT ? GEU : LTU);
19107 break;
19108 case LE:
19109 case GT:
19110 if (mode == DImode || op1 != constm1_rtx)
19111 return false;
19112 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19113 code = (code == LE ? GEU : LTU);
19114 break;
19115
19116 default:
19117 return false;
19118 }
19119 /* Swapping operands may cause constant to appear as first operand. */
19120 if (!nonimmediate_operand (op0, VOIDmode))
19121 {
19122 if (!can_create_pseudo_p ())
19123 return false;
19124 op0 = force_reg (mode, op0);
19125 }
19126 *pop = ix86_expand_compare (code, op0, op1);
19127 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19128 return true;
19129 }
19130
19131 bool
19132 ix86_expand_int_movcc (rtx operands[])
19133 {
19134 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19135 rtx compare_seq, compare_op;
19136 enum machine_mode mode = GET_MODE (operands[0]);
19137 bool sign_bit_compare_p = false;
19138 rtx op0 = XEXP (operands[1], 0);
19139 rtx op1 = XEXP (operands[1], 1);
19140
19141 if (GET_MODE (op0) == TImode
19142 || (GET_MODE (op0) == DImode
19143 && !TARGET_64BIT))
19144 return false;
19145
19146 start_sequence ();
19147 compare_op = ix86_expand_compare (code, op0, op1);
19148 compare_seq = get_insns ();
19149 end_sequence ();
19150
19151 compare_code = GET_CODE (compare_op);
19152
19153 if ((op1 == const0_rtx && (code == GE || code == LT))
19154 || (op1 == constm1_rtx && (code == GT || code == LE)))
19155 sign_bit_compare_p = true;
19156
19157 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19158 HImode insns, we'd be swallowed in word prefix ops. */
19159
19160 if ((mode != HImode || TARGET_FAST_PREFIX)
19161 && (mode != (TARGET_64BIT ? TImode : DImode))
19162 && CONST_INT_P (operands[2])
19163 && CONST_INT_P (operands[3]))
19164 {
19165 rtx out = operands[0];
19166 HOST_WIDE_INT ct = INTVAL (operands[2]);
19167 HOST_WIDE_INT cf = INTVAL (operands[3]);
19168 HOST_WIDE_INT diff;
19169
19170 diff = ct - cf;
19171 /* Sign bit compares are better done using shifts than we do by using
19172 sbb. */
19173 if (sign_bit_compare_p
19174 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19175 {
19176 /* Detect overlap between destination and compare sources. */
19177 rtx tmp = out;
19178
19179 if (!sign_bit_compare_p)
19180 {
19181 rtx flags;
19182 bool fpcmp = false;
19183
19184 compare_code = GET_CODE (compare_op);
19185
19186 flags = XEXP (compare_op, 0);
19187
19188 if (GET_MODE (flags) == CCFPmode
19189 || GET_MODE (flags) == CCFPUmode)
19190 {
19191 fpcmp = true;
19192 compare_code
19193 = ix86_fp_compare_code_to_integer (compare_code);
19194 }
19195
19196 /* To simplify rest of code, restrict to the GEU case. */
19197 if (compare_code == LTU)
19198 {
19199 HOST_WIDE_INT tmp = ct;
19200 ct = cf;
19201 cf = tmp;
19202 compare_code = reverse_condition (compare_code);
19203 code = reverse_condition (code);
19204 }
19205 else
19206 {
19207 if (fpcmp)
19208 PUT_CODE (compare_op,
19209 reverse_condition_maybe_unordered
19210 (GET_CODE (compare_op)));
19211 else
19212 PUT_CODE (compare_op,
19213 reverse_condition (GET_CODE (compare_op)));
19214 }
19215 diff = ct - cf;
19216
19217 if (reg_overlap_mentioned_p (out, op0)
19218 || reg_overlap_mentioned_p (out, op1))
19219 tmp = gen_reg_rtx (mode);
19220
19221 if (mode == DImode)
19222 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19223 else
19224 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19225 flags, compare_op));
19226 }
19227 else
19228 {
19229 if (code == GT || code == GE)
19230 code = reverse_condition (code);
19231 else
19232 {
19233 HOST_WIDE_INT tmp = ct;
19234 ct = cf;
19235 cf = tmp;
19236 diff = ct - cf;
19237 }
19238 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19239 }
19240
19241 if (diff == 1)
19242 {
19243 /*
19244 * cmpl op0,op1
19245 * sbbl dest,dest
19246 * [addl dest, ct]
19247 *
19248 * Size 5 - 8.
19249 */
19250 if (ct)
19251 tmp = expand_simple_binop (mode, PLUS,
19252 tmp, GEN_INT (ct),
19253 copy_rtx (tmp), 1, OPTAB_DIRECT);
19254 }
19255 else if (cf == -1)
19256 {
19257 /*
19258 * cmpl op0,op1
19259 * sbbl dest,dest
19260 * orl $ct, dest
19261 *
19262 * Size 8.
19263 */
19264 tmp = expand_simple_binop (mode, IOR,
19265 tmp, GEN_INT (ct),
19266 copy_rtx (tmp), 1, OPTAB_DIRECT);
19267 }
19268 else if (diff == -1 && ct)
19269 {
19270 /*
19271 * cmpl op0,op1
19272 * sbbl dest,dest
19273 * notl dest
19274 * [addl dest, cf]
19275 *
19276 * Size 8 - 11.
19277 */
19278 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19279 if (cf)
19280 tmp = expand_simple_binop (mode, PLUS,
19281 copy_rtx (tmp), GEN_INT (cf),
19282 copy_rtx (tmp), 1, OPTAB_DIRECT);
19283 }
19284 else
19285 {
19286 /*
19287 * cmpl op0,op1
19288 * sbbl dest,dest
19289 * [notl dest]
19290 * andl cf - ct, dest
19291 * [addl dest, ct]
19292 *
19293 * Size 8 - 11.
19294 */
19295
19296 if (cf == 0)
19297 {
19298 cf = ct;
19299 ct = 0;
19300 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19301 }
19302
19303 tmp = expand_simple_binop (mode, AND,
19304 copy_rtx (tmp),
19305 gen_int_mode (cf - ct, mode),
19306 copy_rtx (tmp), 1, OPTAB_DIRECT);
19307 if (ct)
19308 tmp = expand_simple_binop (mode, PLUS,
19309 copy_rtx (tmp), GEN_INT (ct),
19310 copy_rtx (tmp), 1, OPTAB_DIRECT);
19311 }
19312
19313 if (!rtx_equal_p (tmp, out))
19314 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19315
19316 return true;
19317 }
19318
19319 if (diff < 0)
19320 {
19321 enum machine_mode cmp_mode = GET_MODE (op0);
19322
19323 HOST_WIDE_INT tmp;
19324 tmp = ct, ct = cf, cf = tmp;
19325 diff = -diff;
19326
19327 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19328 {
19329 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19330
19331 /* We may be reversing unordered compare to normal compare, that
19332 is not valid in general (we may convert non-trapping condition
19333 to trapping one), however on i386 we currently emit all
19334 comparisons unordered. */
19335 compare_code = reverse_condition_maybe_unordered (compare_code);
19336 code = reverse_condition_maybe_unordered (code);
19337 }
19338 else
19339 {
19340 compare_code = reverse_condition (compare_code);
19341 code = reverse_condition (code);
19342 }
19343 }
19344
19345 compare_code = UNKNOWN;
19346 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19347 && CONST_INT_P (op1))
19348 {
19349 if (op1 == const0_rtx
19350 && (code == LT || code == GE))
19351 compare_code = code;
19352 else if (op1 == constm1_rtx)
19353 {
19354 if (code == LE)
19355 compare_code = LT;
19356 else if (code == GT)
19357 compare_code = GE;
19358 }
19359 }
19360
19361 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19362 if (compare_code != UNKNOWN
19363 && GET_MODE (op0) == GET_MODE (out)
19364 && (cf == -1 || ct == -1))
19365 {
19366 /* If lea code below could be used, only optimize
19367 if it results in a 2 insn sequence. */
19368
19369 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19370 || diff == 3 || diff == 5 || diff == 9)
19371 || (compare_code == LT && ct == -1)
19372 || (compare_code == GE && cf == -1))
19373 {
19374 /*
19375 * notl op1 (if necessary)
19376 * sarl $31, op1
19377 * orl cf, op1
19378 */
19379 if (ct != -1)
19380 {
19381 cf = ct;
19382 ct = -1;
19383 code = reverse_condition (code);
19384 }
19385
19386 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19387
19388 out = expand_simple_binop (mode, IOR,
19389 out, GEN_INT (cf),
19390 out, 1, OPTAB_DIRECT);
19391 if (out != operands[0])
19392 emit_move_insn (operands[0], out);
19393
19394 return true;
19395 }
19396 }
19397
19398
19399 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19400 || diff == 3 || diff == 5 || diff == 9)
19401 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19402 && (mode != DImode
19403 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19404 {
19405 /*
19406 * xorl dest,dest
19407 * cmpl op1,op2
19408 * setcc dest
19409 * lea cf(dest*(ct-cf)),dest
19410 *
19411 * Size 14.
19412 *
19413 * This also catches the degenerate setcc-only case.
19414 */
19415
19416 rtx tmp;
19417 int nops;
19418
19419 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19420
19421 nops = 0;
19422 /* On x86_64 the lea instruction operates on Pmode, so we need
19423 to get arithmetics done in proper mode to match. */
19424 if (diff == 1)
19425 tmp = copy_rtx (out);
19426 else
19427 {
19428 rtx out1;
19429 out1 = copy_rtx (out);
19430 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19431 nops++;
19432 if (diff & 1)
19433 {
19434 tmp = gen_rtx_PLUS (mode, tmp, out1);
19435 nops++;
19436 }
19437 }
19438 if (cf != 0)
19439 {
19440 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19441 nops++;
19442 }
19443 if (!rtx_equal_p (tmp, out))
19444 {
19445 if (nops == 1)
19446 out = force_operand (tmp, copy_rtx (out));
19447 else
19448 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19449 }
19450 if (!rtx_equal_p (out, operands[0]))
19451 emit_move_insn (operands[0], copy_rtx (out));
19452
19453 return true;
19454 }
19455
19456 /*
19457 * General case: Jumpful:
19458 * xorl dest,dest cmpl op1, op2
19459 * cmpl op1, op2 movl ct, dest
19460 * setcc dest jcc 1f
19461 * decl dest movl cf, dest
19462 * andl (cf-ct),dest 1:
19463 * addl ct,dest
19464 *
19465 * Size 20. Size 14.
19466 *
19467 * This is reasonably steep, but branch mispredict costs are
19468 * high on modern cpus, so consider failing only if optimizing
19469 * for space.
19470 */
19471
19472 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19473 && BRANCH_COST (optimize_insn_for_speed_p (),
19474 false) >= 2)
19475 {
19476 if (cf == 0)
19477 {
19478 enum machine_mode cmp_mode = GET_MODE (op0);
19479
19480 cf = ct;
19481 ct = 0;
19482
19483 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19484 {
19485 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19486
19487 /* We may be reversing unordered compare to normal compare,
19488 that is not valid in general (we may convert non-trapping
19489 condition to trapping one), however on i386 we currently
19490 emit all comparisons unordered. */
19491 code = reverse_condition_maybe_unordered (code);
19492 }
19493 else
19494 {
19495 code = reverse_condition (code);
19496 if (compare_code != UNKNOWN)
19497 compare_code = reverse_condition (compare_code);
19498 }
19499 }
19500
19501 if (compare_code != UNKNOWN)
19502 {
19503 /* notl op1 (if needed)
19504 sarl $31, op1
19505 andl (cf-ct), op1
19506 addl ct, op1
19507
19508 For x < 0 (resp. x <= -1) there will be no notl,
19509 so if possible swap the constants to get rid of the
19510 complement.
19511 True/false will be -1/0 while code below (store flag
19512 followed by decrement) is 0/-1, so the constants need
19513 to be exchanged once more. */
19514
19515 if (compare_code == GE || !cf)
19516 {
19517 code = reverse_condition (code);
19518 compare_code = LT;
19519 }
19520 else
19521 {
19522 HOST_WIDE_INT tmp = cf;
19523 cf = ct;
19524 ct = tmp;
19525 }
19526
19527 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19528 }
19529 else
19530 {
19531 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19532
19533 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19534 constm1_rtx,
19535 copy_rtx (out), 1, OPTAB_DIRECT);
19536 }
19537
19538 out = expand_simple_binop (mode, AND, copy_rtx (out),
19539 gen_int_mode (cf - ct, mode),
19540 copy_rtx (out), 1, OPTAB_DIRECT);
19541 if (ct)
19542 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19543 copy_rtx (out), 1, OPTAB_DIRECT);
19544 if (!rtx_equal_p (out, operands[0]))
19545 emit_move_insn (operands[0], copy_rtx (out));
19546
19547 return true;
19548 }
19549 }
19550
19551 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19552 {
19553 /* Try a few things more with specific constants and a variable. */
19554
19555 optab op;
19556 rtx var, orig_out, out, tmp;
19557
19558 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19559 return false;
19560
19561 /* If one of the two operands is an interesting constant, load a
19562 constant with the above and mask it in with a logical operation. */
19563
19564 if (CONST_INT_P (operands[2]))
19565 {
19566 var = operands[3];
19567 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19568 operands[3] = constm1_rtx, op = and_optab;
19569 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19570 operands[3] = const0_rtx, op = ior_optab;
19571 else
19572 return false;
19573 }
19574 else if (CONST_INT_P (operands[3]))
19575 {
19576 var = operands[2];
19577 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19578 operands[2] = constm1_rtx, op = and_optab;
19579 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19580 operands[2] = const0_rtx, op = ior_optab;
19581 else
19582 return false;
19583 }
19584 else
19585 return false;
19586
19587 orig_out = operands[0];
19588 tmp = gen_reg_rtx (mode);
19589 operands[0] = tmp;
19590
19591 /* Recurse to get the constant loaded. */
19592 if (ix86_expand_int_movcc (operands) == 0)
19593 return false;
19594
19595 /* Mask in the interesting variable. */
19596 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19597 OPTAB_WIDEN);
19598 if (!rtx_equal_p (out, orig_out))
19599 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19600
19601 return true;
19602 }
19603
19604 /*
19605 * For comparison with above,
19606 *
19607 * movl cf,dest
19608 * movl ct,tmp
19609 * cmpl op1,op2
19610 * cmovcc tmp,dest
19611 *
19612 * Size 15.
19613 */
19614
19615 if (! nonimmediate_operand (operands[2], mode))
19616 operands[2] = force_reg (mode, operands[2]);
19617 if (! nonimmediate_operand (operands[3], mode))
19618 operands[3] = force_reg (mode, operands[3]);
19619
19620 if (! register_operand (operands[2], VOIDmode)
19621 && (mode == QImode
19622 || ! register_operand (operands[3], VOIDmode)))
19623 operands[2] = force_reg (mode, operands[2]);
19624
19625 if (mode == QImode
19626 && ! register_operand (operands[3], VOIDmode))
19627 operands[3] = force_reg (mode, operands[3]);
19628
19629 emit_insn (compare_seq);
19630 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19631 gen_rtx_IF_THEN_ELSE (mode,
19632 compare_op, operands[2],
19633 operands[3])));
19634 return true;
19635 }
19636
19637 /* Swap, force into registers, or otherwise massage the two operands
19638 to an sse comparison with a mask result. Thus we differ a bit from
19639 ix86_prepare_fp_compare_args which expects to produce a flags result.
19640
19641 The DEST operand exists to help determine whether to commute commutative
19642 operators. The POP0/POP1 operands are updated in place. The new
19643 comparison code is returned, or UNKNOWN if not implementable. */
19644
19645 static enum rtx_code
19646 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19647 rtx *pop0, rtx *pop1)
19648 {
19649 rtx tmp;
19650
19651 switch (code)
19652 {
19653 case LTGT:
19654 case UNEQ:
19655 /* AVX supports all the needed comparisons. */
19656 if (TARGET_AVX)
19657 break;
19658 /* We have no LTGT as an operator. We could implement it with
19659 NE & ORDERED, but this requires an extra temporary. It's
19660 not clear that it's worth it. */
19661 return UNKNOWN;
19662
19663 case LT:
19664 case LE:
19665 case UNGT:
19666 case UNGE:
19667 /* These are supported directly. */
19668 break;
19669
19670 case EQ:
19671 case NE:
19672 case UNORDERED:
19673 case ORDERED:
19674 /* AVX has 3 operand comparisons, no need to swap anything. */
19675 if (TARGET_AVX)
19676 break;
19677 /* For commutative operators, try to canonicalize the destination
19678 operand to be first in the comparison - this helps reload to
19679 avoid extra moves. */
19680 if (!dest || !rtx_equal_p (dest, *pop1))
19681 break;
19682 /* FALLTHRU */
19683
19684 case GE:
19685 case GT:
19686 case UNLE:
19687 case UNLT:
19688 /* These are not supported directly before AVX, and furthermore
19689 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19690 comparison operands to transform into something that is
19691 supported. */
19692 tmp = *pop0;
19693 *pop0 = *pop1;
19694 *pop1 = tmp;
19695 code = swap_condition (code);
19696 break;
19697
19698 default:
19699 gcc_unreachable ();
19700 }
19701
19702 return code;
19703 }
19704
19705 /* Detect conditional moves that exactly match min/max operational
19706 semantics. Note that this is IEEE safe, as long as we don't
19707 interchange the operands.
19708
19709 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19710 and TRUE if the operation is successful and instructions are emitted. */
19711
19712 static bool
19713 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19714 rtx cmp_op1, rtx if_true, rtx if_false)
19715 {
19716 enum machine_mode mode;
19717 bool is_min;
19718 rtx tmp;
19719
19720 if (code == LT)
19721 ;
19722 else if (code == UNGE)
19723 {
19724 tmp = if_true;
19725 if_true = if_false;
19726 if_false = tmp;
19727 }
19728 else
19729 return false;
19730
19731 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19732 is_min = true;
19733 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19734 is_min = false;
19735 else
19736 return false;
19737
19738 mode = GET_MODE (dest);
19739
19740 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19741 but MODE may be a vector mode and thus not appropriate. */
19742 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19743 {
19744 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19745 rtvec v;
19746
19747 if_true = force_reg (mode, if_true);
19748 v = gen_rtvec (2, if_true, if_false);
19749 tmp = gen_rtx_UNSPEC (mode, v, u);
19750 }
19751 else
19752 {
19753 code = is_min ? SMIN : SMAX;
19754 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19755 }
19756
19757 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19758 return true;
19759 }
19760
19761 /* Expand an sse vector comparison. Return the register with the result. */
19762
19763 static rtx
19764 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19765 rtx op_true, rtx op_false)
19766 {
19767 enum machine_mode mode = GET_MODE (dest);
19768 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19769 rtx x;
19770
19771 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19772 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19773 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19774
19775 if (optimize
19776 || reg_overlap_mentioned_p (dest, op_true)
19777 || reg_overlap_mentioned_p (dest, op_false))
19778 dest = gen_reg_rtx (mode);
19779
19780 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19781 if (cmp_mode != mode)
19782 {
19783 x = force_reg (cmp_mode, x);
19784 convert_move (dest, x, false);
19785 }
19786 else
19787 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19788
19789 return dest;
19790 }
19791
19792 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19793 operations. This is used for both scalar and vector conditional moves. */
19794
19795 static void
19796 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19797 {
19798 enum machine_mode mode = GET_MODE (dest);
19799 rtx t2, t3, x;
19800
19801 if (vector_all_ones_operand (op_true, mode)
19802 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19803 {
19804 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19805 }
19806 else if (op_false == CONST0_RTX (mode))
19807 {
19808 op_true = force_reg (mode, op_true);
19809 x = gen_rtx_AND (mode, cmp, op_true);
19810 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19811 }
19812 else if (op_true == CONST0_RTX (mode))
19813 {
19814 op_false = force_reg (mode, op_false);
19815 x = gen_rtx_NOT (mode, cmp);
19816 x = gen_rtx_AND (mode, x, op_false);
19817 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19818 }
19819 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19820 {
19821 op_false = force_reg (mode, op_false);
19822 x = gen_rtx_IOR (mode, cmp, op_false);
19823 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19824 }
19825 else if (TARGET_XOP)
19826 {
19827 op_true = force_reg (mode, op_true);
19828
19829 if (!nonimmediate_operand (op_false, mode))
19830 op_false = force_reg (mode, op_false);
19831
19832 emit_insn (gen_rtx_SET (mode, dest,
19833 gen_rtx_IF_THEN_ELSE (mode, cmp,
19834 op_true,
19835 op_false)));
19836 }
19837 else
19838 {
19839 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19840
19841 if (!nonimmediate_operand (op_true, mode))
19842 op_true = force_reg (mode, op_true);
19843
19844 op_false = force_reg (mode, op_false);
19845
19846 switch (mode)
19847 {
19848 case V4SFmode:
19849 if (TARGET_SSE4_1)
19850 gen = gen_sse4_1_blendvps;
19851 break;
19852 case V2DFmode:
19853 if (TARGET_SSE4_1)
19854 gen = gen_sse4_1_blendvpd;
19855 break;
19856 case V16QImode:
19857 case V8HImode:
19858 case V4SImode:
19859 case V2DImode:
19860 if (TARGET_SSE4_1)
19861 {
19862 gen = gen_sse4_1_pblendvb;
19863 dest = gen_lowpart (V16QImode, dest);
19864 op_false = gen_lowpart (V16QImode, op_false);
19865 op_true = gen_lowpart (V16QImode, op_true);
19866 cmp = gen_lowpart (V16QImode, cmp);
19867 }
19868 break;
19869 case V8SFmode:
19870 if (TARGET_AVX)
19871 gen = gen_avx_blendvps256;
19872 break;
19873 case V4DFmode:
19874 if (TARGET_AVX)
19875 gen = gen_avx_blendvpd256;
19876 break;
19877 case V32QImode:
19878 case V16HImode:
19879 case V8SImode:
19880 case V4DImode:
19881 if (TARGET_AVX2)
19882 {
19883 gen = gen_avx2_pblendvb;
19884 dest = gen_lowpart (V32QImode, dest);
19885 op_false = gen_lowpart (V32QImode, op_false);
19886 op_true = gen_lowpart (V32QImode, op_true);
19887 cmp = gen_lowpart (V32QImode, cmp);
19888 }
19889 break;
19890 default:
19891 break;
19892 }
19893
19894 if (gen != NULL)
19895 emit_insn (gen (dest, op_false, op_true, cmp));
19896 else
19897 {
19898 op_true = force_reg (mode, op_true);
19899
19900 t2 = gen_reg_rtx (mode);
19901 if (optimize)
19902 t3 = gen_reg_rtx (mode);
19903 else
19904 t3 = dest;
19905
19906 x = gen_rtx_AND (mode, op_true, cmp);
19907 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
19908
19909 x = gen_rtx_NOT (mode, cmp);
19910 x = gen_rtx_AND (mode, x, op_false);
19911 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
19912
19913 x = gen_rtx_IOR (mode, t3, t2);
19914 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19915 }
19916 }
19917 }
19918
19919 /* Expand a floating-point conditional move. Return true if successful. */
19920
19921 bool
19922 ix86_expand_fp_movcc (rtx operands[])
19923 {
19924 enum machine_mode mode = GET_MODE (operands[0]);
19925 enum rtx_code code = GET_CODE (operands[1]);
19926 rtx tmp, compare_op;
19927 rtx op0 = XEXP (operands[1], 0);
19928 rtx op1 = XEXP (operands[1], 1);
19929
19930 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
19931 {
19932 enum machine_mode cmode;
19933
19934 /* Since we've no cmove for sse registers, don't force bad register
19935 allocation just to gain access to it. Deny movcc when the
19936 comparison mode doesn't match the move mode. */
19937 cmode = GET_MODE (op0);
19938 if (cmode == VOIDmode)
19939 cmode = GET_MODE (op1);
19940 if (cmode != mode)
19941 return false;
19942
19943 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
19944 if (code == UNKNOWN)
19945 return false;
19946
19947 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
19948 operands[2], operands[3]))
19949 return true;
19950
19951 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
19952 operands[2], operands[3]);
19953 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
19954 return true;
19955 }
19956
19957 if (GET_MODE (op0) == TImode
19958 || (GET_MODE (op0) == DImode
19959 && !TARGET_64BIT))
19960 return false;
19961
19962 /* The floating point conditional move instructions don't directly
19963 support conditions resulting from a signed integer comparison. */
19964
19965 compare_op = ix86_expand_compare (code, op0, op1);
19966 if (!fcmov_comparison_operator (compare_op, VOIDmode))
19967 {
19968 tmp = gen_reg_rtx (QImode);
19969 ix86_expand_setcc (tmp, code, op0, op1);
19970
19971 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
19972 }
19973
19974 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19975 gen_rtx_IF_THEN_ELSE (mode, compare_op,
19976 operands[2], operands[3])));
19977
19978 return true;
19979 }
19980
19981 /* Expand a floating-point vector conditional move; a vcond operation
19982 rather than a movcc operation. */
19983
19984 bool
19985 ix86_expand_fp_vcond (rtx operands[])
19986 {
19987 enum rtx_code code = GET_CODE (operands[3]);
19988 rtx cmp;
19989
19990 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
19991 &operands[4], &operands[5]);
19992 if (code == UNKNOWN)
19993 {
19994 rtx temp;
19995 switch (GET_CODE (operands[3]))
19996 {
19997 case LTGT:
19998 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
19999 operands[5], operands[0], operands[0]);
20000 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20001 operands[5], operands[1], operands[2]);
20002 code = AND;
20003 break;
20004 case UNEQ:
20005 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20006 operands[5], operands[0], operands[0]);
20007 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20008 operands[5], operands[1], operands[2]);
20009 code = IOR;
20010 break;
20011 default:
20012 gcc_unreachable ();
20013 }
20014 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20015 OPTAB_DIRECT);
20016 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20017 return true;
20018 }
20019
20020 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20021 operands[5], operands[1], operands[2]))
20022 return true;
20023
20024 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20025 operands[1], operands[2]);
20026 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20027 return true;
20028 }
20029
20030 /* Expand a signed/unsigned integral vector conditional move. */
20031
20032 bool
20033 ix86_expand_int_vcond (rtx operands[])
20034 {
20035 enum machine_mode data_mode = GET_MODE (operands[0]);
20036 enum machine_mode mode = GET_MODE (operands[4]);
20037 enum rtx_code code = GET_CODE (operands[3]);
20038 bool negate = false;
20039 rtx x, cop0, cop1;
20040
20041 cop0 = operands[4];
20042 cop1 = operands[5];
20043
20044 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20045 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20046 if ((code == LT || code == GE)
20047 && data_mode == mode
20048 && cop1 == CONST0_RTX (mode)
20049 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20050 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20051 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20052 && (GET_MODE_SIZE (data_mode) == 16
20053 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20054 {
20055 rtx negop = operands[2 - (code == LT)];
20056 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20057 if (negop == CONST1_RTX (data_mode))
20058 {
20059 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20060 operands[0], 1, OPTAB_DIRECT);
20061 if (res != operands[0])
20062 emit_move_insn (operands[0], res);
20063 return true;
20064 }
20065 else if (GET_MODE_INNER (data_mode) != DImode
20066 && vector_all_ones_operand (negop, data_mode))
20067 {
20068 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20069 operands[0], 0, OPTAB_DIRECT);
20070 if (res != operands[0])
20071 emit_move_insn (operands[0], res);
20072 return true;
20073 }
20074 }
20075
20076 if (!nonimmediate_operand (cop1, mode))
20077 cop1 = force_reg (mode, cop1);
20078 if (!general_operand (operands[1], data_mode))
20079 operands[1] = force_reg (data_mode, operands[1]);
20080 if (!general_operand (operands[2], data_mode))
20081 operands[2] = force_reg (data_mode, operands[2]);
20082
20083 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20084 if (TARGET_XOP
20085 && (mode == V16QImode || mode == V8HImode
20086 || mode == V4SImode || mode == V2DImode))
20087 ;
20088 else
20089 {
20090 /* Canonicalize the comparison to EQ, GT, GTU. */
20091 switch (code)
20092 {
20093 case EQ:
20094 case GT:
20095 case GTU:
20096 break;
20097
20098 case NE:
20099 case LE:
20100 case LEU:
20101 code = reverse_condition (code);
20102 negate = true;
20103 break;
20104
20105 case GE:
20106 case GEU:
20107 code = reverse_condition (code);
20108 negate = true;
20109 /* FALLTHRU */
20110
20111 case LT:
20112 case LTU:
20113 code = swap_condition (code);
20114 x = cop0, cop0 = cop1, cop1 = x;
20115 break;
20116
20117 default:
20118 gcc_unreachable ();
20119 }
20120
20121 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20122 if (mode == V2DImode)
20123 {
20124 switch (code)
20125 {
20126 case EQ:
20127 /* SSE4.1 supports EQ. */
20128 if (!TARGET_SSE4_1)
20129 return false;
20130 break;
20131
20132 case GT:
20133 case GTU:
20134 /* SSE4.2 supports GT/GTU. */
20135 if (!TARGET_SSE4_2)
20136 return false;
20137 break;
20138
20139 default:
20140 gcc_unreachable ();
20141 }
20142 }
20143
20144 /* Unsigned parallel compare is not supported by the hardware.
20145 Play some tricks to turn this into a signed comparison
20146 against 0. */
20147 if (code == GTU)
20148 {
20149 cop0 = force_reg (mode, cop0);
20150
20151 switch (mode)
20152 {
20153 case V8SImode:
20154 case V4DImode:
20155 case V4SImode:
20156 case V2DImode:
20157 {
20158 rtx t1, t2, mask;
20159 rtx (*gen_sub3) (rtx, rtx, rtx);
20160
20161 switch (mode)
20162 {
20163 case V8SImode: gen_sub3 = gen_subv8si3; break;
20164 case V4DImode: gen_sub3 = gen_subv4di3; break;
20165 case V4SImode: gen_sub3 = gen_subv4si3; break;
20166 case V2DImode: gen_sub3 = gen_subv2di3; break;
20167 default:
20168 gcc_unreachable ();
20169 }
20170 /* Subtract (-(INT MAX) - 1) from both operands to make
20171 them signed. */
20172 mask = ix86_build_signbit_mask (mode, true, false);
20173 t1 = gen_reg_rtx (mode);
20174 emit_insn (gen_sub3 (t1, cop0, mask));
20175
20176 t2 = gen_reg_rtx (mode);
20177 emit_insn (gen_sub3 (t2, cop1, mask));
20178
20179 cop0 = t1;
20180 cop1 = t2;
20181 code = GT;
20182 }
20183 break;
20184
20185 case V32QImode:
20186 case V16HImode:
20187 case V16QImode:
20188 case V8HImode:
20189 /* Perform a parallel unsigned saturating subtraction. */
20190 x = gen_reg_rtx (mode);
20191 emit_insn (gen_rtx_SET (VOIDmode, x,
20192 gen_rtx_US_MINUS (mode, cop0, cop1)));
20193
20194 cop0 = x;
20195 cop1 = CONST0_RTX (mode);
20196 code = EQ;
20197 negate = !negate;
20198 break;
20199
20200 default:
20201 gcc_unreachable ();
20202 }
20203 }
20204 }
20205
20206 /* Allow the comparison to be done in one mode, but the movcc to
20207 happen in another mode. */
20208 if (data_mode == mode)
20209 {
20210 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20211 operands[1+negate], operands[2-negate]);
20212 }
20213 else
20214 {
20215 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20216 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20217 code, cop0, cop1,
20218 operands[1+negate], operands[2-negate]);
20219 x = gen_lowpart (data_mode, x);
20220 }
20221
20222 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20223 operands[2-negate]);
20224 return true;
20225 }
20226
20227 /* Expand a variable vector permutation. */
20228
20229 void
20230 ix86_expand_vec_perm (rtx operands[])
20231 {
20232 rtx target = operands[0];
20233 rtx op0 = operands[1];
20234 rtx op1 = operands[2];
20235 rtx mask = operands[3];
20236 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20237 enum machine_mode mode = GET_MODE (op0);
20238 enum machine_mode maskmode = GET_MODE (mask);
20239 int w, e, i;
20240 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20241
20242 /* Number of elements in the vector. */
20243 w = GET_MODE_NUNITS (mode);
20244 e = GET_MODE_UNIT_SIZE (mode);
20245 gcc_assert (w <= 32);
20246
20247 if (TARGET_AVX2)
20248 {
20249 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20250 {
20251 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20252 an constant shuffle operand. With a tiny bit of effort we can
20253 use VPERMD instead. A re-interpretation stall for V4DFmode is
20254 unfortunate but there's no avoiding it.
20255 Similarly for V16HImode we don't have instructions for variable
20256 shuffling, while for V32QImode we can use after preparing suitable
20257 masks vpshufb; vpshufb; vpermq; vpor. */
20258
20259 if (mode == V16HImode)
20260 {
20261 maskmode = mode = V32QImode;
20262 w = 32;
20263 e = 1;
20264 }
20265 else
20266 {
20267 maskmode = mode = V8SImode;
20268 w = 8;
20269 e = 4;
20270 }
20271 t1 = gen_reg_rtx (maskmode);
20272
20273 /* Replicate the low bits of the V4DImode mask into V8SImode:
20274 mask = { A B C D }
20275 t1 = { A A B B C C D D }. */
20276 for (i = 0; i < w / 2; ++i)
20277 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20278 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20279 vt = force_reg (maskmode, vt);
20280 mask = gen_lowpart (maskmode, mask);
20281 if (maskmode == V8SImode)
20282 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20283 else
20284 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20285
20286 /* Multiply the shuffle indicies by two. */
20287 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20288 OPTAB_DIRECT);
20289
20290 /* Add one to the odd shuffle indicies:
20291 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20292 for (i = 0; i < w / 2; ++i)
20293 {
20294 vec[i * 2] = const0_rtx;
20295 vec[i * 2 + 1] = const1_rtx;
20296 }
20297 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20298 vt = force_const_mem (maskmode, vt);
20299 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20300 OPTAB_DIRECT);
20301
20302 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20303 operands[3] = mask = t1;
20304 target = gen_lowpart (mode, target);
20305 op0 = gen_lowpart (mode, op0);
20306 op1 = gen_lowpart (mode, op1);
20307 }
20308
20309 switch (mode)
20310 {
20311 case V8SImode:
20312 /* The VPERMD and VPERMPS instructions already properly ignore
20313 the high bits of the shuffle elements. No need for us to
20314 perform an AND ourselves. */
20315 if (one_operand_shuffle)
20316 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20317 else
20318 {
20319 t1 = gen_reg_rtx (V8SImode);
20320 t2 = gen_reg_rtx (V8SImode);
20321 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20322 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20323 goto merge_two;
20324 }
20325 return;
20326
20327 case V8SFmode:
20328 mask = gen_lowpart (V8SFmode, mask);
20329 if (one_operand_shuffle)
20330 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20331 else
20332 {
20333 t1 = gen_reg_rtx (V8SFmode);
20334 t2 = gen_reg_rtx (V8SFmode);
20335 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20336 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20337 goto merge_two;
20338 }
20339 return;
20340
20341 case V4SImode:
20342 /* By combining the two 128-bit input vectors into one 256-bit
20343 input vector, we can use VPERMD and VPERMPS for the full
20344 two-operand shuffle. */
20345 t1 = gen_reg_rtx (V8SImode);
20346 t2 = gen_reg_rtx (V8SImode);
20347 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20348 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20349 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20350 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20351 return;
20352
20353 case V4SFmode:
20354 t1 = gen_reg_rtx (V8SFmode);
20355 t2 = gen_reg_rtx (V8SImode);
20356 mask = gen_lowpart (V4SImode, mask);
20357 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20358 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20359 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20360 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20361 return;
20362
20363 case V32QImode:
20364 t1 = gen_reg_rtx (V32QImode);
20365 t2 = gen_reg_rtx (V32QImode);
20366 t3 = gen_reg_rtx (V32QImode);
20367 vt2 = GEN_INT (128);
20368 for (i = 0; i < 32; i++)
20369 vec[i] = vt2;
20370 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20371 vt = force_reg (V32QImode, vt);
20372 for (i = 0; i < 32; i++)
20373 vec[i] = i < 16 ? vt2 : const0_rtx;
20374 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20375 vt2 = force_reg (V32QImode, vt2);
20376 /* From mask create two adjusted masks, which contain the same
20377 bits as mask in the low 7 bits of each vector element.
20378 The first mask will have the most significant bit clear
20379 if it requests element from the same 128-bit lane
20380 and MSB set if it requests element from the other 128-bit lane.
20381 The second mask will have the opposite values of the MSB,
20382 and additionally will have its 128-bit lanes swapped.
20383 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20384 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20385 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20386 stands for other 12 bytes. */
20387 /* The bit whether element is from the same lane or the other
20388 lane is bit 4, so shift it up by 3 to the MSB position. */
20389 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20390 gen_lowpart (V4DImode, mask),
20391 GEN_INT (3)));
20392 /* Clear MSB bits from the mask just in case it had them set. */
20393 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20394 /* After this t1 will have MSB set for elements from other lane. */
20395 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20396 /* Clear bits other than MSB. */
20397 emit_insn (gen_andv32qi3 (t1, t1, vt));
20398 /* Or in the lower bits from mask into t3. */
20399 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20400 /* And invert MSB bits in t1, so MSB is set for elements from the same
20401 lane. */
20402 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20403 /* Swap 128-bit lanes in t3. */
20404 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20405 gen_lowpart (V4DImode, t3),
20406 const2_rtx, GEN_INT (3),
20407 const0_rtx, const1_rtx));
20408 /* And or in the lower bits from mask into t1. */
20409 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20410 if (one_operand_shuffle)
20411 {
20412 /* Each of these shuffles will put 0s in places where
20413 element from the other 128-bit lane is needed, otherwise
20414 will shuffle in the requested value. */
20415 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20416 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20417 /* For t3 the 128-bit lanes are swapped again. */
20418 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20419 gen_lowpart (V4DImode, t3),
20420 const2_rtx, GEN_INT (3),
20421 const0_rtx, const1_rtx));
20422 /* And oring both together leads to the result. */
20423 emit_insn (gen_iorv32qi3 (target, t1, t3));
20424 return;
20425 }
20426
20427 t4 = gen_reg_rtx (V32QImode);
20428 /* Similarly to the above one_operand_shuffle code,
20429 just for repeated twice for each operand. merge_two:
20430 code will merge the two results together. */
20431 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20432 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20433 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20434 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20435 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20436 gen_lowpart (V4DImode, t4),
20437 const2_rtx, GEN_INT (3),
20438 const0_rtx, const1_rtx));
20439 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20440 gen_lowpart (V4DImode, t3),
20441 const2_rtx, GEN_INT (3),
20442 const0_rtx, const1_rtx));
20443 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20444 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20445 t1 = t4;
20446 t2 = t3;
20447 goto merge_two;
20448
20449 default:
20450 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20451 break;
20452 }
20453 }
20454
20455 if (TARGET_XOP)
20456 {
20457 /* The XOP VPPERM insn supports three inputs. By ignoring the
20458 one_operand_shuffle special case, we avoid creating another
20459 set of constant vectors in memory. */
20460 one_operand_shuffle = false;
20461
20462 /* mask = mask & {2*w-1, ...} */
20463 vt = GEN_INT (2*w - 1);
20464 }
20465 else
20466 {
20467 /* mask = mask & {w-1, ...} */
20468 vt = GEN_INT (w - 1);
20469 }
20470
20471 for (i = 0; i < w; i++)
20472 vec[i] = vt;
20473 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20474 mask = expand_simple_binop (maskmode, AND, mask, vt,
20475 NULL_RTX, 0, OPTAB_DIRECT);
20476
20477 /* For non-QImode operations, convert the word permutation control
20478 into a byte permutation control. */
20479 if (mode != V16QImode)
20480 {
20481 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20482 GEN_INT (exact_log2 (e)),
20483 NULL_RTX, 0, OPTAB_DIRECT);
20484
20485 /* Convert mask to vector of chars. */
20486 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20487
20488 /* Replicate each of the input bytes into byte positions:
20489 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20490 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20491 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20492 for (i = 0; i < 16; ++i)
20493 vec[i] = GEN_INT (i/e * e);
20494 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20495 vt = force_const_mem (V16QImode, vt);
20496 if (TARGET_XOP)
20497 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20498 else
20499 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20500
20501 /* Convert it into the byte positions by doing
20502 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20503 for (i = 0; i < 16; ++i)
20504 vec[i] = GEN_INT (i % e);
20505 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20506 vt = force_const_mem (V16QImode, vt);
20507 emit_insn (gen_addv16qi3 (mask, mask, vt));
20508 }
20509
20510 /* The actual shuffle operations all operate on V16QImode. */
20511 op0 = gen_lowpart (V16QImode, op0);
20512 op1 = gen_lowpart (V16QImode, op1);
20513 target = gen_lowpart (V16QImode, target);
20514
20515 if (TARGET_XOP)
20516 {
20517 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20518 }
20519 else if (one_operand_shuffle)
20520 {
20521 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20522 }
20523 else
20524 {
20525 rtx xops[6];
20526 bool ok;
20527
20528 /* Shuffle the two input vectors independently. */
20529 t1 = gen_reg_rtx (V16QImode);
20530 t2 = gen_reg_rtx (V16QImode);
20531 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20532 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20533
20534 merge_two:
20535 /* Then merge them together. The key is whether any given control
20536 element contained a bit set that indicates the second word. */
20537 mask = operands[3];
20538 vt = GEN_INT (w);
20539 if (maskmode == V2DImode && !TARGET_SSE4_1)
20540 {
20541 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20542 more shuffle to convert the V2DI input mask into a V4SI
20543 input mask. At which point the masking that expand_int_vcond
20544 will work as desired. */
20545 rtx t3 = gen_reg_rtx (V4SImode);
20546 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20547 const0_rtx, const0_rtx,
20548 const2_rtx, const2_rtx));
20549 mask = t3;
20550 maskmode = V4SImode;
20551 e = w = 4;
20552 }
20553
20554 for (i = 0; i < w; i++)
20555 vec[i] = vt;
20556 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20557 vt = force_reg (maskmode, vt);
20558 mask = expand_simple_binop (maskmode, AND, mask, vt,
20559 NULL_RTX, 0, OPTAB_DIRECT);
20560
20561 xops[0] = gen_lowpart (mode, operands[0]);
20562 xops[1] = gen_lowpart (mode, t2);
20563 xops[2] = gen_lowpart (mode, t1);
20564 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20565 xops[4] = mask;
20566 xops[5] = vt;
20567 ok = ix86_expand_int_vcond (xops);
20568 gcc_assert (ok);
20569 }
20570 }
20571
20572 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20573 true if we should do zero extension, else sign extension. HIGH_P is
20574 true if we want the N/2 high elements, else the low elements. */
20575
20576 void
20577 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20578 {
20579 enum machine_mode imode = GET_MODE (src);
20580 rtx tmp;
20581
20582 if (TARGET_SSE4_1)
20583 {
20584 rtx (*unpack)(rtx, rtx);
20585 rtx (*extract)(rtx, rtx) = NULL;
20586 enum machine_mode halfmode = BLKmode;
20587
20588 switch (imode)
20589 {
20590 case V32QImode:
20591 if (unsigned_p)
20592 unpack = gen_avx2_zero_extendv16qiv16hi2;
20593 else
20594 unpack = gen_avx2_sign_extendv16qiv16hi2;
20595 halfmode = V16QImode;
20596 extract
20597 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20598 break;
20599 case V16HImode:
20600 if (unsigned_p)
20601 unpack = gen_avx2_zero_extendv8hiv8si2;
20602 else
20603 unpack = gen_avx2_sign_extendv8hiv8si2;
20604 halfmode = V8HImode;
20605 extract
20606 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20607 break;
20608 case V8SImode:
20609 if (unsigned_p)
20610 unpack = gen_avx2_zero_extendv4siv4di2;
20611 else
20612 unpack = gen_avx2_sign_extendv4siv4di2;
20613 halfmode = V4SImode;
20614 extract
20615 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20616 break;
20617 case V16QImode:
20618 if (unsigned_p)
20619 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20620 else
20621 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20622 break;
20623 case V8HImode:
20624 if (unsigned_p)
20625 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20626 else
20627 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20628 break;
20629 case V4SImode:
20630 if (unsigned_p)
20631 unpack = gen_sse4_1_zero_extendv2siv2di2;
20632 else
20633 unpack = gen_sse4_1_sign_extendv2siv2di2;
20634 break;
20635 default:
20636 gcc_unreachable ();
20637 }
20638
20639 if (GET_MODE_SIZE (imode) == 32)
20640 {
20641 tmp = gen_reg_rtx (halfmode);
20642 emit_insn (extract (tmp, src));
20643 }
20644 else if (high_p)
20645 {
20646 /* Shift higher 8 bytes to lower 8 bytes. */
20647 tmp = gen_reg_rtx (imode);
20648 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20649 gen_lowpart (V1TImode, src),
20650 GEN_INT (64)));
20651 }
20652 else
20653 tmp = src;
20654
20655 emit_insn (unpack (dest, tmp));
20656 }
20657 else
20658 {
20659 rtx (*unpack)(rtx, rtx, rtx);
20660
20661 switch (imode)
20662 {
20663 case V16QImode:
20664 if (high_p)
20665 unpack = gen_vec_interleave_highv16qi;
20666 else
20667 unpack = gen_vec_interleave_lowv16qi;
20668 break;
20669 case V8HImode:
20670 if (high_p)
20671 unpack = gen_vec_interleave_highv8hi;
20672 else
20673 unpack = gen_vec_interleave_lowv8hi;
20674 break;
20675 case V4SImode:
20676 if (high_p)
20677 unpack = gen_vec_interleave_highv4si;
20678 else
20679 unpack = gen_vec_interleave_lowv4si;
20680 break;
20681 default:
20682 gcc_unreachable ();
20683 }
20684
20685 if (unsigned_p)
20686 tmp = force_reg (imode, CONST0_RTX (imode));
20687 else
20688 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20689 src, pc_rtx, pc_rtx);
20690
20691 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20692 }
20693 }
20694
20695 /* Expand conditional increment or decrement using adb/sbb instructions.
20696 The default case using setcc followed by the conditional move can be
20697 done by generic code. */
20698 bool
20699 ix86_expand_int_addcc (rtx operands[])
20700 {
20701 enum rtx_code code = GET_CODE (operands[1]);
20702 rtx flags;
20703 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20704 rtx compare_op;
20705 rtx val = const0_rtx;
20706 bool fpcmp = false;
20707 enum machine_mode mode;
20708 rtx op0 = XEXP (operands[1], 0);
20709 rtx op1 = XEXP (operands[1], 1);
20710
20711 if (operands[3] != const1_rtx
20712 && operands[3] != constm1_rtx)
20713 return false;
20714 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20715 return false;
20716 code = GET_CODE (compare_op);
20717
20718 flags = XEXP (compare_op, 0);
20719
20720 if (GET_MODE (flags) == CCFPmode
20721 || GET_MODE (flags) == CCFPUmode)
20722 {
20723 fpcmp = true;
20724 code = ix86_fp_compare_code_to_integer (code);
20725 }
20726
20727 if (code != LTU)
20728 {
20729 val = constm1_rtx;
20730 if (fpcmp)
20731 PUT_CODE (compare_op,
20732 reverse_condition_maybe_unordered
20733 (GET_CODE (compare_op)));
20734 else
20735 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20736 }
20737
20738 mode = GET_MODE (operands[0]);
20739
20740 /* Construct either adc or sbb insn. */
20741 if ((code == LTU) == (operands[3] == constm1_rtx))
20742 {
20743 switch (mode)
20744 {
20745 case QImode:
20746 insn = gen_subqi3_carry;
20747 break;
20748 case HImode:
20749 insn = gen_subhi3_carry;
20750 break;
20751 case SImode:
20752 insn = gen_subsi3_carry;
20753 break;
20754 case DImode:
20755 insn = gen_subdi3_carry;
20756 break;
20757 default:
20758 gcc_unreachable ();
20759 }
20760 }
20761 else
20762 {
20763 switch (mode)
20764 {
20765 case QImode:
20766 insn = gen_addqi3_carry;
20767 break;
20768 case HImode:
20769 insn = gen_addhi3_carry;
20770 break;
20771 case SImode:
20772 insn = gen_addsi3_carry;
20773 break;
20774 case DImode:
20775 insn = gen_adddi3_carry;
20776 break;
20777 default:
20778 gcc_unreachable ();
20779 }
20780 }
20781 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20782
20783 return true;
20784 }
20785
20786
20787 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20788 but works for floating pointer parameters and nonoffsetable memories.
20789 For pushes, it returns just stack offsets; the values will be saved
20790 in the right order. Maximally three parts are generated. */
20791
20792 static int
20793 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20794 {
20795 int size;
20796
20797 if (!TARGET_64BIT)
20798 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20799 else
20800 size = (GET_MODE_SIZE (mode) + 4) / 8;
20801
20802 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20803 gcc_assert (size >= 2 && size <= 4);
20804
20805 /* Optimize constant pool reference to immediates. This is used by fp
20806 moves, that force all constants to memory to allow combining. */
20807 if (MEM_P (operand) && MEM_READONLY_P (operand))
20808 {
20809 rtx tmp = maybe_get_pool_constant (operand);
20810 if (tmp)
20811 operand = tmp;
20812 }
20813
20814 if (MEM_P (operand) && !offsettable_memref_p (operand))
20815 {
20816 /* The only non-offsetable memories we handle are pushes. */
20817 int ok = push_operand (operand, VOIDmode);
20818
20819 gcc_assert (ok);
20820
20821 operand = copy_rtx (operand);
20822 PUT_MODE (operand, word_mode);
20823 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20824 return size;
20825 }
20826
20827 if (GET_CODE (operand) == CONST_VECTOR)
20828 {
20829 enum machine_mode imode = int_mode_for_mode (mode);
20830 /* Caution: if we looked through a constant pool memory above,
20831 the operand may actually have a different mode now. That's
20832 ok, since we want to pun this all the way back to an integer. */
20833 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20834 gcc_assert (operand != NULL);
20835 mode = imode;
20836 }
20837
20838 if (!TARGET_64BIT)
20839 {
20840 if (mode == DImode)
20841 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20842 else
20843 {
20844 int i;
20845
20846 if (REG_P (operand))
20847 {
20848 gcc_assert (reload_completed);
20849 for (i = 0; i < size; i++)
20850 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20851 }
20852 else if (offsettable_memref_p (operand))
20853 {
20854 operand = adjust_address (operand, SImode, 0);
20855 parts[0] = operand;
20856 for (i = 1; i < size; i++)
20857 parts[i] = adjust_address (operand, SImode, 4 * i);
20858 }
20859 else if (GET_CODE (operand) == CONST_DOUBLE)
20860 {
20861 REAL_VALUE_TYPE r;
20862 long l[4];
20863
20864 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20865 switch (mode)
20866 {
20867 case TFmode:
20868 real_to_target (l, &r, mode);
20869 parts[3] = gen_int_mode (l[3], SImode);
20870 parts[2] = gen_int_mode (l[2], SImode);
20871 break;
20872 case XFmode:
20873 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
20874 long double may not be 80-bit. */
20875 real_to_target (l, &r, mode);
20876 parts[2] = gen_int_mode (l[2], SImode);
20877 break;
20878 case DFmode:
20879 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
20880 break;
20881 default:
20882 gcc_unreachable ();
20883 }
20884 parts[1] = gen_int_mode (l[1], SImode);
20885 parts[0] = gen_int_mode (l[0], SImode);
20886 }
20887 else
20888 gcc_unreachable ();
20889 }
20890 }
20891 else
20892 {
20893 if (mode == TImode)
20894 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20895 if (mode == XFmode || mode == TFmode)
20896 {
20897 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
20898 if (REG_P (operand))
20899 {
20900 gcc_assert (reload_completed);
20901 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
20902 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
20903 }
20904 else if (offsettable_memref_p (operand))
20905 {
20906 operand = adjust_address (operand, DImode, 0);
20907 parts[0] = operand;
20908 parts[1] = adjust_address (operand, upper_mode, 8);
20909 }
20910 else if (GET_CODE (operand) == CONST_DOUBLE)
20911 {
20912 REAL_VALUE_TYPE r;
20913 long l[4];
20914
20915 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
20916 real_to_target (l, &r, mode);
20917
20918 /* Do not use shift by 32 to avoid warning on 32bit systems. */
20919 if (HOST_BITS_PER_WIDE_INT >= 64)
20920 parts[0]
20921 = gen_int_mode
20922 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
20923 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
20924 DImode);
20925 else
20926 parts[0] = immed_double_const (l[0], l[1], DImode);
20927
20928 if (upper_mode == SImode)
20929 parts[1] = gen_int_mode (l[2], SImode);
20930 else if (HOST_BITS_PER_WIDE_INT >= 64)
20931 parts[1]
20932 = gen_int_mode
20933 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
20934 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
20935 DImode);
20936 else
20937 parts[1] = immed_double_const (l[2], l[3], DImode);
20938 }
20939 else
20940 gcc_unreachable ();
20941 }
20942 }
20943
20944 return size;
20945 }
20946
20947 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
20948 Return false when normal moves are needed; true when all required
20949 insns have been emitted. Operands 2-4 contain the input values
20950 int the correct order; operands 5-7 contain the output values. */
20951
20952 void
20953 ix86_split_long_move (rtx operands[])
20954 {
20955 rtx part[2][4];
20956 int nparts, i, j;
20957 int push = 0;
20958 int collisions = 0;
20959 enum machine_mode mode = GET_MODE (operands[0]);
20960 bool collisionparts[4];
20961
20962 /* The DFmode expanders may ask us to move double.
20963 For 64bit target this is single move. By hiding the fact
20964 here we simplify i386.md splitters. */
20965 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
20966 {
20967 /* Optimize constant pool reference to immediates. This is used by
20968 fp moves, that force all constants to memory to allow combining. */
20969
20970 if (MEM_P (operands[1])
20971 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
20972 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
20973 operands[1] = get_pool_constant (XEXP (operands[1], 0));
20974 if (push_operand (operands[0], VOIDmode))
20975 {
20976 operands[0] = copy_rtx (operands[0]);
20977 PUT_MODE (operands[0], word_mode);
20978 }
20979 else
20980 operands[0] = gen_lowpart (DImode, operands[0]);
20981 operands[1] = gen_lowpart (DImode, operands[1]);
20982 emit_move_insn (operands[0], operands[1]);
20983 return;
20984 }
20985
20986 /* The only non-offsettable memory we handle is push. */
20987 if (push_operand (operands[0], VOIDmode))
20988 push = 1;
20989 else
20990 gcc_assert (!MEM_P (operands[0])
20991 || offsettable_memref_p (operands[0]));
20992
20993 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
20994 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
20995
20996 /* When emitting push, take care for source operands on the stack. */
20997 if (push && MEM_P (operands[1])
20998 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
20999 {
21000 rtx src_base = XEXP (part[1][nparts - 1], 0);
21001
21002 /* Compensate for the stack decrement by 4. */
21003 if (!TARGET_64BIT && nparts == 3
21004 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21005 src_base = plus_constant (Pmode, src_base, 4);
21006
21007 /* src_base refers to the stack pointer and is
21008 automatically decreased by emitted push. */
21009 for (i = 0; i < nparts; i++)
21010 part[1][i] = change_address (part[1][i],
21011 GET_MODE (part[1][i]), src_base);
21012 }
21013
21014 /* We need to do copy in the right order in case an address register
21015 of the source overlaps the destination. */
21016 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21017 {
21018 rtx tmp;
21019
21020 for (i = 0; i < nparts; i++)
21021 {
21022 collisionparts[i]
21023 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21024 if (collisionparts[i])
21025 collisions++;
21026 }
21027
21028 /* Collision in the middle part can be handled by reordering. */
21029 if (collisions == 1 && nparts == 3 && collisionparts [1])
21030 {
21031 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21032 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21033 }
21034 else if (collisions == 1
21035 && nparts == 4
21036 && (collisionparts [1] || collisionparts [2]))
21037 {
21038 if (collisionparts [1])
21039 {
21040 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21041 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21042 }
21043 else
21044 {
21045 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21046 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21047 }
21048 }
21049
21050 /* If there are more collisions, we can't handle it by reordering.
21051 Do an lea to the last part and use only one colliding move. */
21052 else if (collisions > 1)
21053 {
21054 rtx base;
21055
21056 collisions = 1;
21057
21058 base = part[0][nparts - 1];
21059
21060 /* Handle the case when the last part isn't valid for lea.
21061 Happens in 64-bit mode storing the 12-byte XFmode. */
21062 if (GET_MODE (base) != Pmode)
21063 base = gen_rtx_REG (Pmode, REGNO (base));
21064
21065 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21066 part[1][0] = replace_equiv_address (part[1][0], base);
21067 for (i = 1; i < nparts; i++)
21068 {
21069 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21070 part[1][i] = replace_equiv_address (part[1][i], tmp);
21071 }
21072 }
21073 }
21074
21075 if (push)
21076 {
21077 if (!TARGET_64BIT)
21078 {
21079 if (nparts == 3)
21080 {
21081 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21082 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21083 stack_pointer_rtx, GEN_INT (-4)));
21084 emit_move_insn (part[0][2], part[1][2]);
21085 }
21086 else if (nparts == 4)
21087 {
21088 emit_move_insn (part[0][3], part[1][3]);
21089 emit_move_insn (part[0][2], part[1][2]);
21090 }
21091 }
21092 else
21093 {
21094 /* In 64bit mode we don't have 32bit push available. In case this is
21095 register, it is OK - we will just use larger counterpart. We also
21096 retype memory - these comes from attempt to avoid REX prefix on
21097 moving of second half of TFmode value. */
21098 if (GET_MODE (part[1][1]) == SImode)
21099 {
21100 switch (GET_CODE (part[1][1]))
21101 {
21102 case MEM:
21103 part[1][1] = adjust_address (part[1][1], DImode, 0);
21104 break;
21105
21106 case REG:
21107 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21108 break;
21109
21110 default:
21111 gcc_unreachable ();
21112 }
21113
21114 if (GET_MODE (part[1][0]) == SImode)
21115 part[1][0] = part[1][1];
21116 }
21117 }
21118 emit_move_insn (part[0][1], part[1][1]);
21119 emit_move_insn (part[0][0], part[1][0]);
21120 return;
21121 }
21122
21123 /* Choose correct order to not overwrite the source before it is copied. */
21124 if ((REG_P (part[0][0])
21125 && REG_P (part[1][1])
21126 && (REGNO (part[0][0]) == REGNO (part[1][1])
21127 || (nparts == 3
21128 && REGNO (part[0][0]) == REGNO (part[1][2]))
21129 || (nparts == 4
21130 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21131 || (collisions > 0
21132 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21133 {
21134 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21135 {
21136 operands[2 + i] = part[0][j];
21137 operands[6 + i] = part[1][j];
21138 }
21139 }
21140 else
21141 {
21142 for (i = 0; i < nparts; i++)
21143 {
21144 operands[2 + i] = part[0][i];
21145 operands[6 + i] = part[1][i];
21146 }
21147 }
21148
21149 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21150 if (optimize_insn_for_size_p ())
21151 {
21152 for (j = 0; j < nparts - 1; j++)
21153 if (CONST_INT_P (operands[6 + j])
21154 && operands[6 + j] != const0_rtx
21155 && REG_P (operands[2 + j]))
21156 for (i = j; i < nparts - 1; i++)
21157 if (CONST_INT_P (operands[7 + i])
21158 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21159 operands[7 + i] = operands[2 + j];
21160 }
21161
21162 for (i = 0; i < nparts; i++)
21163 emit_move_insn (operands[2 + i], operands[6 + i]);
21164
21165 return;
21166 }
21167
21168 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21169 left shift by a constant, either using a single shift or
21170 a sequence of add instructions. */
21171
21172 static void
21173 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21174 {
21175 rtx (*insn)(rtx, rtx, rtx);
21176
21177 if (count == 1
21178 || (count * ix86_cost->add <= ix86_cost->shift_const
21179 && !optimize_insn_for_size_p ()))
21180 {
21181 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21182 while (count-- > 0)
21183 emit_insn (insn (operand, operand, operand));
21184 }
21185 else
21186 {
21187 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21188 emit_insn (insn (operand, operand, GEN_INT (count)));
21189 }
21190 }
21191
21192 void
21193 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21194 {
21195 rtx (*gen_ashl3)(rtx, rtx, rtx);
21196 rtx (*gen_shld)(rtx, rtx, rtx);
21197 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21198
21199 rtx low[2], high[2];
21200 int count;
21201
21202 if (CONST_INT_P (operands[2]))
21203 {
21204 split_double_mode (mode, operands, 2, low, high);
21205 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21206
21207 if (count >= half_width)
21208 {
21209 emit_move_insn (high[0], low[1]);
21210 emit_move_insn (low[0], const0_rtx);
21211
21212 if (count > half_width)
21213 ix86_expand_ashl_const (high[0], count - half_width, mode);
21214 }
21215 else
21216 {
21217 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21218
21219 if (!rtx_equal_p (operands[0], operands[1]))
21220 emit_move_insn (operands[0], operands[1]);
21221
21222 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21223 ix86_expand_ashl_const (low[0], count, mode);
21224 }
21225 return;
21226 }
21227
21228 split_double_mode (mode, operands, 1, low, high);
21229
21230 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21231
21232 if (operands[1] == const1_rtx)
21233 {
21234 /* Assuming we've chosen a QImode capable registers, then 1 << N
21235 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21236 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21237 {
21238 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21239
21240 ix86_expand_clear (low[0]);
21241 ix86_expand_clear (high[0]);
21242 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21243
21244 d = gen_lowpart (QImode, low[0]);
21245 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21246 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21247 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21248
21249 d = gen_lowpart (QImode, high[0]);
21250 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21251 s = gen_rtx_NE (QImode, flags, const0_rtx);
21252 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21253 }
21254
21255 /* Otherwise, we can get the same results by manually performing
21256 a bit extract operation on bit 5/6, and then performing the two
21257 shifts. The two methods of getting 0/1 into low/high are exactly
21258 the same size. Avoiding the shift in the bit extract case helps
21259 pentium4 a bit; no one else seems to care much either way. */
21260 else
21261 {
21262 enum machine_mode half_mode;
21263 rtx (*gen_lshr3)(rtx, rtx, rtx);
21264 rtx (*gen_and3)(rtx, rtx, rtx);
21265 rtx (*gen_xor3)(rtx, rtx, rtx);
21266 HOST_WIDE_INT bits;
21267 rtx x;
21268
21269 if (mode == DImode)
21270 {
21271 half_mode = SImode;
21272 gen_lshr3 = gen_lshrsi3;
21273 gen_and3 = gen_andsi3;
21274 gen_xor3 = gen_xorsi3;
21275 bits = 5;
21276 }
21277 else
21278 {
21279 half_mode = DImode;
21280 gen_lshr3 = gen_lshrdi3;
21281 gen_and3 = gen_anddi3;
21282 gen_xor3 = gen_xordi3;
21283 bits = 6;
21284 }
21285
21286 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21287 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21288 else
21289 x = gen_lowpart (half_mode, operands[2]);
21290 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21291
21292 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21293 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21294 emit_move_insn (low[0], high[0]);
21295 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21296 }
21297
21298 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21299 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21300 return;
21301 }
21302
21303 if (operands[1] == constm1_rtx)
21304 {
21305 /* For -1 << N, we can avoid the shld instruction, because we
21306 know that we're shifting 0...31/63 ones into a -1. */
21307 emit_move_insn (low[0], constm1_rtx);
21308 if (optimize_insn_for_size_p ())
21309 emit_move_insn (high[0], low[0]);
21310 else
21311 emit_move_insn (high[0], constm1_rtx);
21312 }
21313 else
21314 {
21315 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21316
21317 if (!rtx_equal_p (operands[0], operands[1]))
21318 emit_move_insn (operands[0], operands[1]);
21319
21320 split_double_mode (mode, operands, 1, low, high);
21321 emit_insn (gen_shld (high[0], low[0], operands[2]));
21322 }
21323
21324 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21325
21326 if (TARGET_CMOVE && scratch)
21327 {
21328 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21329 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21330
21331 ix86_expand_clear (scratch);
21332 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21333 }
21334 else
21335 {
21336 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21337 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21338
21339 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21340 }
21341 }
21342
21343 void
21344 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21345 {
21346 rtx (*gen_ashr3)(rtx, rtx, rtx)
21347 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21348 rtx (*gen_shrd)(rtx, rtx, rtx);
21349 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21350
21351 rtx low[2], high[2];
21352 int count;
21353
21354 if (CONST_INT_P (operands[2]))
21355 {
21356 split_double_mode (mode, operands, 2, low, high);
21357 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21358
21359 if (count == GET_MODE_BITSIZE (mode) - 1)
21360 {
21361 emit_move_insn (high[0], high[1]);
21362 emit_insn (gen_ashr3 (high[0], high[0],
21363 GEN_INT (half_width - 1)));
21364 emit_move_insn (low[0], high[0]);
21365
21366 }
21367 else if (count >= half_width)
21368 {
21369 emit_move_insn (low[0], high[1]);
21370 emit_move_insn (high[0], low[0]);
21371 emit_insn (gen_ashr3 (high[0], high[0],
21372 GEN_INT (half_width - 1)));
21373
21374 if (count > half_width)
21375 emit_insn (gen_ashr3 (low[0], low[0],
21376 GEN_INT (count - half_width)));
21377 }
21378 else
21379 {
21380 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21381
21382 if (!rtx_equal_p (operands[0], operands[1]))
21383 emit_move_insn (operands[0], operands[1]);
21384
21385 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21386 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21387 }
21388 }
21389 else
21390 {
21391 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21392
21393 if (!rtx_equal_p (operands[0], operands[1]))
21394 emit_move_insn (operands[0], operands[1]);
21395
21396 split_double_mode (mode, operands, 1, low, high);
21397
21398 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21399 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21400
21401 if (TARGET_CMOVE && scratch)
21402 {
21403 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21404 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21405
21406 emit_move_insn (scratch, high[0]);
21407 emit_insn (gen_ashr3 (scratch, scratch,
21408 GEN_INT (half_width - 1)));
21409 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21410 scratch));
21411 }
21412 else
21413 {
21414 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21415 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21416
21417 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21418 }
21419 }
21420 }
21421
21422 void
21423 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21424 {
21425 rtx (*gen_lshr3)(rtx, rtx, rtx)
21426 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21427 rtx (*gen_shrd)(rtx, rtx, rtx);
21428 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21429
21430 rtx low[2], high[2];
21431 int count;
21432
21433 if (CONST_INT_P (operands[2]))
21434 {
21435 split_double_mode (mode, operands, 2, low, high);
21436 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21437
21438 if (count >= half_width)
21439 {
21440 emit_move_insn (low[0], high[1]);
21441 ix86_expand_clear (high[0]);
21442
21443 if (count > half_width)
21444 emit_insn (gen_lshr3 (low[0], low[0],
21445 GEN_INT (count - half_width)));
21446 }
21447 else
21448 {
21449 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21450
21451 if (!rtx_equal_p (operands[0], operands[1]))
21452 emit_move_insn (operands[0], operands[1]);
21453
21454 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21455 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21456 }
21457 }
21458 else
21459 {
21460 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21461
21462 if (!rtx_equal_p (operands[0], operands[1]))
21463 emit_move_insn (operands[0], operands[1]);
21464
21465 split_double_mode (mode, operands, 1, low, high);
21466
21467 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21468 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21469
21470 if (TARGET_CMOVE && scratch)
21471 {
21472 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21473 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21474
21475 ix86_expand_clear (scratch);
21476 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21477 scratch));
21478 }
21479 else
21480 {
21481 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21482 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21483
21484 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21485 }
21486 }
21487 }
21488
21489 /* Predict just emitted jump instruction to be taken with probability PROB. */
21490 static void
21491 predict_jump (int prob)
21492 {
21493 rtx insn = get_last_insn ();
21494 gcc_assert (JUMP_P (insn));
21495 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21496 }
21497
21498 /* Helper function for the string operations below. Dest VARIABLE whether
21499 it is aligned to VALUE bytes. If true, jump to the label. */
21500 static rtx
21501 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21502 {
21503 rtx label = gen_label_rtx ();
21504 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21505 if (GET_MODE (variable) == DImode)
21506 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21507 else
21508 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21509 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21510 1, label);
21511 if (epilogue)
21512 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21513 else
21514 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21515 return label;
21516 }
21517
21518 /* Adjust COUNTER by the VALUE. */
21519 static void
21520 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21521 {
21522 rtx (*gen_add)(rtx, rtx, rtx)
21523 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21524
21525 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21526 }
21527
21528 /* Zero extend possibly SImode EXP to Pmode register. */
21529 rtx
21530 ix86_zero_extend_to_Pmode (rtx exp)
21531 {
21532 if (GET_MODE (exp) != Pmode)
21533 exp = convert_to_mode (Pmode, exp, 1);
21534 return force_reg (Pmode, exp);
21535 }
21536
21537 /* Divide COUNTREG by SCALE. */
21538 static rtx
21539 scale_counter (rtx countreg, int scale)
21540 {
21541 rtx sc;
21542
21543 if (scale == 1)
21544 return countreg;
21545 if (CONST_INT_P (countreg))
21546 return GEN_INT (INTVAL (countreg) / scale);
21547 gcc_assert (REG_P (countreg));
21548
21549 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21550 GEN_INT (exact_log2 (scale)),
21551 NULL, 1, OPTAB_DIRECT);
21552 return sc;
21553 }
21554
21555 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21556 DImode for constant loop counts. */
21557
21558 static enum machine_mode
21559 counter_mode (rtx count_exp)
21560 {
21561 if (GET_MODE (count_exp) != VOIDmode)
21562 return GET_MODE (count_exp);
21563 if (!CONST_INT_P (count_exp))
21564 return Pmode;
21565 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21566 return DImode;
21567 return SImode;
21568 }
21569
21570 /* When SRCPTR is non-NULL, output simple loop to move memory
21571 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21572 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21573 equivalent loop to set memory by VALUE (supposed to be in MODE).
21574
21575 The size is rounded down to whole number of chunk size moved at once.
21576 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21577
21578
21579 static void
21580 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21581 rtx destptr, rtx srcptr, rtx value,
21582 rtx count, enum machine_mode mode, int unroll,
21583 int expected_size)
21584 {
21585 rtx out_label, top_label, iter, tmp;
21586 enum machine_mode iter_mode = counter_mode (count);
21587 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21588 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21589 rtx size;
21590 rtx x_addr;
21591 rtx y_addr;
21592 int i;
21593
21594 top_label = gen_label_rtx ();
21595 out_label = gen_label_rtx ();
21596 iter = gen_reg_rtx (iter_mode);
21597
21598 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21599 NULL, 1, OPTAB_DIRECT);
21600 /* Those two should combine. */
21601 if (piece_size == const1_rtx)
21602 {
21603 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21604 true, out_label);
21605 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21606 }
21607 emit_move_insn (iter, const0_rtx);
21608
21609 emit_label (top_label);
21610
21611 tmp = convert_modes (Pmode, iter_mode, iter, true);
21612 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21613 destmem = change_address (destmem, mode, x_addr);
21614
21615 if (srcmem)
21616 {
21617 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21618 srcmem = change_address (srcmem, mode, y_addr);
21619
21620 /* When unrolling for chips that reorder memory reads and writes,
21621 we can save registers by using single temporary.
21622 Also using 4 temporaries is overkill in 32bit mode. */
21623 if (!TARGET_64BIT && 0)
21624 {
21625 for (i = 0; i < unroll; i++)
21626 {
21627 if (i)
21628 {
21629 destmem =
21630 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21631 srcmem =
21632 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21633 }
21634 emit_move_insn (destmem, srcmem);
21635 }
21636 }
21637 else
21638 {
21639 rtx tmpreg[4];
21640 gcc_assert (unroll <= 4);
21641 for (i = 0; i < unroll; i++)
21642 {
21643 tmpreg[i] = gen_reg_rtx (mode);
21644 if (i)
21645 {
21646 srcmem =
21647 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21648 }
21649 emit_move_insn (tmpreg[i], srcmem);
21650 }
21651 for (i = 0; i < unroll; i++)
21652 {
21653 if (i)
21654 {
21655 destmem =
21656 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21657 }
21658 emit_move_insn (destmem, tmpreg[i]);
21659 }
21660 }
21661 }
21662 else
21663 for (i = 0; i < unroll; i++)
21664 {
21665 if (i)
21666 destmem =
21667 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21668 emit_move_insn (destmem, value);
21669 }
21670
21671 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21672 true, OPTAB_LIB_WIDEN);
21673 if (tmp != iter)
21674 emit_move_insn (iter, tmp);
21675
21676 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21677 true, top_label);
21678 if (expected_size != -1)
21679 {
21680 expected_size /= GET_MODE_SIZE (mode) * unroll;
21681 if (expected_size == 0)
21682 predict_jump (0);
21683 else if (expected_size > REG_BR_PROB_BASE)
21684 predict_jump (REG_BR_PROB_BASE - 1);
21685 else
21686 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21687 }
21688 else
21689 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21690 iter = ix86_zero_extend_to_Pmode (iter);
21691 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21692 true, OPTAB_LIB_WIDEN);
21693 if (tmp != destptr)
21694 emit_move_insn (destptr, tmp);
21695 if (srcptr)
21696 {
21697 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21698 true, OPTAB_LIB_WIDEN);
21699 if (tmp != srcptr)
21700 emit_move_insn (srcptr, tmp);
21701 }
21702 emit_label (out_label);
21703 }
21704
21705 /* Output "rep; mov" instruction.
21706 Arguments have same meaning as for previous function */
21707 static void
21708 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21709 rtx destptr, rtx srcptr,
21710 rtx count,
21711 enum machine_mode mode)
21712 {
21713 rtx destexp;
21714 rtx srcexp;
21715 rtx countreg;
21716 HOST_WIDE_INT rounded_count;
21717
21718 /* If the size is known, it is shorter to use rep movs. */
21719 if (mode == QImode && CONST_INT_P (count)
21720 && !(INTVAL (count) & 3))
21721 mode = SImode;
21722
21723 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21724 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21725 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21726 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21727 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21728 if (mode != QImode)
21729 {
21730 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21731 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21732 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21733 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21734 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21735 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21736 }
21737 else
21738 {
21739 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21740 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21741 }
21742 if (CONST_INT_P (count))
21743 {
21744 rounded_count = (INTVAL (count)
21745 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21746 destmem = shallow_copy_rtx (destmem);
21747 srcmem = shallow_copy_rtx (srcmem);
21748 set_mem_size (destmem, rounded_count);
21749 set_mem_size (srcmem, rounded_count);
21750 }
21751 else
21752 {
21753 if (MEM_SIZE_KNOWN_P (destmem))
21754 clear_mem_size (destmem);
21755 if (MEM_SIZE_KNOWN_P (srcmem))
21756 clear_mem_size (srcmem);
21757 }
21758 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21759 destexp, srcexp));
21760 }
21761
21762 /* Output "rep; stos" instruction.
21763 Arguments have same meaning as for previous function */
21764 static void
21765 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21766 rtx count, enum machine_mode mode,
21767 rtx orig_value)
21768 {
21769 rtx destexp;
21770 rtx countreg;
21771 HOST_WIDE_INT rounded_count;
21772
21773 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21774 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21775 value = force_reg (mode, gen_lowpart (mode, value));
21776 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21777 if (mode != QImode)
21778 {
21779 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21780 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21781 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21782 }
21783 else
21784 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21785 if (orig_value == const0_rtx && CONST_INT_P (count))
21786 {
21787 rounded_count = (INTVAL (count)
21788 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21789 destmem = shallow_copy_rtx (destmem);
21790 set_mem_size (destmem, rounded_count);
21791 }
21792 else if (MEM_SIZE_KNOWN_P (destmem))
21793 clear_mem_size (destmem);
21794 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21795 }
21796
21797 static void
21798 emit_strmov (rtx destmem, rtx srcmem,
21799 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21800 {
21801 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21802 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21803 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21804 }
21805
21806 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21807 static void
21808 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21809 rtx destptr, rtx srcptr, rtx count, int max_size)
21810 {
21811 rtx src, dest;
21812 if (CONST_INT_P (count))
21813 {
21814 HOST_WIDE_INT countval = INTVAL (count);
21815 int offset = 0;
21816
21817 if ((countval & 0x10) && max_size > 16)
21818 {
21819 if (TARGET_64BIT)
21820 {
21821 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21822 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21823 }
21824 else
21825 gcc_unreachable ();
21826 offset += 16;
21827 }
21828 if ((countval & 0x08) && max_size > 8)
21829 {
21830 if (TARGET_64BIT)
21831 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21832 else
21833 {
21834 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21835 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21836 }
21837 offset += 8;
21838 }
21839 if ((countval & 0x04) && max_size > 4)
21840 {
21841 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21842 offset += 4;
21843 }
21844 if ((countval & 0x02) && max_size > 2)
21845 {
21846 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
21847 offset += 2;
21848 }
21849 if ((countval & 0x01) && max_size > 1)
21850 {
21851 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
21852 offset += 1;
21853 }
21854 return;
21855 }
21856 if (max_size > 8)
21857 {
21858 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
21859 count, 1, OPTAB_DIRECT);
21860 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
21861 count, QImode, 1, 4);
21862 return;
21863 }
21864
21865 /* When there are stringops, we can cheaply increase dest and src pointers.
21866 Otherwise we save code size by maintaining offset (zero is readily
21867 available from preceding rep operation) and using x86 addressing modes.
21868 */
21869 if (TARGET_SINGLE_STRINGOP)
21870 {
21871 if (max_size > 4)
21872 {
21873 rtx label = ix86_expand_aligntest (count, 4, true);
21874 src = change_address (srcmem, SImode, srcptr);
21875 dest = change_address (destmem, SImode, destptr);
21876 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21877 emit_label (label);
21878 LABEL_NUSES (label) = 1;
21879 }
21880 if (max_size > 2)
21881 {
21882 rtx label = ix86_expand_aligntest (count, 2, true);
21883 src = change_address (srcmem, HImode, srcptr);
21884 dest = change_address (destmem, HImode, destptr);
21885 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21886 emit_label (label);
21887 LABEL_NUSES (label) = 1;
21888 }
21889 if (max_size > 1)
21890 {
21891 rtx label = ix86_expand_aligntest (count, 1, true);
21892 src = change_address (srcmem, QImode, srcptr);
21893 dest = change_address (destmem, QImode, destptr);
21894 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21895 emit_label (label);
21896 LABEL_NUSES (label) = 1;
21897 }
21898 }
21899 else
21900 {
21901 rtx offset = force_reg (Pmode, const0_rtx);
21902 rtx tmp;
21903
21904 if (max_size > 4)
21905 {
21906 rtx label = ix86_expand_aligntest (count, 4, true);
21907 src = change_address (srcmem, SImode, srcptr);
21908 dest = change_address (destmem, SImode, destptr);
21909 emit_move_insn (dest, src);
21910 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
21911 true, OPTAB_LIB_WIDEN);
21912 if (tmp != offset)
21913 emit_move_insn (offset, tmp);
21914 emit_label (label);
21915 LABEL_NUSES (label) = 1;
21916 }
21917 if (max_size > 2)
21918 {
21919 rtx label = ix86_expand_aligntest (count, 2, true);
21920 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21921 src = change_address (srcmem, HImode, tmp);
21922 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21923 dest = change_address (destmem, HImode, tmp);
21924 emit_move_insn (dest, src);
21925 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
21926 true, OPTAB_LIB_WIDEN);
21927 if (tmp != offset)
21928 emit_move_insn (offset, tmp);
21929 emit_label (label);
21930 LABEL_NUSES (label) = 1;
21931 }
21932 if (max_size > 1)
21933 {
21934 rtx label = ix86_expand_aligntest (count, 1, true);
21935 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
21936 src = change_address (srcmem, QImode, tmp);
21937 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
21938 dest = change_address (destmem, QImode, tmp);
21939 emit_move_insn (dest, src);
21940 emit_label (label);
21941 LABEL_NUSES (label) = 1;
21942 }
21943 }
21944 }
21945
21946 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21947 static void
21948 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
21949 rtx count, int max_size)
21950 {
21951 count =
21952 expand_simple_binop (counter_mode (count), AND, count,
21953 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
21954 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
21955 gen_lowpart (QImode, value), count, QImode,
21956 1, max_size / 2);
21957 }
21958
21959 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
21960 static void
21961 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
21962 {
21963 rtx dest;
21964
21965 if (CONST_INT_P (count))
21966 {
21967 HOST_WIDE_INT countval = INTVAL (count);
21968 int offset = 0;
21969
21970 if ((countval & 0x10) && max_size > 16)
21971 {
21972 if (TARGET_64BIT)
21973 {
21974 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21975 emit_insn (gen_strset (destptr, dest, value));
21976 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
21977 emit_insn (gen_strset (destptr, dest, value));
21978 }
21979 else
21980 gcc_unreachable ();
21981 offset += 16;
21982 }
21983 if ((countval & 0x08) && max_size > 8)
21984 {
21985 if (TARGET_64BIT)
21986 {
21987 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
21988 emit_insn (gen_strset (destptr, dest, value));
21989 }
21990 else
21991 {
21992 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
21993 emit_insn (gen_strset (destptr, dest, value));
21994 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
21995 emit_insn (gen_strset (destptr, dest, value));
21996 }
21997 offset += 8;
21998 }
21999 if ((countval & 0x04) && max_size > 4)
22000 {
22001 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22002 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22003 offset += 4;
22004 }
22005 if ((countval & 0x02) && max_size > 2)
22006 {
22007 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22008 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22009 offset += 2;
22010 }
22011 if ((countval & 0x01) && max_size > 1)
22012 {
22013 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22014 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22015 offset += 1;
22016 }
22017 return;
22018 }
22019 if (max_size > 32)
22020 {
22021 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22022 return;
22023 }
22024 if (max_size > 16)
22025 {
22026 rtx label = ix86_expand_aligntest (count, 16, true);
22027 if (TARGET_64BIT)
22028 {
22029 dest = change_address (destmem, DImode, destptr);
22030 emit_insn (gen_strset (destptr, dest, value));
22031 emit_insn (gen_strset (destptr, dest, value));
22032 }
22033 else
22034 {
22035 dest = change_address (destmem, SImode, destptr);
22036 emit_insn (gen_strset (destptr, dest, value));
22037 emit_insn (gen_strset (destptr, dest, value));
22038 emit_insn (gen_strset (destptr, dest, value));
22039 emit_insn (gen_strset (destptr, dest, value));
22040 }
22041 emit_label (label);
22042 LABEL_NUSES (label) = 1;
22043 }
22044 if (max_size > 8)
22045 {
22046 rtx label = ix86_expand_aligntest (count, 8, true);
22047 if (TARGET_64BIT)
22048 {
22049 dest = change_address (destmem, DImode, destptr);
22050 emit_insn (gen_strset (destptr, dest, value));
22051 }
22052 else
22053 {
22054 dest = change_address (destmem, SImode, destptr);
22055 emit_insn (gen_strset (destptr, dest, value));
22056 emit_insn (gen_strset (destptr, dest, value));
22057 }
22058 emit_label (label);
22059 LABEL_NUSES (label) = 1;
22060 }
22061 if (max_size > 4)
22062 {
22063 rtx label = ix86_expand_aligntest (count, 4, true);
22064 dest = change_address (destmem, SImode, destptr);
22065 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22066 emit_label (label);
22067 LABEL_NUSES (label) = 1;
22068 }
22069 if (max_size > 2)
22070 {
22071 rtx label = ix86_expand_aligntest (count, 2, true);
22072 dest = change_address (destmem, HImode, destptr);
22073 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22074 emit_label (label);
22075 LABEL_NUSES (label) = 1;
22076 }
22077 if (max_size > 1)
22078 {
22079 rtx label = ix86_expand_aligntest (count, 1, true);
22080 dest = change_address (destmem, QImode, destptr);
22081 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22082 emit_label (label);
22083 LABEL_NUSES (label) = 1;
22084 }
22085 }
22086
22087 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22088 DESIRED_ALIGNMENT. */
22089 static void
22090 expand_movmem_prologue (rtx destmem, rtx srcmem,
22091 rtx destptr, rtx srcptr, rtx count,
22092 int align, int desired_alignment)
22093 {
22094 if (align <= 1 && desired_alignment > 1)
22095 {
22096 rtx label = ix86_expand_aligntest (destptr, 1, false);
22097 srcmem = change_address (srcmem, QImode, srcptr);
22098 destmem = change_address (destmem, QImode, destptr);
22099 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22100 ix86_adjust_counter (count, 1);
22101 emit_label (label);
22102 LABEL_NUSES (label) = 1;
22103 }
22104 if (align <= 2 && desired_alignment > 2)
22105 {
22106 rtx label = ix86_expand_aligntest (destptr, 2, false);
22107 srcmem = change_address (srcmem, HImode, srcptr);
22108 destmem = change_address (destmem, HImode, destptr);
22109 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22110 ix86_adjust_counter (count, 2);
22111 emit_label (label);
22112 LABEL_NUSES (label) = 1;
22113 }
22114 if (align <= 4 && desired_alignment > 4)
22115 {
22116 rtx label = ix86_expand_aligntest (destptr, 4, false);
22117 srcmem = change_address (srcmem, SImode, srcptr);
22118 destmem = change_address (destmem, SImode, destptr);
22119 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22120 ix86_adjust_counter (count, 4);
22121 emit_label (label);
22122 LABEL_NUSES (label) = 1;
22123 }
22124 gcc_assert (desired_alignment <= 8);
22125 }
22126
22127 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22128 ALIGN_BYTES is how many bytes need to be copied. */
22129 static rtx
22130 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22131 int desired_align, int align_bytes)
22132 {
22133 rtx src = *srcp;
22134 rtx orig_dst = dst;
22135 rtx orig_src = src;
22136 int off = 0;
22137 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22138 if (src_align_bytes >= 0)
22139 src_align_bytes = desired_align - src_align_bytes;
22140 if (align_bytes & 1)
22141 {
22142 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22143 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22144 off = 1;
22145 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22146 }
22147 if (align_bytes & 2)
22148 {
22149 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22150 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22151 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22152 set_mem_align (dst, 2 * BITS_PER_UNIT);
22153 if (src_align_bytes >= 0
22154 && (src_align_bytes & 1) == (align_bytes & 1)
22155 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22156 set_mem_align (src, 2 * BITS_PER_UNIT);
22157 off = 2;
22158 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22159 }
22160 if (align_bytes & 4)
22161 {
22162 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22163 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22164 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22165 set_mem_align (dst, 4 * BITS_PER_UNIT);
22166 if (src_align_bytes >= 0)
22167 {
22168 unsigned int src_align = 0;
22169 if ((src_align_bytes & 3) == (align_bytes & 3))
22170 src_align = 4;
22171 else if ((src_align_bytes & 1) == (align_bytes & 1))
22172 src_align = 2;
22173 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22174 set_mem_align (src, src_align * BITS_PER_UNIT);
22175 }
22176 off = 4;
22177 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22178 }
22179 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22180 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22181 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22182 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22183 if (src_align_bytes >= 0)
22184 {
22185 unsigned int src_align = 0;
22186 if ((src_align_bytes & 7) == (align_bytes & 7))
22187 src_align = 8;
22188 else if ((src_align_bytes & 3) == (align_bytes & 3))
22189 src_align = 4;
22190 else if ((src_align_bytes & 1) == (align_bytes & 1))
22191 src_align = 2;
22192 if (src_align > (unsigned int) desired_align)
22193 src_align = desired_align;
22194 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22195 set_mem_align (src, src_align * BITS_PER_UNIT);
22196 }
22197 if (MEM_SIZE_KNOWN_P (orig_dst))
22198 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22199 if (MEM_SIZE_KNOWN_P (orig_src))
22200 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22201 *srcp = src;
22202 return dst;
22203 }
22204
22205 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22206 DESIRED_ALIGNMENT. */
22207 static void
22208 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22209 int align, int desired_alignment)
22210 {
22211 if (align <= 1 && desired_alignment > 1)
22212 {
22213 rtx label = ix86_expand_aligntest (destptr, 1, false);
22214 destmem = change_address (destmem, QImode, destptr);
22215 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22216 ix86_adjust_counter (count, 1);
22217 emit_label (label);
22218 LABEL_NUSES (label) = 1;
22219 }
22220 if (align <= 2 && desired_alignment > 2)
22221 {
22222 rtx label = ix86_expand_aligntest (destptr, 2, false);
22223 destmem = change_address (destmem, HImode, destptr);
22224 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22225 ix86_adjust_counter (count, 2);
22226 emit_label (label);
22227 LABEL_NUSES (label) = 1;
22228 }
22229 if (align <= 4 && desired_alignment > 4)
22230 {
22231 rtx label = ix86_expand_aligntest (destptr, 4, false);
22232 destmem = change_address (destmem, SImode, destptr);
22233 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22234 ix86_adjust_counter (count, 4);
22235 emit_label (label);
22236 LABEL_NUSES (label) = 1;
22237 }
22238 gcc_assert (desired_alignment <= 8);
22239 }
22240
22241 /* Set enough from DST to align DST known to by aligned by ALIGN to
22242 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22243 static rtx
22244 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22245 int desired_align, int align_bytes)
22246 {
22247 int off = 0;
22248 rtx orig_dst = dst;
22249 if (align_bytes & 1)
22250 {
22251 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22252 off = 1;
22253 emit_insn (gen_strset (destreg, dst,
22254 gen_lowpart (QImode, value)));
22255 }
22256 if (align_bytes & 2)
22257 {
22258 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22259 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22260 set_mem_align (dst, 2 * BITS_PER_UNIT);
22261 off = 2;
22262 emit_insn (gen_strset (destreg, dst,
22263 gen_lowpart (HImode, value)));
22264 }
22265 if (align_bytes & 4)
22266 {
22267 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22268 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22269 set_mem_align (dst, 4 * BITS_PER_UNIT);
22270 off = 4;
22271 emit_insn (gen_strset (destreg, dst,
22272 gen_lowpart (SImode, value)));
22273 }
22274 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22275 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22276 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22277 if (MEM_SIZE_KNOWN_P (orig_dst))
22278 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22279 return dst;
22280 }
22281
22282 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22283 static enum stringop_alg
22284 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22285 int *dynamic_check)
22286 {
22287 const struct stringop_algs * algs;
22288 bool optimize_for_speed;
22289 /* Algorithms using the rep prefix want at least edi and ecx;
22290 additionally, memset wants eax and memcpy wants esi. Don't
22291 consider such algorithms if the user has appropriated those
22292 registers for their own purposes. */
22293 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22294 || (memset
22295 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22296
22297 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22298 || (alg != rep_prefix_1_byte \
22299 && alg != rep_prefix_4_byte \
22300 && alg != rep_prefix_8_byte))
22301 const struct processor_costs *cost;
22302
22303 /* Even if the string operation call is cold, we still might spend a lot
22304 of time processing large blocks. */
22305 if (optimize_function_for_size_p (cfun)
22306 || (optimize_insn_for_size_p ()
22307 && expected_size != -1 && expected_size < 256))
22308 optimize_for_speed = false;
22309 else
22310 optimize_for_speed = true;
22311
22312 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22313
22314 *dynamic_check = -1;
22315 if (memset)
22316 algs = &cost->memset[TARGET_64BIT != 0];
22317 else
22318 algs = &cost->memcpy[TARGET_64BIT != 0];
22319 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22320 return ix86_stringop_alg;
22321 /* rep; movq or rep; movl is the smallest variant. */
22322 else if (!optimize_for_speed)
22323 {
22324 if (!count || (count & 3))
22325 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22326 else
22327 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22328 }
22329 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22330 */
22331 else if (expected_size != -1 && expected_size < 4)
22332 return loop_1_byte;
22333 else if (expected_size != -1)
22334 {
22335 unsigned int i;
22336 enum stringop_alg alg = libcall;
22337 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22338 {
22339 /* We get here if the algorithms that were not libcall-based
22340 were rep-prefix based and we are unable to use rep prefixes
22341 based on global register usage. Break out of the loop and
22342 use the heuristic below. */
22343 if (algs->size[i].max == 0)
22344 break;
22345 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22346 {
22347 enum stringop_alg candidate = algs->size[i].alg;
22348
22349 if (candidate != libcall && ALG_USABLE_P (candidate))
22350 alg = candidate;
22351 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22352 last non-libcall inline algorithm. */
22353 if (TARGET_INLINE_ALL_STRINGOPS)
22354 {
22355 /* When the current size is best to be copied by a libcall,
22356 but we are still forced to inline, run the heuristic below
22357 that will pick code for medium sized blocks. */
22358 if (alg != libcall)
22359 return alg;
22360 break;
22361 }
22362 else if (ALG_USABLE_P (candidate))
22363 return candidate;
22364 }
22365 }
22366 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22367 }
22368 /* When asked to inline the call anyway, try to pick meaningful choice.
22369 We look for maximal size of block that is faster to copy by hand and
22370 take blocks of at most of that size guessing that average size will
22371 be roughly half of the block.
22372
22373 If this turns out to be bad, we might simply specify the preferred
22374 choice in ix86_costs. */
22375 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22376 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22377 {
22378 int max = -1;
22379 enum stringop_alg alg;
22380 int i;
22381 bool any_alg_usable_p = true;
22382
22383 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22384 {
22385 enum stringop_alg candidate = algs->size[i].alg;
22386 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22387
22388 if (candidate != libcall && candidate
22389 && ALG_USABLE_P (candidate))
22390 max = algs->size[i].max;
22391 }
22392 /* If there aren't any usable algorithms, then recursing on
22393 smaller sizes isn't going to find anything. Just return the
22394 simple byte-at-a-time copy loop. */
22395 if (!any_alg_usable_p)
22396 {
22397 /* Pick something reasonable. */
22398 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22399 *dynamic_check = 128;
22400 return loop_1_byte;
22401 }
22402 if (max == -1)
22403 max = 4096;
22404 alg = decide_alg (count, max / 2, memset, dynamic_check);
22405 gcc_assert (*dynamic_check == -1);
22406 gcc_assert (alg != libcall);
22407 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22408 *dynamic_check = max;
22409 return alg;
22410 }
22411 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22412 #undef ALG_USABLE_P
22413 }
22414
22415 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22416 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22417 static int
22418 decide_alignment (int align,
22419 enum stringop_alg alg,
22420 int expected_size)
22421 {
22422 int desired_align = 0;
22423 switch (alg)
22424 {
22425 case no_stringop:
22426 gcc_unreachable ();
22427 case loop:
22428 case unrolled_loop:
22429 desired_align = GET_MODE_SIZE (Pmode);
22430 break;
22431 case rep_prefix_8_byte:
22432 desired_align = 8;
22433 break;
22434 case rep_prefix_4_byte:
22435 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22436 copying whole cacheline at once. */
22437 if (TARGET_PENTIUMPRO)
22438 desired_align = 8;
22439 else
22440 desired_align = 4;
22441 break;
22442 case rep_prefix_1_byte:
22443 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22444 copying whole cacheline at once. */
22445 if (TARGET_PENTIUMPRO)
22446 desired_align = 8;
22447 else
22448 desired_align = 1;
22449 break;
22450 case loop_1_byte:
22451 desired_align = 1;
22452 break;
22453 case libcall:
22454 return 0;
22455 }
22456
22457 if (optimize_size)
22458 desired_align = 1;
22459 if (desired_align < align)
22460 desired_align = align;
22461 if (expected_size != -1 && expected_size < 4)
22462 desired_align = align;
22463 return desired_align;
22464 }
22465
22466 /* Return the smallest power of 2 greater than VAL. */
22467 static int
22468 smallest_pow2_greater_than (int val)
22469 {
22470 int ret = 1;
22471 while (ret <= val)
22472 ret <<= 1;
22473 return ret;
22474 }
22475
22476 /* Expand string move (memcpy) operation. Use i386 string operations
22477 when profitable. expand_setmem contains similar code. The code
22478 depends upon architecture, block size and alignment, but always has
22479 the same overall structure:
22480
22481 1) Prologue guard: Conditional that jumps up to epilogues for small
22482 blocks that can be handled by epilogue alone. This is faster
22483 but also needed for correctness, since prologue assume the block
22484 is larger than the desired alignment.
22485
22486 Optional dynamic check for size and libcall for large
22487 blocks is emitted here too, with -minline-stringops-dynamically.
22488
22489 2) Prologue: copy first few bytes in order to get destination
22490 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22491 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22492 copied. We emit either a jump tree on power of two sized
22493 blocks, or a byte loop.
22494
22495 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22496 with specified algorithm.
22497
22498 4) Epilogue: code copying tail of the block that is too small to be
22499 handled by main body (or up to size guarded by prologue guard). */
22500
22501 bool
22502 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22503 rtx expected_align_exp, rtx expected_size_exp)
22504 {
22505 rtx destreg;
22506 rtx srcreg;
22507 rtx label = NULL;
22508 rtx tmp;
22509 rtx jump_around_label = NULL;
22510 HOST_WIDE_INT align = 1;
22511 unsigned HOST_WIDE_INT count = 0;
22512 HOST_WIDE_INT expected_size = -1;
22513 int size_needed = 0, epilogue_size_needed;
22514 int desired_align = 0, align_bytes = 0;
22515 enum stringop_alg alg;
22516 int dynamic_check;
22517 bool need_zero_guard = false;
22518
22519 if (CONST_INT_P (align_exp))
22520 align = INTVAL (align_exp);
22521 /* i386 can do misaligned access on reasonably increased cost. */
22522 if (CONST_INT_P (expected_align_exp)
22523 && INTVAL (expected_align_exp) > align)
22524 align = INTVAL (expected_align_exp);
22525 /* ALIGN is the minimum of destination and source alignment, but we care here
22526 just about destination alignment. */
22527 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22528 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22529
22530 if (CONST_INT_P (count_exp))
22531 count = expected_size = INTVAL (count_exp);
22532 if (CONST_INT_P (expected_size_exp) && count == 0)
22533 expected_size = INTVAL (expected_size_exp);
22534
22535 /* Make sure we don't need to care about overflow later on. */
22536 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22537 return false;
22538
22539 /* Step 0: Decide on preferred algorithm, desired alignment and
22540 size of chunks to be copied by main loop. */
22541
22542 alg = decide_alg (count, expected_size, false, &dynamic_check);
22543 desired_align = decide_alignment (align, alg, expected_size);
22544
22545 if (!TARGET_ALIGN_STRINGOPS)
22546 align = desired_align;
22547
22548 if (alg == libcall)
22549 return false;
22550 gcc_assert (alg != no_stringop);
22551 if (!count)
22552 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22553 destreg = copy_addr_to_reg (XEXP (dst, 0));
22554 srcreg = copy_addr_to_reg (XEXP (src, 0));
22555 switch (alg)
22556 {
22557 case libcall:
22558 case no_stringop:
22559 gcc_unreachable ();
22560 case loop:
22561 need_zero_guard = true;
22562 size_needed = GET_MODE_SIZE (word_mode);
22563 break;
22564 case unrolled_loop:
22565 need_zero_guard = true;
22566 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22567 break;
22568 case rep_prefix_8_byte:
22569 size_needed = 8;
22570 break;
22571 case rep_prefix_4_byte:
22572 size_needed = 4;
22573 break;
22574 case rep_prefix_1_byte:
22575 size_needed = 1;
22576 break;
22577 case loop_1_byte:
22578 need_zero_guard = true;
22579 size_needed = 1;
22580 break;
22581 }
22582
22583 epilogue_size_needed = size_needed;
22584
22585 /* Step 1: Prologue guard. */
22586
22587 /* Alignment code needs count to be in register. */
22588 if (CONST_INT_P (count_exp) && desired_align > align)
22589 {
22590 if (INTVAL (count_exp) > desired_align
22591 && INTVAL (count_exp) > size_needed)
22592 {
22593 align_bytes
22594 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22595 if (align_bytes <= 0)
22596 align_bytes = 0;
22597 else
22598 align_bytes = desired_align - align_bytes;
22599 }
22600 if (align_bytes == 0)
22601 count_exp = force_reg (counter_mode (count_exp), count_exp);
22602 }
22603 gcc_assert (desired_align >= 1 && align >= 1);
22604
22605 /* Ensure that alignment prologue won't copy past end of block. */
22606 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22607 {
22608 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22609 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22610 Make sure it is power of 2. */
22611 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22612
22613 if (count)
22614 {
22615 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22616 {
22617 /* If main algorithm works on QImode, no epilogue is needed.
22618 For small sizes just don't align anything. */
22619 if (size_needed == 1)
22620 desired_align = align;
22621 else
22622 goto epilogue;
22623 }
22624 }
22625 else
22626 {
22627 label = gen_label_rtx ();
22628 emit_cmp_and_jump_insns (count_exp,
22629 GEN_INT (epilogue_size_needed),
22630 LTU, 0, counter_mode (count_exp), 1, label);
22631 if (expected_size == -1 || expected_size < epilogue_size_needed)
22632 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22633 else
22634 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22635 }
22636 }
22637
22638 /* Emit code to decide on runtime whether library call or inline should be
22639 used. */
22640 if (dynamic_check != -1)
22641 {
22642 if (CONST_INT_P (count_exp))
22643 {
22644 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22645 {
22646 emit_block_move_via_libcall (dst, src, count_exp, false);
22647 count_exp = const0_rtx;
22648 goto epilogue;
22649 }
22650 }
22651 else
22652 {
22653 rtx hot_label = gen_label_rtx ();
22654 jump_around_label = gen_label_rtx ();
22655 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22656 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22657 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22658 emit_block_move_via_libcall (dst, src, count_exp, false);
22659 emit_jump (jump_around_label);
22660 emit_label (hot_label);
22661 }
22662 }
22663
22664 /* Step 2: Alignment prologue. */
22665
22666 if (desired_align > align)
22667 {
22668 if (align_bytes == 0)
22669 {
22670 /* Except for the first move in epilogue, we no longer know
22671 constant offset in aliasing info. It don't seems to worth
22672 the pain to maintain it for the first move, so throw away
22673 the info early. */
22674 src = change_address (src, BLKmode, srcreg);
22675 dst = change_address (dst, BLKmode, destreg);
22676 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22677 desired_align);
22678 }
22679 else
22680 {
22681 /* If we know how many bytes need to be stored before dst is
22682 sufficiently aligned, maintain aliasing info accurately. */
22683 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22684 desired_align, align_bytes);
22685 count_exp = plus_constant (counter_mode (count_exp),
22686 count_exp, -align_bytes);
22687 count -= align_bytes;
22688 }
22689 if (need_zero_guard
22690 && (count < (unsigned HOST_WIDE_INT) size_needed
22691 || (align_bytes == 0
22692 && count < ((unsigned HOST_WIDE_INT) size_needed
22693 + desired_align - align))))
22694 {
22695 /* It is possible that we copied enough so the main loop will not
22696 execute. */
22697 gcc_assert (size_needed > 1);
22698 if (label == NULL_RTX)
22699 label = gen_label_rtx ();
22700 emit_cmp_and_jump_insns (count_exp,
22701 GEN_INT (size_needed),
22702 LTU, 0, counter_mode (count_exp), 1, label);
22703 if (expected_size == -1
22704 || expected_size < (desired_align - align) / 2 + size_needed)
22705 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22706 else
22707 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22708 }
22709 }
22710 if (label && size_needed == 1)
22711 {
22712 emit_label (label);
22713 LABEL_NUSES (label) = 1;
22714 label = NULL;
22715 epilogue_size_needed = 1;
22716 }
22717 else if (label == NULL_RTX)
22718 epilogue_size_needed = size_needed;
22719
22720 /* Step 3: Main loop. */
22721
22722 switch (alg)
22723 {
22724 case libcall:
22725 case no_stringop:
22726 gcc_unreachable ();
22727 case loop_1_byte:
22728 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22729 count_exp, QImode, 1, expected_size);
22730 break;
22731 case loop:
22732 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22733 count_exp, word_mode, 1, expected_size);
22734 break;
22735 case unrolled_loop:
22736 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22737 registers for 4 temporaries anyway. */
22738 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22739 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22740 expected_size);
22741 break;
22742 case rep_prefix_8_byte:
22743 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22744 DImode);
22745 break;
22746 case rep_prefix_4_byte:
22747 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22748 SImode);
22749 break;
22750 case rep_prefix_1_byte:
22751 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22752 QImode);
22753 break;
22754 }
22755 /* Adjust properly the offset of src and dest memory for aliasing. */
22756 if (CONST_INT_P (count_exp))
22757 {
22758 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22759 (count / size_needed) * size_needed);
22760 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22761 (count / size_needed) * size_needed);
22762 }
22763 else
22764 {
22765 src = change_address (src, BLKmode, srcreg);
22766 dst = change_address (dst, BLKmode, destreg);
22767 }
22768
22769 /* Step 4: Epilogue to copy the remaining bytes. */
22770 epilogue:
22771 if (label)
22772 {
22773 /* When the main loop is done, COUNT_EXP might hold original count,
22774 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22775 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22776 bytes. Compensate if needed. */
22777
22778 if (size_needed < epilogue_size_needed)
22779 {
22780 tmp =
22781 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22782 GEN_INT (size_needed - 1), count_exp, 1,
22783 OPTAB_DIRECT);
22784 if (tmp != count_exp)
22785 emit_move_insn (count_exp, tmp);
22786 }
22787 emit_label (label);
22788 LABEL_NUSES (label) = 1;
22789 }
22790
22791 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22792 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22793 epilogue_size_needed);
22794 if (jump_around_label)
22795 emit_label (jump_around_label);
22796 return true;
22797 }
22798
22799 /* Helper function for memcpy. For QImode value 0xXY produce
22800 0xXYXYXYXY of wide specified by MODE. This is essentially
22801 a * 0x10101010, but we can do slightly better than
22802 synth_mult by unwinding the sequence by hand on CPUs with
22803 slow multiply. */
22804 static rtx
22805 promote_duplicated_reg (enum machine_mode mode, rtx val)
22806 {
22807 enum machine_mode valmode = GET_MODE (val);
22808 rtx tmp;
22809 int nops = mode == DImode ? 3 : 2;
22810
22811 gcc_assert (mode == SImode || mode == DImode);
22812 if (val == const0_rtx)
22813 return copy_to_mode_reg (mode, const0_rtx);
22814 if (CONST_INT_P (val))
22815 {
22816 HOST_WIDE_INT v = INTVAL (val) & 255;
22817
22818 v |= v << 8;
22819 v |= v << 16;
22820 if (mode == DImode)
22821 v |= (v << 16) << 16;
22822 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22823 }
22824
22825 if (valmode == VOIDmode)
22826 valmode = QImode;
22827 if (valmode != QImode)
22828 val = gen_lowpart (QImode, val);
22829 if (mode == QImode)
22830 return val;
22831 if (!TARGET_PARTIAL_REG_STALL)
22832 nops--;
22833 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22834 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22835 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22836 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22837 {
22838 rtx reg = convert_modes (mode, QImode, val, true);
22839 tmp = promote_duplicated_reg (mode, const1_rtx);
22840 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
22841 OPTAB_DIRECT);
22842 }
22843 else
22844 {
22845 rtx reg = convert_modes (mode, QImode, val, true);
22846
22847 if (!TARGET_PARTIAL_REG_STALL)
22848 if (mode == SImode)
22849 emit_insn (gen_movsi_insv_1 (reg, reg));
22850 else
22851 emit_insn (gen_movdi_insv_1 (reg, reg));
22852 else
22853 {
22854 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
22855 NULL, 1, OPTAB_DIRECT);
22856 reg =
22857 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22858 }
22859 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
22860 NULL, 1, OPTAB_DIRECT);
22861 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22862 if (mode == SImode)
22863 return reg;
22864 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
22865 NULL, 1, OPTAB_DIRECT);
22866 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
22867 return reg;
22868 }
22869 }
22870
22871 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
22872 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
22873 alignment from ALIGN to DESIRED_ALIGN. */
22874 static rtx
22875 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
22876 {
22877 rtx promoted_val;
22878
22879 if (TARGET_64BIT
22880 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
22881 promoted_val = promote_duplicated_reg (DImode, val);
22882 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
22883 promoted_val = promote_duplicated_reg (SImode, val);
22884 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
22885 promoted_val = promote_duplicated_reg (HImode, val);
22886 else
22887 promoted_val = val;
22888
22889 return promoted_val;
22890 }
22891
22892 /* Expand string clear operation (bzero). Use i386 string operations when
22893 profitable. See expand_movmem comment for explanation of individual
22894 steps performed. */
22895 bool
22896 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
22897 rtx expected_align_exp, rtx expected_size_exp)
22898 {
22899 rtx destreg;
22900 rtx label = NULL;
22901 rtx tmp;
22902 rtx jump_around_label = NULL;
22903 HOST_WIDE_INT align = 1;
22904 unsigned HOST_WIDE_INT count = 0;
22905 HOST_WIDE_INT expected_size = -1;
22906 int size_needed = 0, epilogue_size_needed;
22907 int desired_align = 0, align_bytes = 0;
22908 enum stringop_alg alg;
22909 rtx promoted_val = NULL;
22910 bool force_loopy_epilogue = false;
22911 int dynamic_check;
22912 bool need_zero_guard = false;
22913
22914 if (CONST_INT_P (align_exp))
22915 align = INTVAL (align_exp);
22916 /* i386 can do misaligned access on reasonably increased cost. */
22917 if (CONST_INT_P (expected_align_exp)
22918 && INTVAL (expected_align_exp) > align)
22919 align = INTVAL (expected_align_exp);
22920 if (CONST_INT_P (count_exp))
22921 count = expected_size = INTVAL (count_exp);
22922 if (CONST_INT_P (expected_size_exp) && count == 0)
22923 expected_size = INTVAL (expected_size_exp);
22924
22925 /* Make sure we don't need to care about overflow later on. */
22926 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22927 return false;
22928
22929 /* Step 0: Decide on preferred algorithm, desired alignment and
22930 size of chunks to be copied by main loop. */
22931
22932 alg = decide_alg (count, expected_size, true, &dynamic_check);
22933 desired_align = decide_alignment (align, alg, expected_size);
22934
22935 if (!TARGET_ALIGN_STRINGOPS)
22936 align = desired_align;
22937
22938 if (alg == libcall)
22939 return false;
22940 gcc_assert (alg != no_stringop);
22941 if (!count)
22942 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
22943 destreg = copy_addr_to_reg (XEXP (dst, 0));
22944 switch (alg)
22945 {
22946 case libcall:
22947 case no_stringop:
22948 gcc_unreachable ();
22949 case loop:
22950 need_zero_guard = true;
22951 size_needed = GET_MODE_SIZE (word_mode);
22952 break;
22953 case unrolled_loop:
22954 need_zero_guard = true;
22955 size_needed = GET_MODE_SIZE (word_mode) * 4;
22956 break;
22957 case rep_prefix_8_byte:
22958 size_needed = 8;
22959 break;
22960 case rep_prefix_4_byte:
22961 size_needed = 4;
22962 break;
22963 case rep_prefix_1_byte:
22964 size_needed = 1;
22965 break;
22966 case loop_1_byte:
22967 need_zero_guard = true;
22968 size_needed = 1;
22969 break;
22970 }
22971 epilogue_size_needed = size_needed;
22972
22973 /* Step 1: Prologue guard. */
22974
22975 /* Alignment code needs count to be in register. */
22976 if (CONST_INT_P (count_exp) && desired_align > align)
22977 {
22978 if (INTVAL (count_exp) > desired_align
22979 && INTVAL (count_exp) > size_needed)
22980 {
22981 align_bytes
22982 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22983 if (align_bytes <= 0)
22984 align_bytes = 0;
22985 else
22986 align_bytes = desired_align - align_bytes;
22987 }
22988 if (align_bytes == 0)
22989 {
22990 enum machine_mode mode = SImode;
22991 if (TARGET_64BIT && (count & ~0xffffffff))
22992 mode = DImode;
22993 count_exp = force_reg (mode, count_exp);
22994 }
22995 }
22996 /* Do the cheap promotion to allow better CSE across the
22997 main loop and epilogue (ie one load of the big constant in the
22998 front of all code. */
22999 if (CONST_INT_P (val_exp))
23000 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23001 desired_align, align);
23002 /* Ensure that alignment prologue won't copy past end of block. */
23003 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23004 {
23005 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23006 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23007 Make sure it is power of 2. */
23008 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23009
23010 /* To improve performance of small blocks, we jump around the VAL
23011 promoting mode. This mean that if the promoted VAL is not constant,
23012 we might not use it in the epilogue and have to use byte
23013 loop variant. */
23014 if (epilogue_size_needed > 2 && !promoted_val)
23015 force_loopy_epilogue = true;
23016 if (count)
23017 {
23018 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23019 {
23020 /* If main algorithm works on QImode, no epilogue is needed.
23021 For small sizes just don't align anything. */
23022 if (size_needed == 1)
23023 desired_align = align;
23024 else
23025 goto epilogue;
23026 }
23027 }
23028 else
23029 {
23030 label = gen_label_rtx ();
23031 emit_cmp_and_jump_insns (count_exp,
23032 GEN_INT (epilogue_size_needed),
23033 LTU, 0, counter_mode (count_exp), 1, label);
23034 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23035 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23036 else
23037 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23038 }
23039 }
23040 if (dynamic_check != -1)
23041 {
23042 rtx hot_label = gen_label_rtx ();
23043 jump_around_label = gen_label_rtx ();
23044 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23045 LEU, 0, counter_mode (count_exp), 1, hot_label);
23046 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23047 set_storage_via_libcall (dst, count_exp, val_exp, false);
23048 emit_jump (jump_around_label);
23049 emit_label (hot_label);
23050 }
23051
23052 /* Step 2: Alignment prologue. */
23053
23054 /* Do the expensive promotion once we branched off the small blocks. */
23055 if (!promoted_val)
23056 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23057 desired_align, align);
23058 gcc_assert (desired_align >= 1 && align >= 1);
23059
23060 if (desired_align > align)
23061 {
23062 if (align_bytes == 0)
23063 {
23064 /* Except for the first move in epilogue, we no longer know
23065 constant offset in aliasing info. It don't seems to worth
23066 the pain to maintain it for the first move, so throw away
23067 the info early. */
23068 dst = change_address (dst, BLKmode, destreg);
23069 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23070 desired_align);
23071 }
23072 else
23073 {
23074 /* If we know how many bytes need to be stored before dst is
23075 sufficiently aligned, maintain aliasing info accurately. */
23076 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23077 desired_align, align_bytes);
23078 count_exp = plus_constant (counter_mode (count_exp),
23079 count_exp, -align_bytes);
23080 count -= align_bytes;
23081 }
23082 if (need_zero_guard
23083 && (count < (unsigned HOST_WIDE_INT) size_needed
23084 || (align_bytes == 0
23085 && count < ((unsigned HOST_WIDE_INT) size_needed
23086 + desired_align - align))))
23087 {
23088 /* It is possible that we copied enough so the main loop will not
23089 execute. */
23090 gcc_assert (size_needed > 1);
23091 if (label == NULL_RTX)
23092 label = gen_label_rtx ();
23093 emit_cmp_and_jump_insns (count_exp,
23094 GEN_INT (size_needed),
23095 LTU, 0, counter_mode (count_exp), 1, label);
23096 if (expected_size == -1
23097 || expected_size < (desired_align - align) / 2 + size_needed)
23098 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23099 else
23100 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23101 }
23102 }
23103 if (label && size_needed == 1)
23104 {
23105 emit_label (label);
23106 LABEL_NUSES (label) = 1;
23107 label = NULL;
23108 promoted_val = val_exp;
23109 epilogue_size_needed = 1;
23110 }
23111 else if (label == NULL_RTX)
23112 epilogue_size_needed = size_needed;
23113
23114 /* Step 3: Main loop. */
23115
23116 switch (alg)
23117 {
23118 case libcall:
23119 case no_stringop:
23120 gcc_unreachable ();
23121 case loop_1_byte:
23122 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23123 count_exp, QImode, 1, expected_size);
23124 break;
23125 case loop:
23126 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23127 count_exp, word_mode, 1, expected_size);
23128 break;
23129 case unrolled_loop:
23130 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23131 count_exp, word_mode, 4, expected_size);
23132 break;
23133 case rep_prefix_8_byte:
23134 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23135 DImode, val_exp);
23136 break;
23137 case rep_prefix_4_byte:
23138 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23139 SImode, val_exp);
23140 break;
23141 case rep_prefix_1_byte:
23142 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23143 QImode, val_exp);
23144 break;
23145 }
23146 /* Adjust properly the offset of src and dest memory for aliasing. */
23147 if (CONST_INT_P (count_exp))
23148 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23149 (count / size_needed) * size_needed);
23150 else
23151 dst = change_address (dst, BLKmode, destreg);
23152
23153 /* Step 4: Epilogue to copy the remaining bytes. */
23154
23155 if (label)
23156 {
23157 /* When the main loop is done, COUNT_EXP might hold original count,
23158 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23159 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23160 bytes. Compensate if needed. */
23161
23162 if (size_needed < epilogue_size_needed)
23163 {
23164 tmp =
23165 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23166 GEN_INT (size_needed - 1), count_exp, 1,
23167 OPTAB_DIRECT);
23168 if (tmp != count_exp)
23169 emit_move_insn (count_exp, tmp);
23170 }
23171 emit_label (label);
23172 LABEL_NUSES (label) = 1;
23173 }
23174 epilogue:
23175 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23176 {
23177 if (force_loopy_epilogue)
23178 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23179 epilogue_size_needed);
23180 else
23181 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23182 epilogue_size_needed);
23183 }
23184 if (jump_around_label)
23185 emit_label (jump_around_label);
23186 return true;
23187 }
23188
23189 /* Expand the appropriate insns for doing strlen if not just doing
23190 repnz; scasb
23191
23192 out = result, initialized with the start address
23193 align_rtx = alignment of the address.
23194 scratch = scratch register, initialized with the startaddress when
23195 not aligned, otherwise undefined
23196
23197 This is just the body. It needs the initializations mentioned above and
23198 some address computing at the end. These things are done in i386.md. */
23199
23200 static void
23201 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23202 {
23203 int align;
23204 rtx tmp;
23205 rtx align_2_label = NULL_RTX;
23206 rtx align_3_label = NULL_RTX;
23207 rtx align_4_label = gen_label_rtx ();
23208 rtx end_0_label = gen_label_rtx ();
23209 rtx mem;
23210 rtx tmpreg = gen_reg_rtx (SImode);
23211 rtx scratch = gen_reg_rtx (SImode);
23212 rtx cmp;
23213
23214 align = 0;
23215 if (CONST_INT_P (align_rtx))
23216 align = INTVAL (align_rtx);
23217
23218 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23219
23220 /* Is there a known alignment and is it less than 4? */
23221 if (align < 4)
23222 {
23223 rtx scratch1 = gen_reg_rtx (Pmode);
23224 emit_move_insn (scratch1, out);
23225 /* Is there a known alignment and is it not 2? */
23226 if (align != 2)
23227 {
23228 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23229 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23230
23231 /* Leave just the 3 lower bits. */
23232 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23233 NULL_RTX, 0, OPTAB_WIDEN);
23234
23235 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23236 Pmode, 1, align_4_label);
23237 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23238 Pmode, 1, align_2_label);
23239 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23240 Pmode, 1, align_3_label);
23241 }
23242 else
23243 {
23244 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23245 check if is aligned to 4 - byte. */
23246
23247 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23248 NULL_RTX, 0, OPTAB_WIDEN);
23249
23250 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23251 Pmode, 1, align_4_label);
23252 }
23253
23254 mem = change_address (src, QImode, out);
23255
23256 /* Now compare the bytes. */
23257
23258 /* Compare the first n unaligned byte on a byte per byte basis. */
23259 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23260 QImode, 1, end_0_label);
23261
23262 /* Increment the address. */
23263 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23264
23265 /* Not needed with an alignment of 2 */
23266 if (align != 2)
23267 {
23268 emit_label (align_2_label);
23269
23270 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23271 end_0_label);
23272
23273 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23274
23275 emit_label (align_3_label);
23276 }
23277
23278 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23279 end_0_label);
23280
23281 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23282 }
23283
23284 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23285 align this loop. It gives only huge programs, but does not help to
23286 speed up. */
23287 emit_label (align_4_label);
23288
23289 mem = change_address (src, SImode, out);
23290 emit_move_insn (scratch, mem);
23291 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23292
23293 /* This formula yields a nonzero result iff one of the bytes is zero.
23294 This saves three branches inside loop and many cycles. */
23295
23296 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23297 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23298 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23299 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23300 gen_int_mode (0x80808080, SImode)));
23301 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23302 align_4_label);
23303
23304 if (TARGET_CMOVE)
23305 {
23306 rtx reg = gen_reg_rtx (SImode);
23307 rtx reg2 = gen_reg_rtx (Pmode);
23308 emit_move_insn (reg, tmpreg);
23309 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23310
23311 /* If zero is not in the first two bytes, move two bytes forward. */
23312 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23313 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23314 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23315 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23316 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23317 reg,
23318 tmpreg)));
23319 /* Emit lea manually to avoid clobbering of flags. */
23320 emit_insn (gen_rtx_SET (SImode, reg2,
23321 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23322
23323 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23324 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23325 emit_insn (gen_rtx_SET (VOIDmode, out,
23326 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23327 reg2,
23328 out)));
23329 }
23330 else
23331 {
23332 rtx end_2_label = gen_label_rtx ();
23333 /* Is zero in the first two bytes? */
23334
23335 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23336 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23337 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23338 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23339 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23340 pc_rtx);
23341 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23342 JUMP_LABEL (tmp) = end_2_label;
23343
23344 /* Not in the first two. Move two bytes forward. */
23345 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23346 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23347
23348 emit_label (end_2_label);
23349
23350 }
23351
23352 /* Avoid branch in fixing the byte. */
23353 tmpreg = gen_lowpart (QImode, tmpreg);
23354 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23355 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23356 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23357 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23358
23359 emit_label (end_0_label);
23360 }
23361
23362 /* Expand strlen. */
23363
23364 bool
23365 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23366 {
23367 rtx addr, scratch1, scratch2, scratch3, scratch4;
23368
23369 /* The generic case of strlen expander is long. Avoid it's
23370 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23371
23372 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23373 && !TARGET_INLINE_ALL_STRINGOPS
23374 && !optimize_insn_for_size_p ()
23375 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23376 return false;
23377
23378 addr = force_reg (Pmode, XEXP (src, 0));
23379 scratch1 = gen_reg_rtx (Pmode);
23380
23381 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23382 && !optimize_insn_for_size_p ())
23383 {
23384 /* Well it seems that some optimizer does not combine a call like
23385 foo(strlen(bar), strlen(bar));
23386 when the move and the subtraction is done here. It does calculate
23387 the length just once when these instructions are done inside of
23388 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23389 often used and I use one fewer register for the lifetime of
23390 output_strlen_unroll() this is better. */
23391
23392 emit_move_insn (out, addr);
23393
23394 ix86_expand_strlensi_unroll_1 (out, src, align);
23395
23396 /* strlensi_unroll_1 returns the address of the zero at the end of
23397 the string, like memchr(), so compute the length by subtracting
23398 the start address. */
23399 emit_insn (ix86_gen_sub3 (out, out, addr));
23400 }
23401 else
23402 {
23403 rtx unspec;
23404
23405 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23406 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23407 return false;
23408
23409 scratch2 = gen_reg_rtx (Pmode);
23410 scratch3 = gen_reg_rtx (Pmode);
23411 scratch4 = force_reg (Pmode, constm1_rtx);
23412
23413 emit_move_insn (scratch3, addr);
23414 eoschar = force_reg (QImode, eoschar);
23415
23416 src = replace_equiv_address_nv (src, scratch3);
23417
23418 /* If .md starts supporting :P, this can be done in .md. */
23419 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23420 scratch4), UNSPEC_SCAS);
23421 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23422 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23423 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23424 }
23425 return true;
23426 }
23427
23428 /* For given symbol (function) construct code to compute address of it's PLT
23429 entry in large x86-64 PIC model. */
23430 static rtx
23431 construct_plt_address (rtx symbol)
23432 {
23433 rtx tmp, unspec;
23434
23435 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23436 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23437 gcc_assert (Pmode == DImode);
23438
23439 tmp = gen_reg_rtx (Pmode);
23440 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23441
23442 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23443 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23444 return tmp;
23445 }
23446
23447 rtx
23448 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23449 rtx callarg2,
23450 rtx pop, bool sibcall)
23451 {
23452 /* We need to represent that SI and DI registers are clobbered
23453 by SYSV calls. */
23454 static int clobbered_registers[] = {
23455 XMM6_REG, XMM7_REG, XMM8_REG,
23456 XMM9_REG, XMM10_REG, XMM11_REG,
23457 XMM12_REG, XMM13_REG, XMM14_REG,
23458 XMM15_REG, SI_REG, DI_REG
23459 };
23460 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23461 rtx use = NULL, call;
23462 unsigned int vec_len;
23463
23464 if (pop == const0_rtx)
23465 pop = NULL;
23466 gcc_assert (!TARGET_64BIT || !pop);
23467
23468 if (TARGET_MACHO && !TARGET_64BIT)
23469 {
23470 #if TARGET_MACHO
23471 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23472 fnaddr = machopic_indirect_call_target (fnaddr);
23473 #endif
23474 }
23475 else
23476 {
23477 /* Static functions and indirect calls don't need the pic register. */
23478 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23479 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23480 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23481 use_reg (&use, pic_offset_table_rtx);
23482 }
23483
23484 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23485 {
23486 rtx al = gen_rtx_REG (QImode, AX_REG);
23487 emit_move_insn (al, callarg2);
23488 use_reg (&use, al);
23489 }
23490
23491 if (ix86_cmodel == CM_LARGE_PIC
23492 && MEM_P (fnaddr)
23493 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23494 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23495 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23496 else if (sibcall
23497 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23498 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23499 {
23500 fnaddr = XEXP (fnaddr, 0);
23501 if (GET_MODE (fnaddr) != word_mode)
23502 fnaddr = convert_to_mode (word_mode, fnaddr, 1);
23503 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23504 }
23505
23506 vec_len = 0;
23507 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23508 if (retval)
23509 call = gen_rtx_SET (VOIDmode, retval, call);
23510 vec[vec_len++] = call;
23511
23512 if (pop)
23513 {
23514 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23515 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23516 vec[vec_len++] = pop;
23517 }
23518
23519 if (TARGET_64BIT_MS_ABI
23520 && (!callarg2 || INTVAL (callarg2) != -2))
23521 {
23522 unsigned i;
23523
23524 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23525 UNSPEC_MS_TO_SYSV_CALL);
23526
23527 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23528 vec[vec_len++]
23529 = gen_rtx_CLOBBER (VOIDmode,
23530 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23531 ? TImode : DImode,
23532 clobbered_registers[i]));
23533 }
23534
23535 if (vec_len > 1)
23536 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23537 call = emit_call_insn (call);
23538 if (use)
23539 CALL_INSN_FUNCTION_USAGE (call) = use;
23540
23541 return call;
23542 }
23543
23544 /* Output the assembly for a call instruction. */
23545
23546 const char *
23547 ix86_output_call_insn (rtx insn, rtx call_op)
23548 {
23549 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23550 bool seh_nop_p = false;
23551 const char *xasm;
23552
23553 if (SIBLING_CALL_P (insn))
23554 {
23555 if (direct_p)
23556 xasm = "jmp\t%P0";
23557 /* SEH epilogue detection requires the indirect branch case
23558 to include REX.W. */
23559 else if (TARGET_SEH)
23560 xasm = "rex.W jmp %A0";
23561 else
23562 xasm = "jmp\t%A0";
23563
23564 output_asm_insn (xasm, &call_op);
23565 return "";
23566 }
23567
23568 /* SEH unwinding can require an extra nop to be emitted in several
23569 circumstances. Determine if we have one of those. */
23570 if (TARGET_SEH)
23571 {
23572 rtx i;
23573
23574 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23575 {
23576 /* If we get to another real insn, we don't need the nop. */
23577 if (INSN_P (i))
23578 break;
23579
23580 /* If we get to the epilogue note, prevent a catch region from
23581 being adjacent to the standard epilogue sequence. If non-
23582 call-exceptions, we'll have done this during epilogue emission. */
23583 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23584 && !flag_non_call_exceptions
23585 && !can_throw_internal (insn))
23586 {
23587 seh_nop_p = true;
23588 break;
23589 }
23590 }
23591
23592 /* If we didn't find a real insn following the call, prevent the
23593 unwinder from looking into the next function. */
23594 if (i == NULL)
23595 seh_nop_p = true;
23596 }
23597
23598 if (direct_p)
23599 xasm = "call\t%P0";
23600 else
23601 xasm = "call\t%A0";
23602
23603 output_asm_insn (xasm, &call_op);
23604
23605 if (seh_nop_p)
23606 return "nop";
23607
23608 return "";
23609 }
23610 \f
23611 /* Clear stack slot assignments remembered from previous functions.
23612 This is called from INIT_EXPANDERS once before RTL is emitted for each
23613 function. */
23614
23615 static struct machine_function *
23616 ix86_init_machine_status (void)
23617 {
23618 struct machine_function *f;
23619
23620 f = ggc_alloc_cleared_machine_function ();
23621 f->use_fast_prologue_epilogue_nregs = -1;
23622 f->call_abi = ix86_abi;
23623
23624 return f;
23625 }
23626
23627 /* Return a MEM corresponding to a stack slot with mode MODE.
23628 Allocate a new slot if necessary.
23629
23630 The RTL for a function can have several slots available: N is
23631 which slot to use. */
23632
23633 rtx
23634 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23635 {
23636 struct stack_local_entry *s;
23637
23638 gcc_assert (n < MAX_386_STACK_LOCALS);
23639
23640 for (s = ix86_stack_locals; s; s = s->next)
23641 if (s->mode == mode && s->n == n)
23642 return validize_mem (copy_rtx (s->rtl));
23643
23644 s = ggc_alloc_stack_local_entry ();
23645 s->n = n;
23646 s->mode = mode;
23647 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23648
23649 s->next = ix86_stack_locals;
23650 ix86_stack_locals = s;
23651 return validize_mem (s->rtl);
23652 }
23653
23654 static void
23655 ix86_instantiate_decls (void)
23656 {
23657 struct stack_local_entry *s;
23658
23659 for (s = ix86_stack_locals; s; s = s->next)
23660 if (s->rtl != NULL_RTX)
23661 instantiate_decl_rtl (s->rtl);
23662 }
23663 \f
23664 /* Calculate the length of the memory address in the instruction encoding.
23665 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23666 or other prefixes. We never generate addr32 prefix for LEA insn. */
23667
23668 int
23669 memory_address_length (rtx addr, bool lea)
23670 {
23671 struct ix86_address parts;
23672 rtx base, index, disp;
23673 int len;
23674 int ok;
23675
23676 if (GET_CODE (addr) == PRE_DEC
23677 || GET_CODE (addr) == POST_INC
23678 || GET_CODE (addr) == PRE_MODIFY
23679 || GET_CODE (addr) == POST_MODIFY)
23680 return 0;
23681
23682 ok = ix86_decompose_address (addr, &parts);
23683 gcc_assert (ok);
23684
23685 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23686
23687 /* If this is not LEA instruction, add the length of addr32 prefix. */
23688 if (TARGET_64BIT && !lea
23689 && (SImode_address_operand (addr, VOIDmode)
23690 || (parts.base && GET_MODE (parts.base) == SImode)
23691 || (parts.index && GET_MODE (parts.index) == SImode)))
23692 len++;
23693
23694 base = parts.base;
23695 index = parts.index;
23696 disp = parts.disp;
23697
23698 if (base && GET_CODE (base) == SUBREG)
23699 base = SUBREG_REG (base);
23700 if (index && GET_CODE (index) == SUBREG)
23701 index = SUBREG_REG (index);
23702
23703 gcc_assert (base == NULL_RTX || REG_P (base));
23704 gcc_assert (index == NULL_RTX || REG_P (index));
23705
23706 /* Rule of thumb:
23707 - esp as the base always wants an index,
23708 - ebp as the base always wants a displacement,
23709 - r12 as the base always wants an index,
23710 - r13 as the base always wants a displacement. */
23711
23712 /* Register Indirect. */
23713 if (base && !index && !disp)
23714 {
23715 /* esp (for its index) and ebp (for its displacement) need
23716 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23717 code. */
23718 if (base == arg_pointer_rtx
23719 || base == frame_pointer_rtx
23720 || REGNO (base) == SP_REG
23721 || REGNO (base) == BP_REG
23722 || REGNO (base) == R12_REG
23723 || REGNO (base) == R13_REG)
23724 len++;
23725 }
23726
23727 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23728 is not disp32, but disp32(%rip), so for disp32
23729 SIB byte is needed, unless print_operand_address
23730 optimizes it into disp32(%rip) or (%rip) is implied
23731 by UNSPEC. */
23732 else if (disp && !base && !index)
23733 {
23734 len += 4;
23735 if (TARGET_64BIT)
23736 {
23737 rtx symbol = disp;
23738
23739 if (GET_CODE (disp) == CONST)
23740 symbol = XEXP (disp, 0);
23741 if (GET_CODE (symbol) == PLUS
23742 && CONST_INT_P (XEXP (symbol, 1)))
23743 symbol = XEXP (symbol, 0);
23744
23745 if (GET_CODE (symbol) != LABEL_REF
23746 && (GET_CODE (symbol) != SYMBOL_REF
23747 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23748 && (GET_CODE (symbol) != UNSPEC
23749 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23750 && XINT (symbol, 1) != UNSPEC_PCREL
23751 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23752 len++;
23753 }
23754 }
23755 else
23756 {
23757 /* Find the length of the displacement constant. */
23758 if (disp)
23759 {
23760 if (base && satisfies_constraint_K (disp))
23761 len += 1;
23762 else
23763 len += 4;
23764 }
23765 /* ebp always wants a displacement. Similarly r13. */
23766 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23767 len++;
23768
23769 /* An index requires the two-byte modrm form.... */
23770 if (index
23771 /* ...like esp (or r12), which always wants an index. */
23772 || base == arg_pointer_rtx
23773 || base == frame_pointer_rtx
23774 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23775 len++;
23776 }
23777
23778 return len;
23779 }
23780
23781 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23782 is set, expect that insn have 8bit immediate alternative. */
23783 int
23784 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23785 {
23786 int len = 0;
23787 int i;
23788 extract_insn_cached (insn);
23789 for (i = recog_data.n_operands - 1; i >= 0; --i)
23790 if (CONSTANT_P (recog_data.operand[i]))
23791 {
23792 enum attr_mode mode = get_attr_mode (insn);
23793
23794 gcc_assert (!len);
23795 if (shortform && CONST_INT_P (recog_data.operand[i]))
23796 {
23797 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23798 switch (mode)
23799 {
23800 case MODE_QI:
23801 len = 1;
23802 continue;
23803 case MODE_HI:
23804 ival = trunc_int_for_mode (ival, HImode);
23805 break;
23806 case MODE_SI:
23807 ival = trunc_int_for_mode (ival, SImode);
23808 break;
23809 default:
23810 break;
23811 }
23812 if (IN_RANGE (ival, -128, 127))
23813 {
23814 len = 1;
23815 continue;
23816 }
23817 }
23818 switch (mode)
23819 {
23820 case MODE_QI:
23821 len = 1;
23822 break;
23823 case MODE_HI:
23824 len = 2;
23825 break;
23826 case MODE_SI:
23827 len = 4;
23828 break;
23829 /* Immediates for DImode instructions are encoded
23830 as 32bit sign extended values. */
23831 case MODE_DI:
23832 len = 4;
23833 break;
23834 default:
23835 fatal_insn ("unknown insn mode", insn);
23836 }
23837 }
23838 return len;
23839 }
23840
23841 /* Compute default value for "length_address" attribute. */
23842 int
23843 ix86_attr_length_address_default (rtx insn)
23844 {
23845 int i;
23846
23847 if (get_attr_type (insn) == TYPE_LEA)
23848 {
23849 rtx set = PATTERN (insn), addr;
23850
23851 if (GET_CODE (set) == PARALLEL)
23852 set = XVECEXP (set, 0, 0);
23853
23854 gcc_assert (GET_CODE (set) == SET);
23855
23856 addr = SET_SRC (set);
23857
23858 return memory_address_length (addr, true);
23859 }
23860
23861 extract_insn_cached (insn);
23862 for (i = recog_data.n_operands - 1; i >= 0; --i)
23863 if (MEM_P (recog_data.operand[i]))
23864 {
23865 constrain_operands_cached (reload_completed);
23866 if (which_alternative != -1)
23867 {
23868 const char *constraints = recog_data.constraints[i];
23869 int alt = which_alternative;
23870
23871 while (*constraints == '=' || *constraints == '+')
23872 constraints++;
23873 while (alt-- > 0)
23874 while (*constraints++ != ',')
23875 ;
23876 /* Skip ignored operands. */
23877 if (*constraints == 'X')
23878 continue;
23879 }
23880 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
23881 }
23882 return 0;
23883 }
23884
23885 /* Compute default value for "length_vex" attribute. It includes
23886 2 or 3 byte VEX prefix and 1 opcode byte. */
23887
23888 int
23889 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
23890 {
23891 int i;
23892
23893 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
23894 byte VEX prefix. */
23895 if (!has_0f_opcode || has_vex_w)
23896 return 3 + 1;
23897
23898 /* We can always use 2 byte VEX prefix in 32bit. */
23899 if (!TARGET_64BIT)
23900 return 2 + 1;
23901
23902 extract_insn_cached (insn);
23903
23904 for (i = recog_data.n_operands - 1; i >= 0; --i)
23905 if (REG_P (recog_data.operand[i]))
23906 {
23907 /* REX.W bit uses 3 byte VEX prefix. */
23908 if (GET_MODE (recog_data.operand[i]) == DImode
23909 && GENERAL_REG_P (recog_data.operand[i]))
23910 return 3 + 1;
23911 }
23912 else
23913 {
23914 /* REX.X or REX.B bits use 3 byte VEX prefix. */
23915 if (MEM_P (recog_data.operand[i])
23916 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
23917 return 3 + 1;
23918 }
23919
23920 return 2 + 1;
23921 }
23922 \f
23923 /* Return the maximum number of instructions a cpu can issue. */
23924
23925 static int
23926 ix86_issue_rate (void)
23927 {
23928 switch (ix86_tune)
23929 {
23930 case PROCESSOR_PENTIUM:
23931 case PROCESSOR_ATOM:
23932 case PROCESSOR_K6:
23933 case PROCESSOR_BTVER2:
23934 return 2;
23935
23936 case PROCESSOR_PENTIUMPRO:
23937 case PROCESSOR_PENTIUM4:
23938 case PROCESSOR_CORE2_32:
23939 case PROCESSOR_CORE2_64:
23940 case PROCESSOR_COREI7_32:
23941 case PROCESSOR_COREI7_64:
23942 case PROCESSOR_ATHLON:
23943 case PROCESSOR_K8:
23944 case PROCESSOR_AMDFAM10:
23945 case PROCESSOR_NOCONA:
23946 case PROCESSOR_GENERIC32:
23947 case PROCESSOR_GENERIC64:
23948 case PROCESSOR_BDVER1:
23949 case PROCESSOR_BDVER2:
23950 case PROCESSOR_BDVER3:
23951 case PROCESSOR_BTVER1:
23952 return 3;
23953
23954 default:
23955 return 1;
23956 }
23957 }
23958
23959 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
23960 by DEP_INSN and nothing set by DEP_INSN. */
23961
23962 static bool
23963 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
23964 {
23965 rtx set, set2;
23966
23967 /* Simplify the test for uninteresting insns. */
23968 if (insn_type != TYPE_SETCC
23969 && insn_type != TYPE_ICMOV
23970 && insn_type != TYPE_FCMOV
23971 && insn_type != TYPE_IBR)
23972 return false;
23973
23974 if ((set = single_set (dep_insn)) != 0)
23975 {
23976 set = SET_DEST (set);
23977 set2 = NULL_RTX;
23978 }
23979 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
23980 && XVECLEN (PATTERN (dep_insn), 0) == 2
23981 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
23982 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
23983 {
23984 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23985 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
23986 }
23987 else
23988 return false;
23989
23990 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
23991 return false;
23992
23993 /* This test is true if the dependent insn reads the flags but
23994 not any other potentially set register. */
23995 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
23996 return false;
23997
23998 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
23999 return false;
24000
24001 return true;
24002 }
24003
24004 /* Return true iff USE_INSN has a memory address with operands set by
24005 SET_INSN. */
24006
24007 bool
24008 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24009 {
24010 int i;
24011 extract_insn_cached (use_insn);
24012 for (i = recog_data.n_operands - 1; i >= 0; --i)
24013 if (MEM_P (recog_data.operand[i]))
24014 {
24015 rtx addr = XEXP (recog_data.operand[i], 0);
24016 return modified_in_p (addr, set_insn) != 0;
24017 }
24018 return false;
24019 }
24020
24021 static int
24022 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24023 {
24024 enum attr_type insn_type, dep_insn_type;
24025 enum attr_memory memory;
24026 rtx set, set2;
24027 int dep_insn_code_number;
24028
24029 /* Anti and output dependencies have zero cost on all CPUs. */
24030 if (REG_NOTE_KIND (link) != 0)
24031 return 0;
24032
24033 dep_insn_code_number = recog_memoized (dep_insn);
24034
24035 /* If we can't recognize the insns, we can't really do anything. */
24036 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24037 return cost;
24038
24039 insn_type = get_attr_type (insn);
24040 dep_insn_type = get_attr_type (dep_insn);
24041
24042 switch (ix86_tune)
24043 {
24044 case PROCESSOR_PENTIUM:
24045 /* Address Generation Interlock adds a cycle of latency. */
24046 if (insn_type == TYPE_LEA)
24047 {
24048 rtx addr = PATTERN (insn);
24049
24050 if (GET_CODE (addr) == PARALLEL)
24051 addr = XVECEXP (addr, 0, 0);
24052
24053 gcc_assert (GET_CODE (addr) == SET);
24054
24055 addr = SET_SRC (addr);
24056 if (modified_in_p (addr, dep_insn))
24057 cost += 1;
24058 }
24059 else if (ix86_agi_dependent (dep_insn, insn))
24060 cost += 1;
24061
24062 /* ??? Compares pair with jump/setcc. */
24063 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24064 cost = 0;
24065
24066 /* Floating point stores require value to be ready one cycle earlier. */
24067 if (insn_type == TYPE_FMOV
24068 && get_attr_memory (insn) == MEMORY_STORE
24069 && !ix86_agi_dependent (dep_insn, insn))
24070 cost += 1;
24071 break;
24072
24073 case PROCESSOR_PENTIUMPRO:
24074 memory = get_attr_memory (insn);
24075
24076 /* INT->FP conversion is expensive. */
24077 if (get_attr_fp_int_src (dep_insn))
24078 cost += 5;
24079
24080 /* There is one cycle extra latency between an FP op and a store. */
24081 if (insn_type == TYPE_FMOV
24082 && (set = single_set (dep_insn)) != NULL_RTX
24083 && (set2 = single_set (insn)) != NULL_RTX
24084 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24085 && MEM_P (SET_DEST (set2)))
24086 cost += 1;
24087
24088 /* Show ability of reorder buffer to hide latency of load by executing
24089 in parallel with previous instruction in case
24090 previous instruction is not needed to compute the address. */
24091 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24092 && !ix86_agi_dependent (dep_insn, insn))
24093 {
24094 /* Claim moves to take one cycle, as core can issue one load
24095 at time and the next load can start cycle later. */
24096 if (dep_insn_type == TYPE_IMOV
24097 || dep_insn_type == TYPE_FMOV)
24098 cost = 1;
24099 else if (cost > 1)
24100 cost--;
24101 }
24102 break;
24103
24104 case PROCESSOR_K6:
24105 memory = get_attr_memory (insn);
24106
24107 /* The esp dependency is resolved before the instruction is really
24108 finished. */
24109 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24110 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24111 return 1;
24112
24113 /* INT->FP conversion is expensive. */
24114 if (get_attr_fp_int_src (dep_insn))
24115 cost += 5;
24116
24117 /* Show ability of reorder buffer to hide latency of load by executing
24118 in parallel with previous instruction in case
24119 previous instruction is not needed to compute the address. */
24120 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24121 && !ix86_agi_dependent (dep_insn, insn))
24122 {
24123 /* Claim moves to take one cycle, as core can issue one load
24124 at time and the next load can start cycle later. */
24125 if (dep_insn_type == TYPE_IMOV
24126 || dep_insn_type == TYPE_FMOV)
24127 cost = 1;
24128 else if (cost > 2)
24129 cost -= 2;
24130 else
24131 cost = 1;
24132 }
24133 break;
24134
24135 case PROCESSOR_ATHLON:
24136 case PROCESSOR_K8:
24137 case PROCESSOR_AMDFAM10:
24138 case PROCESSOR_BDVER1:
24139 case PROCESSOR_BDVER2:
24140 case PROCESSOR_BDVER3:
24141 case PROCESSOR_BTVER1:
24142 case PROCESSOR_BTVER2:
24143 case PROCESSOR_ATOM:
24144 case PROCESSOR_GENERIC32:
24145 case PROCESSOR_GENERIC64:
24146 memory = get_attr_memory (insn);
24147
24148 /* Show ability of reorder buffer to hide latency of load by executing
24149 in parallel with previous instruction in case
24150 previous instruction is not needed to compute the address. */
24151 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24152 && !ix86_agi_dependent (dep_insn, insn))
24153 {
24154 enum attr_unit unit = get_attr_unit (insn);
24155 int loadcost = 3;
24156
24157 /* Because of the difference between the length of integer and
24158 floating unit pipeline preparation stages, the memory operands
24159 for floating point are cheaper.
24160
24161 ??? For Athlon it the difference is most probably 2. */
24162 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24163 loadcost = 3;
24164 else
24165 loadcost = TARGET_ATHLON ? 2 : 0;
24166
24167 if (cost >= loadcost)
24168 cost -= loadcost;
24169 else
24170 cost = 0;
24171 }
24172
24173 default:
24174 break;
24175 }
24176
24177 return cost;
24178 }
24179
24180 /* How many alternative schedules to try. This should be as wide as the
24181 scheduling freedom in the DFA, but no wider. Making this value too
24182 large results extra work for the scheduler. */
24183
24184 static int
24185 ia32_multipass_dfa_lookahead (void)
24186 {
24187 switch (ix86_tune)
24188 {
24189 case PROCESSOR_PENTIUM:
24190 return 2;
24191
24192 case PROCESSOR_PENTIUMPRO:
24193 case PROCESSOR_K6:
24194 return 1;
24195
24196 case PROCESSOR_CORE2_32:
24197 case PROCESSOR_CORE2_64:
24198 case PROCESSOR_COREI7_32:
24199 case PROCESSOR_COREI7_64:
24200 case PROCESSOR_ATOM:
24201 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24202 as many instructions can be executed on a cycle, i.e.,
24203 issue_rate. I wonder why tuning for many CPUs does not do this. */
24204 if (reload_completed)
24205 return ix86_issue_rate ();
24206 /* Don't use lookahead for pre-reload schedule to save compile time. */
24207 return 0;
24208
24209 default:
24210 return 0;
24211 }
24212 }
24213
24214 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24215 execution. It is applied if
24216 (1) IMUL instruction is on the top of list;
24217 (2) There exists the only producer of independent IMUL instruction in
24218 ready list;
24219 (3) Put found producer on the top of ready list.
24220 Returns issue rate. */
24221
24222 static int
24223 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24224 int clock_var ATTRIBUTE_UNUSED)
24225 {
24226 static int issue_rate = -1;
24227 int n_ready = *pn_ready;
24228 rtx insn, insn1, insn2;
24229 int i;
24230 sd_iterator_def sd_it;
24231 dep_t dep;
24232 int index = -1;
24233
24234 /* Set up issue rate. */
24235 issue_rate = ix86_issue_rate();
24236
24237 /* Do reodering for Atom only. */
24238 if (ix86_tune != PROCESSOR_ATOM)
24239 return issue_rate;
24240 /* Do not perform ready list reodering for pre-reload schedule pass. */
24241 if (!reload_completed)
24242 return issue_rate;
24243 /* Nothing to do if ready list contains only 1 instruction. */
24244 if (n_ready <= 1)
24245 return issue_rate;
24246
24247 /* Check that IMUL instruction is on the top of ready list. */
24248 insn = ready[n_ready - 1];
24249 if (!NONDEBUG_INSN_P (insn))
24250 return issue_rate;
24251 insn = PATTERN (insn);
24252 if (GET_CODE (insn) == PARALLEL)
24253 insn = XVECEXP (insn, 0, 0);
24254 if (GET_CODE (insn) != SET)
24255 return issue_rate;
24256 if (!(GET_CODE (SET_SRC (insn)) == MULT
24257 && GET_MODE (SET_SRC (insn)) == SImode))
24258 return issue_rate;
24259
24260 /* Search for producer of independent IMUL instruction. */
24261 for (i = n_ready - 2; i>= 0; i--)
24262 {
24263 insn = ready[i];
24264 if (!NONDEBUG_INSN_P (insn))
24265 continue;
24266 /* Skip IMUL instruction. */
24267 insn2 = PATTERN (insn);
24268 if (GET_CODE (insn2) == PARALLEL)
24269 insn2 = XVECEXP (insn2, 0, 0);
24270 if (GET_CODE (insn2) == SET
24271 && GET_CODE (SET_SRC (insn2)) == MULT
24272 && GET_MODE (SET_SRC (insn2)) == SImode)
24273 continue;
24274
24275 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24276 {
24277 rtx con;
24278 con = DEP_CON (dep);
24279 if (!NONDEBUG_INSN_P (con))
24280 continue;
24281 insn1 = PATTERN (con);
24282 if (GET_CODE (insn1) == PARALLEL)
24283 insn1 = XVECEXP (insn1, 0, 0);
24284
24285 if (GET_CODE (insn1) == SET
24286 && GET_CODE (SET_SRC (insn1)) == MULT
24287 && GET_MODE (SET_SRC (insn1)) == SImode)
24288 {
24289 sd_iterator_def sd_it1;
24290 dep_t dep1;
24291 /* Check if there is no other dependee for IMUL. */
24292 index = i;
24293 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24294 {
24295 rtx pro;
24296 pro = DEP_PRO (dep1);
24297 if (!NONDEBUG_INSN_P (pro))
24298 continue;
24299 if (pro != insn)
24300 index = -1;
24301 }
24302 if (index >= 0)
24303 break;
24304 }
24305 }
24306 if (index >= 0)
24307 break;
24308 }
24309 if (index < 0)
24310 return issue_rate; /* Didn't find IMUL producer. */
24311
24312 if (sched_verbose > 1)
24313 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24314 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24315
24316 /* Put IMUL producer (ready[index]) at the top of ready list. */
24317 insn1= ready[index];
24318 for (i = index; i < n_ready - 1; i++)
24319 ready[i] = ready[i + 1];
24320 ready[n_ready - 1] = insn1;
24321
24322 return issue_rate;
24323 }
24324
24325 static bool
24326 ix86_class_likely_spilled_p (reg_class_t);
24327
24328 /* Returns true if lhs of insn is HW function argument register and set up
24329 is_spilled to true if it is likely spilled HW register. */
24330 static bool
24331 insn_is_function_arg (rtx insn, bool* is_spilled)
24332 {
24333 rtx dst;
24334
24335 if (!NONDEBUG_INSN_P (insn))
24336 return false;
24337 /* Call instructions are not movable, ignore it. */
24338 if (CALL_P (insn))
24339 return false;
24340 insn = PATTERN (insn);
24341 if (GET_CODE (insn) == PARALLEL)
24342 insn = XVECEXP (insn, 0, 0);
24343 if (GET_CODE (insn) != SET)
24344 return false;
24345 dst = SET_DEST (insn);
24346 if (REG_P (dst) && HARD_REGISTER_P (dst)
24347 && ix86_function_arg_regno_p (REGNO (dst)))
24348 {
24349 /* Is it likely spilled HW register? */
24350 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24351 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24352 *is_spilled = true;
24353 return true;
24354 }
24355 return false;
24356 }
24357
24358 /* Add output dependencies for chain of function adjacent arguments if only
24359 there is a move to likely spilled HW register. Return first argument
24360 if at least one dependence was added or NULL otherwise. */
24361 static rtx
24362 add_parameter_dependencies (rtx call, rtx head)
24363 {
24364 rtx insn;
24365 rtx last = call;
24366 rtx first_arg = NULL;
24367 bool is_spilled = false;
24368
24369 head = PREV_INSN (head);
24370
24371 /* Find nearest to call argument passing instruction. */
24372 while (true)
24373 {
24374 last = PREV_INSN (last);
24375 if (last == head)
24376 return NULL;
24377 if (!NONDEBUG_INSN_P (last))
24378 continue;
24379 if (insn_is_function_arg (last, &is_spilled))
24380 break;
24381 return NULL;
24382 }
24383
24384 first_arg = last;
24385 while (true)
24386 {
24387 insn = PREV_INSN (last);
24388 if (!INSN_P (insn))
24389 break;
24390 if (insn == head)
24391 break;
24392 if (!NONDEBUG_INSN_P (insn))
24393 {
24394 last = insn;
24395 continue;
24396 }
24397 if (insn_is_function_arg (insn, &is_spilled))
24398 {
24399 /* Add output depdendence between two function arguments if chain
24400 of output arguments contains likely spilled HW registers. */
24401 if (is_spilled)
24402 add_dependence (last, insn, REG_DEP_OUTPUT);
24403 first_arg = last = insn;
24404 }
24405 else
24406 break;
24407 }
24408 if (!is_spilled)
24409 return NULL;
24410 return first_arg;
24411 }
24412
24413 /* Add output or anti dependency from insn to first_arg to restrict its code
24414 motion. */
24415 static void
24416 avoid_func_arg_motion (rtx first_arg, rtx insn)
24417 {
24418 rtx set;
24419 rtx tmp;
24420
24421 set = single_set (insn);
24422 if (!set)
24423 return;
24424 tmp = SET_DEST (set);
24425 if (REG_P (tmp))
24426 {
24427 /* Add output dependency to the first function argument. */
24428 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24429 return;
24430 }
24431 /* Add anti dependency. */
24432 add_dependence (first_arg, insn, REG_DEP_ANTI);
24433 }
24434
24435 /* Avoid cross block motion of function argument through adding dependency
24436 from the first non-jump instruction in bb. */
24437 static void
24438 add_dependee_for_func_arg (rtx arg, basic_block bb)
24439 {
24440 rtx insn = BB_END (bb);
24441
24442 while (insn)
24443 {
24444 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24445 {
24446 rtx set = single_set (insn);
24447 if (set)
24448 {
24449 avoid_func_arg_motion (arg, insn);
24450 return;
24451 }
24452 }
24453 if (insn == BB_HEAD (bb))
24454 return;
24455 insn = PREV_INSN (insn);
24456 }
24457 }
24458
24459 /* Hook for pre-reload schedule - avoid motion of function arguments
24460 passed in likely spilled HW registers. */
24461 static void
24462 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24463 {
24464 rtx insn;
24465 rtx first_arg = NULL;
24466 if (reload_completed)
24467 return;
24468 while (head != tail && DEBUG_INSN_P (head))
24469 head = NEXT_INSN (head);
24470 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24471 if (INSN_P (insn) && CALL_P (insn))
24472 {
24473 first_arg = add_parameter_dependencies (insn, head);
24474 if (first_arg)
24475 {
24476 /* Add dependee for first argument to predecessors if only
24477 region contains more than one block. */
24478 basic_block bb = BLOCK_FOR_INSN (insn);
24479 int rgn = CONTAINING_RGN (bb->index);
24480 int nr_blks = RGN_NR_BLOCKS (rgn);
24481 /* Skip trivial regions and region head blocks that can have
24482 predecessors outside of region. */
24483 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24484 {
24485 edge e;
24486 edge_iterator ei;
24487 /* Assume that region is SCC, i.e. all immediate predecessors
24488 of non-head block are in the same region. */
24489 FOR_EACH_EDGE (e, ei, bb->preds)
24490 {
24491 /* Avoid creating of loop-carried dependencies through
24492 using topological odering in region. */
24493 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24494 add_dependee_for_func_arg (first_arg, e->src);
24495 }
24496 }
24497 insn = first_arg;
24498 if (insn == head)
24499 break;
24500 }
24501 }
24502 else if (first_arg)
24503 avoid_func_arg_motion (first_arg, insn);
24504 }
24505
24506 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24507 HW registers to maximum, to schedule them at soon as possible. These are
24508 moves from function argument registers at the top of the function entry
24509 and moves from function return value registers after call. */
24510 static int
24511 ix86_adjust_priority (rtx insn, int priority)
24512 {
24513 rtx set;
24514
24515 if (reload_completed)
24516 return priority;
24517
24518 if (!NONDEBUG_INSN_P (insn))
24519 return priority;
24520
24521 set = single_set (insn);
24522 if (set)
24523 {
24524 rtx tmp = SET_SRC (set);
24525 if (REG_P (tmp)
24526 && HARD_REGISTER_P (tmp)
24527 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24528 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24529 return current_sched_info->sched_max_insns_priority;
24530 }
24531
24532 return priority;
24533 }
24534
24535 /* Model decoder of Core 2/i7.
24536 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24537 track the instruction fetch block boundaries and make sure that long
24538 (9+ bytes) instructions are assigned to D0. */
24539
24540 /* Maximum length of an insn that can be handled by
24541 a secondary decoder unit. '8' for Core 2/i7. */
24542 static int core2i7_secondary_decoder_max_insn_size;
24543
24544 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24545 '16' for Core 2/i7. */
24546 static int core2i7_ifetch_block_size;
24547
24548 /* Maximum number of instructions decoder can handle per cycle.
24549 '6' for Core 2/i7. */
24550 static int core2i7_ifetch_block_max_insns;
24551
24552 typedef struct ix86_first_cycle_multipass_data_ *
24553 ix86_first_cycle_multipass_data_t;
24554 typedef const struct ix86_first_cycle_multipass_data_ *
24555 const_ix86_first_cycle_multipass_data_t;
24556
24557 /* A variable to store target state across calls to max_issue within
24558 one cycle. */
24559 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24560 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24561
24562 /* Initialize DATA. */
24563 static void
24564 core2i7_first_cycle_multipass_init (void *_data)
24565 {
24566 ix86_first_cycle_multipass_data_t data
24567 = (ix86_first_cycle_multipass_data_t) _data;
24568
24569 data->ifetch_block_len = 0;
24570 data->ifetch_block_n_insns = 0;
24571 data->ready_try_change = NULL;
24572 data->ready_try_change_size = 0;
24573 }
24574
24575 /* Advancing the cycle; reset ifetch block counts. */
24576 static void
24577 core2i7_dfa_post_advance_cycle (void)
24578 {
24579 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24580
24581 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24582
24583 data->ifetch_block_len = 0;
24584 data->ifetch_block_n_insns = 0;
24585 }
24586
24587 static int min_insn_size (rtx);
24588
24589 /* Filter out insns from ready_try that the core will not be able to issue
24590 on current cycle due to decoder. */
24591 static void
24592 core2i7_first_cycle_multipass_filter_ready_try
24593 (const_ix86_first_cycle_multipass_data_t data,
24594 char *ready_try, int n_ready, bool first_cycle_insn_p)
24595 {
24596 while (n_ready--)
24597 {
24598 rtx insn;
24599 int insn_size;
24600
24601 if (ready_try[n_ready])
24602 continue;
24603
24604 insn = get_ready_element (n_ready);
24605 insn_size = min_insn_size (insn);
24606
24607 if (/* If this is a too long an insn for a secondary decoder ... */
24608 (!first_cycle_insn_p
24609 && insn_size > core2i7_secondary_decoder_max_insn_size)
24610 /* ... or it would not fit into the ifetch block ... */
24611 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24612 /* ... or the decoder is full already ... */
24613 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24614 /* ... mask the insn out. */
24615 {
24616 ready_try[n_ready] = 1;
24617
24618 if (data->ready_try_change)
24619 bitmap_set_bit (data->ready_try_change, n_ready);
24620 }
24621 }
24622 }
24623
24624 /* Prepare for a new round of multipass lookahead scheduling. */
24625 static void
24626 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24627 bool first_cycle_insn_p)
24628 {
24629 ix86_first_cycle_multipass_data_t data
24630 = (ix86_first_cycle_multipass_data_t) _data;
24631 const_ix86_first_cycle_multipass_data_t prev_data
24632 = ix86_first_cycle_multipass_data;
24633
24634 /* Restore the state from the end of the previous round. */
24635 data->ifetch_block_len = prev_data->ifetch_block_len;
24636 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24637
24638 /* Filter instructions that cannot be issued on current cycle due to
24639 decoder restrictions. */
24640 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24641 first_cycle_insn_p);
24642 }
24643
24644 /* INSN is being issued in current solution. Account for its impact on
24645 the decoder model. */
24646 static void
24647 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24648 rtx insn, const void *_prev_data)
24649 {
24650 ix86_first_cycle_multipass_data_t data
24651 = (ix86_first_cycle_multipass_data_t) _data;
24652 const_ix86_first_cycle_multipass_data_t prev_data
24653 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24654
24655 int insn_size = min_insn_size (insn);
24656
24657 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24658 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24659 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24660 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24661
24662 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24663 if (!data->ready_try_change)
24664 {
24665 data->ready_try_change = sbitmap_alloc (n_ready);
24666 data->ready_try_change_size = n_ready;
24667 }
24668 else if (data->ready_try_change_size < n_ready)
24669 {
24670 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24671 n_ready, 0);
24672 data->ready_try_change_size = n_ready;
24673 }
24674 bitmap_clear (data->ready_try_change);
24675
24676 /* Filter out insns from ready_try that the core will not be able to issue
24677 on current cycle due to decoder. */
24678 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24679 false);
24680 }
24681
24682 /* Revert the effect on ready_try. */
24683 static void
24684 core2i7_first_cycle_multipass_backtrack (const void *_data,
24685 char *ready_try,
24686 int n_ready ATTRIBUTE_UNUSED)
24687 {
24688 const_ix86_first_cycle_multipass_data_t data
24689 = (const_ix86_first_cycle_multipass_data_t) _data;
24690 unsigned int i = 0;
24691 sbitmap_iterator sbi;
24692
24693 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24694 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24695 {
24696 ready_try[i] = 0;
24697 }
24698 }
24699
24700 /* Save the result of multipass lookahead scheduling for the next round. */
24701 static void
24702 core2i7_first_cycle_multipass_end (const void *_data)
24703 {
24704 const_ix86_first_cycle_multipass_data_t data
24705 = (const_ix86_first_cycle_multipass_data_t) _data;
24706 ix86_first_cycle_multipass_data_t next_data
24707 = ix86_first_cycle_multipass_data;
24708
24709 if (data != NULL)
24710 {
24711 next_data->ifetch_block_len = data->ifetch_block_len;
24712 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24713 }
24714 }
24715
24716 /* Deallocate target data. */
24717 static void
24718 core2i7_first_cycle_multipass_fini (void *_data)
24719 {
24720 ix86_first_cycle_multipass_data_t data
24721 = (ix86_first_cycle_multipass_data_t) _data;
24722
24723 if (data->ready_try_change)
24724 {
24725 sbitmap_free (data->ready_try_change);
24726 data->ready_try_change = NULL;
24727 data->ready_try_change_size = 0;
24728 }
24729 }
24730
24731 /* Prepare for scheduling pass. */
24732 static void
24733 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24734 int verbose ATTRIBUTE_UNUSED,
24735 int max_uid ATTRIBUTE_UNUSED)
24736 {
24737 /* Install scheduling hooks for current CPU. Some of these hooks are used
24738 in time-critical parts of the scheduler, so we only set them up when
24739 they are actually used. */
24740 switch (ix86_tune)
24741 {
24742 case PROCESSOR_CORE2_32:
24743 case PROCESSOR_CORE2_64:
24744 case PROCESSOR_COREI7_32:
24745 case PROCESSOR_COREI7_64:
24746 /* Do not perform multipass scheduling for pre-reload schedule
24747 to save compile time. */
24748 if (reload_completed)
24749 {
24750 targetm.sched.dfa_post_advance_cycle
24751 = core2i7_dfa_post_advance_cycle;
24752 targetm.sched.first_cycle_multipass_init
24753 = core2i7_first_cycle_multipass_init;
24754 targetm.sched.first_cycle_multipass_begin
24755 = core2i7_first_cycle_multipass_begin;
24756 targetm.sched.first_cycle_multipass_issue
24757 = core2i7_first_cycle_multipass_issue;
24758 targetm.sched.first_cycle_multipass_backtrack
24759 = core2i7_first_cycle_multipass_backtrack;
24760 targetm.sched.first_cycle_multipass_end
24761 = core2i7_first_cycle_multipass_end;
24762 targetm.sched.first_cycle_multipass_fini
24763 = core2i7_first_cycle_multipass_fini;
24764
24765 /* Set decoder parameters. */
24766 core2i7_secondary_decoder_max_insn_size = 8;
24767 core2i7_ifetch_block_size = 16;
24768 core2i7_ifetch_block_max_insns = 6;
24769 break;
24770 }
24771 /* ... Fall through ... */
24772 default:
24773 targetm.sched.dfa_post_advance_cycle = NULL;
24774 targetm.sched.first_cycle_multipass_init = NULL;
24775 targetm.sched.first_cycle_multipass_begin = NULL;
24776 targetm.sched.first_cycle_multipass_issue = NULL;
24777 targetm.sched.first_cycle_multipass_backtrack = NULL;
24778 targetm.sched.first_cycle_multipass_end = NULL;
24779 targetm.sched.first_cycle_multipass_fini = NULL;
24780 break;
24781 }
24782 }
24783
24784 \f
24785 /* Compute the alignment given to a constant that is being placed in memory.
24786 EXP is the constant and ALIGN is the alignment that the object would
24787 ordinarily have.
24788 The value of this function is used instead of that alignment to align
24789 the object. */
24790
24791 int
24792 ix86_constant_alignment (tree exp, int align)
24793 {
24794 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24795 || TREE_CODE (exp) == INTEGER_CST)
24796 {
24797 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24798 return 64;
24799 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24800 return 128;
24801 }
24802 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24803 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24804 return BITS_PER_WORD;
24805
24806 return align;
24807 }
24808
24809 /* Compute the alignment for a static variable.
24810 TYPE is the data type, and ALIGN is the alignment that
24811 the object would ordinarily have. The value of this function is used
24812 instead of that alignment to align the object. */
24813
24814 int
24815 ix86_data_alignment (tree type, int align)
24816 {
24817 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24818
24819 if (AGGREGATE_TYPE_P (type)
24820 && TYPE_SIZE (type)
24821 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24822 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24823 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24824 && align < max_align)
24825 align = max_align;
24826
24827 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24828 to 16byte boundary. */
24829 if (TARGET_64BIT)
24830 {
24831 if (AGGREGATE_TYPE_P (type)
24832 && TYPE_SIZE (type)
24833 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24834 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24835 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24836 return 128;
24837 }
24838
24839 if (TREE_CODE (type) == ARRAY_TYPE)
24840 {
24841 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24842 return 64;
24843 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24844 return 128;
24845 }
24846 else if (TREE_CODE (type) == COMPLEX_TYPE)
24847 {
24848
24849 if (TYPE_MODE (type) == DCmode && align < 64)
24850 return 64;
24851 if ((TYPE_MODE (type) == XCmode
24852 || TYPE_MODE (type) == TCmode) && align < 128)
24853 return 128;
24854 }
24855 else if ((TREE_CODE (type) == RECORD_TYPE
24856 || TREE_CODE (type) == UNION_TYPE
24857 || TREE_CODE (type) == QUAL_UNION_TYPE)
24858 && TYPE_FIELDS (type))
24859 {
24860 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24861 return 64;
24862 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24863 return 128;
24864 }
24865 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24866 || TREE_CODE (type) == INTEGER_TYPE)
24867 {
24868 if (TYPE_MODE (type) == DFmode && align < 64)
24869 return 64;
24870 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24871 return 128;
24872 }
24873
24874 return align;
24875 }
24876
24877 /* Compute the alignment for a local variable or a stack slot. EXP is
24878 the data type or decl itself, MODE is the widest mode available and
24879 ALIGN is the alignment that the object would ordinarily have. The
24880 value of this macro is used instead of that alignment to align the
24881 object. */
24882
24883 unsigned int
24884 ix86_local_alignment (tree exp, enum machine_mode mode,
24885 unsigned int align)
24886 {
24887 tree type, decl;
24888
24889 if (exp && DECL_P (exp))
24890 {
24891 type = TREE_TYPE (exp);
24892 decl = exp;
24893 }
24894 else
24895 {
24896 type = exp;
24897 decl = NULL;
24898 }
24899
24900 /* Don't do dynamic stack realignment for long long objects with
24901 -mpreferred-stack-boundary=2. */
24902 if (!TARGET_64BIT
24903 && align == 64
24904 && ix86_preferred_stack_boundary < 64
24905 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
24906 && (!type || !TYPE_USER_ALIGN (type))
24907 && (!decl || !DECL_USER_ALIGN (decl)))
24908 align = 32;
24909
24910 /* If TYPE is NULL, we are allocating a stack slot for caller-save
24911 register in MODE. We will return the largest alignment of XF
24912 and DF. */
24913 if (!type)
24914 {
24915 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
24916 align = GET_MODE_ALIGNMENT (DFmode);
24917 return align;
24918 }
24919
24920 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24921 to 16byte boundary. Exact wording is:
24922
24923 An array uses the same alignment as its elements, except that a local or
24924 global array variable of length at least 16 bytes or
24925 a C99 variable-length array variable always has alignment of at least 16 bytes.
24926
24927 This was added to allow use of aligned SSE instructions at arrays. This
24928 rule is meant for static storage (where compiler can not do the analysis
24929 by itself). We follow it for automatic variables only when convenient.
24930 We fully control everything in the function compiled and functions from
24931 other unit can not rely on the alignment.
24932
24933 Exclude va_list type. It is the common case of local array where
24934 we can not benefit from the alignment. */
24935 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
24936 && TARGET_SSE)
24937 {
24938 if (AGGREGATE_TYPE_P (type)
24939 && (va_list_type_node == NULL_TREE
24940 || (TYPE_MAIN_VARIANT (type)
24941 != TYPE_MAIN_VARIANT (va_list_type_node)))
24942 && TYPE_SIZE (type)
24943 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24944 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
24945 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24946 return 128;
24947 }
24948 if (TREE_CODE (type) == ARRAY_TYPE)
24949 {
24950 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
24951 return 64;
24952 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
24953 return 128;
24954 }
24955 else if (TREE_CODE (type) == COMPLEX_TYPE)
24956 {
24957 if (TYPE_MODE (type) == DCmode && align < 64)
24958 return 64;
24959 if ((TYPE_MODE (type) == XCmode
24960 || TYPE_MODE (type) == TCmode) && align < 128)
24961 return 128;
24962 }
24963 else if ((TREE_CODE (type) == RECORD_TYPE
24964 || TREE_CODE (type) == UNION_TYPE
24965 || TREE_CODE (type) == QUAL_UNION_TYPE)
24966 && TYPE_FIELDS (type))
24967 {
24968 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
24969 return 64;
24970 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
24971 return 128;
24972 }
24973 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
24974 || TREE_CODE (type) == INTEGER_TYPE)
24975 {
24976
24977 if (TYPE_MODE (type) == DFmode && align < 64)
24978 return 64;
24979 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
24980 return 128;
24981 }
24982 return align;
24983 }
24984
24985 /* Compute the minimum required alignment for dynamic stack realignment
24986 purposes for a local variable, parameter or a stack slot. EXP is
24987 the data type or decl itself, MODE is its mode and ALIGN is the
24988 alignment that the object would ordinarily have. */
24989
24990 unsigned int
24991 ix86_minimum_alignment (tree exp, enum machine_mode mode,
24992 unsigned int align)
24993 {
24994 tree type, decl;
24995
24996 if (exp && DECL_P (exp))
24997 {
24998 type = TREE_TYPE (exp);
24999 decl = exp;
25000 }
25001 else
25002 {
25003 type = exp;
25004 decl = NULL;
25005 }
25006
25007 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25008 return align;
25009
25010 /* Don't do dynamic stack realignment for long long objects with
25011 -mpreferred-stack-boundary=2. */
25012 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25013 && (!type || !TYPE_USER_ALIGN (type))
25014 && (!decl || !DECL_USER_ALIGN (decl)))
25015 return 32;
25016
25017 return align;
25018 }
25019 \f
25020 /* Find a location for the static chain incoming to a nested function.
25021 This is a register, unless all free registers are used by arguments. */
25022
25023 static rtx
25024 ix86_static_chain (const_tree fndecl, bool incoming_p)
25025 {
25026 unsigned regno;
25027
25028 if (!DECL_STATIC_CHAIN (fndecl))
25029 return NULL;
25030
25031 if (TARGET_64BIT)
25032 {
25033 /* We always use R10 in 64-bit mode. */
25034 regno = R10_REG;
25035 }
25036 else
25037 {
25038 tree fntype;
25039 unsigned int ccvt;
25040
25041 /* By default in 32-bit mode we use ECX to pass the static chain. */
25042 regno = CX_REG;
25043
25044 fntype = TREE_TYPE (fndecl);
25045 ccvt = ix86_get_callcvt (fntype);
25046 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
25047 {
25048 /* Fastcall functions use ecx/edx for arguments, which leaves
25049 us with EAX for the static chain.
25050 Thiscall functions use ecx for arguments, which also
25051 leaves us with EAX for the static chain. */
25052 regno = AX_REG;
25053 }
25054 else if (ix86_function_regparm (fntype, fndecl) == 3)
25055 {
25056 /* For regparm 3, we have no free call-clobbered registers in
25057 which to store the static chain. In order to implement this,
25058 we have the trampoline push the static chain to the stack.
25059 However, we can't push a value below the return address when
25060 we call the nested function directly, so we have to use an
25061 alternate entry point. For this we use ESI, and have the
25062 alternate entry point push ESI, so that things appear the
25063 same once we're executing the nested function. */
25064 if (incoming_p)
25065 {
25066 if (fndecl == current_function_decl)
25067 ix86_static_chain_on_stack = true;
25068 return gen_frame_mem (SImode,
25069 plus_constant (Pmode,
25070 arg_pointer_rtx, -8));
25071 }
25072 regno = SI_REG;
25073 }
25074 }
25075
25076 return gen_rtx_REG (Pmode, regno);
25077 }
25078
25079 /* Emit RTL insns to initialize the variable parts of a trampoline.
25080 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25081 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25082 to be passed to the target function. */
25083
25084 static void
25085 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25086 {
25087 rtx mem, fnaddr;
25088 int opcode;
25089 int offset = 0;
25090
25091 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25092
25093 if (TARGET_64BIT)
25094 {
25095 int size;
25096
25097 /* Load the function address to r11. Try to load address using
25098 the shorter movl instead of movabs. We may want to support
25099 movq for kernel mode, but kernel does not use trampolines at
25100 the moment. FNADDR is a 32bit address and may not be in
25101 DImode when ptr_mode == SImode. Always use movl in this
25102 case. */
25103 if (ptr_mode == SImode
25104 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25105 {
25106 fnaddr = copy_addr_to_reg (fnaddr);
25107
25108 mem = adjust_address (m_tramp, HImode, offset);
25109 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25110
25111 mem = adjust_address (m_tramp, SImode, offset + 2);
25112 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25113 offset += 6;
25114 }
25115 else
25116 {
25117 mem = adjust_address (m_tramp, HImode, offset);
25118 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25119
25120 mem = adjust_address (m_tramp, DImode, offset + 2);
25121 emit_move_insn (mem, fnaddr);
25122 offset += 10;
25123 }
25124
25125 /* Load static chain using movabs to r10. Use the shorter movl
25126 instead of movabs when ptr_mode == SImode. */
25127 if (ptr_mode == SImode)
25128 {
25129 opcode = 0xba41;
25130 size = 6;
25131 }
25132 else
25133 {
25134 opcode = 0xba49;
25135 size = 10;
25136 }
25137
25138 mem = adjust_address (m_tramp, HImode, offset);
25139 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25140
25141 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25142 emit_move_insn (mem, chain_value);
25143 offset += size;
25144
25145 /* Jump to r11; the last (unused) byte is a nop, only there to
25146 pad the write out to a single 32-bit store. */
25147 mem = adjust_address (m_tramp, SImode, offset);
25148 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25149 offset += 4;
25150 }
25151 else
25152 {
25153 rtx disp, chain;
25154
25155 /* Depending on the static chain location, either load a register
25156 with a constant, or push the constant to the stack. All of the
25157 instructions are the same size. */
25158 chain = ix86_static_chain (fndecl, true);
25159 if (REG_P (chain))
25160 {
25161 switch (REGNO (chain))
25162 {
25163 case AX_REG:
25164 opcode = 0xb8; break;
25165 case CX_REG:
25166 opcode = 0xb9; break;
25167 default:
25168 gcc_unreachable ();
25169 }
25170 }
25171 else
25172 opcode = 0x68;
25173
25174 mem = adjust_address (m_tramp, QImode, offset);
25175 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25176
25177 mem = adjust_address (m_tramp, SImode, offset + 1);
25178 emit_move_insn (mem, chain_value);
25179 offset += 5;
25180
25181 mem = adjust_address (m_tramp, QImode, offset);
25182 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25183
25184 mem = adjust_address (m_tramp, SImode, offset + 1);
25185
25186 /* Compute offset from the end of the jmp to the target function.
25187 In the case in which the trampoline stores the static chain on
25188 the stack, we need to skip the first insn which pushes the
25189 (call-saved) register static chain; this push is 1 byte. */
25190 offset += 5;
25191 disp = expand_binop (SImode, sub_optab, fnaddr,
25192 plus_constant (Pmode, XEXP (m_tramp, 0),
25193 offset - (MEM_P (chain) ? 1 : 0)),
25194 NULL_RTX, 1, OPTAB_DIRECT);
25195 emit_move_insn (mem, disp);
25196 }
25197
25198 gcc_assert (offset <= TRAMPOLINE_SIZE);
25199
25200 #ifdef HAVE_ENABLE_EXECUTE_STACK
25201 #ifdef CHECK_EXECUTE_STACK_ENABLED
25202 if (CHECK_EXECUTE_STACK_ENABLED)
25203 #endif
25204 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25205 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25206 #endif
25207 }
25208 \f
25209 /* The following file contains several enumerations and data structures
25210 built from the definitions in i386-builtin-types.def. */
25211
25212 #include "i386-builtin-types.inc"
25213
25214 /* Table for the ix86 builtin non-function types. */
25215 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25216
25217 /* Retrieve an element from the above table, building some of
25218 the types lazily. */
25219
25220 static tree
25221 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25222 {
25223 unsigned int index;
25224 tree type, itype;
25225
25226 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25227
25228 type = ix86_builtin_type_tab[(int) tcode];
25229 if (type != NULL)
25230 return type;
25231
25232 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25233 if (tcode <= IX86_BT_LAST_VECT)
25234 {
25235 enum machine_mode mode;
25236
25237 index = tcode - IX86_BT_LAST_PRIM - 1;
25238 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25239 mode = ix86_builtin_type_vect_mode[index];
25240
25241 type = build_vector_type_for_mode (itype, mode);
25242 }
25243 else
25244 {
25245 int quals;
25246
25247 index = tcode - IX86_BT_LAST_VECT - 1;
25248 if (tcode <= IX86_BT_LAST_PTR)
25249 quals = TYPE_UNQUALIFIED;
25250 else
25251 quals = TYPE_QUAL_CONST;
25252
25253 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25254 if (quals != TYPE_UNQUALIFIED)
25255 itype = build_qualified_type (itype, quals);
25256
25257 type = build_pointer_type (itype);
25258 }
25259
25260 ix86_builtin_type_tab[(int) tcode] = type;
25261 return type;
25262 }
25263
25264 /* Table for the ix86 builtin function types. */
25265 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25266
25267 /* Retrieve an element from the above table, building some of
25268 the types lazily. */
25269
25270 static tree
25271 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25272 {
25273 tree type;
25274
25275 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25276
25277 type = ix86_builtin_func_type_tab[(int) tcode];
25278 if (type != NULL)
25279 return type;
25280
25281 if (tcode <= IX86_BT_LAST_FUNC)
25282 {
25283 unsigned start = ix86_builtin_func_start[(int) tcode];
25284 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25285 tree rtype, atype, args = void_list_node;
25286 unsigned i;
25287
25288 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25289 for (i = after - 1; i > start; --i)
25290 {
25291 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25292 args = tree_cons (NULL, atype, args);
25293 }
25294
25295 type = build_function_type (rtype, args);
25296 }
25297 else
25298 {
25299 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25300 enum ix86_builtin_func_type icode;
25301
25302 icode = ix86_builtin_func_alias_base[index];
25303 type = ix86_get_builtin_func_type (icode);
25304 }
25305
25306 ix86_builtin_func_type_tab[(int) tcode] = type;
25307 return type;
25308 }
25309
25310
25311 /* Codes for all the SSE/MMX builtins. */
25312 enum ix86_builtins
25313 {
25314 IX86_BUILTIN_ADDPS,
25315 IX86_BUILTIN_ADDSS,
25316 IX86_BUILTIN_DIVPS,
25317 IX86_BUILTIN_DIVSS,
25318 IX86_BUILTIN_MULPS,
25319 IX86_BUILTIN_MULSS,
25320 IX86_BUILTIN_SUBPS,
25321 IX86_BUILTIN_SUBSS,
25322
25323 IX86_BUILTIN_CMPEQPS,
25324 IX86_BUILTIN_CMPLTPS,
25325 IX86_BUILTIN_CMPLEPS,
25326 IX86_BUILTIN_CMPGTPS,
25327 IX86_BUILTIN_CMPGEPS,
25328 IX86_BUILTIN_CMPNEQPS,
25329 IX86_BUILTIN_CMPNLTPS,
25330 IX86_BUILTIN_CMPNLEPS,
25331 IX86_BUILTIN_CMPNGTPS,
25332 IX86_BUILTIN_CMPNGEPS,
25333 IX86_BUILTIN_CMPORDPS,
25334 IX86_BUILTIN_CMPUNORDPS,
25335 IX86_BUILTIN_CMPEQSS,
25336 IX86_BUILTIN_CMPLTSS,
25337 IX86_BUILTIN_CMPLESS,
25338 IX86_BUILTIN_CMPNEQSS,
25339 IX86_BUILTIN_CMPNLTSS,
25340 IX86_BUILTIN_CMPNLESS,
25341 IX86_BUILTIN_CMPNGTSS,
25342 IX86_BUILTIN_CMPNGESS,
25343 IX86_BUILTIN_CMPORDSS,
25344 IX86_BUILTIN_CMPUNORDSS,
25345
25346 IX86_BUILTIN_COMIEQSS,
25347 IX86_BUILTIN_COMILTSS,
25348 IX86_BUILTIN_COMILESS,
25349 IX86_BUILTIN_COMIGTSS,
25350 IX86_BUILTIN_COMIGESS,
25351 IX86_BUILTIN_COMINEQSS,
25352 IX86_BUILTIN_UCOMIEQSS,
25353 IX86_BUILTIN_UCOMILTSS,
25354 IX86_BUILTIN_UCOMILESS,
25355 IX86_BUILTIN_UCOMIGTSS,
25356 IX86_BUILTIN_UCOMIGESS,
25357 IX86_BUILTIN_UCOMINEQSS,
25358
25359 IX86_BUILTIN_CVTPI2PS,
25360 IX86_BUILTIN_CVTPS2PI,
25361 IX86_BUILTIN_CVTSI2SS,
25362 IX86_BUILTIN_CVTSI642SS,
25363 IX86_BUILTIN_CVTSS2SI,
25364 IX86_BUILTIN_CVTSS2SI64,
25365 IX86_BUILTIN_CVTTPS2PI,
25366 IX86_BUILTIN_CVTTSS2SI,
25367 IX86_BUILTIN_CVTTSS2SI64,
25368
25369 IX86_BUILTIN_MAXPS,
25370 IX86_BUILTIN_MAXSS,
25371 IX86_BUILTIN_MINPS,
25372 IX86_BUILTIN_MINSS,
25373
25374 IX86_BUILTIN_LOADUPS,
25375 IX86_BUILTIN_STOREUPS,
25376 IX86_BUILTIN_MOVSS,
25377
25378 IX86_BUILTIN_MOVHLPS,
25379 IX86_BUILTIN_MOVLHPS,
25380 IX86_BUILTIN_LOADHPS,
25381 IX86_BUILTIN_LOADLPS,
25382 IX86_BUILTIN_STOREHPS,
25383 IX86_BUILTIN_STORELPS,
25384
25385 IX86_BUILTIN_MASKMOVQ,
25386 IX86_BUILTIN_MOVMSKPS,
25387 IX86_BUILTIN_PMOVMSKB,
25388
25389 IX86_BUILTIN_MOVNTPS,
25390 IX86_BUILTIN_MOVNTQ,
25391
25392 IX86_BUILTIN_LOADDQU,
25393 IX86_BUILTIN_STOREDQU,
25394
25395 IX86_BUILTIN_PACKSSWB,
25396 IX86_BUILTIN_PACKSSDW,
25397 IX86_BUILTIN_PACKUSWB,
25398
25399 IX86_BUILTIN_PADDB,
25400 IX86_BUILTIN_PADDW,
25401 IX86_BUILTIN_PADDD,
25402 IX86_BUILTIN_PADDQ,
25403 IX86_BUILTIN_PADDSB,
25404 IX86_BUILTIN_PADDSW,
25405 IX86_BUILTIN_PADDUSB,
25406 IX86_BUILTIN_PADDUSW,
25407 IX86_BUILTIN_PSUBB,
25408 IX86_BUILTIN_PSUBW,
25409 IX86_BUILTIN_PSUBD,
25410 IX86_BUILTIN_PSUBQ,
25411 IX86_BUILTIN_PSUBSB,
25412 IX86_BUILTIN_PSUBSW,
25413 IX86_BUILTIN_PSUBUSB,
25414 IX86_BUILTIN_PSUBUSW,
25415
25416 IX86_BUILTIN_PAND,
25417 IX86_BUILTIN_PANDN,
25418 IX86_BUILTIN_POR,
25419 IX86_BUILTIN_PXOR,
25420
25421 IX86_BUILTIN_PAVGB,
25422 IX86_BUILTIN_PAVGW,
25423
25424 IX86_BUILTIN_PCMPEQB,
25425 IX86_BUILTIN_PCMPEQW,
25426 IX86_BUILTIN_PCMPEQD,
25427 IX86_BUILTIN_PCMPGTB,
25428 IX86_BUILTIN_PCMPGTW,
25429 IX86_BUILTIN_PCMPGTD,
25430
25431 IX86_BUILTIN_PMADDWD,
25432
25433 IX86_BUILTIN_PMAXSW,
25434 IX86_BUILTIN_PMAXUB,
25435 IX86_BUILTIN_PMINSW,
25436 IX86_BUILTIN_PMINUB,
25437
25438 IX86_BUILTIN_PMULHUW,
25439 IX86_BUILTIN_PMULHW,
25440 IX86_BUILTIN_PMULLW,
25441
25442 IX86_BUILTIN_PSADBW,
25443 IX86_BUILTIN_PSHUFW,
25444
25445 IX86_BUILTIN_PSLLW,
25446 IX86_BUILTIN_PSLLD,
25447 IX86_BUILTIN_PSLLQ,
25448 IX86_BUILTIN_PSRAW,
25449 IX86_BUILTIN_PSRAD,
25450 IX86_BUILTIN_PSRLW,
25451 IX86_BUILTIN_PSRLD,
25452 IX86_BUILTIN_PSRLQ,
25453 IX86_BUILTIN_PSLLWI,
25454 IX86_BUILTIN_PSLLDI,
25455 IX86_BUILTIN_PSLLQI,
25456 IX86_BUILTIN_PSRAWI,
25457 IX86_BUILTIN_PSRADI,
25458 IX86_BUILTIN_PSRLWI,
25459 IX86_BUILTIN_PSRLDI,
25460 IX86_BUILTIN_PSRLQI,
25461
25462 IX86_BUILTIN_PUNPCKHBW,
25463 IX86_BUILTIN_PUNPCKHWD,
25464 IX86_BUILTIN_PUNPCKHDQ,
25465 IX86_BUILTIN_PUNPCKLBW,
25466 IX86_BUILTIN_PUNPCKLWD,
25467 IX86_BUILTIN_PUNPCKLDQ,
25468
25469 IX86_BUILTIN_SHUFPS,
25470
25471 IX86_BUILTIN_RCPPS,
25472 IX86_BUILTIN_RCPSS,
25473 IX86_BUILTIN_RSQRTPS,
25474 IX86_BUILTIN_RSQRTPS_NR,
25475 IX86_BUILTIN_RSQRTSS,
25476 IX86_BUILTIN_RSQRTF,
25477 IX86_BUILTIN_SQRTPS,
25478 IX86_BUILTIN_SQRTPS_NR,
25479 IX86_BUILTIN_SQRTSS,
25480
25481 IX86_BUILTIN_UNPCKHPS,
25482 IX86_BUILTIN_UNPCKLPS,
25483
25484 IX86_BUILTIN_ANDPS,
25485 IX86_BUILTIN_ANDNPS,
25486 IX86_BUILTIN_ORPS,
25487 IX86_BUILTIN_XORPS,
25488
25489 IX86_BUILTIN_EMMS,
25490 IX86_BUILTIN_LDMXCSR,
25491 IX86_BUILTIN_STMXCSR,
25492 IX86_BUILTIN_SFENCE,
25493
25494 IX86_BUILTIN_FXSAVE,
25495 IX86_BUILTIN_FXRSTOR,
25496 IX86_BUILTIN_FXSAVE64,
25497 IX86_BUILTIN_FXRSTOR64,
25498
25499 IX86_BUILTIN_XSAVE,
25500 IX86_BUILTIN_XRSTOR,
25501 IX86_BUILTIN_XSAVE64,
25502 IX86_BUILTIN_XRSTOR64,
25503
25504 IX86_BUILTIN_XSAVEOPT,
25505 IX86_BUILTIN_XSAVEOPT64,
25506
25507 /* 3DNow! Original */
25508 IX86_BUILTIN_FEMMS,
25509 IX86_BUILTIN_PAVGUSB,
25510 IX86_BUILTIN_PF2ID,
25511 IX86_BUILTIN_PFACC,
25512 IX86_BUILTIN_PFADD,
25513 IX86_BUILTIN_PFCMPEQ,
25514 IX86_BUILTIN_PFCMPGE,
25515 IX86_BUILTIN_PFCMPGT,
25516 IX86_BUILTIN_PFMAX,
25517 IX86_BUILTIN_PFMIN,
25518 IX86_BUILTIN_PFMUL,
25519 IX86_BUILTIN_PFRCP,
25520 IX86_BUILTIN_PFRCPIT1,
25521 IX86_BUILTIN_PFRCPIT2,
25522 IX86_BUILTIN_PFRSQIT1,
25523 IX86_BUILTIN_PFRSQRT,
25524 IX86_BUILTIN_PFSUB,
25525 IX86_BUILTIN_PFSUBR,
25526 IX86_BUILTIN_PI2FD,
25527 IX86_BUILTIN_PMULHRW,
25528
25529 /* 3DNow! Athlon Extensions */
25530 IX86_BUILTIN_PF2IW,
25531 IX86_BUILTIN_PFNACC,
25532 IX86_BUILTIN_PFPNACC,
25533 IX86_BUILTIN_PI2FW,
25534 IX86_BUILTIN_PSWAPDSI,
25535 IX86_BUILTIN_PSWAPDSF,
25536
25537 /* SSE2 */
25538 IX86_BUILTIN_ADDPD,
25539 IX86_BUILTIN_ADDSD,
25540 IX86_BUILTIN_DIVPD,
25541 IX86_BUILTIN_DIVSD,
25542 IX86_BUILTIN_MULPD,
25543 IX86_BUILTIN_MULSD,
25544 IX86_BUILTIN_SUBPD,
25545 IX86_BUILTIN_SUBSD,
25546
25547 IX86_BUILTIN_CMPEQPD,
25548 IX86_BUILTIN_CMPLTPD,
25549 IX86_BUILTIN_CMPLEPD,
25550 IX86_BUILTIN_CMPGTPD,
25551 IX86_BUILTIN_CMPGEPD,
25552 IX86_BUILTIN_CMPNEQPD,
25553 IX86_BUILTIN_CMPNLTPD,
25554 IX86_BUILTIN_CMPNLEPD,
25555 IX86_BUILTIN_CMPNGTPD,
25556 IX86_BUILTIN_CMPNGEPD,
25557 IX86_BUILTIN_CMPORDPD,
25558 IX86_BUILTIN_CMPUNORDPD,
25559 IX86_BUILTIN_CMPEQSD,
25560 IX86_BUILTIN_CMPLTSD,
25561 IX86_BUILTIN_CMPLESD,
25562 IX86_BUILTIN_CMPNEQSD,
25563 IX86_BUILTIN_CMPNLTSD,
25564 IX86_BUILTIN_CMPNLESD,
25565 IX86_BUILTIN_CMPORDSD,
25566 IX86_BUILTIN_CMPUNORDSD,
25567
25568 IX86_BUILTIN_COMIEQSD,
25569 IX86_BUILTIN_COMILTSD,
25570 IX86_BUILTIN_COMILESD,
25571 IX86_BUILTIN_COMIGTSD,
25572 IX86_BUILTIN_COMIGESD,
25573 IX86_BUILTIN_COMINEQSD,
25574 IX86_BUILTIN_UCOMIEQSD,
25575 IX86_BUILTIN_UCOMILTSD,
25576 IX86_BUILTIN_UCOMILESD,
25577 IX86_BUILTIN_UCOMIGTSD,
25578 IX86_BUILTIN_UCOMIGESD,
25579 IX86_BUILTIN_UCOMINEQSD,
25580
25581 IX86_BUILTIN_MAXPD,
25582 IX86_BUILTIN_MAXSD,
25583 IX86_BUILTIN_MINPD,
25584 IX86_BUILTIN_MINSD,
25585
25586 IX86_BUILTIN_ANDPD,
25587 IX86_BUILTIN_ANDNPD,
25588 IX86_BUILTIN_ORPD,
25589 IX86_BUILTIN_XORPD,
25590
25591 IX86_BUILTIN_SQRTPD,
25592 IX86_BUILTIN_SQRTSD,
25593
25594 IX86_BUILTIN_UNPCKHPD,
25595 IX86_BUILTIN_UNPCKLPD,
25596
25597 IX86_BUILTIN_SHUFPD,
25598
25599 IX86_BUILTIN_LOADUPD,
25600 IX86_BUILTIN_STOREUPD,
25601 IX86_BUILTIN_MOVSD,
25602
25603 IX86_BUILTIN_LOADHPD,
25604 IX86_BUILTIN_LOADLPD,
25605
25606 IX86_BUILTIN_CVTDQ2PD,
25607 IX86_BUILTIN_CVTDQ2PS,
25608
25609 IX86_BUILTIN_CVTPD2DQ,
25610 IX86_BUILTIN_CVTPD2PI,
25611 IX86_BUILTIN_CVTPD2PS,
25612 IX86_BUILTIN_CVTTPD2DQ,
25613 IX86_BUILTIN_CVTTPD2PI,
25614
25615 IX86_BUILTIN_CVTPI2PD,
25616 IX86_BUILTIN_CVTSI2SD,
25617 IX86_BUILTIN_CVTSI642SD,
25618
25619 IX86_BUILTIN_CVTSD2SI,
25620 IX86_BUILTIN_CVTSD2SI64,
25621 IX86_BUILTIN_CVTSD2SS,
25622 IX86_BUILTIN_CVTSS2SD,
25623 IX86_BUILTIN_CVTTSD2SI,
25624 IX86_BUILTIN_CVTTSD2SI64,
25625
25626 IX86_BUILTIN_CVTPS2DQ,
25627 IX86_BUILTIN_CVTPS2PD,
25628 IX86_BUILTIN_CVTTPS2DQ,
25629
25630 IX86_BUILTIN_MOVNTI,
25631 IX86_BUILTIN_MOVNTI64,
25632 IX86_BUILTIN_MOVNTPD,
25633 IX86_BUILTIN_MOVNTDQ,
25634
25635 IX86_BUILTIN_MOVQ128,
25636
25637 /* SSE2 MMX */
25638 IX86_BUILTIN_MASKMOVDQU,
25639 IX86_BUILTIN_MOVMSKPD,
25640 IX86_BUILTIN_PMOVMSKB128,
25641
25642 IX86_BUILTIN_PACKSSWB128,
25643 IX86_BUILTIN_PACKSSDW128,
25644 IX86_BUILTIN_PACKUSWB128,
25645
25646 IX86_BUILTIN_PADDB128,
25647 IX86_BUILTIN_PADDW128,
25648 IX86_BUILTIN_PADDD128,
25649 IX86_BUILTIN_PADDQ128,
25650 IX86_BUILTIN_PADDSB128,
25651 IX86_BUILTIN_PADDSW128,
25652 IX86_BUILTIN_PADDUSB128,
25653 IX86_BUILTIN_PADDUSW128,
25654 IX86_BUILTIN_PSUBB128,
25655 IX86_BUILTIN_PSUBW128,
25656 IX86_BUILTIN_PSUBD128,
25657 IX86_BUILTIN_PSUBQ128,
25658 IX86_BUILTIN_PSUBSB128,
25659 IX86_BUILTIN_PSUBSW128,
25660 IX86_BUILTIN_PSUBUSB128,
25661 IX86_BUILTIN_PSUBUSW128,
25662
25663 IX86_BUILTIN_PAND128,
25664 IX86_BUILTIN_PANDN128,
25665 IX86_BUILTIN_POR128,
25666 IX86_BUILTIN_PXOR128,
25667
25668 IX86_BUILTIN_PAVGB128,
25669 IX86_BUILTIN_PAVGW128,
25670
25671 IX86_BUILTIN_PCMPEQB128,
25672 IX86_BUILTIN_PCMPEQW128,
25673 IX86_BUILTIN_PCMPEQD128,
25674 IX86_BUILTIN_PCMPGTB128,
25675 IX86_BUILTIN_PCMPGTW128,
25676 IX86_BUILTIN_PCMPGTD128,
25677
25678 IX86_BUILTIN_PMADDWD128,
25679
25680 IX86_BUILTIN_PMAXSW128,
25681 IX86_BUILTIN_PMAXUB128,
25682 IX86_BUILTIN_PMINSW128,
25683 IX86_BUILTIN_PMINUB128,
25684
25685 IX86_BUILTIN_PMULUDQ,
25686 IX86_BUILTIN_PMULUDQ128,
25687 IX86_BUILTIN_PMULHUW128,
25688 IX86_BUILTIN_PMULHW128,
25689 IX86_BUILTIN_PMULLW128,
25690
25691 IX86_BUILTIN_PSADBW128,
25692 IX86_BUILTIN_PSHUFHW,
25693 IX86_BUILTIN_PSHUFLW,
25694 IX86_BUILTIN_PSHUFD,
25695
25696 IX86_BUILTIN_PSLLDQI128,
25697 IX86_BUILTIN_PSLLWI128,
25698 IX86_BUILTIN_PSLLDI128,
25699 IX86_BUILTIN_PSLLQI128,
25700 IX86_BUILTIN_PSRAWI128,
25701 IX86_BUILTIN_PSRADI128,
25702 IX86_BUILTIN_PSRLDQI128,
25703 IX86_BUILTIN_PSRLWI128,
25704 IX86_BUILTIN_PSRLDI128,
25705 IX86_BUILTIN_PSRLQI128,
25706
25707 IX86_BUILTIN_PSLLDQ128,
25708 IX86_BUILTIN_PSLLW128,
25709 IX86_BUILTIN_PSLLD128,
25710 IX86_BUILTIN_PSLLQ128,
25711 IX86_BUILTIN_PSRAW128,
25712 IX86_BUILTIN_PSRAD128,
25713 IX86_BUILTIN_PSRLW128,
25714 IX86_BUILTIN_PSRLD128,
25715 IX86_BUILTIN_PSRLQ128,
25716
25717 IX86_BUILTIN_PUNPCKHBW128,
25718 IX86_BUILTIN_PUNPCKHWD128,
25719 IX86_BUILTIN_PUNPCKHDQ128,
25720 IX86_BUILTIN_PUNPCKHQDQ128,
25721 IX86_BUILTIN_PUNPCKLBW128,
25722 IX86_BUILTIN_PUNPCKLWD128,
25723 IX86_BUILTIN_PUNPCKLDQ128,
25724 IX86_BUILTIN_PUNPCKLQDQ128,
25725
25726 IX86_BUILTIN_CLFLUSH,
25727 IX86_BUILTIN_MFENCE,
25728 IX86_BUILTIN_LFENCE,
25729 IX86_BUILTIN_PAUSE,
25730
25731 IX86_BUILTIN_BSRSI,
25732 IX86_BUILTIN_BSRDI,
25733 IX86_BUILTIN_RDPMC,
25734 IX86_BUILTIN_RDTSC,
25735 IX86_BUILTIN_RDTSCP,
25736 IX86_BUILTIN_ROLQI,
25737 IX86_BUILTIN_ROLHI,
25738 IX86_BUILTIN_RORQI,
25739 IX86_BUILTIN_RORHI,
25740
25741 /* SSE3. */
25742 IX86_BUILTIN_ADDSUBPS,
25743 IX86_BUILTIN_HADDPS,
25744 IX86_BUILTIN_HSUBPS,
25745 IX86_BUILTIN_MOVSHDUP,
25746 IX86_BUILTIN_MOVSLDUP,
25747 IX86_BUILTIN_ADDSUBPD,
25748 IX86_BUILTIN_HADDPD,
25749 IX86_BUILTIN_HSUBPD,
25750 IX86_BUILTIN_LDDQU,
25751
25752 IX86_BUILTIN_MONITOR,
25753 IX86_BUILTIN_MWAIT,
25754
25755 /* SSSE3. */
25756 IX86_BUILTIN_PHADDW,
25757 IX86_BUILTIN_PHADDD,
25758 IX86_BUILTIN_PHADDSW,
25759 IX86_BUILTIN_PHSUBW,
25760 IX86_BUILTIN_PHSUBD,
25761 IX86_BUILTIN_PHSUBSW,
25762 IX86_BUILTIN_PMADDUBSW,
25763 IX86_BUILTIN_PMULHRSW,
25764 IX86_BUILTIN_PSHUFB,
25765 IX86_BUILTIN_PSIGNB,
25766 IX86_BUILTIN_PSIGNW,
25767 IX86_BUILTIN_PSIGND,
25768 IX86_BUILTIN_PALIGNR,
25769 IX86_BUILTIN_PABSB,
25770 IX86_BUILTIN_PABSW,
25771 IX86_BUILTIN_PABSD,
25772
25773 IX86_BUILTIN_PHADDW128,
25774 IX86_BUILTIN_PHADDD128,
25775 IX86_BUILTIN_PHADDSW128,
25776 IX86_BUILTIN_PHSUBW128,
25777 IX86_BUILTIN_PHSUBD128,
25778 IX86_BUILTIN_PHSUBSW128,
25779 IX86_BUILTIN_PMADDUBSW128,
25780 IX86_BUILTIN_PMULHRSW128,
25781 IX86_BUILTIN_PSHUFB128,
25782 IX86_BUILTIN_PSIGNB128,
25783 IX86_BUILTIN_PSIGNW128,
25784 IX86_BUILTIN_PSIGND128,
25785 IX86_BUILTIN_PALIGNR128,
25786 IX86_BUILTIN_PABSB128,
25787 IX86_BUILTIN_PABSW128,
25788 IX86_BUILTIN_PABSD128,
25789
25790 /* AMDFAM10 - SSE4A New Instructions. */
25791 IX86_BUILTIN_MOVNTSD,
25792 IX86_BUILTIN_MOVNTSS,
25793 IX86_BUILTIN_EXTRQI,
25794 IX86_BUILTIN_EXTRQ,
25795 IX86_BUILTIN_INSERTQI,
25796 IX86_BUILTIN_INSERTQ,
25797
25798 /* SSE4.1. */
25799 IX86_BUILTIN_BLENDPD,
25800 IX86_BUILTIN_BLENDPS,
25801 IX86_BUILTIN_BLENDVPD,
25802 IX86_BUILTIN_BLENDVPS,
25803 IX86_BUILTIN_PBLENDVB128,
25804 IX86_BUILTIN_PBLENDW128,
25805
25806 IX86_BUILTIN_DPPD,
25807 IX86_BUILTIN_DPPS,
25808
25809 IX86_BUILTIN_INSERTPS128,
25810
25811 IX86_BUILTIN_MOVNTDQA,
25812 IX86_BUILTIN_MPSADBW128,
25813 IX86_BUILTIN_PACKUSDW128,
25814 IX86_BUILTIN_PCMPEQQ,
25815 IX86_BUILTIN_PHMINPOSUW128,
25816
25817 IX86_BUILTIN_PMAXSB128,
25818 IX86_BUILTIN_PMAXSD128,
25819 IX86_BUILTIN_PMAXUD128,
25820 IX86_BUILTIN_PMAXUW128,
25821
25822 IX86_BUILTIN_PMINSB128,
25823 IX86_BUILTIN_PMINSD128,
25824 IX86_BUILTIN_PMINUD128,
25825 IX86_BUILTIN_PMINUW128,
25826
25827 IX86_BUILTIN_PMOVSXBW128,
25828 IX86_BUILTIN_PMOVSXBD128,
25829 IX86_BUILTIN_PMOVSXBQ128,
25830 IX86_BUILTIN_PMOVSXWD128,
25831 IX86_BUILTIN_PMOVSXWQ128,
25832 IX86_BUILTIN_PMOVSXDQ128,
25833
25834 IX86_BUILTIN_PMOVZXBW128,
25835 IX86_BUILTIN_PMOVZXBD128,
25836 IX86_BUILTIN_PMOVZXBQ128,
25837 IX86_BUILTIN_PMOVZXWD128,
25838 IX86_BUILTIN_PMOVZXWQ128,
25839 IX86_BUILTIN_PMOVZXDQ128,
25840
25841 IX86_BUILTIN_PMULDQ128,
25842 IX86_BUILTIN_PMULLD128,
25843
25844 IX86_BUILTIN_ROUNDSD,
25845 IX86_BUILTIN_ROUNDSS,
25846
25847 IX86_BUILTIN_ROUNDPD,
25848 IX86_BUILTIN_ROUNDPS,
25849
25850 IX86_BUILTIN_FLOORPD,
25851 IX86_BUILTIN_CEILPD,
25852 IX86_BUILTIN_TRUNCPD,
25853 IX86_BUILTIN_RINTPD,
25854 IX86_BUILTIN_ROUNDPD_AZ,
25855
25856 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
25857 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
25858 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
25859
25860 IX86_BUILTIN_FLOORPS,
25861 IX86_BUILTIN_CEILPS,
25862 IX86_BUILTIN_TRUNCPS,
25863 IX86_BUILTIN_RINTPS,
25864 IX86_BUILTIN_ROUNDPS_AZ,
25865
25866 IX86_BUILTIN_FLOORPS_SFIX,
25867 IX86_BUILTIN_CEILPS_SFIX,
25868 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
25869
25870 IX86_BUILTIN_PTESTZ,
25871 IX86_BUILTIN_PTESTC,
25872 IX86_BUILTIN_PTESTNZC,
25873
25874 IX86_BUILTIN_VEC_INIT_V2SI,
25875 IX86_BUILTIN_VEC_INIT_V4HI,
25876 IX86_BUILTIN_VEC_INIT_V8QI,
25877 IX86_BUILTIN_VEC_EXT_V2DF,
25878 IX86_BUILTIN_VEC_EXT_V2DI,
25879 IX86_BUILTIN_VEC_EXT_V4SF,
25880 IX86_BUILTIN_VEC_EXT_V4SI,
25881 IX86_BUILTIN_VEC_EXT_V8HI,
25882 IX86_BUILTIN_VEC_EXT_V2SI,
25883 IX86_BUILTIN_VEC_EXT_V4HI,
25884 IX86_BUILTIN_VEC_EXT_V16QI,
25885 IX86_BUILTIN_VEC_SET_V2DI,
25886 IX86_BUILTIN_VEC_SET_V4SF,
25887 IX86_BUILTIN_VEC_SET_V4SI,
25888 IX86_BUILTIN_VEC_SET_V8HI,
25889 IX86_BUILTIN_VEC_SET_V4HI,
25890 IX86_BUILTIN_VEC_SET_V16QI,
25891
25892 IX86_BUILTIN_VEC_PACK_SFIX,
25893 IX86_BUILTIN_VEC_PACK_SFIX256,
25894
25895 /* SSE4.2. */
25896 IX86_BUILTIN_CRC32QI,
25897 IX86_BUILTIN_CRC32HI,
25898 IX86_BUILTIN_CRC32SI,
25899 IX86_BUILTIN_CRC32DI,
25900
25901 IX86_BUILTIN_PCMPESTRI128,
25902 IX86_BUILTIN_PCMPESTRM128,
25903 IX86_BUILTIN_PCMPESTRA128,
25904 IX86_BUILTIN_PCMPESTRC128,
25905 IX86_BUILTIN_PCMPESTRO128,
25906 IX86_BUILTIN_PCMPESTRS128,
25907 IX86_BUILTIN_PCMPESTRZ128,
25908 IX86_BUILTIN_PCMPISTRI128,
25909 IX86_BUILTIN_PCMPISTRM128,
25910 IX86_BUILTIN_PCMPISTRA128,
25911 IX86_BUILTIN_PCMPISTRC128,
25912 IX86_BUILTIN_PCMPISTRO128,
25913 IX86_BUILTIN_PCMPISTRS128,
25914 IX86_BUILTIN_PCMPISTRZ128,
25915
25916 IX86_BUILTIN_PCMPGTQ,
25917
25918 /* AES instructions */
25919 IX86_BUILTIN_AESENC128,
25920 IX86_BUILTIN_AESENCLAST128,
25921 IX86_BUILTIN_AESDEC128,
25922 IX86_BUILTIN_AESDECLAST128,
25923 IX86_BUILTIN_AESIMC128,
25924 IX86_BUILTIN_AESKEYGENASSIST128,
25925
25926 /* PCLMUL instruction */
25927 IX86_BUILTIN_PCLMULQDQ128,
25928
25929 /* AVX */
25930 IX86_BUILTIN_ADDPD256,
25931 IX86_BUILTIN_ADDPS256,
25932 IX86_BUILTIN_ADDSUBPD256,
25933 IX86_BUILTIN_ADDSUBPS256,
25934 IX86_BUILTIN_ANDPD256,
25935 IX86_BUILTIN_ANDPS256,
25936 IX86_BUILTIN_ANDNPD256,
25937 IX86_BUILTIN_ANDNPS256,
25938 IX86_BUILTIN_BLENDPD256,
25939 IX86_BUILTIN_BLENDPS256,
25940 IX86_BUILTIN_BLENDVPD256,
25941 IX86_BUILTIN_BLENDVPS256,
25942 IX86_BUILTIN_DIVPD256,
25943 IX86_BUILTIN_DIVPS256,
25944 IX86_BUILTIN_DPPS256,
25945 IX86_BUILTIN_HADDPD256,
25946 IX86_BUILTIN_HADDPS256,
25947 IX86_BUILTIN_HSUBPD256,
25948 IX86_BUILTIN_HSUBPS256,
25949 IX86_BUILTIN_MAXPD256,
25950 IX86_BUILTIN_MAXPS256,
25951 IX86_BUILTIN_MINPD256,
25952 IX86_BUILTIN_MINPS256,
25953 IX86_BUILTIN_MULPD256,
25954 IX86_BUILTIN_MULPS256,
25955 IX86_BUILTIN_ORPD256,
25956 IX86_BUILTIN_ORPS256,
25957 IX86_BUILTIN_SHUFPD256,
25958 IX86_BUILTIN_SHUFPS256,
25959 IX86_BUILTIN_SUBPD256,
25960 IX86_BUILTIN_SUBPS256,
25961 IX86_BUILTIN_XORPD256,
25962 IX86_BUILTIN_XORPS256,
25963 IX86_BUILTIN_CMPSD,
25964 IX86_BUILTIN_CMPSS,
25965 IX86_BUILTIN_CMPPD,
25966 IX86_BUILTIN_CMPPS,
25967 IX86_BUILTIN_CMPPD256,
25968 IX86_BUILTIN_CMPPS256,
25969 IX86_BUILTIN_CVTDQ2PD256,
25970 IX86_BUILTIN_CVTDQ2PS256,
25971 IX86_BUILTIN_CVTPD2PS256,
25972 IX86_BUILTIN_CVTPS2DQ256,
25973 IX86_BUILTIN_CVTPS2PD256,
25974 IX86_BUILTIN_CVTTPD2DQ256,
25975 IX86_BUILTIN_CVTPD2DQ256,
25976 IX86_BUILTIN_CVTTPS2DQ256,
25977 IX86_BUILTIN_EXTRACTF128PD256,
25978 IX86_BUILTIN_EXTRACTF128PS256,
25979 IX86_BUILTIN_EXTRACTF128SI256,
25980 IX86_BUILTIN_VZEROALL,
25981 IX86_BUILTIN_VZEROUPPER,
25982 IX86_BUILTIN_VPERMILVARPD,
25983 IX86_BUILTIN_VPERMILVARPS,
25984 IX86_BUILTIN_VPERMILVARPD256,
25985 IX86_BUILTIN_VPERMILVARPS256,
25986 IX86_BUILTIN_VPERMILPD,
25987 IX86_BUILTIN_VPERMILPS,
25988 IX86_BUILTIN_VPERMILPD256,
25989 IX86_BUILTIN_VPERMILPS256,
25990 IX86_BUILTIN_VPERMIL2PD,
25991 IX86_BUILTIN_VPERMIL2PS,
25992 IX86_BUILTIN_VPERMIL2PD256,
25993 IX86_BUILTIN_VPERMIL2PS256,
25994 IX86_BUILTIN_VPERM2F128PD256,
25995 IX86_BUILTIN_VPERM2F128PS256,
25996 IX86_BUILTIN_VPERM2F128SI256,
25997 IX86_BUILTIN_VBROADCASTSS,
25998 IX86_BUILTIN_VBROADCASTSD256,
25999 IX86_BUILTIN_VBROADCASTSS256,
26000 IX86_BUILTIN_VBROADCASTPD256,
26001 IX86_BUILTIN_VBROADCASTPS256,
26002 IX86_BUILTIN_VINSERTF128PD256,
26003 IX86_BUILTIN_VINSERTF128PS256,
26004 IX86_BUILTIN_VINSERTF128SI256,
26005 IX86_BUILTIN_LOADUPD256,
26006 IX86_BUILTIN_LOADUPS256,
26007 IX86_BUILTIN_STOREUPD256,
26008 IX86_BUILTIN_STOREUPS256,
26009 IX86_BUILTIN_LDDQU256,
26010 IX86_BUILTIN_MOVNTDQ256,
26011 IX86_BUILTIN_MOVNTPD256,
26012 IX86_BUILTIN_MOVNTPS256,
26013 IX86_BUILTIN_LOADDQU256,
26014 IX86_BUILTIN_STOREDQU256,
26015 IX86_BUILTIN_MASKLOADPD,
26016 IX86_BUILTIN_MASKLOADPS,
26017 IX86_BUILTIN_MASKSTOREPD,
26018 IX86_BUILTIN_MASKSTOREPS,
26019 IX86_BUILTIN_MASKLOADPD256,
26020 IX86_BUILTIN_MASKLOADPS256,
26021 IX86_BUILTIN_MASKSTOREPD256,
26022 IX86_BUILTIN_MASKSTOREPS256,
26023 IX86_BUILTIN_MOVSHDUP256,
26024 IX86_BUILTIN_MOVSLDUP256,
26025 IX86_BUILTIN_MOVDDUP256,
26026
26027 IX86_BUILTIN_SQRTPD256,
26028 IX86_BUILTIN_SQRTPS256,
26029 IX86_BUILTIN_SQRTPS_NR256,
26030 IX86_BUILTIN_RSQRTPS256,
26031 IX86_BUILTIN_RSQRTPS_NR256,
26032
26033 IX86_BUILTIN_RCPPS256,
26034
26035 IX86_BUILTIN_ROUNDPD256,
26036 IX86_BUILTIN_ROUNDPS256,
26037
26038 IX86_BUILTIN_FLOORPD256,
26039 IX86_BUILTIN_CEILPD256,
26040 IX86_BUILTIN_TRUNCPD256,
26041 IX86_BUILTIN_RINTPD256,
26042 IX86_BUILTIN_ROUNDPD_AZ256,
26043
26044 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26045 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26046 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26047
26048 IX86_BUILTIN_FLOORPS256,
26049 IX86_BUILTIN_CEILPS256,
26050 IX86_BUILTIN_TRUNCPS256,
26051 IX86_BUILTIN_RINTPS256,
26052 IX86_BUILTIN_ROUNDPS_AZ256,
26053
26054 IX86_BUILTIN_FLOORPS_SFIX256,
26055 IX86_BUILTIN_CEILPS_SFIX256,
26056 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26057
26058 IX86_BUILTIN_UNPCKHPD256,
26059 IX86_BUILTIN_UNPCKLPD256,
26060 IX86_BUILTIN_UNPCKHPS256,
26061 IX86_BUILTIN_UNPCKLPS256,
26062
26063 IX86_BUILTIN_SI256_SI,
26064 IX86_BUILTIN_PS256_PS,
26065 IX86_BUILTIN_PD256_PD,
26066 IX86_BUILTIN_SI_SI256,
26067 IX86_BUILTIN_PS_PS256,
26068 IX86_BUILTIN_PD_PD256,
26069
26070 IX86_BUILTIN_VTESTZPD,
26071 IX86_BUILTIN_VTESTCPD,
26072 IX86_BUILTIN_VTESTNZCPD,
26073 IX86_BUILTIN_VTESTZPS,
26074 IX86_BUILTIN_VTESTCPS,
26075 IX86_BUILTIN_VTESTNZCPS,
26076 IX86_BUILTIN_VTESTZPD256,
26077 IX86_BUILTIN_VTESTCPD256,
26078 IX86_BUILTIN_VTESTNZCPD256,
26079 IX86_BUILTIN_VTESTZPS256,
26080 IX86_BUILTIN_VTESTCPS256,
26081 IX86_BUILTIN_VTESTNZCPS256,
26082 IX86_BUILTIN_PTESTZ256,
26083 IX86_BUILTIN_PTESTC256,
26084 IX86_BUILTIN_PTESTNZC256,
26085
26086 IX86_BUILTIN_MOVMSKPD256,
26087 IX86_BUILTIN_MOVMSKPS256,
26088
26089 /* AVX2 */
26090 IX86_BUILTIN_MPSADBW256,
26091 IX86_BUILTIN_PABSB256,
26092 IX86_BUILTIN_PABSW256,
26093 IX86_BUILTIN_PABSD256,
26094 IX86_BUILTIN_PACKSSDW256,
26095 IX86_BUILTIN_PACKSSWB256,
26096 IX86_BUILTIN_PACKUSDW256,
26097 IX86_BUILTIN_PACKUSWB256,
26098 IX86_BUILTIN_PADDB256,
26099 IX86_BUILTIN_PADDW256,
26100 IX86_BUILTIN_PADDD256,
26101 IX86_BUILTIN_PADDQ256,
26102 IX86_BUILTIN_PADDSB256,
26103 IX86_BUILTIN_PADDSW256,
26104 IX86_BUILTIN_PADDUSB256,
26105 IX86_BUILTIN_PADDUSW256,
26106 IX86_BUILTIN_PALIGNR256,
26107 IX86_BUILTIN_AND256I,
26108 IX86_BUILTIN_ANDNOT256I,
26109 IX86_BUILTIN_PAVGB256,
26110 IX86_BUILTIN_PAVGW256,
26111 IX86_BUILTIN_PBLENDVB256,
26112 IX86_BUILTIN_PBLENDVW256,
26113 IX86_BUILTIN_PCMPEQB256,
26114 IX86_BUILTIN_PCMPEQW256,
26115 IX86_BUILTIN_PCMPEQD256,
26116 IX86_BUILTIN_PCMPEQQ256,
26117 IX86_BUILTIN_PCMPGTB256,
26118 IX86_BUILTIN_PCMPGTW256,
26119 IX86_BUILTIN_PCMPGTD256,
26120 IX86_BUILTIN_PCMPGTQ256,
26121 IX86_BUILTIN_PHADDW256,
26122 IX86_BUILTIN_PHADDD256,
26123 IX86_BUILTIN_PHADDSW256,
26124 IX86_BUILTIN_PHSUBW256,
26125 IX86_BUILTIN_PHSUBD256,
26126 IX86_BUILTIN_PHSUBSW256,
26127 IX86_BUILTIN_PMADDUBSW256,
26128 IX86_BUILTIN_PMADDWD256,
26129 IX86_BUILTIN_PMAXSB256,
26130 IX86_BUILTIN_PMAXSW256,
26131 IX86_BUILTIN_PMAXSD256,
26132 IX86_BUILTIN_PMAXUB256,
26133 IX86_BUILTIN_PMAXUW256,
26134 IX86_BUILTIN_PMAXUD256,
26135 IX86_BUILTIN_PMINSB256,
26136 IX86_BUILTIN_PMINSW256,
26137 IX86_BUILTIN_PMINSD256,
26138 IX86_BUILTIN_PMINUB256,
26139 IX86_BUILTIN_PMINUW256,
26140 IX86_BUILTIN_PMINUD256,
26141 IX86_BUILTIN_PMOVMSKB256,
26142 IX86_BUILTIN_PMOVSXBW256,
26143 IX86_BUILTIN_PMOVSXBD256,
26144 IX86_BUILTIN_PMOVSXBQ256,
26145 IX86_BUILTIN_PMOVSXWD256,
26146 IX86_BUILTIN_PMOVSXWQ256,
26147 IX86_BUILTIN_PMOVSXDQ256,
26148 IX86_BUILTIN_PMOVZXBW256,
26149 IX86_BUILTIN_PMOVZXBD256,
26150 IX86_BUILTIN_PMOVZXBQ256,
26151 IX86_BUILTIN_PMOVZXWD256,
26152 IX86_BUILTIN_PMOVZXWQ256,
26153 IX86_BUILTIN_PMOVZXDQ256,
26154 IX86_BUILTIN_PMULDQ256,
26155 IX86_BUILTIN_PMULHRSW256,
26156 IX86_BUILTIN_PMULHUW256,
26157 IX86_BUILTIN_PMULHW256,
26158 IX86_BUILTIN_PMULLW256,
26159 IX86_BUILTIN_PMULLD256,
26160 IX86_BUILTIN_PMULUDQ256,
26161 IX86_BUILTIN_POR256,
26162 IX86_BUILTIN_PSADBW256,
26163 IX86_BUILTIN_PSHUFB256,
26164 IX86_BUILTIN_PSHUFD256,
26165 IX86_BUILTIN_PSHUFHW256,
26166 IX86_BUILTIN_PSHUFLW256,
26167 IX86_BUILTIN_PSIGNB256,
26168 IX86_BUILTIN_PSIGNW256,
26169 IX86_BUILTIN_PSIGND256,
26170 IX86_BUILTIN_PSLLDQI256,
26171 IX86_BUILTIN_PSLLWI256,
26172 IX86_BUILTIN_PSLLW256,
26173 IX86_BUILTIN_PSLLDI256,
26174 IX86_BUILTIN_PSLLD256,
26175 IX86_BUILTIN_PSLLQI256,
26176 IX86_BUILTIN_PSLLQ256,
26177 IX86_BUILTIN_PSRAWI256,
26178 IX86_BUILTIN_PSRAW256,
26179 IX86_BUILTIN_PSRADI256,
26180 IX86_BUILTIN_PSRAD256,
26181 IX86_BUILTIN_PSRLDQI256,
26182 IX86_BUILTIN_PSRLWI256,
26183 IX86_BUILTIN_PSRLW256,
26184 IX86_BUILTIN_PSRLDI256,
26185 IX86_BUILTIN_PSRLD256,
26186 IX86_BUILTIN_PSRLQI256,
26187 IX86_BUILTIN_PSRLQ256,
26188 IX86_BUILTIN_PSUBB256,
26189 IX86_BUILTIN_PSUBW256,
26190 IX86_BUILTIN_PSUBD256,
26191 IX86_BUILTIN_PSUBQ256,
26192 IX86_BUILTIN_PSUBSB256,
26193 IX86_BUILTIN_PSUBSW256,
26194 IX86_BUILTIN_PSUBUSB256,
26195 IX86_BUILTIN_PSUBUSW256,
26196 IX86_BUILTIN_PUNPCKHBW256,
26197 IX86_BUILTIN_PUNPCKHWD256,
26198 IX86_BUILTIN_PUNPCKHDQ256,
26199 IX86_BUILTIN_PUNPCKHQDQ256,
26200 IX86_BUILTIN_PUNPCKLBW256,
26201 IX86_BUILTIN_PUNPCKLWD256,
26202 IX86_BUILTIN_PUNPCKLDQ256,
26203 IX86_BUILTIN_PUNPCKLQDQ256,
26204 IX86_BUILTIN_PXOR256,
26205 IX86_BUILTIN_MOVNTDQA256,
26206 IX86_BUILTIN_VBROADCASTSS_PS,
26207 IX86_BUILTIN_VBROADCASTSS_PS256,
26208 IX86_BUILTIN_VBROADCASTSD_PD256,
26209 IX86_BUILTIN_VBROADCASTSI256,
26210 IX86_BUILTIN_PBLENDD256,
26211 IX86_BUILTIN_PBLENDD128,
26212 IX86_BUILTIN_PBROADCASTB256,
26213 IX86_BUILTIN_PBROADCASTW256,
26214 IX86_BUILTIN_PBROADCASTD256,
26215 IX86_BUILTIN_PBROADCASTQ256,
26216 IX86_BUILTIN_PBROADCASTB128,
26217 IX86_BUILTIN_PBROADCASTW128,
26218 IX86_BUILTIN_PBROADCASTD128,
26219 IX86_BUILTIN_PBROADCASTQ128,
26220 IX86_BUILTIN_VPERMVARSI256,
26221 IX86_BUILTIN_VPERMDF256,
26222 IX86_BUILTIN_VPERMVARSF256,
26223 IX86_BUILTIN_VPERMDI256,
26224 IX86_BUILTIN_VPERMTI256,
26225 IX86_BUILTIN_VEXTRACT128I256,
26226 IX86_BUILTIN_VINSERT128I256,
26227 IX86_BUILTIN_MASKLOADD,
26228 IX86_BUILTIN_MASKLOADQ,
26229 IX86_BUILTIN_MASKLOADD256,
26230 IX86_BUILTIN_MASKLOADQ256,
26231 IX86_BUILTIN_MASKSTORED,
26232 IX86_BUILTIN_MASKSTOREQ,
26233 IX86_BUILTIN_MASKSTORED256,
26234 IX86_BUILTIN_MASKSTOREQ256,
26235 IX86_BUILTIN_PSLLVV4DI,
26236 IX86_BUILTIN_PSLLVV2DI,
26237 IX86_BUILTIN_PSLLVV8SI,
26238 IX86_BUILTIN_PSLLVV4SI,
26239 IX86_BUILTIN_PSRAVV8SI,
26240 IX86_BUILTIN_PSRAVV4SI,
26241 IX86_BUILTIN_PSRLVV4DI,
26242 IX86_BUILTIN_PSRLVV2DI,
26243 IX86_BUILTIN_PSRLVV8SI,
26244 IX86_BUILTIN_PSRLVV4SI,
26245
26246 IX86_BUILTIN_GATHERSIV2DF,
26247 IX86_BUILTIN_GATHERSIV4DF,
26248 IX86_BUILTIN_GATHERDIV2DF,
26249 IX86_BUILTIN_GATHERDIV4DF,
26250 IX86_BUILTIN_GATHERSIV4SF,
26251 IX86_BUILTIN_GATHERSIV8SF,
26252 IX86_BUILTIN_GATHERDIV4SF,
26253 IX86_BUILTIN_GATHERDIV8SF,
26254 IX86_BUILTIN_GATHERSIV2DI,
26255 IX86_BUILTIN_GATHERSIV4DI,
26256 IX86_BUILTIN_GATHERDIV2DI,
26257 IX86_BUILTIN_GATHERDIV4DI,
26258 IX86_BUILTIN_GATHERSIV4SI,
26259 IX86_BUILTIN_GATHERSIV8SI,
26260 IX86_BUILTIN_GATHERDIV4SI,
26261 IX86_BUILTIN_GATHERDIV8SI,
26262
26263 /* Alternate 4 element gather for the vectorizer where
26264 all operands are 32-byte wide. */
26265 IX86_BUILTIN_GATHERALTSIV4DF,
26266 IX86_BUILTIN_GATHERALTDIV8SF,
26267 IX86_BUILTIN_GATHERALTSIV4DI,
26268 IX86_BUILTIN_GATHERALTDIV8SI,
26269
26270 /* TFmode support builtins. */
26271 IX86_BUILTIN_INFQ,
26272 IX86_BUILTIN_HUGE_VALQ,
26273 IX86_BUILTIN_FABSQ,
26274 IX86_BUILTIN_COPYSIGNQ,
26275
26276 /* Vectorizer support builtins. */
26277 IX86_BUILTIN_CPYSGNPS,
26278 IX86_BUILTIN_CPYSGNPD,
26279 IX86_BUILTIN_CPYSGNPS256,
26280 IX86_BUILTIN_CPYSGNPD256,
26281
26282 /* FMA4 instructions. */
26283 IX86_BUILTIN_VFMADDSS,
26284 IX86_BUILTIN_VFMADDSD,
26285 IX86_BUILTIN_VFMADDPS,
26286 IX86_BUILTIN_VFMADDPD,
26287 IX86_BUILTIN_VFMADDPS256,
26288 IX86_BUILTIN_VFMADDPD256,
26289 IX86_BUILTIN_VFMADDSUBPS,
26290 IX86_BUILTIN_VFMADDSUBPD,
26291 IX86_BUILTIN_VFMADDSUBPS256,
26292 IX86_BUILTIN_VFMADDSUBPD256,
26293
26294 /* FMA3 instructions. */
26295 IX86_BUILTIN_VFMADDSS3,
26296 IX86_BUILTIN_VFMADDSD3,
26297
26298 /* XOP instructions. */
26299 IX86_BUILTIN_VPCMOV,
26300 IX86_BUILTIN_VPCMOV_V2DI,
26301 IX86_BUILTIN_VPCMOV_V4SI,
26302 IX86_BUILTIN_VPCMOV_V8HI,
26303 IX86_BUILTIN_VPCMOV_V16QI,
26304 IX86_BUILTIN_VPCMOV_V4SF,
26305 IX86_BUILTIN_VPCMOV_V2DF,
26306 IX86_BUILTIN_VPCMOV256,
26307 IX86_BUILTIN_VPCMOV_V4DI256,
26308 IX86_BUILTIN_VPCMOV_V8SI256,
26309 IX86_BUILTIN_VPCMOV_V16HI256,
26310 IX86_BUILTIN_VPCMOV_V32QI256,
26311 IX86_BUILTIN_VPCMOV_V8SF256,
26312 IX86_BUILTIN_VPCMOV_V4DF256,
26313
26314 IX86_BUILTIN_VPPERM,
26315
26316 IX86_BUILTIN_VPMACSSWW,
26317 IX86_BUILTIN_VPMACSWW,
26318 IX86_BUILTIN_VPMACSSWD,
26319 IX86_BUILTIN_VPMACSWD,
26320 IX86_BUILTIN_VPMACSSDD,
26321 IX86_BUILTIN_VPMACSDD,
26322 IX86_BUILTIN_VPMACSSDQL,
26323 IX86_BUILTIN_VPMACSSDQH,
26324 IX86_BUILTIN_VPMACSDQL,
26325 IX86_BUILTIN_VPMACSDQH,
26326 IX86_BUILTIN_VPMADCSSWD,
26327 IX86_BUILTIN_VPMADCSWD,
26328
26329 IX86_BUILTIN_VPHADDBW,
26330 IX86_BUILTIN_VPHADDBD,
26331 IX86_BUILTIN_VPHADDBQ,
26332 IX86_BUILTIN_VPHADDWD,
26333 IX86_BUILTIN_VPHADDWQ,
26334 IX86_BUILTIN_VPHADDDQ,
26335 IX86_BUILTIN_VPHADDUBW,
26336 IX86_BUILTIN_VPHADDUBD,
26337 IX86_BUILTIN_VPHADDUBQ,
26338 IX86_BUILTIN_VPHADDUWD,
26339 IX86_BUILTIN_VPHADDUWQ,
26340 IX86_BUILTIN_VPHADDUDQ,
26341 IX86_BUILTIN_VPHSUBBW,
26342 IX86_BUILTIN_VPHSUBWD,
26343 IX86_BUILTIN_VPHSUBDQ,
26344
26345 IX86_BUILTIN_VPROTB,
26346 IX86_BUILTIN_VPROTW,
26347 IX86_BUILTIN_VPROTD,
26348 IX86_BUILTIN_VPROTQ,
26349 IX86_BUILTIN_VPROTB_IMM,
26350 IX86_BUILTIN_VPROTW_IMM,
26351 IX86_BUILTIN_VPROTD_IMM,
26352 IX86_BUILTIN_VPROTQ_IMM,
26353
26354 IX86_BUILTIN_VPSHLB,
26355 IX86_BUILTIN_VPSHLW,
26356 IX86_BUILTIN_VPSHLD,
26357 IX86_BUILTIN_VPSHLQ,
26358 IX86_BUILTIN_VPSHAB,
26359 IX86_BUILTIN_VPSHAW,
26360 IX86_BUILTIN_VPSHAD,
26361 IX86_BUILTIN_VPSHAQ,
26362
26363 IX86_BUILTIN_VFRCZSS,
26364 IX86_BUILTIN_VFRCZSD,
26365 IX86_BUILTIN_VFRCZPS,
26366 IX86_BUILTIN_VFRCZPD,
26367 IX86_BUILTIN_VFRCZPS256,
26368 IX86_BUILTIN_VFRCZPD256,
26369
26370 IX86_BUILTIN_VPCOMEQUB,
26371 IX86_BUILTIN_VPCOMNEUB,
26372 IX86_BUILTIN_VPCOMLTUB,
26373 IX86_BUILTIN_VPCOMLEUB,
26374 IX86_BUILTIN_VPCOMGTUB,
26375 IX86_BUILTIN_VPCOMGEUB,
26376 IX86_BUILTIN_VPCOMFALSEUB,
26377 IX86_BUILTIN_VPCOMTRUEUB,
26378
26379 IX86_BUILTIN_VPCOMEQUW,
26380 IX86_BUILTIN_VPCOMNEUW,
26381 IX86_BUILTIN_VPCOMLTUW,
26382 IX86_BUILTIN_VPCOMLEUW,
26383 IX86_BUILTIN_VPCOMGTUW,
26384 IX86_BUILTIN_VPCOMGEUW,
26385 IX86_BUILTIN_VPCOMFALSEUW,
26386 IX86_BUILTIN_VPCOMTRUEUW,
26387
26388 IX86_BUILTIN_VPCOMEQUD,
26389 IX86_BUILTIN_VPCOMNEUD,
26390 IX86_BUILTIN_VPCOMLTUD,
26391 IX86_BUILTIN_VPCOMLEUD,
26392 IX86_BUILTIN_VPCOMGTUD,
26393 IX86_BUILTIN_VPCOMGEUD,
26394 IX86_BUILTIN_VPCOMFALSEUD,
26395 IX86_BUILTIN_VPCOMTRUEUD,
26396
26397 IX86_BUILTIN_VPCOMEQUQ,
26398 IX86_BUILTIN_VPCOMNEUQ,
26399 IX86_BUILTIN_VPCOMLTUQ,
26400 IX86_BUILTIN_VPCOMLEUQ,
26401 IX86_BUILTIN_VPCOMGTUQ,
26402 IX86_BUILTIN_VPCOMGEUQ,
26403 IX86_BUILTIN_VPCOMFALSEUQ,
26404 IX86_BUILTIN_VPCOMTRUEUQ,
26405
26406 IX86_BUILTIN_VPCOMEQB,
26407 IX86_BUILTIN_VPCOMNEB,
26408 IX86_BUILTIN_VPCOMLTB,
26409 IX86_BUILTIN_VPCOMLEB,
26410 IX86_BUILTIN_VPCOMGTB,
26411 IX86_BUILTIN_VPCOMGEB,
26412 IX86_BUILTIN_VPCOMFALSEB,
26413 IX86_BUILTIN_VPCOMTRUEB,
26414
26415 IX86_BUILTIN_VPCOMEQW,
26416 IX86_BUILTIN_VPCOMNEW,
26417 IX86_BUILTIN_VPCOMLTW,
26418 IX86_BUILTIN_VPCOMLEW,
26419 IX86_BUILTIN_VPCOMGTW,
26420 IX86_BUILTIN_VPCOMGEW,
26421 IX86_BUILTIN_VPCOMFALSEW,
26422 IX86_BUILTIN_VPCOMTRUEW,
26423
26424 IX86_BUILTIN_VPCOMEQD,
26425 IX86_BUILTIN_VPCOMNED,
26426 IX86_BUILTIN_VPCOMLTD,
26427 IX86_BUILTIN_VPCOMLED,
26428 IX86_BUILTIN_VPCOMGTD,
26429 IX86_BUILTIN_VPCOMGED,
26430 IX86_BUILTIN_VPCOMFALSED,
26431 IX86_BUILTIN_VPCOMTRUED,
26432
26433 IX86_BUILTIN_VPCOMEQQ,
26434 IX86_BUILTIN_VPCOMNEQ,
26435 IX86_BUILTIN_VPCOMLTQ,
26436 IX86_BUILTIN_VPCOMLEQ,
26437 IX86_BUILTIN_VPCOMGTQ,
26438 IX86_BUILTIN_VPCOMGEQ,
26439 IX86_BUILTIN_VPCOMFALSEQ,
26440 IX86_BUILTIN_VPCOMTRUEQ,
26441
26442 /* LWP instructions. */
26443 IX86_BUILTIN_LLWPCB,
26444 IX86_BUILTIN_SLWPCB,
26445 IX86_BUILTIN_LWPVAL32,
26446 IX86_BUILTIN_LWPVAL64,
26447 IX86_BUILTIN_LWPINS32,
26448 IX86_BUILTIN_LWPINS64,
26449
26450 IX86_BUILTIN_CLZS,
26451
26452 /* RTM */
26453 IX86_BUILTIN_XBEGIN,
26454 IX86_BUILTIN_XEND,
26455 IX86_BUILTIN_XABORT,
26456 IX86_BUILTIN_XTEST,
26457
26458 /* BMI instructions. */
26459 IX86_BUILTIN_BEXTR32,
26460 IX86_BUILTIN_BEXTR64,
26461 IX86_BUILTIN_CTZS,
26462
26463 /* TBM instructions. */
26464 IX86_BUILTIN_BEXTRI32,
26465 IX86_BUILTIN_BEXTRI64,
26466
26467 /* BMI2 instructions. */
26468 IX86_BUILTIN_BZHI32,
26469 IX86_BUILTIN_BZHI64,
26470 IX86_BUILTIN_PDEP32,
26471 IX86_BUILTIN_PDEP64,
26472 IX86_BUILTIN_PEXT32,
26473 IX86_BUILTIN_PEXT64,
26474
26475 /* ADX instructions. */
26476 IX86_BUILTIN_ADDCARRYX32,
26477 IX86_BUILTIN_ADDCARRYX64,
26478
26479 /* FSGSBASE instructions. */
26480 IX86_BUILTIN_RDFSBASE32,
26481 IX86_BUILTIN_RDFSBASE64,
26482 IX86_BUILTIN_RDGSBASE32,
26483 IX86_BUILTIN_RDGSBASE64,
26484 IX86_BUILTIN_WRFSBASE32,
26485 IX86_BUILTIN_WRFSBASE64,
26486 IX86_BUILTIN_WRGSBASE32,
26487 IX86_BUILTIN_WRGSBASE64,
26488
26489 /* RDRND instructions. */
26490 IX86_BUILTIN_RDRAND16_STEP,
26491 IX86_BUILTIN_RDRAND32_STEP,
26492 IX86_BUILTIN_RDRAND64_STEP,
26493
26494 /* RDSEED instructions. */
26495 IX86_BUILTIN_RDSEED16_STEP,
26496 IX86_BUILTIN_RDSEED32_STEP,
26497 IX86_BUILTIN_RDSEED64_STEP,
26498
26499 /* F16C instructions. */
26500 IX86_BUILTIN_CVTPH2PS,
26501 IX86_BUILTIN_CVTPH2PS256,
26502 IX86_BUILTIN_CVTPS2PH,
26503 IX86_BUILTIN_CVTPS2PH256,
26504
26505 /* CFString built-in for darwin */
26506 IX86_BUILTIN_CFSTRING,
26507
26508 /* Builtins to get CPU type and supported features. */
26509 IX86_BUILTIN_CPU_INIT,
26510 IX86_BUILTIN_CPU_IS,
26511 IX86_BUILTIN_CPU_SUPPORTS,
26512
26513 IX86_BUILTIN_MAX
26514 };
26515
26516 /* Table for the ix86 builtin decls. */
26517 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26518
26519 /* Table of all of the builtin functions that are possible with different ISA's
26520 but are waiting to be built until a function is declared to use that
26521 ISA. */
26522 struct builtin_isa {
26523 const char *name; /* function name */
26524 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26525 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26526 bool const_p; /* true if the declaration is constant */
26527 bool set_and_not_built_p;
26528 };
26529
26530 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26531
26532
26533 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26534 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26535 function decl in the ix86_builtins array. Returns the function decl or
26536 NULL_TREE, if the builtin was not added.
26537
26538 If the front end has a special hook for builtin functions, delay adding
26539 builtin functions that aren't in the current ISA until the ISA is changed
26540 with function specific optimization. Doing so, can save about 300K for the
26541 default compiler. When the builtin is expanded, check at that time whether
26542 it is valid.
26543
26544 If the front end doesn't have a special hook, record all builtins, even if
26545 it isn't an instruction set in the current ISA in case the user uses
26546 function specific options for a different ISA, so that we don't get scope
26547 errors if a builtin is added in the middle of a function scope. */
26548
26549 static inline tree
26550 def_builtin (HOST_WIDE_INT mask, const char *name,
26551 enum ix86_builtin_func_type tcode,
26552 enum ix86_builtins code)
26553 {
26554 tree decl = NULL_TREE;
26555
26556 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26557 {
26558 ix86_builtins_isa[(int) code].isa = mask;
26559
26560 mask &= ~OPTION_MASK_ISA_64BIT;
26561 if (mask == 0
26562 || (mask & ix86_isa_flags) != 0
26563 || (lang_hooks.builtin_function
26564 == lang_hooks.builtin_function_ext_scope))
26565
26566 {
26567 tree type = ix86_get_builtin_func_type (tcode);
26568 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26569 NULL, NULL_TREE);
26570 ix86_builtins[(int) code] = decl;
26571 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26572 }
26573 else
26574 {
26575 ix86_builtins[(int) code] = NULL_TREE;
26576 ix86_builtins_isa[(int) code].tcode = tcode;
26577 ix86_builtins_isa[(int) code].name = name;
26578 ix86_builtins_isa[(int) code].const_p = false;
26579 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26580 }
26581 }
26582
26583 return decl;
26584 }
26585
26586 /* Like def_builtin, but also marks the function decl "const". */
26587
26588 static inline tree
26589 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26590 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26591 {
26592 tree decl = def_builtin (mask, name, tcode, code);
26593 if (decl)
26594 TREE_READONLY (decl) = 1;
26595 else
26596 ix86_builtins_isa[(int) code].const_p = true;
26597
26598 return decl;
26599 }
26600
26601 /* Add any new builtin functions for a given ISA that may not have been
26602 declared. This saves a bit of space compared to adding all of the
26603 declarations to the tree, even if we didn't use them. */
26604
26605 static void
26606 ix86_add_new_builtins (HOST_WIDE_INT isa)
26607 {
26608 int i;
26609
26610 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26611 {
26612 if ((ix86_builtins_isa[i].isa & isa) != 0
26613 && ix86_builtins_isa[i].set_and_not_built_p)
26614 {
26615 tree decl, type;
26616
26617 /* Don't define the builtin again. */
26618 ix86_builtins_isa[i].set_and_not_built_p = false;
26619
26620 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26621 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26622 type, i, BUILT_IN_MD, NULL,
26623 NULL_TREE);
26624
26625 ix86_builtins[i] = decl;
26626 if (ix86_builtins_isa[i].const_p)
26627 TREE_READONLY (decl) = 1;
26628 }
26629 }
26630 }
26631
26632 /* Bits for builtin_description.flag. */
26633
26634 /* Set when we don't support the comparison natively, and should
26635 swap_comparison in order to support it. */
26636 #define BUILTIN_DESC_SWAP_OPERANDS 1
26637
26638 struct builtin_description
26639 {
26640 const HOST_WIDE_INT mask;
26641 const enum insn_code icode;
26642 const char *const name;
26643 const enum ix86_builtins code;
26644 const enum rtx_code comparison;
26645 const int flag;
26646 };
26647
26648 static const struct builtin_description bdesc_comi[] =
26649 {
26650 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26651 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26652 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26653 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26654 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26655 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26656 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26657 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26658 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26659 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26660 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26661 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26662 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26663 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26664 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26665 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26666 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26667 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26668 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26669 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26670 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26671 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26672 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26673 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26674 };
26675
26676 static const struct builtin_description bdesc_pcmpestr[] =
26677 {
26678 /* SSE4.2 */
26679 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26680 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26681 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26682 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26683 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26684 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26685 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26686 };
26687
26688 static const struct builtin_description bdesc_pcmpistr[] =
26689 {
26690 /* SSE4.2 */
26691 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26692 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26693 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26694 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26695 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26696 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26697 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26698 };
26699
26700 /* Special builtins with variable number of arguments. */
26701 static const struct builtin_description bdesc_special_args[] =
26702 {
26703 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26704 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26705 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26706
26707 /* MMX */
26708 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26709
26710 /* 3DNow! */
26711 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26712
26713 /* FXSR, XSAVE and XSAVEOPT */
26714 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26715 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26716 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26717 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26718 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26719
26720 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26721 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26722 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26723 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26724 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26725
26726 /* SSE */
26727 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26728 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26729 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26730
26731 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26732 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26733 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26734 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26735
26736 /* SSE or 3DNow!A */
26737 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26738 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26739
26740 /* SSE2 */
26741 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26742 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26743 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26744 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26745 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26746 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26747 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26748 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26749 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26750 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26751
26752 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26753 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26754
26755 /* SSE3 */
26756 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26757
26758 /* SSE4.1 */
26759 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26760
26761 /* SSE4A */
26762 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26763 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26764
26765 /* AVX */
26766 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26767 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26768
26769 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26770 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26771 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26774
26775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26780 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26781 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26782
26783 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26784 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26785 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26786
26787 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26788 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26789 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26790 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26791 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26792 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26795
26796 /* AVX2 */
26797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26798 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26799 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26800 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26801 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26802 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26803 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26804 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26805 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26806
26807 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26808 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26809 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26810 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26811 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26812 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26813
26814 /* FSGSBASE */
26815 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26816 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26817 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26818 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26819 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26820 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26821 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26822 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26823
26824 /* RTM */
26825 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26826 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26827 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26828 };
26829
26830 /* Builtins with variable number of arguments. */
26831 static const struct builtin_description bdesc_args[] =
26832 {
26833 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
26834 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
26835 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
26836 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26837 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26838 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
26839 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
26840
26841 /* MMX */
26842 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26843 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26844 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26845 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26846 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26847 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26848
26849 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26850 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26851 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26852 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26853 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26854 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26855 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26856 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26857
26858 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26859 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26860
26861 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26862 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26863 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26864 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26865
26866 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26867 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26868 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26869 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26870 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26871 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26872
26873 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26874 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26875 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
26876 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26877 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
26878 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
26879
26880 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26881 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
26882 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
26883
26884 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
26885
26886 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26887 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26888 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26889 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26890 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26891 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26892
26893 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26894 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26895 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
26896 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26897 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26898 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
26899
26900 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
26901 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
26902 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
26903 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
26904
26905 /* 3DNow! */
26906 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26907 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26908 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26909 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26910
26911 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
26912 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26913 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26914 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26915 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26916 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
26917 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26918 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26919 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26920 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26921 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26922 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26923 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26924 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26925 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
26926
26927 /* 3DNow!A */
26928 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
26929 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
26930 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
26931 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
26932 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26933 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
26934
26935 /* SSE */
26936 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
26937 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26938 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26939 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26940 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26941 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
26942 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26943 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26944 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26945 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
26946 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
26947 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
26948
26949 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
26950
26951 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26952 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26953 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26954 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26955 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26956 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26957 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26958 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26959
26960 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26961 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26962 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26963 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26964 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26965 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26966 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26967 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26968 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26969 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26970 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
26971 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26972 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
26973 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
26974 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
26975 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26976 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
26977 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
26978 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
26979 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26980 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
26981 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
26982
26983 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26984 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26985 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26986 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26987
26988 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26989 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26990 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26991 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26992
26993 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26994
26995 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26996 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26997 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26998 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
26999 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27000
27001 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27002 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27003 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27004
27005 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27006
27007 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27008 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27009 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27010
27011 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27012 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27013
27014 /* SSE MMX or 3Dnow!A */
27015 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27016 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27017 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27018
27019 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27020 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27021 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27022 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27023
27024 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27025 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27026
27027 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27028
27029 /* SSE2 */
27030 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27031
27032 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27033 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27034 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27035 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27036 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27037
27038 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27039 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27040 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27041 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27042 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27043
27044 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27045
27046 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27047 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27048 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27049 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27050
27051 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27052 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27053 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27054
27055 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27056 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27057 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27058 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27059 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27060 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27061 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27062 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27063
27064 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27065 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27066 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27067 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27068 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27069 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27070 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27071 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27072 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27073 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27074 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27075 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27076 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27077 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27078 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27079 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27080 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27081 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27082 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27083 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27084
27085 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27086 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27087 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27088 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27089
27090 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27091 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27092 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27093 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27094
27095 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27096
27097 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27098 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27099 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27100
27101 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27102
27103 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27104 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27105 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27106 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27107 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27108 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27109 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27110 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27111
27112 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27115 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27117 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27119 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27120
27121 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27122 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27123
27124 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27125 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27126 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27127 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27128
27129 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27130 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27131
27132 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27133 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27134 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27135 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27136 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27137 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27138
27139 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27140 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27141 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27142 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27143
27144 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27145 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27146 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27147 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27148 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27149 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27150 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27151 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27152
27153 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27154 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27155 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27156
27157 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27158 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27159
27160 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27161 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27162
27163 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27164
27165 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27166 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27167 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27168 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27169
27170 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27171 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27172 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27173 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27174 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27175 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27176 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27177
27178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27179 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27180 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27181 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27182 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27183 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27184 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27185
27186 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27187 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27188 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27189 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27190
27191 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27192 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27193 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27194
27195 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27196
27197 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27198
27199 /* SSE2 MMX */
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27202
27203 /* SSE3 */
27204 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27205 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27206
27207 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27208 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27209 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27210 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27211 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27212 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27213
27214 /* SSSE3 */
27215 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27216 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27217 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27218 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27219 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27220 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27221
27222 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27223 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27224 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27225 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27226 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27227 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27228 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27229 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27230 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27231 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27232 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27233 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27234 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27235 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27236 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27237 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27238 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27239 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27240 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27241 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27242 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27243 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27244 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27245 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27246
27247 /* SSSE3. */
27248 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27249 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27250
27251 /* SSE4.1 */
27252 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27253 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27254 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27255 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27256 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27257 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27258 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27259 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27260 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27261 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27262
27263 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27264 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27265 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27266 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27267 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27268 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27269 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27270 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27271 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27272 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27273 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27274 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27275 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27276
27277 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27278 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27279 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27280 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27281 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27282 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27283 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27284 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27285 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27286 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27287 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27288 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27289
27290 /* SSE4.1 */
27291 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27292 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27293 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27294 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27295
27296 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27297 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27298 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27299 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27300
27301 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27302 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27303
27304 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27305 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27306
27307 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27308 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27309 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27310 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27311
27312 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27313 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27314
27315 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27316 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27317
27318 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27319 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27320 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27321
27322 /* SSE4.2 */
27323 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27324 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27325 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27326 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27327 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27328
27329 /* SSE4A */
27330 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27331 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27332 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27333 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27334
27335 /* AES */
27336 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27338
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27342 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27343
27344 /* PCLMUL */
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27346
27347 /* AVX */
27348 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27349 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27350 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27351 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27352 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27353 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27354 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27355 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27356 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27357 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27358 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27359 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27360 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27361 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27362 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27363 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27364 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27365 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27366 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27367 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27368 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27369 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27370 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27371 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27372 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27373 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27374
27375 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27376 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27377 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27378 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27379
27380 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27381 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27382 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27383 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27384 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27385 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27386 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27387 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27388 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27389 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27390 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27391 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27392 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27393 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27394 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27395 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27396 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27397 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27398 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27399 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27400 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27401 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27402 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27403 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27404 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27405 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27406 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27407 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27408 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27409 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27410 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27411 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27412 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27413 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27414
27415 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27416 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27417 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27418
27419 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27420 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27421 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27422 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27423 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27424
27425 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27426
27427 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27428 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27429
27430 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27431 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27432 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27433 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27434
27435 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27436 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27437
27438 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27439 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27440
27441 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27442 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27443 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27444 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27445
27446 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27447 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27448
27449 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27450 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27451
27452 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27453 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27454 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27455 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27456
27457 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27458 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27459 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27460 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27461 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27462 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27463
27464 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27465 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27466 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27467 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27468 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27469 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27470 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27471 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27472 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27473 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27474 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27475 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27476 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27477 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27478 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27479
27480 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27481 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27482
27483 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27484 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27485
27486 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27487
27488 /* AVX2 */
27489 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27490 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27491 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27492 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27493 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27494 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27495 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27496 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27497 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27498 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27499 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27500 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27501 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27502 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27503 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27504 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27505 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27506 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27507 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27508 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27509 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27510 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27511 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27512 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27513 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27514 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27515 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27516 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27517 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27518 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27519 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27520 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27521 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27522 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27523 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27524 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27525 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27526 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27527 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27528 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27529 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27530 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27531 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27532 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27533 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27534 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27535 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27536 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27537 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27538 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27539 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27540 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27541 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27542 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27543 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27544 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27545 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27546 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27547 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27548 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27549 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27550 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27551 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27552 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27553 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27554 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27555 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27556 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27557 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27558 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27559 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27560 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27561 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27562 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27563 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27564 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27565 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27566 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27567 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27568 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27569 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27570 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27571 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27572 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27573 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27574 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27575 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27576 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27577 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27578 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27579 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27580 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27581 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27582 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27583 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27584 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27585 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27586 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27587 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27588 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27589 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27590 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27591 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27592 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27593 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27594 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27595 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27596 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27597 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27598 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27599 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27600 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27601 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27602 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27603 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27604 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27605 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27606 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27607 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27608 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27609 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27610 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27611 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27612 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27613 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27614 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27615 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27616 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27617 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27618 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27619 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27620 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27621 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27622 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27623 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27624 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27625 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27626 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27627 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27628 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27629 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27630 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27631 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27632 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27633 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27634 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27635
27636 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27637
27638 /* BMI */
27639 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27640 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27641 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27642
27643 /* TBM */
27644 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27645 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27646
27647 /* F16C */
27648 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27649 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27650 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27651 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27652
27653 /* BMI2 */
27654 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27655 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27656 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27657 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27658 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27659 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27660 };
27661
27662 /* FMA4 and XOP. */
27663 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27664 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27665 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27666 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27667 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27668 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27669 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27670 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27671 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27672 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27673 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27674 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27675 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27676 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27677 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27678 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27679 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27680 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27681 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27682 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27683 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27684 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27685 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27686 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27687 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27688 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27689 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27690 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27691 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27692 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27693 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27694 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27695 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27696 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27697 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27698 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27699 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27700 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27701 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27702 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27703 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27704 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27705 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27706 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27707 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27708 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27709 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27710 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27711 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27712 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27713 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27714 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27715
27716 static const struct builtin_description bdesc_multi_arg[] =
27717 {
27718 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27719 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27720 UNKNOWN, (int)MULTI_ARG_3_SF },
27721 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27722 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27723 UNKNOWN, (int)MULTI_ARG_3_DF },
27724
27725 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27726 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27727 UNKNOWN, (int)MULTI_ARG_3_SF },
27728 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27729 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27730 UNKNOWN, (int)MULTI_ARG_3_DF },
27731
27732 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27733 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27734 UNKNOWN, (int)MULTI_ARG_3_SF },
27735 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27736 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27737 UNKNOWN, (int)MULTI_ARG_3_DF },
27738 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27739 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27740 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27741 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27742 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27743 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27744
27745 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27746 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27747 UNKNOWN, (int)MULTI_ARG_3_SF },
27748 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27749 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27750 UNKNOWN, (int)MULTI_ARG_3_DF },
27751 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27752 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27753 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27754 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27755 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27756 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27757
27758 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27759 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27760 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27761 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27762 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27763 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27764 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27765
27766 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27767 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27768 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27769 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27770 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27771 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27772 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27773
27774 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27775
27776 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27777 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27778 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27779 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27780 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27781 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27782 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27783 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27784 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27785 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27786 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27787 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27788
27789 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27790 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27791 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27792 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27793 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27794 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27795 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27796 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27797 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27798 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27799 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27800 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27801 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27802 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27803 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27804 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27805
27806 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
27807 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
27808 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27809 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27810 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27811 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27812
27813 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27814 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27815 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27816 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27817 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27818 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27819 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27820 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27821 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27822 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27823 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27824 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27825 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27826 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27827 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27828
27829 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27830 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27831 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27832 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
27833 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
27834 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
27835 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
27836
27837 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
27838 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27839 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
27840 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
27841 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
27842 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
27843 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
27844
27845 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
27846 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27847 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
27848 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
27849 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
27850 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
27851 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
27852
27853 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27854 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27855 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
27856 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
27857 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
27858 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
27859 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
27860
27861 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
27862 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27863 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
27864 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
27865 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
27866 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
27867 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
27868
27869 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
27870 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27871 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
27872 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
27873 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
27874 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
27875 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
27876
27877 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
27878 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27879 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
27880 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
27881 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
27882 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
27883 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
27884
27885 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
27886 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27887 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
27888 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
27889 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
27890 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
27891 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
27892
27893 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27894 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27895 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27896 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27897 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
27898 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
27899 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
27900 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
27901
27902 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27903 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27904 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27905 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27906 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
27907 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
27908 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
27909 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
27910
27911 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
27912 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
27913 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
27914 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
27915
27916 };
27917 \f
27918 /* TM vector builtins. */
27919
27920 /* Reuse the existing x86-specific `struct builtin_description' cause
27921 we're lazy. Add casts to make them fit. */
27922 static const struct builtin_description bdesc_tm[] =
27923 {
27924 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27925 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27926 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
27927 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27928 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27929 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27930 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
27931
27932 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27933 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27934 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
27935 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27936 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27937 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27938 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
27939
27940 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27941 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27942 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
27943 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27944 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27945 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27946 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
27947
27948 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
27949 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
27950 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
27951 };
27952
27953 /* TM callbacks. */
27954
27955 /* Return the builtin decl needed to load a vector of TYPE. */
27956
27957 static tree
27958 ix86_builtin_tm_load (tree type)
27959 {
27960 if (TREE_CODE (type) == VECTOR_TYPE)
27961 {
27962 switch (tree_low_cst (TYPE_SIZE (type), 1))
27963 {
27964 case 64:
27965 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
27966 case 128:
27967 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
27968 case 256:
27969 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
27970 }
27971 }
27972 return NULL_TREE;
27973 }
27974
27975 /* Return the builtin decl needed to store a vector of TYPE. */
27976
27977 static tree
27978 ix86_builtin_tm_store (tree type)
27979 {
27980 if (TREE_CODE (type) == VECTOR_TYPE)
27981 {
27982 switch (tree_low_cst (TYPE_SIZE (type), 1))
27983 {
27984 case 64:
27985 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
27986 case 128:
27987 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
27988 case 256:
27989 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
27990 }
27991 }
27992 return NULL_TREE;
27993 }
27994 \f
27995 /* Initialize the transactional memory vector load/store builtins. */
27996
27997 static void
27998 ix86_init_tm_builtins (void)
27999 {
28000 enum ix86_builtin_func_type ftype;
28001 const struct builtin_description *d;
28002 size_t i;
28003 tree decl;
28004 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28005 tree attrs_log, attrs_type_log;
28006
28007 if (!flag_tm)
28008 return;
28009
28010 /* If there are no builtins defined, we must be compiling in a
28011 language without trans-mem support. */
28012 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28013 return;
28014
28015 /* Use whatever attributes a normal TM load has. */
28016 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28017 attrs_load = DECL_ATTRIBUTES (decl);
28018 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28019 /* Use whatever attributes a normal TM store has. */
28020 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28021 attrs_store = DECL_ATTRIBUTES (decl);
28022 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28023 /* Use whatever attributes a normal TM log has. */
28024 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28025 attrs_log = DECL_ATTRIBUTES (decl);
28026 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28027
28028 for (i = 0, d = bdesc_tm;
28029 i < ARRAY_SIZE (bdesc_tm);
28030 i++, d++)
28031 {
28032 if ((d->mask & ix86_isa_flags) != 0
28033 || (lang_hooks.builtin_function
28034 == lang_hooks.builtin_function_ext_scope))
28035 {
28036 tree type, attrs, attrs_type;
28037 enum built_in_function code = (enum built_in_function) d->code;
28038
28039 ftype = (enum ix86_builtin_func_type) d->flag;
28040 type = ix86_get_builtin_func_type (ftype);
28041
28042 if (BUILTIN_TM_LOAD_P (code))
28043 {
28044 attrs = attrs_load;
28045 attrs_type = attrs_type_load;
28046 }
28047 else if (BUILTIN_TM_STORE_P (code))
28048 {
28049 attrs = attrs_store;
28050 attrs_type = attrs_type_store;
28051 }
28052 else
28053 {
28054 attrs = attrs_log;
28055 attrs_type = attrs_type_log;
28056 }
28057 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28058 /* The builtin without the prefix for
28059 calling it directly. */
28060 d->name + strlen ("__builtin_"),
28061 attrs);
28062 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28063 set the TYPE_ATTRIBUTES. */
28064 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28065
28066 set_builtin_decl (code, decl, false);
28067 }
28068 }
28069 }
28070
28071 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28072 in the current target ISA to allow the user to compile particular modules
28073 with different target specific options that differ from the command line
28074 options. */
28075 static void
28076 ix86_init_mmx_sse_builtins (void)
28077 {
28078 const struct builtin_description * d;
28079 enum ix86_builtin_func_type ftype;
28080 size_t i;
28081
28082 /* Add all special builtins with variable number of operands. */
28083 for (i = 0, d = bdesc_special_args;
28084 i < ARRAY_SIZE (bdesc_special_args);
28085 i++, d++)
28086 {
28087 if (d->name == 0)
28088 continue;
28089
28090 ftype = (enum ix86_builtin_func_type) d->flag;
28091 def_builtin (d->mask, d->name, ftype, d->code);
28092 }
28093
28094 /* Add all builtins with variable number of operands. */
28095 for (i = 0, d = bdesc_args;
28096 i < ARRAY_SIZE (bdesc_args);
28097 i++, d++)
28098 {
28099 if (d->name == 0)
28100 continue;
28101
28102 ftype = (enum ix86_builtin_func_type) d->flag;
28103 def_builtin_const (d->mask, d->name, ftype, d->code);
28104 }
28105
28106 /* pcmpestr[im] insns. */
28107 for (i = 0, d = bdesc_pcmpestr;
28108 i < ARRAY_SIZE (bdesc_pcmpestr);
28109 i++, d++)
28110 {
28111 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28112 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28113 else
28114 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28115 def_builtin_const (d->mask, d->name, ftype, d->code);
28116 }
28117
28118 /* pcmpistr[im] insns. */
28119 for (i = 0, d = bdesc_pcmpistr;
28120 i < ARRAY_SIZE (bdesc_pcmpistr);
28121 i++, d++)
28122 {
28123 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28124 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28125 else
28126 ftype = INT_FTYPE_V16QI_V16QI_INT;
28127 def_builtin_const (d->mask, d->name, ftype, d->code);
28128 }
28129
28130 /* comi/ucomi insns. */
28131 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28132 {
28133 if (d->mask == OPTION_MASK_ISA_SSE2)
28134 ftype = INT_FTYPE_V2DF_V2DF;
28135 else
28136 ftype = INT_FTYPE_V4SF_V4SF;
28137 def_builtin_const (d->mask, d->name, ftype, d->code);
28138 }
28139
28140 /* SSE */
28141 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28142 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28143 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28144 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28145
28146 /* SSE or 3DNow!A */
28147 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28148 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28149 IX86_BUILTIN_MASKMOVQ);
28150
28151 /* SSE2 */
28152 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28153 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28154
28155 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28156 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28157 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28158 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28159
28160 /* SSE3. */
28161 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28162 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28163 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28164 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28165
28166 /* AES */
28167 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28168 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28169 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28170 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28171 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28172 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28173 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28174 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28175 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28176 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28177 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28178 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28179
28180 /* PCLMUL */
28181 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28182 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28183
28184 /* RDRND */
28185 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28186 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28187 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28188 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28189 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28190 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28191 IX86_BUILTIN_RDRAND64_STEP);
28192
28193 /* AVX2 */
28194 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28195 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28196 IX86_BUILTIN_GATHERSIV2DF);
28197
28198 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28199 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28200 IX86_BUILTIN_GATHERSIV4DF);
28201
28202 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28203 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28204 IX86_BUILTIN_GATHERDIV2DF);
28205
28206 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28207 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28208 IX86_BUILTIN_GATHERDIV4DF);
28209
28210 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28211 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28212 IX86_BUILTIN_GATHERSIV4SF);
28213
28214 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28215 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28216 IX86_BUILTIN_GATHERSIV8SF);
28217
28218 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28219 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28220 IX86_BUILTIN_GATHERDIV4SF);
28221
28222 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28223 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28224 IX86_BUILTIN_GATHERDIV8SF);
28225
28226 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28227 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28228 IX86_BUILTIN_GATHERSIV2DI);
28229
28230 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28231 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28232 IX86_BUILTIN_GATHERSIV4DI);
28233
28234 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28235 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28236 IX86_BUILTIN_GATHERDIV2DI);
28237
28238 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28239 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28240 IX86_BUILTIN_GATHERDIV4DI);
28241
28242 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28243 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28244 IX86_BUILTIN_GATHERSIV4SI);
28245
28246 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28247 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28248 IX86_BUILTIN_GATHERSIV8SI);
28249
28250 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28251 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28252 IX86_BUILTIN_GATHERDIV4SI);
28253
28254 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28255 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28256 IX86_BUILTIN_GATHERDIV8SI);
28257
28258 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28259 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28260 IX86_BUILTIN_GATHERALTSIV4DF);
28261
28262 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28263 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28264 IX86_BUILTIN_GATHERALTDIV8SF);
28265
28266 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28267 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28268 IX86_BUILTIN_GATHERALTSIV4DI);
28269
28270 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28271 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28272 IX86_BUILTIN_GATHERALTDIV8SI);
28273
28274 /* RTM. */
28275 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28276 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28277
28278 /* MMX access to the vec_init patterns. */
28279 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28280 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28281
28282 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28283 V4HI_FTYPE_HI_HI_HI_HI,
28284 IX86_BUILTIN_VEC_INIT_V4HI);
28285
28286 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28287 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28288 IX86_BUILTIN_VEC_INIT_V8QI);
28289
28290 /* Access to the vec_extract patterns. */
28291 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28292 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28293 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28294 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28295 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28296 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28297 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28298 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28299 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28300 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28301
28302 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28303 "__builtin_ia32_vec_ext_v4hi",
28304 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28305
28306 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28307 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28308
28309 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28310 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28311
28312 /* Access to the vec_set patterns. */
28313 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28314 "__builtin_ia32_vec_set_v2di",
28315 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28316
28317 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28318 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28319
28320 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28321 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28322
28323 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28324 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28325
28326 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28327 "__builtin_ia32_vec_set_v4hi",
28328 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28329
28330 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28331 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28332
28333 /* RDSEED */
28334 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28335 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28336 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28337 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28338 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28339 "__builtin_ia32_rdseed_di_step",
28340 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28341
28342 /* ADCX */
28343 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28344 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28345 def_builtin (OPTION_MASK_ISA_64BIT,
28346 "__builtin_ia32_addcarryx_u64",
28347 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28348 IX86_BUILTIN_ADDCARRYX64);
28349
28350 /* Add FMA4 multi-arg argument instructions */
28351 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28352 {
28353 if (d->name == 0)
28354 continue;
28355
28356 ftype = (enum ix86_builtin_func_type) d->flag;
28357 def_builtin_const (d->mask, d->name, ftype, d->code);
28358 }
28359 }
28360
28361 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28362 to return a pointer to VERSION_DECL if the outcome of the expression
28363 formed by PREDICATE_CHAIN is true. This function will be called during
28364 version dispatch to decide which function version to execute. It returns
28365 the basic block at the end, to which more conditions can be added. */
28366
28367 static basic_block
28368 add_condition_to_bb (tree function_decl, tree version_decl,
28369 tree predicate_chain, basic_block new_bb)
28370 {
28371 gimple return_stmt;
28372 tree convert_expr, result_var;
28373 gimple convert_stmt;
28374 gimple call_cond_stmt;
28375 gimple if_else_stmt;
28376
28377 basic_block bb1, bb2, bb3;
28378 edge e12, e23;
28379
28380 tree cond_var, and_expr_var = NULL_TREE;
28381 gimple_seq gseq;
28382
28383 tree predicate_decl, predicate_arg;
28384
28385 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28386
28387 gcc_assert (new_bb != NULL);
28388 gseq = bb_seq (new_bb);
28389
28390
28391 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28392 build_fold_addr_expr (version_decl));
28393 result_var = create_tmp_var (ptr_type_node, NULL);
28394 convert_stmt = gimple_build_assign (result_var, convert_expr);
28395 return_stmt = gimple_build_return (result_var);
28396
28397 if (predicate_chain == NULL_TREE)
28398 {
28399 gimple_seq_add_stmt (&gseq, convert_stmt);
28400 gimple_seq_add_stmt (&gseq, return_stmt);
28401 set_bb_seq (new_bb, gseq);
28402 gimple_set_bb (convert_stmt, new_bb);
28403 gimple_set_bb (return_stmt, new_bb);
28404 pop_cfun ();
28405 return new_bb;
28406 }
28407
28408 while (predicate_chain != NULL)
28409 {
28410 cond_var = create_tmp_var (integer_type_node, NULL);
28411 predicate_decl = TREE_PURPOSE (predicate_chain);
28412 predicate_arg = TREE_VALUE (predicate_chain);
28413 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28414 gimple_call_set_lhs (call_cond_stmt, cond_var);
28415
28416 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28417 gimple_set_bb (call_cond_stmt, new_bb);
28418 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28419
28420 predicate_chain = TREE_CHAIN (predicate_chain);
28421
28422 if (and_expr_var == NULL)
28423 and_expr_var = cond_var;
28424 else
28425 {
28426 gimple assign_stmt;
28427 /* Use MIN_EXPR to check if any integer is zero?.
28428 and_expr_var = min_expr <cond_var, and_expr_var> */
28429 assign_stmt = gimple_build_assign (and_expr_var,
28430 build2 (MIN_EXPR, integer_type_node,
28431 cond_var, and_expr_var));
28432
28433 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28434 gimple_set_bb (assign_stmt, new_bb);
28435 gimple_seq_add_stmt (&gseq, assign_stmt);
28436 }
28437 }
28438
28439 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28440 integer_zero_node,
28441 NULL_TREE, NULL_TREE);
28442 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28443 gimple_set_bb (if_else_stmt, new_bb);
28444 gimple_seq_add_stmt (&gseq, if_else_stmt);
28445
28446 gimple_seq_add_stmt (&gseq, convert_stmt);
28447 gimple_seq_add_stmt (&gseq, return_stmt);
28448 set_bb_seq (new_bb, gseq);
28449
28450 bb1 = new_bb;
28451 e12 = split_block (bb1, if_else_stmt);
28452 bb2 = e12->dest;
28453 e12->flags &= ~EDGE_FALLTHRU;
28454 e12->flags |= EDGE_TRUE_VALUE;
28455
28456 e23 = split_block (bb2, return_stmt);
28457
28458 gimple_set_bb (convert_stmt, bb2);
28459 gimple_set_bb (return_stmt, bb2);
28460
28461 bb3 = e23->dest;
28462 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28463
28464 remove_edge (e23);
28465 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28466
28467 pop_cfun ();
28468
28469 return bb3;
28470 }
28471
28472 /* This parses the attribute arguments to target in DECL and determines
28473 the right builtin to use to match the platform specification.
28474 It returns the priority value for this version decl. If PREDICATE_LIST
28475 is not NULL, it stores the list of cpu features that need to be checked
28476 before dispatching this function. */
28477
28478 static unsigned int
28479 get_builtin_code_for_version (tree decl, tree *predicate_list)
28480 {
28481 tree attrs;
28482 struct cl_target_option cur_target;
28483 tree target_node;
28484 struct cl_target_option *new_target;
28485 const char *arg_str = NULL;
28486 const char *attrs_str = NULL;
28487 char *tok_str = NULL;
28488 char *token;
28489
28490 /* Priority of i386 features, greater value is higher priority. This is
28491 used to decide the order in which function dispatch must happen. For
28492 instance, a version specialized for SSE4.2 should be checked for dispatch
28493 before a version for SSE3, as SSE4.2 implies SSE3. */
28494 enum feature_priority
28495 {
28496 P_ZERO = 0,
28497 P_MMX,
28498 P_SSE,
28499 P_SSE2,
28500 P_SSE3,
28501 P_SSSE3,
28502 P_PROC_SSSE3,
28503 P_SSE4_a,
28504 P_PROC_SSE4_a,
28505 P_SSE4_1,
28506 P_SSE4_2,
28507 P_PROC_SSE4_2,
28508 P_POPCNT,
28509 P_AVX,
28510 P_AVX2,
28511 P_FMA,
28512 P_PROC_FMA
28513 };
28514
28515 enum feature_priority priority = P_ZERO;
28516
28517 /* These are the target attribute strings for which a dispatcher is
28518 available, from fold_builtin_cpu. */
28519
28520 static struct _feature_list
28521 {
28522 const char *const name;
28523 const enum feature_priority priority;
28524 }
28525 const feature_list[] =
28526 {
28527 {"mmx", P_MMX},
28528 {"sse", P_SSE},
28529 {"sse2", P_SSE2},
28530 {"sse3", P_SSE3},
28531 {"ssse3", P_SSSE3},
28532 {"sse4.1", P_SSE4_1},
28533 {"sse4.2", P_SSE4_2},
28534 {"popcnt", P_POPCNT},
28535 {"avx", P_AVX},
28536 {"avx2", P_AVX2}
28537 };
28538
28539
28540 static unsigned int NUM_FEATURES
28541 = sizeof (feature_list) / sizeof (struct _feature_list);
28542
28543 unsigned int i;
28544
28545 tree predicate_chain = NULL_TREE;
28546 tree predicate_decl, predicate_arg;
28547
28548 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28549 gcc_assert (attrs != NULL);
28550
28551 attrs = TREE_VALUE (TREE_VALUE (attrs));
28552
28553 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28554 attrs_str = TREE_STRING_POINTER (attrs);
28555
28556
28557 /* Handle arch= if specified. For priority, set it to be 1 more than
28558 the best instruction set the processor can handle. For instance, if
28559 there is a version for atom and a version for ssse3 (the highest ISA
28560 priority for atom), the atom version must be checked for dispatch
28561 before the ssse3 version. */
28562 if (strstr (attrs_str, "arch=") != NULL)
28563 {
28564 cl_target_option_save (&cur_target, &global_options);
28565 target_node = ix86_valid_target_attribute_tree (attrs);
28566
28567 gcc_assert (target_node);
28568 new_target = TREE_TARGET_OPTION (target_node);
28569 gcc_assert (new_target);
28570
28571 if (new_target->arch_specified && new_target->arch > 0)
28572 {
28573 switch (new_target->arch)
28574 {
28575 case PROCESSOR_CORE2_32:
28576 case PROCESSOR_CORE2_64:
28577 arg_str = "core2";
28578 priority = P_PROC_SSSE3;
28579 break;
28580 case PROCESSOR_COREI7_32:
28581 case PROCESSOR_COREI7_64:
28582 arg_str = "corei7";
28583 priority = P_PROC_SSE4_2;
28584 break;
28585 case PROCESSOR_ATOM:
28586 arg_str = "atom";
28587 priority = P_PROC_SSSE3;
28588 break;
28589 case PROCESSOR_AMDFAM10:
28590 arg_str = "amdfam10h";
28591 priority = P_PROC_SSE4_a;
28592 break;
28593 case PROCESSOR_BDVER1:
28594 arg_str = "bdver1";
28595 priority = P_PROC_FMA;
28596 break;
28597 case PROCESSOR_BDVER2:
28598 arg_str = "bdver2";
28599 priority = P_PROC_FMA;
28600 break;
28601 }
28602 }
28603
28604 cl_target_option_restore (&global_options, &cur_target);
28605
28606 if (predicate_list && arg_str == NULL)
28607 {
28608 error_at (DECL_SOURCE_LOCATION (decl),
28609 "No dispatcher found for the versioning attributes");
28610 return 0;
28611 }
28612
28613 if (predicate_list)
28614 {
28615 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28616 /* For a C string literal the length includes the trailing NULL. */
28617 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28618 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28619 predicate_chain);
28620 }
28621 }
28622
28623 /* Process feature name. */
28624 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28625 strcpy (tok_str, attrs_str);
28626 token = strtok (tok_str, ",");
28627 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28628
28629 while (token != NULL)
28630 {
28631 /* Do not process "arch=" */
28632 if (strncmp (token, "arch=", 5) == 0)
28633 {
28634 token = strtok (NULL, ",");
28635 continue;
28636 }
28637 for (i = 0; i < NUM_FEATURES; ++i)
28638 {
28639 if (strcmp (token, feature_list[i].name) == 0)
28640 {
28641 if (predicate_list)
28642 {
28643 predicate_arg = build_string_literal (
28644 strlen (feature_list[i].name) + 1,
28645 feature_list[i].name);
28646 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28647 predicate_chain);
28648 }
28649 /* Find the maximum priority feature. */
28650 if (feature_list[i].priority > priority)
28651 priority = feature_list[i].priority;
28652
28653 break;
28654 }
28655 }
28656 if (predicate_list && i == NUM_FEATURES)
28657 {
28658 error_at (DECL_SOURCE_LOCATION (decl),
28659 "No dispatcher found for %s", token);
28660 return 0;
28661 }
28662 token = strtok (NULL, ",");
28663 }
28664 free (tok_str);
28665
28666 if (predicate_list && predicate_chain == NULL_TREE)
28667 {
28668 error_at (DECL_SOURCE_LOCATION (decl),
28669 "No dispatcher found for the versioning attributes : %s",
28670 attrs_str);
28671 return 0;
28672 }
28673 else if (predicate_list)
28674 {
28675 predicate_chain = nreverse (predicate_chain);
28676 *predicate_list = predicate_chain;
28677 }
28678
28679 return priority;
28680 }
28681
28682 /* This compares the priority of target features in function DECL1
28683 and DECL2. It returns positive value if DECL1 is higher priority,
28684 negative value if DECL2 is higher priority and 0 if they are the
28685 same. */
28686
28687 static int
28688 ix86_compare_version_priority (tree decl1, tree decl2)
28689 {
28690 unsigned int priority1 = 0;
28691 unsigned int priority2 = 0;
28692
28693 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl1)) != NULL)
28694 priority1 = get_builtin_code_for_version (decl1, NULL);
28695
28696 if (lookup_attribute ("target", DECL_ATTRIBUTES (decl2)) != NULL)
28697 priority2 = get_builtin_code_for_version (decl2, NULL);
28698
28699 return (int)priority1 - (int)priority2;
28700 }
28701
28702 /* V1 and V2 point to function versions with different priorities
28703 based on the target ISA. This function compares their priorities. */
28704
28705 static int
28706 feature_compare (const void *v1, const void *v2)
28707 {
28708 typedef struct _function_version_info
28709 {
28710 tree version_decl;
28711 tree predicate_chain;
28712 unsigned int dispatch_priority;
28713 } function_version_info;
28714
28715 const function_version_info c1 = *(const function_version_info *)v1;
28716 const function_version_info c2 = *(const function_version_info *)v2;
28717 return (c2.dispatch_priority - c1.dispatch_priority);
28718 }
28719
28720 /* This function generates the dispatch function for
28721 multi-versioned functions. DISPATCH_DECL is the function which will
28722 contain the dispatch logic. FNDECLS are the function choices for
28723 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28724 in DISPATCH_DECL in which the dispatch code is generated. */
28725
28726 static int
28727 dispatch_function_versions (tree dispatch_decl,
28728 void *fndecls_p,
28729 basic_block *empty_bb)
28730 {
28731 tree default_decl;
28732 gimple ifunc_cpu_init_stmt;
28733 gimple_seq gseq;
28734 int ix;
28735 tree ele;
28736 vec<tree> *fndecls;
28737 unsigned int num_versions = 0;
28738 unsigned int actual_versions = 0;
28739 unsigned int i;
28740
28741 struct _function_version_info
28742 {
28743 tree version_decl;
28744 tree predicate_chain;
28745 unsigned int dispatch_priority;
28746 }*function_version_info;
28747
28748 gcc_assert (dispatch_decl != NULL
28749 && fndecls_p != NULL
28750 && empty_bb != NULL);
28751
28752 /*fndecls_p is actually a vector. */
28753 fndecls = static_cast<vec<tree> *> (fndecls_p);
28754
28755 /* At least one more version other than the default. */
28756 num_versions = fndecls->length ();
28757 gcc_assert (num_versions >= 2);
28758
28759 function_version_info = (struct _function_version_info *)
28760 XNEWVEC (struct _function_version_info, (num_versions - 1));
28761
28762 /* The first version in the vector is the default decl. */
28763 default_decl = (*fndecls)[0];
28764
28765 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28766
28767 gseq = bb_seq (*empty_bb);
28768 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28769 constructors, so explicity call __builtin_cpu_init here. */
28770 ifunc_cpu_init_stmt = gimple_build_call_vec (
28771 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vec<tree>());
28772 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28773 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28774 set_bb_seq (*empty_bb, gseq);
28775
28776 pop_cfun ();
28777
28778
28779 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28780 {
28781 tree version_decl = ele;
28782 tree predicate_chain = NULL_TREE;
28783 unsigned int priority;
28784 /* Get attribute string, parse it and find the right predicate decl.
28785 The predicate function could be a lengthy combination of many
28786 features, like arch-type and various isa-variants. */
28787 priority = get_builtin_code_for_version (version_decl,
28788 &predicate_chain);
28789
28790 if (predicate_chain == NULL_TREE)
28791 continue;
28792
28793 actual_versions++;
28794 function_version_info [ix - 1].version_decl = version_decl;
28795 function_version_info [ix - 1].predicate_chain = predicate_chain;
28796 function_version_info [ix - 1].dispatch_priority = priority;
28797 }
28798
28799 /* Sort the versions according to descending order of dispatch priority. The
28800 priority is based on the ISA. This is not a perfect solution. There
28801 could still be ambiguity. If more than one function version is suitable
28802 to execute, which one should be dispatched? In future, allow the user
28803 to specify a dispatch priority next to the version. */
28804 qsort (function_version_info, actual_versions,
28805 sizeof (struct _function_version_info), feature_compare);
28806
28807 for (i = 0; i < actual_versions; ++i)
28808 *empty_bb = add_condition_to_bb (dispatch_decl,
28809 function_version_info[i].version_decl,
28810 function_version_info[i].predicate_chain,
28811 *empty_bb);
28812
28813 /* dispatch default version at the end. */
28814 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28815 NULL, *empty_bb);
28816
28817 free (function_version_info);
28818 return 0;
28819 }
28820
28821 /* This function returns true if FN1 and FN2 are versions of the same function,
28822 that is, the targets of the function decls are different. This assumes
28823 that FN1 and FN2 have the same signature. */
28824
28825 static bool
28826 ix86_function_versions (tree fn1, tree fn2)
28827 {
28828 tree attr1, attr2;
28829 struct cl_target_option *target1, *target2;
28830
28831 if (TREE_CODE (fn1) != FUNCTION_DECL
28832 || TREE_CODE (fn2) != FUNCTION_DECL)
28833 return false;
28834
28835 attr1 = DECL_FUNCTION_SPECIFIC_TARGET (fn1);
28836 attr2 = DECL_FUNCTION_SPECIFIC_TARGET (fn2);
28837
28838 /* Atleast one function decl should have target attribute specified. */
28839 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
28840 return false;
28841
28842 if (attr1 == NULL_TREE)
28843 attr1 = target_option_default_node;
28844 else if (attr2 == NULL_TREE)
28845 attr2 = target_option_default_node;
28846
28847 target1 = TREE_TARGET_OPTION (attr1);
28848 target2 = TREE_TARGET_OPTION (attr2);
28849
28850 /* target1 and target2 must be different in some way. */
28851 if (target1->x_ix86_isa_flags == target2->x_ix86_isa_flags
28852 && target1->x_target_flags == target2->x_target_flags
28853 && target1->arch == target2->arch
28854 && target1->tune == target2->tune
28855 && target1->x_ix86_fpmath == target2->x_ix86_fpmath
28856 && target1->branch_cost == target2->branch_cost)
28857 return false;
28858
28859 return true;
28860 }
28861
28862 /* Comparator function to be used in qsort routine to sort attribute
28863 specification strings to "target". */
28864
28865 static int
28866 attr_strcmp (const void *v1, const void *v2)
28867 {
28868 const char *c1 = *(char *const*)v1;
28869 const char *c2 = *(char *const*)v2;
28870 return strcmp (c1, c2);
28871 }
28872
28873 /* STR is the argument to target attribute. This function tokenizes
28874 the comma separated arguments, sorts them and returns a string which
28875 is a unique identifier for the comma separated arguments. It also
28876 replaces non-identifier characters "=,-" with "_". */
28877
28878 static char *
28879 sorted_attr_string (const char *str)
28880 {
28881 char **args = NULL;
28882 char *attr_str, *ret_str;
28883 char *attr = NULL;
28884 unsigned int argnum = 1;
28885 unsigned int i;
28886
28887 for (i = 0; i < strlen (str); i++)
28888 if (str[i] == ',')
28889 argnum++;
28890
28891 attr_str = (char *)xmalloc (strlen (str) + 1);
28892 strcpy (attr_str, str);
28893
28894 /* Replace "=,-" with "_". */
28895 for (i = 0; i < strlen (attr_str); i++)
28896 if (attr_str[i] == '=' || attr_str[i]== '-')
28897 attr_str[i] = '_';
28898
28899 if (argnum == 1)
28900 return attr_str;
28901
28902 args = XNEWVEC (char *, argnum);
28903
28904 i = 0;
28905 attr = strtok (attr_str, ",");
28906 while (attr != NULL)
28907 {
28908 args[i] = attr;
28909 i++;
28910 attr = strtok (NULL, ",");
28911 }
28912
28913 qsort (args, argnum, sizeof (char*), attr_strcmp);
28914
28915 ret_str = (char *)xmalloc (strlen (str) + 1);
28916 strcpy (ret_str, args[0]);
28917 for (i = 1; i < argnum; i++)
28918 {
28919 strcat (ret_str, "_");
28920 strcat (ret_str, args[i]);
28921 }
28922
28923 free (args);
28924 free (attr_str);
28925 return ret_str;
28926 }
28927
28928 /* This function changes the assembler name for functions that are
28929 versions. If DECL is a function version and has a "target"
28930 attribute, it appends the attribute string to its assembler name. */
28931
28932 static tree
28933 ix86_mangle_function_version_assembler_name (tree decl, tree id)
28934 {
28935 tree version_attr;
28936 const char *orig_name, *version_string, *attr_str;
28937 char *assembler_name;
28938
28939 if (DECL_DECLARED_INLINE_P (decl)
28940 && lookup_attribute ("gnu_inline",
28941 DECL_ATTRIBUTES (decl)))
28942 error_at (DECL_SOURCE_LOCATION (decl),
28943 "Function versions cannot be marked as gnu_inline,"
28944 " bodies have to be generated");
28945
28946 if (DECL_VIRTUAL_P (decl)
28947 || DECL_VINDEX (decl))
28948 error_at (DECL_SOURCE_LOCATION (decl),
28949 "Virtual function versioning not supported\n");
28950
28951 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28952
28953 /* target attribute string is NULL for default functions. */
28954 if (version_attr == NULL_TREE)
28955 return id;
28956
28957 orig_name = IDENTIFIER_POINTER (id);
28958 version_string
28959 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
28960
28961 attr_str = sorted_attr_string (version_string);
28962 assembler_name = (char *) xmalloc (strlen (orig_name)
28963 + strlen (attr_str) + 2);
28964
28965 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
28966
28967 /* Allow assembler name to be modified if already set. */
28968 if (DECL_ASSEMBLER_NAME_SET_P (decl))
28969 SET_DECL_RTL (decl, NULL);
28970
28971 return get_identifier (assembler_name);
28972 }
28973
28974 static tree
28975 ix86_mangle_decl_assembler_name (tree decl, tree id)
28976 {
28977 /* For function version, add the target suffix to the assembler name. */
28978 if (TREE_CODE (decl) == FUNCTION_DECL
28979 && DECL_FUNCTION_VERSIONED (decl))
28980 id = ix86_mangle_function_version_assembler_name (decl, id);
28981 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
28982 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
28983 #endif
28984
28985 return id;
28986 }
28987
28988 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
28989 is true, append the full path name of the source file. */
28990
28991 static char *
28992 make_name (tree decl, const char *suffix, bool make_unique)
28993 {
28994 char *global_var_name;
28995 int name_len;
28996 const char *name;
28997 const char *unique_name = NULL;
28998
28999 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29000
29001 /* Get a unique name that can be used globally without any chances
29002 of collision at link time. */
29003 if (make_unique)
29004 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29005
29006 name_len = strlen (name) + strlen (suffix) + 2;
29007
29008 if (make_unique)
29009 name_len += strlen (unique_name) + 1;
29010 global_var_name = XNEWVEC (char, name_len);
29011
29012 /* Use '.' to concatenate names as it is demangler friendly. */
29013 if (make_unique)
29014 snprintf (global_var_name, name_len, "%s.%s.%s", name,
29015 unique_name, suffix);
29016 else
29017 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29018
29019 return global_var_name;
29020 }
29021
29022 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29023
29024 /* Make a dispatcher declaration for the multi-versioned function DECL.
29025 Calls to DECL function will be replaced with calls to the dispatcher
29026 by the front-end. Return the decl created. */
29027
29028 static tree
29029 make_dispatcher_decl (const tree decl)
29030 {
29031 tree func_decl;
29032 char *func_name, *resolver_name;
29033 tree fn_type, func_type;
29034 bool is_uniq = false;
29035
29036 if (TREE_PUBLIC (decl) == 0)
29037 is_uniq = true;
29038
29039 func_name = make_name (decl, "ifunc", is_uniq);
29040 resolver_name = make_name (decl, "resolver", is_uniq);
29041 gcc_assert (resolver_name);
29042
29043 fn_type = TREE_TYPE (decl);
29044 func_type = build_function_type (TREE_TYPE (fn_type),
29045 TYPE_ARG_TYPES (fn_type));
29046
29047 func_decl = build_fn_decl (func_name, func_type);
29048 TREE_USED (func_decl) = 1;
29049 DECL_CONTEXT (func_decl) = NULL_TREE;
29050 DECL_INITIAL (func_decl) = error_mark_node;
29051 DECL_ARTIFICIAL (func_decl) = 1;
29052 /* Mark this func as external, the resolver will flip it again if
29053 it gets generated. */
29054 DECL_EXTERNAL (func_decl) = 1;
29055 /* This will be of type IFUNCs have to be externally visible. */
29056 TREE_PUBLIC (func_decl) = 1;
29057
29058 return func_decl;
29059 }
29060
29061 #endif
29062
29063 /* Returns true if decl is multi-versioned and DECL is the default function,
29064 that is it is not tagged with target specific optimization. */
29065
29066 static bool
29067 is_function_default_version (const tree decl)
29068 {
29069 return (TREE_CODE (decl) == FUNCTION_DECL
29070 && DECL_FUNCTION_VERSIONED (decl)
29071 && DECL_FUNCTION_SPECIFIC_TARGET (decl) == NULL_TREE);
29072 }
29073
29074 /* Make a dispatcher declaration for the multi-versioned function DECL.
29075 Calls to DECL function will be replaced with calls to the dispatcher
29076 by the front-end. Returns the decl of the dispatcher function. */
29077
29078 static tree
29079 ix86_get_function_versions_dispatcher (void *decl)
29080 {
29081 tree fn = (tree) decl;
29082 struct cgraph_node *node = NULL;
29083 struct cgraph_node *default_node = NULL;
29084 struct cgraph_function_version_info *node_v = NULL;
29085 struct cgraph_function_version_info *first_v = NULL;
29086
29087 tree dispatch_decl = NULL;
29088
29089 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29090 struct cgraph_function_version_info *it_v = NULL;
29091 struct cgraph_node *dispatcher_node = NULL;
29092 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29093 #endif
29094
29095 struct cgraph_function_version_info *default_version_info = NULL;
29096
29097 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29098
29099 node = cgraph_get_node (fn);
29100 gcc_assert (node != NULL);
29101
29102 node_v = get_cgraph_node_version (node);
29103 gcc_assert (node_v != NULL);
29104
29105 if (node_v->dispatcher_resolver != NULL)
29106 return node_v->dispatcher_resolver;
29107
29108 /* Find the default version and make it the first node. */
29109 first_v = node_v;
29110 /* Go to the beginnig of the chain. */
29111 while (first_v->prev != NULL)
29112 first_v = first_v->prev;
29113 default_version_info = first_v;
29114 while (default_version_info != NULL)
29115 {
29116 if (is_function_default_version
29117 (default_version_info->this_node->symbol.decl))
29118 break;
29119 default_version_info = default_version_info->next;
29120 }
29121
29122 /* If there is no default node, just return NULL. */
29123 if (default_version_info == NULL)
29124 return NULL;
29125
29126 /* Make default info the first node. */
29127 if (first_v != default_version_info)
29128 {
29129 default_version_info->prev->next = default_version_info->next;
29130 if (default_version_info->next)
29131 default_version_info->next->prev = default_version_info->prev;
29132 first_v->prev = default_version_info;
29133 default_version_info->next = first_v;
29134 default_version_info->prev = NULL;
29135 }
29136
29137 default_node = default_version_info->this_node;
29138
29139 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29140 /* Right now, the dispatching is done via ifunc. */
29141 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29142
29143 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29144 gcc_assert (dispatcher_node != NULL);
29145 dispatcher_node->dispatcher_function = 1;
29146 dispatcher_version_info
29147 = insert_new_cgraph_node_version (dispatcher_node);
29148 dispatcher_version_info->next = default_version_info;
29149 dispatcher_node->local.finalized = 1;
29150
29151 /* Set the dispatcher for all the versions. */
29152 it_v = default_version_info;
29153 while (it_v->next != NULL)
29154 {
29155 it_v->dispatcher_resolver = dispatch_decl;
29156 it_v = it_v->next;
29157 }
29158 #else
29159 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29160 "multiversioning needs ifunc which is not supported "
29161 "in this configuration");
29162 #endif
29163 return dispatch_decl;
29164 }
29165
29166 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29167 it to CHAIN. */
29168
29169 static tree
29170 make_attribute (const char *name, const char *arg_name, tree chain)
29171 {
29172 tree attr_name;
29173 tree attr_arg_name;
29174 tree attr_args;
29175 tree attr;
29176
29177 attr_name = get_identifier (name);
29178 attr_arg_name = build_string (strlen (arg_name), arg_name);
29179 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29180 attr = tree_cons (attr_name, attr_args, chain);
29181 return attr;
29182 }
29183
29184 /* Make the resolver function decl to dispatch the versions of
29185 a multi-versioned function, DEFAULT_DECL. Create an
29186 empty basic block in the resolver and store the pointer in
29187 EMPTY_BB. Return the decl of the resolver function. */
29188
29189 static tree
29190 make_resolver_func (const tree default_decl,
29191 const tree dispatch_decl,
29192 basic_block *empty_bb)
29193 {
29194 char *resolver_name;
29195 tree decl, type, decl_name, t;
29196 bool is_uniq = false;
29197
29198 /* IFUNC's have to be globally visible. So, if the default_decl is
29199 not, then the name of the IFUNC should be made unique. */
29200 if (TREE_PUBLIC (default_decl) == 0)
29201 is_uniq = true;
29202
29203 /* Append the filename to the resolver function if the versions are
29204 not externally visible. This is because the resolver function has
29205 to be externally visible for the loader to find it. So, appending
29206 the filename will prevent conflicts with a resolver function from
29207 another module which is based on the same version name. */
29208 resolver_name = make_name (default_decl, "resolver", is_uniq);
29209
29210 /* The resolver function should return a (void *). */
29211 type = build_function_type_list (ptr_type_node, NULL_TREE);
29212
29213 decl = build_fn_decl (resolver_name, type);
29214 decl_name = get_identifier (resolver_name);
29215 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29216
29217 DECL_NAME (decl) = decl_name;
29218 TREE_USED (decl) = 1;
29219 DECL_ARTIFICIAL (decl) = 1;
29220 DECL_IGNORED_P (decl) = 0;
29221 /* IFUNC resolvers have to be externally visible. */
29222 TREE_PUBLIC (decl) = 1;
29223 DECL_UNINLINABLE (decl) = 0;
29224
29225 /* Resolver is not external, body is generated. */
29226 DECL_EXTERNAL (decl) = 0;
29227 DECL_EXTERNAL (dispatch_decl) = 0;
29228
29229 DECL_CONTEXT (decl) = NULL_TREE;
29230 DECL_INITIAL (decl) = make_node (BLOCK);
29231 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29232
29233 if (DECL_COMDAT_GROUP (default_decl)
29234 || TREE_PUBLIC (default_decl))
29235 {
29236 /* In this case, each translation unit with a call to this
29237 versioned function will put out a resolver. Ensure it
29238 is comdat to keep just one copy. */
29239 DECL_COMDAT (decl) = 1;
29240 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29241 }
29242 /* Build result decl and add to function_decl. */
29243 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29244 DECL_ARTIFICIAL (t) = 1;
29245 DECL_IGNORED_P (t) = 1;
29246 DECL_RESULT (decl) = t;
29247
29248 gimplify_function_tree (decl);
29249 push_cfun (DECL_STRUCT_FUNCTION (decl));
29250 *empty_bb = init_lowered_empty_function (decl, false);
29251
29252 cgraph_add_new_function (decl, true);
29253 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29254
29255 pop_cfun ();
29256
29257 gcc_assert (dispatch_decl != NULL);
29258 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29259 DECL_ATTRIBUTES (dispatch_decl)
29260 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29261
29262 /* Create the alias for dispatch to resolver here. */
29263 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29264 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29265 return decl;
29266 }
29267
29268 /* Generate the dispatching code body to dispatch multi-versioned function
29269 DECL. The target hook is called to process the "target" attributes and
29270 provide the code to dispatch the right function at run-time. NODE points
29271 to the dispatcher decl whose body will be created. */
29272
29273 static tree
29274 ix86_generate_version_dispatcher_body (void *node_p)
29275 {
29276 tree resolver_decl;
29277 basic_block empty_bb;
29278 vec<tree> fn_ver_vec = vec<tree>();
29279 tree default_ver_decl;
29280 struct cgraph_node *versn;
29281 struct cgraph_node *node;
29282
29283 struct cgraph_function_version_info *node_version_info = NULL;
29284 struct cgraph_function_version_info *versn_info = NULL;
29285
29286 node = (cgraph_node *)node_p;
29287
29288 node_version_info = get_cgraph_node_version (node);
29289 gcc_assert (node->dispatcher_function
29290 && node_version_info != NULL);
29291
29292 if (node_version_info->dispatcher_resolver)
29293 return node_version_info->dispatcher_resolver;
29294
29295 /* The first version in the chain corresponds to the default version. */
29296 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29297
29298 /* node is going to be an alias, so remove the finalized bit. */
29299 node->local.finalized = false;
29300
29301 resolver_decl = make_resolver_func (default_ver_decl,
29302 node->symbol.decl, &empty_bb);
29303
29304 node_version_info->dispatcher_resolver = resolver_decl;
29305
29306 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29307
29308 fn_ver_vec.create (2);
29309
29310 for (versn_info = node_version_info->next; versn_info;
29311 versn_info = versn_info->next)
29312 {
29313 versn = versn_info->this_node;
29314 /* Check for virtual functions here again, as by this time it should
29315 have been determined if this function needs a vtable index or
29316 not. This happens for methods in derived classes that override
29317 virtual methods in base classes but are not explicitly marked as
29318 virtual. */
29319 if (DECL_VINDEX (versn->symbol.decl))
29320 error_at (DECL_SOURCE_LOCATION (versn->symbol.decl),
29321 "Virtual function multiversioning not supported");
29322 fn_ver_vec.safe_push (versn->symbol.decl);
29323 }
29324
29325 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29326
29327 rebuild_cgraph_edges ();
29328 pop_cfun ();
29329 return resolver_decl;
29330 }
29331 /* This builds the processor_model struct type defined in
29332 libgcc/config/i386/cpuinfo.c */
29333
29334 static tree
29335 build_processor_model_struct (void)
29336 {
29337 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29338 "__cpu_features"};
29339 tree field = NULL_TREE, field_chain = NULL_TREE;
29340 int i;
29341 tree type = make_node (RECORD_TYPE);
29342
29343 /* The first 3 fields are unsigned int. */
29344 for (i = 0; i < 3; ++i)
29345 {
29346 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29347 get_identifier (field_name[i]), unsigned_type_node);
29348 if (field_chain != NULL_TREE)
29349 DECL_CHAIN (field) = field_chain;
29350 field_chain = field;
29351 }
29352
29353 /* The last field is an array of unsigned integers of size one. */
29354 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29355 get_identifier (field_name[3]),
29356 build_array_type (unsigned_type_node,
29357 build_index_type (size_one_node)));
29358 if (field_chain != NULL_TREE)
29359 DECL_CHAIN (field) = field_chain;
29360 field_chain = field;
29361
29362 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29363 return type;
29364 }
29365
29366 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29367
29368 static tree
29369 make_var_decl (tree type, const char *name)
29370 {
29371 tree new_decl;
29372
29373 new_decl = build_decl (UNKNOWN_LOCATION,
29374 VAR_DECL,
29375 get_identifier(name),
29376 type);
29377
29378 DECL_EXTERNAL (new_decl) = 1;
29379 TREE_STATIC (new_decl) = 1;
29380 TREE_PUBLIC (new_decl) = 1;
29381 DECL_INITIAL (new_decl) = 0;
29382 DECL_ARTIFICIAL (new_decl) = 0;
29383 DECL_PRESERVE_P (new_decl) = 1;
29384
29385 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29386 assemble_variable (new_decl, 0, 0, 0);
29387
29388 return new_decl;
29389 }
29390
29391 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29392 into an integer defined in libgcc/config/i386/cpuinfo.c */
29393
29394 static tree
29395 fold_builtin_cpu (tree fndecl, tree *args)
29396 {
29397 unsigned int i;
29398 enum ix86_builtins fn_code = (enum ix86_builtins)
29399 DECL_FUNCTION_CODE (fndecl);
29400 tree param_string_cst = NULL;
29401
29402 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29403 enum processor_features
29404 {
29405 F_CMOV = 0,
29406 F_MMX,
29407 F_POPCNT,
29408 F_SSE,
29409 F_SSE2,
29410 F_SSE3,
29411 F_SSSE3,
29412 F_SSE4_1,
29413 F_SSE4_2,
29414 F_AVX,
29415 F_AVX2,
29416 F_MAX
29417 };
29418
29419 /* These are the values for vendor types and cpu types and subtypes
29420 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29421 the corresponding start value. */
29422 enum processor_model
29423 {
29424 M_INTEL = 1,
29425 M_AMD,
29426 M_CPU_TYPE_START,
29427 M_INTEL_ATOM,
29428 M_INTEL_CORE2,
29429 M_INTEL_COREI7,
29430 M_AMDFAM10H,
29431 M_AMDFAM15H,
29432 M_CPU_SUBTYPE_START,
29433 M_INTEL_COREI7_NEHALEM,
29434 M_INTEL_COREI7_WESTMERE,
29435 M_INTEL_COREI7_SANDYBRIDGE,
29436 M_AMDFAM10H_BARCELONA,
29437 M_AMDFAM10H_SHANGHAI,
29438 M_AMDFAM10H_ISTANBUL,
29439 M_AMDFAM15H_BDVER1,
29440 M_AMDFAM15H_BDVER2,
29441 M_AMDFAM15H_BDVER3
29442 };
29443
29444 static struct _arch_names_table
29445 {
29446 const char *const name;
29447 const enum processor_model model;
29448 }
29449 const arch_names_table[] =
29450 {
29451 {"amd", M_AMD},
29452 {"intel", M_INTEL},
29453 {"atom", M_INTEL_ATOM},
29454 {"core2", M_INTEL_CORE2},
29455 {"corei7", M_INTEL_COREI7},
29456 {"nehalem", M_INTEL_COREI7_NEHALEM},
29457 {"westmere", M_INTEL_COREI7_WESTMERE},
29458 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29459 {"amdfam10h", M_AMDFAM10H},
29460 {"barcelona", M_AMDFAM10H_BARCELONA},
29461 {"shanghai", M_AMDFAM10H_SHANGHAI},
29462 {"istanbul", M_AMDFAM10H_ISTANBUL},
29463 {"amdfam15h", M_AMDFAM15H},
29464 {"bdver1", M_AMDFAM15H_BDVER1},
29465 {"bdver2", M_AMDFAM15H_BDVER2},
29466 {"bdver3", M_AMDFAM15H_BDVER3},
29467 };
29468
29469 static struct _isa_names_table
29470 {
29471 const char *const name;
29472 const enum processor_features feature;
29473 }
29474 const isa_names_table[] =
29475 {
29476 {"cmov", F_CMOV},
29477 {"mmx", F_MMX},
29478 {"popcnt", F_POPCNT},
29479 {"sse", F_SSE},
29480 {"sse2", F_SSE2},
29481 {"sse3", F_SSE3},
29482 {"ssse3", F_SSSE3},
29483 {"sse4.1", F_SSE4_1},
29484 {"sse4.2", F_SSE4_2},
29485 {"avx", F_AVX},
29486 {"avx2", F_AVX2}
29487 };
29488
29489 static tree __processor_model_type = NULL_TREE;
29490 static tree __cpu_model_var = NULL_TREE;
29491
29492 if (__processor_model_type == NULL_TREE)
29493 __processor_model_type = build_processor_model_struct ();
29494
29495 if (__cpu_model_var == NULL_TREE)
29496 __cpu_model_var = make_var_decl (__processor_model_type,
29497 "__cpu_model");
29498
29499 gcc_assert ((args != NULL) && (*args != NULL));
29500
29501 param_string_cst = *args;
29502 while (param_string_cst
29503 && TREE_CODE (param_string_cst) != STRING_CST)
29504 {
29505 /* *args must be a expr that can contain other EXPRS leading to a
29506 STRING_CST. */
29507 if (!EXPR_P (param_string_cst))
29508 {
29509 error ("Parameter to builtin must be a string constant or literal");
29510 return integer_zero_node;
29511 }
29512 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29513 }
29514
29515 gcc_assert (param_string_cst);
29516
29517 if (fn_code == IX86_BUILTIN_CPU_IS)
29518 {
29519 tree ref;
29520 tree field;
29521 tree final;
29522
29523 unsigned int field_val = 0;
29524 unsigned int NUM_ARCH_NAMES
29525 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29526
29527 for (i = 0; i < NUM_ARCH_NAMES; i++)
29528 if (strcmp (arch_names_table[i].name,
29529 TREE_STRING_POINTER (param_string_cst)) == 0)
29530 break;
29531
29532 if (i == NUM_ARCH_NAMES)
29533 {
29534 error ("Parameter to builtin not valid: %s",
29535 TREE_STRING_POINTER (param_string_cst));
29536 return integer_zero_node;
29537 }
29538
29539 field = TYPE_FIELDS (__processor_model_type);
29540 field_val = arch_names_table[i].model;
29541
29542 /* CPU types are stored in the next field. */
29543 if (field_val > M_CPU_TYPE_START
29544 && field_val < M_CPU_SUBTYPE_START)
29545 {
29546 field = DECL_CHAIN (field);
29547 field_val -= M_CPU_TYPE_START;
29548 }
29549
29550 /* CPU subtypes are stored in the next field. */
29551 if (field_val > M_CPU_SUBTYPE_START)
29552 {
29553 field = DECL_CHAIN ( DECL_CHAIN (field));
29554 field_val -= M_CPU_SUBTYPE_START;
29555 }
29556
29557 /* Get the appropriate field in __cpu_model. */
29558 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29559 field, NULL_TREE);
29560
29561 /* Check the value. */
29562 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29563 build_int_cstu (unsigned_type_node, field_val));
29564 return build1 (CONVERT_EXPR, integer_type_node, final);
29565 }
29566 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29567 {
29568 tree ref;
29569 tree array_elt;
29570 tree field;
29571 tree final;
29572
29573 unsigned int field_val = 0;
29574 unsigned int NUM_ISA_NAMES
29575 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29576
29577 for (i = 0; i < NUM_ISA_NAMES; i++)
29578 if (strcmp (isa_names_table[i].name,
29579 TREE_STRING_POINTER (param_string_cst)) == 0)
29580 break;
29581
29582 if (i == NUM_ISA_NAMES)
29583 {
29584 error ("Parameter to builtin not valid: %s",
29585 TREE_STRING_POINTER (param_string_cst));
29586 return integer_zero_node;
29587 }
29588
29589 field = TYPE_FIELDS (__processor_model_type);
29590 /* Get the last field, which is __cpu_features. */
29591 while (DECL_CHAIN (field))
29592 field = DECL_CHAIN (field);
29593
29594 /* Get the appropriate field: __cpu_model.__cpu_features */
29595 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29596 field, NULL_TREE);
29597
29598 /* Access the 0th element of __cpu_features array. */
29599 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29600 integer_zero_node, NULL_TREE, NULL_TREE);
29601
29602 field_val = (1 << isa_names_table[i].feature);
29603 /* Return __cpu_model.__cpu_features[0] & field_val */
29604 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29605 build_int_cstu (unsigned_type_node, field_val));
29606 return build1 (CONVERT_EXPR, integer_type_node, final);
29607 }
29608 gcc_unreachable ();
29609 }
29610
29611 static tree
29612 ix86_fold_builtin (tree fndecl, int n_args,
29613 tree *args, bool ignore ATTRIBUTE_UNUSED)
29614 {
29615 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29616 {
29617 enum ix86_builtins fn_code = (enum ix86_builtins)
29618 DECL_FUNCTION_CODE (fndecl);
29619 if (fn_code == IX86_BUILTIN_CPU_IS
29620 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29621 {
29622 gcc_assert (n_args == 1);
29623 return fold_builtin_cpu (fndecl, args);
29624 }
29625 }
29626
29627 #ifdef SUBTARGET_FOLD_BUILTIN
29628 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29629 #endif
29630
29631 return NULL_TREE;
29632 }
29633
29634 /* Make builtins to detect cpu type and features supported. NAME is
29635 the builtin name, CODE is the builtin code, and FTYPE is the function
29636 type of the builtin. */
29637
29638 static void
29639 make_cpu_type_builtin (const char* name, int code,
29640 enum ix86_builtin_func_type ftype, bool is_const)
29641 {
29642 tree decl;
29643 tree type;
29644
29645 type = ix86_get_builtin_func_type (ftype);
29646 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29647 NULL, NULL_TREE);
29648 gcc_assert (decl != NULL_TREE);
29649 ix86_builtins[(int) code] = decl;
29650 TREE_READONLY (decl) = is_const;
29651 }
29652
29653 /* Make builtins to get CPU type and features supported. The created
29654 builtins are :
29655
29656 __builtin_cpu_init (), to detect cpu type and features,
29657 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29658 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29659 */
29660
29661 static void
29662 ix86_init_platform_type_builtins (void)
29663 {
29664 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29665 INT_FTYPE_VOID, false);
29666 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29667 INT_FTYPE_PCCHAR, true);
29668 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29669 INT_FTYPE_PCCHAR, true);
29670 }
29671
29672 /* Internal method for ix86_init_builtins. */
29673
29674 static void
29675 ix86_init_builtins_va_builtins_abi (void)
29676 {
29677 tree ms_va_ref, sysv_va_ref;
29678 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29679 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29680 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29681 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29682
29683 if (!TARGET_64BIT)
29684 return;
29685 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29686 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29687 ms_va_ref = build_reference_type (ms_va_list_type_node);
29688 sysv_va_ref =
29689 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29690
29691 fnvoid_va_end_ms =
29692 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29693 fnvoid_va_start_ms =
29694 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29695 fnvoid_va_end_sysv =
29696 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29697 fnvoid_va_start_sysv =
29698 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29699 NULL_TREE);
29700 fnvoid_va_copy_ms =
29701 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29702 NULL_TREE);
29703 fnvoid_va_copy_sysv =
29704 build_function_type_list (void_type_node, sysv_va_ref,
29705 sysv_va_ref, NULL_TREE);
29706
29707 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29708 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29709 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29710 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29711 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29712 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29713 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29714 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29715 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29716 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29717 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29718 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29719 }
29720
29721 static void
29722 ix86_init_builtin_types (void)
29723 {
29724 tree float128_type_node, float80_type_node;
29725
29726 /* The __float80 type. */
29727 float80_type_node = long_double_type_node;
29728 if (TYPE_MODE (float80_type_node) != XFmode)
29729 {
29730 /* The __float80 type. */
29731 float80_type_node = make_node (REAL_TYPE);
29732
29733 TYPE_PRECISION (float80_type_node) = 80;
29734 layout_type (float80_type_node);
29735 }
29736 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29737
29738 /* The __float128 type. */
29739 float128_type_node = make_node (REAL_TYPE);
29740 TYPE_PRECISION (float128_type_node) = 128;
29741 layout_type (float128_type_node);
29742 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29743
29744 /* This macro is built by i386-builtin-types.awk. */
29745 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29746 }
29747
29748 static void
29749 ix86_init_builtins (void)
29750 {
29751 tree t;
29752
29753 ix86_init_builtin_types ();
29754
29755 /* Builtins to get CPU type and features. */
29756 ix86_init_platform_type_builtins ();
29757
29758 /* TFmode support builtins. */
29759 def_builtin_const (0, "__builtin_infq",
29760 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29761 def_builtin_const (0, "__builtin_huge_valq",
29762 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29763
29764 /* We will expand them to normal call if SSE isn't available since
29765 they are used by libgcc. */
29766 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29767 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29768 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29769 TREE_READONLY (t) = 1;
29770 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29771
29772 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29773 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29774 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29775 TREE_READONLY (t) = 1;
29776 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29777
29778 ix86_init_tm_builtins ();
29779 ix86_init_mmx_sse_builtins ();
29780
29781 if (TARGET_LP64)
29782 ix86_init_builtins_va_builtins_abi ();
29783
29784 #ifdef SUBTARGET_INIT_BUILTINS
29785 SUBTARGET_INIT_BUILTINS;
29786 #endif
29787 }
29788
29789 /* Return the ix86 builtin for CODE. */
29790
29791 static tree
29792 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
29793 {
29794 if (code >= IX86_BUILTIN_MAX)
29795 return error_mark_node;
29796
29797 return ix86_builtins[code];
29798 }
29799
29800 /* Errors in the source file can cause expand_expr to return const0_rtx
29801 where we expect a vector. To avoid crashing, use one of the vector
29802 clear instructions. */
29803 static rtx
29804 safe_vector_operand (rtx x, enum machine_mode mode)
29805 {
29806 if (x == const0_rtx)
29807 x = CONST0_RTX (mode);
29808 return x;
29809 }
29810
29811 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
29812
29813 static rtx
29814 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
29815 {
29816 rtx pat;
29817 tree arg0 = CALL_EXPR_ARG (exp, 0);
29818 tree arg1 = CALL_EXPR_ARG (exp, 1);
29819 rtx op0 = expand_normal (arg0);
29820 rtx op1 = expand_normal (arg1);
29821 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29822 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
29823 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
29824
29825 if (VECTOR_MODE_P (mode0))
29826 op0 = safe_vector_operand (op0, mode0);
29827 if (VECTOR_MODE_P (mode1))
29828 op1 = safe_vector_operand (op1, mode1);
29829
29830 if (optimize || !target
29831 || GET_MODE (target) != tmode
29832 || !insn_data[icode].operand[0].predicate (target, tmode))
29833 target = gen_reg_rtx (tmode);
29834
29835 if (GET_MODE (op1) == SImode && mode1 == TImode)
29836 {
29837 rtx x = gen_reg_rtx (V4SImode);
29838 emit_insn (gen_sse2_loadd (x, op1));
29839 op1 = gen_lowpart (TImode, x);
29840 }
29841
29842 if (!insn_data[icode].operand[1].predicate (op0, mode0))
29843 op0 = copy_to_mode_reg (mode0, op0);
29844 if (!insn_data[icode].operand[2].predicate (op1, mode1))
29845 op1 = copy_to_mode_reg (mode1, op1);
29846
29847 pat = GEN_FCN (icode) (target, op0, op1);
29848 if (! pat)
29849 return 0;
29850
29851 emit_insn (pat);
29852
29853 return target;
29854 }
29855
29856 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
29857
29858 static rtx
29859 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
29860 enum ix86_builtin_func_type m_type,
29861 enum rtx_code sub_code)
29862 {
29863 rtx pat;
29864 int i;
29865 int nargs;
29866 bool comparison_p = false;
29867 bool tf_p = false;
29868 bool last_arg_constant = false;
29869 int num_memory = 0;
29870 struct {
29871 rtx op;
29872 enum machine_mode mode;
29873 } args[4];
29874
29875 enum machine_mode tmode = insn_data[icode].operand[0].mode;
29876
29877 switch (m_type)
29878 {
29879 case MULTI_ARG_4_DF2_DI_I:
29880 case MULTI_ARG_4_DF2_DI_I1:
29881 case MULTI_ARG_4_SF2_SI_I:
29882 case MULTI_ARG_4_SF2_SI_I1:
29883 nargs = 4;
29884 last_arg_constant = true;
29885 break;
29886
29887 case MULTI_ARG_3_SF:
29888 case MULTI_ARG_3_DF:
29889 case MULTI_ARG_3_SF2:
29890 case MULTI_ARG_3_DF2:
29891 case MULTI_ARG_3_DI:
29892 case MULTI_ARG_3_SI:
29893 case MULTI_ARG_3_SI_DI:
29894 case MULTI_ARG_3_HI:
29895 case MULTI_ARG_3_HI_SI:
29896 case MULTI_ARG_3_QI:
29897 case MULTI_ARG_3_DI2:
29898 case MULTI_ARG_3_SI2:
29899 case MULTI_ARG_3_HI2:
29900 case MULTI_ARG_3_QI2:
29901 nargs = 3;
29902 break;
29903
29904 case MULTI_ARG_2_SF:
29905 case MULTI_ARG_2_DF:
29906 case MULTI_ARG_2_DI:
29907 case MULTI_ARG_2_SI:
29908 case MULTI_ARG_2_HI:
29909 case MULTI_ARG_2_QI:
29910 nargs = 2;
29911 break;
29912
29913 case MULTI_ARG_2_DI_IMM:
29914 case MULTI_ARG_2_SI_IMM:
29915 case MULTI_ARG_2_HI_IMM:
29916 case MULTI_ARG_2_QI_IMM:
29917 nargs = 2;
29918 last_arg_constant = true;
29919 break;
29920
29921 case MULTI_ARG_1_SF:
29922 case MULTI_ARG_1_DF:
29923 case MULTI_ARG_1_SF2:
29924 case MULTI_ARG_1_DF2:
29925 case MULTI_ARG_1_DI:
29926 case MULTI_ARG_1_SI:
29927 case MULTI_ARG_1_HI:
29928 case MULTI_ARG_1_QI:
29929 case MULTI_ARG_1_SI_DI:
29930 case MULTI_ARG_1_HI_DI:
29931 case MULTI_ARG_1_HI_SI:
29932 case MULTI_ARG_1_QI_DI:
29933 case MULTI_ARG_1_QI_SI:
29934 case MULTI_ARG_1_QI_HI:
29935 nargs = 1;
29936 break;
29937
29938 case MULTI_ARG_2_DI_CMP:
29939 case MULTI_ARG_2_SI_CMP:
29940 case MULTI_ARG_2_HI_CMP:
29941 case MULTI_ARG_2_QI_CMP:
29942 nargs = 2;
29943 comparison_p = true;
29944 break;
29945
29946 case MULTI_ARG_2_SF_TF:
29947 case MULTI_ARG_2_DF_TF:
29948 case MULTI_ARG_2_DI_TF:
29949 case MULTI_ARG_2_SI_TF:
29950 case MULTI_ARG_2_HI_TF:
29951 case MULTI_ARG_2_QI_TF:
29952 nargs = 2;
29953 tf_p = true;
29954 break;
29955
29956 default:
29957 gcc_unreachable ();
29958 }
29959
29960 if (optimize || !target
29961 || GET_MODE (target) != tmode
29962 || !insn_data[icode].operand[0].predicate (target, tmode))
29963 target = gen_reg_rtx (tmode);
29964
29965 gcc_assert (nargs <= 4);
29966
29967 for (i = 0; i < nargs; i++)
29968 {
29969 tree arg = CALL_EXPR_ARG (exp, i);
29970 rtx op = expand_normal (arg);
29971 int adjust = (comparison_p) ? 1 : 0;
29972 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
29973
29974 if (last_arg_constant && i == nargs - 1)
29975 {
29976 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
29977 {
29978 enum insn_code new_icode = icode;
29979 switch (icode)
29980 {
29981 case CODE_FOR_xop_vpermil2v2df3:
29982 case CODE_FOR_xop_vpermil2v4sf3:
29983 case CODE_FOR_xop_vpermil2v4df3:
29984 case CODE_FOR_xop_vpermil2v8sf3:
29985 error ("the last argument must be a 2-bit immediate");
29986 return gen_reg_rtx (tmode);
29987 case CODE_FOR_xop_rotlv2di3:
29988 new_icode = CODE_FOR_rotlv2di3;
29989 goto xop_rotl;
29990 case CODE_FOR_xop_rotlv4si3:
29991 new_icode = CODE_FOR_rotlv4si3;
29992 goto xop_rotl;
29993 case CODE_FOR_xop_rotlv8hi3:
29994 new_icode = CODE_FOR_rotlv8hi3;
29995 goto xop_rotl;
29996 case CODE_FOR_xop_rotlv16qi3:
29997 new_icode = CODE_FOR_rotlv16qi3;
29998 xop_rotl:
29999 if (CONST_INT_P (op))
30000 {
30001 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30002 op = GEN_INT (INTVAL (op) & mask);
30003 gcc_checking_assert
30004 (insn_data[icode].operand[i + 1].predicate (op, mode));
30005 }
30006 else
30007 {
30008 gcc_checking_assert
30009 (nargs == 2
30010 && insn_data[new_icode].operand[0].mode == tmode
30011 && insn_data[new_icode].operand[1].mode == tmode
30012 && insn_data[new_icode].operand[2].mode == mode
30013 && insn_data[new_icode].operand[0].predicate
30014 == insn_data[icode].operand[0].predicate
30015 && insn_data[new_icode].operand[1].predicate
30016 == insn_data[icode].operand[1].predicate);
30017 icode = new_icode;
30018 goto non_constant;
30019 }
30020 break;
30021 default:
30022 gcc_unreachable ();
30023 }
30024 }
30025 }
30026 else
30027 {
30028 non_constant:
30029 if (VECTOR_MODE_P (mode))
30030 op = safe_vector_operand (op, mode);
30031
30032 /* If we aren't optimizing, only allow one memory operand to be
30033 generated. */
30034 if (memory_operand (op, mode))
30035 num_memory++;
30036
30037 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30038
30039 if (optimize
30040 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30041 || num_memory > 1)
30042 op = force_reg (mode, op);
30043 }
30044
30045 args[i].op = op;
30046 args[i].mode = mode;
30047 }
30048
30049 switch (nargs)
30050 {
30051 case 1:
30052 pat = GEN_FCN (icode) (target, args[0].op);
30053 break;
30054
30055 case 2:
30056 if (tf_p)
30057 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30058 GEN_INT ((int)sub_code));
30059 else if (! comparison_p)
30060 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30061 else
30062 {
30063 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30064 args[0].op,
30065 args[1].op);
30066
30067 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30068 }
30069 break;
30070
30071 case 3:
30072 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30073 break;
30074
30075 case 4:
30076 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30077 break;
30078
30079 default:
30080 gcc_unreachable ();
30081 }
30082
30083 if (! pat)
30084 return 0;
30085
30086 emit_insn (pat);
30087 return target;
30088 }
30089
30090 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30091 insns with vec_merge. */
30092
30093 static rtx
30094 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30095 rtx target)
30096 {
30097 rtx pat;
30098 tree arg0 = CALL_EXPR_ARG (exp, 0);
30099 rtx op1, op0 = expand_normal (arg0);
30100 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30101 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30102
30103 if (optimize || !target
30104 || GET_MODE (target) != tmode
30105 || !insn_data[icode].operand[0].predicate (target, tmode))
30106 target = gen_reg_rtx (tmode);
30107
30108 if (VECTOR_MODE_P (mode0))
30109 op0 = safe_vector_operand (op0, mode0);
30110
30111 if ((optimize && !register_operand (op0, mode0))
30112 || !insn_data[icode].operand[1].predicate (op0, mode0))
30113 op0 = copy_to_mode_reg (mode0, op0);
30114
30115 op1 = op0;
30116 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30117 op1 = copy_to_mode_reg (mode0, op1);
30118
30119 pat = GEN_FCN (icode) (target, op0, op1);
30120 if (! pat)
30121 return 0;
30122 emit_insn (pat);
30123 return target;
30124 }
30125
30126 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30127
30128 static rtx
30129 ix86_expand_sse_compare (const struct builtin_description *d,
30130 tree exp, rtx target, bool swap)
30131 {
30132 rtx pat;
30133 tree arg0 = CALL_EXPR_ARG (exp, 0);
30134 tree arg1 = CALL_EXPR_ARG (exp, 1);
30135 rtx op0 = expand_normal (arg0);
30136 rtx op1 = expand_normal (arg1);
30137 rtx op2;
30138 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30139 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30140 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30141 enum rtx_code comparison = d->comparison;
30142
30143 if (VECTOR_MODE_P (mode0))
30144 op0 = safe_vector_operand (op0, mode0);
30145 if (VECTOR_MODE_P (mode1))
30146 op1 = safe_vector_operand (op1, mode1);
30147
30148 /* Swap operands if we have a comparison that isn't available in
30149 hardware. */
30150 if (swap)
30151 {
30152 rtx tmp = gen_reg_rtx (mode1);
30153 emit_move_insn (tmp, op1);
30154 op1 = op0;
30155 op0 = tmp;
30156 }
30157
30158 if (optimize || !target
30159 || GET_MODE (target) != tmode
30160 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30161 target = gen_reg_rtx (tmode);
30162
30163 if ((optimize && !register_operand (op0, mode0))
30164 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30165 op0 = copy_to_mode_reg (mode0, op0);
30166 if ((optimize && !register_operand (op1, mode1))
30167 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30168 op1 = copy_to_mode_reg (mode1, op1);
30169
30170 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30171 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30172 if (! pat)
30173 return 0;
30174 emit_insn (pat);
30175 return target;
30176 }
30177
30178 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30179
30180 static rtx
30181 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30182 rtx target)
30183 {
30184 rtx pat;
30185 tree arg0 = CALL_EXPR_ARG (exp, 0);
30186 tree arg1 = CALL_EXPR_ARG (exp, 1);
30187 rtx op0 = expand_normal (arg0);
30188 rtx op1 = expand_normal (arg1);
30189 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30190 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30191 enum rtx_code comparison = d->comparison;
30192
30193 if (VECTOR_MODE_P (mode0))
30194 op0 = safe_vector_operand (op0, mode0);
30195 if (VECTOR_MODE_P (mode1))
30196 op1 = safe_vector_operand (op1, mode1);
30197
30198 /* Swap operands if we have a comparison that isn't available in
30199 hardware. */
30200 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30201 {
30202 rtx tmp = op1;
30203 op1 = op0;
30204 op0 = tmp;
30205 }
30206
30207 target = gen_reg_rtx (SImode);
30208 emit_move_insn (target, const0_rtx);
30209 target = gen_rtx_SUBREG (QImode, target, 0);
30210
30211 if ((optimize && !register_operand (op0, mode0))
30212 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30213 op0 = copy_to_mode_reg (mode0, op0);
30214 if ((optimize && !register_operand (op1, mode1))
30215 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30216 op1 = copy_to_mode_reg (mode1, op1);
30217
30218 pat = GEN_FCN (d->icode) (op0, op1);
30219 if (! pat)
30220 return 0;
30221 emit_insn (pat);
30222 emit_insn (gen_rtx_SET (VOIDmode,
30223 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30224 gen_rtx_fmt_ee (comparison, QImode,
30225 SET_DEST (pat),
30226 const0_rtx)));
30227
30228 return SUBREG_REG (target);
30229 }
30230
30231 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30232
30233 static rtx
30234 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30235 rtx target)
30236 {
30237 rtx pat;
30238 tree arg0 = CALL_EXPR_ARG (exp, 0);
30239 rtx op1, op0 = expand_normal (arg0);
30240 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30241 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30242
30243 if (optimize || target == 0
30244 || GET_MODE (target) != tmode
30245 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30246 target = gen_reg_rtx (tmode);
30247
30248 if (VECTOR_MODE_P (mode0))
30249 op0 = safe_vector_operand (op0, mode0);
30250
30251 if ((optimize && !register_operand (op0, mode0))
30252 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30253 op0 = copy_to_mode_reg (mode0, op0);
30254
30255 op1 = GEN_INT (d->comparison);
30256
30257 pat = GEN_FCN (d->icode) (target, op0, op1);
30258 if (! pat)
30259 return 0;
30260 emit_insn (pat);
30261 return target;
30262 }
30263
30264 static rtx
30265 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30266 tree exp, rtx target)
30267 {
30268 rtx pat;
30269 tree arg0 = CALL_EXPR_ARG (exp, 0);
30270 tree arg1 = CALL_EXPR_ARG (exp, 1);
30271 rtx op0 = expand_normal (arg0);
30272 rtx op1 = expand_normal (arg1);
30273 rtx op2;
30274 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30275 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30276 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30277
30278 if (optimize || target == 0
30279 || GET_MODE (target) != tmode
30280 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30281 target = gen_reg_rtx (tmode);
30282
30283 op0 = safe_vector_operand (op0, mode0);
30284 op1 = safe_vector_operand (op1, mode1);
30285
30286 if ((optimize && !register_operand (op0, mode0))
30287 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30288 op0 = copy_to_mode_reg (mode0, op0);
30289 if ((optimize && !register_operand (op1, mode1))
30290 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30291 op1 = copy_to_mode_reg (mode1, op1);
30292
30293 op2 = GEN_INT (d->comparison);
30294
30295 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30296 if (! pat)
30297 return 0;
30298 emit_insn (pat);
30299 return target;
30300 }
30301
30302 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30303
30304 static rtx
30305 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30306 rtx target)
30307 {
30308 rtx pat;
30309 tree arg0 = CALL_EXPR_ARG (exp, 0);
30310 tree arg1 = CALL_EXPR_ARG (exp, 1);
30311 rtx op0 = expand_normal (arg0);
30312 rtx op1 = expand_normal (arg1);
30313 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30314 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30315 enum rtx_code comparison = d->comparison;
30316
30317 if (VECTOR_MODE_P (mode0))
30318 op0 = safe_vector_operand (op0, mode0);
30319 if (VECTOR_MODE_P (mode1))
30320 op1 = safe_vector_operand (op1, mode1);
30321
30322 target = gen_reg_rtx (SImode);
30323 emit_move_insn (target, const0_rtx);
30324 target = gen_rtx_SUBREG (QImode, target, 0);
30325
30326 if ((optimize && !register_operand (op0, mode0))
30327 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30328 op0 = copy_to_mode_reg (mode0, op0);
30329 if ((optimize && !register_operand (op1, mode1))
30330 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30331 op1 = copy_to_mode_reg (mode1, op1);
30332
30333 pat = GEN_FCN (d->icode) (op0, op1);
30334 if (! pat)
30335 return 0;
30336 emit_insn (pat);
30337 emit_insn (gen_rtx_SET (VOIDmode,
30338 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30339 gen_rtx_fmt_ee (comparison, QImode,
30340 SET_DEST (pat),
30341 const0_rtx)));
30342
30343 return SUBREG_REG (target);
30344 }
30345
30346 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30347
30348 static rtx
30349 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30350 tree exp, rtx target)
30351 {
30352 rtx pat;
30353 tree arg0 = CALL_EXPR_ARG (exp, 0);
30354 tree arg1 = CALL_EXPR_ARG (exp, 1);
30355 tree arg2 = CALL_EXPR_ARG (exp, 2);
30356 tree arg3 = CALL_EXPR_ARG (exp, 3);
30357 tree arg4 = CALL_EXPR_ARG (exp, 4);
30358 rtx scratch0, scratch1;
30359 rtx op0 = expand_normal (arg0);
30360 rtx op1 = expand_normal (arg1);
30361 rtx op2 = expand_normal (arg2);
30362 rtx op3 = expand_normal (arg3);
30363 rtx op4 = expand_normal (arg4);
30364 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30365
30366 tmode0 = insn_data[d->icode].operand[0].mode;
30367 tmode1 = insn_data[d->icode].operand[1].mode;
30368 modev2 = insn_data[d->icode].operand[2].mode;
30369 modei3 = insn_data[d->icode].operand[3].mode;
30370 modev4 = insn_data[d->icode].operand[4].mode;
30371 modei5 = insn_data[d->icode].operand[5].mode;
30372 modeimm = insn_data[d->icode].operand[6].mode;
30373
30374 if (VECTOR_MODE_P (modev2))
30375 op0 = safe_vector_operand (op0, modev2);
30376 if (VECTOR_MODE_P (modev4))
30377 op2 = safe_vector_operand (op2, modev4);
30378
30379 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30380 op0 = copy_to_mode_reg (modev2, op0);
30381 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30382 op1 = copy_to_mode_reg (modei3, op1);
30383 if ((optimize && !register_operand (op2, modev4))
30384 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30385 op2 = copy_to_mode_reg (modev4, op2);
30386 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30387 op3 = copy_to_mode_reg (modei5, op3);
30388
30389 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30390 {
30391 error ("the fifth argument must be an 8-bit immediate");
30392 return const0_rtx;
30393 }
30394
30395 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30396 {
30397 if (optimize || !target
30398 || GET_MODE (target) != tmode0
30399 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30400 target = gen_reg_rtx (tmode0);
30401
30402 scratch1 = gen_reg_rtx (tmode1);
30403
30404 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30405 }
30406 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30407 {
30408 if (optimize || !target
30409 || GET_MODE (target) != tmode1
30410 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30411 target = gen_reg_rtx (tmode1);
30412
30413 scratch0 = gen_reg_rtx (tmode0);
30414
30415 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30416 }
30417 else
30418 {
30419 gcc_assert (d->flag);
30420
30421 scratch0 = gen_reg_rtx (tmode0);
30422 scratch1 = gen_reg_rtx (tmode1);
30423
30424 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30425 }
30426
30427 if (! pat)
30428 return 0;
30429
30430 emit_insn (pat);
30431
30432 if (d->flag)
30433 {
30434 target = gen_reg_rtx (SImode);
30435 emit_move_insn (target, const0_rtx);
30436 target = gen_rtx_SUBREG (QImode, target, 0);
30437
30438 emit_insn
30439 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30440 gen_rtx_fmt_ee (EQ, QImode,
30441 gen_rtx_REG ((enum machine_mode) d->flag,
30442 FLAGS_REG),
30443 const0_rtx)));
30444 return SUBREG_REG (target);
30445 }
30446 else
30447 return target;
30448 }
30449
30450
30451 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30452
30453 static rtx
30454 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30455 tree exp, rtx target)
30456 {
30457 rtx pat;
30458 tree arg0 = CALL_EXPR_ARG (exp, 0);
30459 tree arg1 = CALL_EXPR_ARG (exp, 1);
30460 tree arg2 = CALL_EXPR_ARG (exp, 2);
30461 rtx scratch0, scratch1;
30462 rtx op0 = expand_normal (arg0);
30463 rtx op1 = expand_normal (arg1);
30464 rtx op2 = expand_normal (arg2);
30465 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30466
30467 tmode0 = insn_data[d->icode].operand[0].mode;
30468 tmode1 = insn_data[d->icode].operand[1].mode;
30469 modev2 = insn_data[d->icode].operand[2].mode;
30470 modev3 = insn_data[d->icode].operand[3].mode;
30471 modeimm = insn_data[d->icode].operand[4].mode;
30472
30473 if (VECTOR_MODE_P (modev2))
30474 op0 = safe_vector_operand (op0, modev2);
30475 if (VECTOR_MODE_P (modev3))
30476 op1 = safe_vector_operand (op1, modev3);
30477
30478 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30479 op0 = copy_to_mode_reg (modev2, op0);
30480 if ((optimize && !register_operand (op1, modev3))
30481 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30482 op1 = copy_to_mode_reg (modev3, op1);
30483
30484 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30485 {
30486 error ("the third argument must be an 8-bit immediate");
30487 return const0_rtx;
30488 }
30489
30490 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30491 {
30492 if (optimize || !target
30493 || GET_MODE (target) != tmode0
30494 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30495 target = gen_reg_rtx (tmode0);
30496
30497 scratch1 = gen_reg_rtx (tmode1);
30498
30499 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30500 }
30501 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30502 {
30503 if (optimize || !target
30504 || GET_MODE (target) != tmode1
30505 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30506 target = gen_reg_rtx (tmode1);
30507
30508 scratch0 = gen_reg_rtx (tmode0);
30509
30510 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30511 }
30512 else
30513 {
30514 gcc_assert (d->flag);
30515
30516 scratch0 = gen_reg_rtx (tmode0);
30517 scratch1 = gen_reg_rtx (tmode1);
30518
30519 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30520 }
30521
30522 if (! pat)
30523 return 0;
30524
30525 emit_insn (pat);
30526
30527 if (d->flag)
30528 {
30529 target = gen_reg_rtx (SImode);
30530 emit_move_insn (target, const0_rtx);
30531 target = gen_rtx_SUBREG (QImode, target, 0);
30532
30533 emit_insn
30534 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30535 gen_rtx_fmt_ee (EQ, QImode,
30536 gen_rtx_REG ((enum machine_mode) d->flag,
30537 FLAGS_REG),
30538 const0_rtx)));
30539 return SUBREG_REG (target);
30540 }
30541 else
30542 return target;
30543 }
30544
30545 /* Subroutine of ix86_expand_builtin to take care of insns with
30546 variable number of operands. */
30547
30548 static rtx
30549 ix86_expand_args_builtin (const struct builtin_description *d,
30550 tree exp, rtx target)
30551 {
30552 rtx pat, real_target;
30553 unsigned int i, nargs;
30554 unsigned int nargs_constant = 0;
30555 int num_memory = 0;
30556 struct
30557 {
30558 rtx op;
30559 enum machine_mode mode;
30560 } args[4];
30561 bool last_arg_count = false;
30562 enum insn_code icode = d->icode;
30563 const struct insn_data_d *insn_p = &insn_data[icode];
30564 enum machine_mode tmode = insn_p->operand[0].mode;
30565 enum machine_mode rmode = VOIDmode;
30566 bool swap = false;
30567 enum rtx_code comparison = d->comparison;
30568
30569 switch ((enum ix86_builtin_func_type) d->flag)
30570 {
30571 case V2DF_FTYPE_V2DF_ROUND:
30572 case V4DF_FTYPE_V4DF_ROUND:
30573 case V4SF_FTYPE_V4SF_ROUND:
30574 case V8SF_FTYPE_V8SF_ROUND:
30575 case V4SI_FTYPE_V4SF_ROUND:
30576 case V8SI_FTYPE_V8SF_ROUND:
30577 return ix86_expand_sse_round (d, exp, target);
30578 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30579 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30580 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30581 case INT_FTYPE_V8SF_V8SF_PTEST:
30582 case INT_FTYPE_V4DI_V4DI_PTEST:
30583 case INT_FTYPE_V4DF_V4DF_PTEST:
30584 case INT_FTYPE_V4SF_V4SF_PTEST:
30585 case INT_FTYPE_V2DI_V2DI_PTEST:
30586 case INT_FTYPE_V2DF_V2DF_PTEST:
30587 return ix86_expand_sse_ptest (d, exp, target);
30588 case FLOAT128_FTYPE_FLOAT128:
30589 case FLOAT_FTYPE_FLOAT:
30590 case INT_FTYPE_INT:
30591 case UINT64_FTYPE_INT:
30592 case UINT16_FTYPE_UINT16:
30593 case INT64_FTYPE_INT64:
30594 case INT64_FTYPE_V4SF:
30595 case INT64_FTYPE_V2DF:
30596 case INT_FTYPE_V16QI:
30597 case INT_FTYPE_V8QI:
30598 case INT_FTYPE_V8SF:
30599 case INT_FTYPE_V4DF:
30600 case INT_FTYPE_V4SF:
30601 case INT_FTYPE_V2DF:
30602 case INT_FTYPE_V32QI:
30603 case V16QI_FTYPE_V16QI:
30604 case V8SI_FTYPE_V8SF:
30605 case V8SI_FTYPE_V4SI:
30606 case V8HI_FTYPE_V8HI:
30607 case V8HI_FTYPE_V16QI:
30608 case V8QI_FTYPE_V8QI:
30609 case V8SF_FTYPE_V8SF:
30610 case V8SF_FTYPE_V8SI:
30611 case V8SF_FTYPE_V4SF:
30612 case V8SF_FTYPE_V8HI:
30613 case V4SI_FTYPE_V4SI:
30614 case V4SI_FTYPE_V16QI:
30615 case V4SI_FTYPE_V4SF:
30616 case V4SI_FTYPE_V8SI:
30617 case V4SI_FTYPE_V8HI:
30618 case V4SI_FTYPE_V4DF:
30619 case V4SI_FTYPE_V2DF:
30620 case V4HI_FTYPE_V4HI:
30621 case V4DF_FTYPE_V4DF:
30622 case V4DF_FTYPE_V4SI:
30623 case V4DF_FTYPE_V4SF:
30624 case V4DF_FTYPE_V2DF:
30625 case V4SF_FTYPE_V4SF:
30626 case V4SF_FTYPE_V4SI:
30627 case V4SF_FTYPE_V8SF:
30628 case V4SF_FTYPE_V4DF:
30629 case V4SF_FTYPE_V8HI:
30630 case V4SF_FTYPE_V2DF:
30631 case V2DI_FTYPE_V2DI:
30632 case V2DI_FTYPE_V16QI:
30633 case V2DI_FTYPE_V8HI:
30634 case V2DI_FTYPE_V4SI:
30635 case V2DF_FTYPE_V2DF:
30636 case V2DF_FTYPE_V4SI:
30637 case V2DF_FTYPE_V4DF:
30638 case V2DF_FTYPE_V4SF:
30639 case V2DF_FTYPE_V2SI:
30640 case V2SI_FTYPE_V2SI:
30641 case V2SI_FTYPE_V4SF:
30642 case V2SI_FTYPE_V2SF:
30643 case V2SI_FTYPE_V2DF:
30644 case V2SF_FTYPE_V2SF:
30645 case V2SF_FTYPE_V2SI:
30646 case V32QI_FTYPE_V32QI:
30647 case V32QI_FTYPE_V16QI:
30648 case V16HI_FTYPE_V16HI:
30649 case V16HI_FTYPE_V8HI:
30650 case V8SI_FTYPE_V8SI:
30651 case V16HI_FTYPE_V16QI:
30652 case V8SI_FTYPE_V16QI:
30653 case V4DI_FTYPE_V16QI:
30654 case V8SI_FTYPE_V8HI:
30655 case V4DI_FTYPE_V8HI:
30656 case V4DI_FTYPE_V4SI:
30657 case V4DI_FTYPE_V2DI:
30658 nargs = 1;
30659 break;
30660 case V4SF_FTYPE_V4SF_VEC_MERGE:
30661 case V2DF_FTYPE_V2DF_VEC_MERGE:
30662 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30663 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30664 case V16QI_FTYPE_V16QI_V16QI:
30665 case V16QI_FTYPE_V8HI_V8HI:
30666 case V8QI_FTYPE_V8QI_V8QI:
30667 case V8QI_FTYPE_V4HI_V4HI:
30668 case V8HI_FTYPE_V8HI_V8HI:
30669 case V8HI_FTYPE_V16QI_V16QI:
30670 case V8HI_FTYPE_V4SI_V4SI:
30671 case V8SF_FTYPE_V8SF_V8SF:
30672 case V8SF_FTYPE_V8SF_V8SI:
30673 case V4SI_FTYPE_V4SI_V4SI:
30674 case V4SI_FTYPE_V8HI_V8HI:
30675 case V4SI_FTYPE_V4SF_V4SF:
30676 case V4SI_FTYPE_V2DF_V2DF:
30677 case V4HI_FTYPE_V4HI_V4HI:
30678 case V4HI_FTYPE_V8QI_V8QI:
30679 case V4HI_FTYPE_V2SI_V2SI:
30680 case V4DF_FTYPE_V4DF_V4DF:
30681 case V4DF_FTYPE_V4DF_V4DI:
30682 case V4SF_FTYPE_V4SF_V4SF:
30683 case V4SF_FTYPE_V4SF_V4SI:
30684 case V4SF_FTYPE_V4SF_V2SI:
30685 case V4SF_FTYPE_V4SF_V2DF:
30686 case V4SF_FTYPE_V4SF_DI:
30687 case V4SF_FTYPE_V4SF_SI:
30688 case V2DI_FTYPE_V2DI_V2DI:
30689 case V2DI_FTYPE_V16QI_V16QI:
30690 case V2DI_FTYPE_V4SI_V4SI:
30691 case V2UDI_FTYPE_V4USI_V4USI:
30692 case V2DI_FTYPE_V2DI_V16QI:
30693 case V2DI_FTYPE_V2DF_V2DF:
30694 case V2SI_FTYPE_V2SI_V2SI:
30695 case V2SI_FTYPE_V4HI_V4HI:
30696 case V2SI_FTYPE_V2SF_V2SF:
30697 case V2DF_FTYPE_V2DF_V2DF:
30698 case V2DF_FTYPE_V2DF_V4SF:
30699 case V2DF_FTYPE_V2DF_V2DI:
30700 case V2DF_FTYPE_V2DF_DI:
30701 case V2DF_FTYPE_V2DF_SI:
30702 case V2SF_FTYPE_V2SF_V2SF:
30703 case V1DI_FTYPE_V1DI_V1DI:
30704 case V1DI_FTYPE_V8QI_V8QI:
30705 case V1DI_FTYPE_V2SI_V2SI:
30706 case V32QI_FTYPE_V16HI_V16HI:
30707 case V16HI_FTYPE_V8SI_V8SI:
30708 case V32QI_FTYPE_V32QI_V32QI:
30709 case V16HI_FTYPE_V32QI_V32QI:
30710 case V16HI_FTYPE_V16HI_V16HI:
30711 case V8SI_FTYPE_V4DF_V4DF:
30712 case V8SI_FTYPE_V8SI_V8SI:
30713 case V8SI_FTYPE_V16HI_V16HI:
30714 case V4DI_FTYPE_V4DI_V4DI:
30715 case V4DI_FTYPE_V8SI_V8SI:
30716 case V4UDI_FTYPE_V8USI_V8USI:
30717 if (comparison == UNKNOWN)
30718 return ix86_expand_binop_builtin (icode, exp, target);
30719 nargs = 2;
30720 break;
30721 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30722 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30723 gcc_assert (comparison != UNKNOWN);
30724 nargs = 2;
30725 swap = true;
30726 break;
30727 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30728 case V16HI_FTYPE_V16HI_SI_COUNT:
30729 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30730 case V8SI_FTYPE_V8SI_SI_COUNT:
30731 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30732 case V4DI_FTYPE_V4DI_INT_COUNT:
30733 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30734 case V8HI_FTYPE_V8HI_SI_COUNT:
30735 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30736 case V4SI_FTYPE_V4SI_SI_COUNT:
30737 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30738 case V4HI_FTYPE_V4HI_SI_COUNT:
30739 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30740 case V2DI_FTYPE_V2DI_SI_COUNT:
30741 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30742 case V2SI_FTYPE_V2SI_SI_COUNT:
30743 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30744 case V1DI_FTYPE_V1DI_SI_COUNT:
30745 nargs = 2;
30746 last_arg_count = true;
30747 break;
30748 case UINT64_FTYPE_UINT64_UINT64:
30749 case UINT_FTYPE_UINT_UINT:
30750 case UINT_FTYPE_UINT_USHORT:
30751 case UINT_FTYPE_UINT_UCHAR:
30752 case UINT16_FTYPE_UINT16_INT:
30753 case UINT8_FTYPE_UINT8_INT:
30754 nargs = 2;
30755 break;
30756 case V2DI_FTYPE_V2DI_INT_CONVERT:
30757 nargs = 2;
30758 rmode = V1TImode;
30759 nargs_constant = 1;
30760 break;
30761 case V4DI_FTYPE_V4DI_INT_CONVERT:
30762 nargs = 2;
30763 rmode = V2TImode;
30764 nargs_constant = 1;
30765 break;
30766 case V8HI_FTYPE_V8HI_INT:
30767 case V8HI_FTYPE_V8SF_INT:
30768 case V8HI_FTYPE_V4SF_INT:
30769 case V8SF_FTYPE_V8SF_INT:
30770 case V4SI_FTYPE_V4SI_INT:
30771 case V4SI_FTYPE_V8SI_INT:
30772 case V4HI_FTYPE_V4HI_INT:
30773 case V4DF_FTYPE_V4DF_INT:
30774 case V4SF_FTYPE_V4SF_INT:
30775 case V4SF_FTYPE_V8SF_INT:
30776 case V2DI_FTYPE_V2DI_INT:
30777 case V2DF_FTYPE_V2DF_INT:
30778 case V2DF_FTYPE_V4DF_INT:
30779 case V16HI_FTYPE_V16HI_INT:
30780 case V8SI_FTYPE_V8SI_INT:
30781 case V4DI_FTYPE_V4DI_INT:
30782 case V2DI_FTYPE_V4DI_INT:
30783 nargs = 2;
30784 nargs_constant = 1;
30785 break;
30786 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30787 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30788 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30789 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30790 case V2DF_FTYPE_V2DF_V2DF_V2DF:
30791 case V32QI_FTYPE_V32QI_V32QI_V32QI:
30792 nargs = 3;
30793 break;
30794 case V32QI_FTYPE_V32QI_V32QI_INT:
30795 case V16HI_FTYPE_V16HI_V16HI_INT:
30796 case V16QI_FTYPE_V16QI_V16QI_INT:
30797 case V4DI_FTYPE_V4DI_V4DI_INT:
30798 case V8HI_FTYPE_V8HI_V8HI_INT:
30799 case V8SI_FTYPE_V8SI_V8SI_INT:
30800 case V8SI_FTYPE_V8SI_V4SI_INT:
30801 case V8SF_FTYPE_V8SF_V8SF_INT:
30802 case V8SF_FTYPE_V8SF_V4SF_INT:
30803 case V4SI_FTYPE_V4SI_V4SI_INT:
30804 case V4DF_FTYPE_V4DF_V4DF_INT:
30805 case V4DF_FTYPE_V4DF_V2DF_INT:
30806 case V4SF_FTYPE_V4SF_V4SF_INT:
30807 case V2DI_FTYPE_V2DI_V2DI_INT:
30808 case V4DI_FTYPE_V4DI_V2DI_INT:
30809 case V2DF_FTYPE_V2DF_V2DF_INT:
30810 nargs = 3;
30811 nargs_constant = 1;
30812 break;
30813 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
30814 nargs = 3;
30815 rmode = V4DImode;
30816 nargs_constant = 1;
30817 break;
30818 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
30819 nargs = 3;
30820 rmode = V2DImode;
30821 nargs_constant = 1;
30822 break;
30823 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
30824 nargs = 3;
30825 rmode = DImode;
30826 nargs_constant = 1;
30827 break;
30828 case V2DI_FTYPE_V2DI_UINT_UINT:
30829 nargs = 3;
30830 nargs_constant = 2;
30831 break;
30832 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
30833 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
30834 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
30835 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
30836 nargs = 4;
30837 nargs_constant = 1;
30838 break;
30839 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
30840 nargs = 4;
30841 nargs_constant = 2;
30842 break;
30843 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
30844 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
30845 nargs = 4;
30846 break;
30847 default:
30848 gcc_unreachable ();
30849 }
30850
30851 gcc_assert (nargs <= ARRAY_SIZE (args));
30852
30853 if (comparison != UNKNOWN)
30854 {
30855 gcc_assert (nargs == 2);
30856 return ix86_expand_sse_compare (d, exp, target, swap);
30857 }
30858
30859 if (rmode == VOIDmode || rmode == tmode)
30860 {
30861 if (optimize
30862 || target == 0
30863 || GET_MODE (target) != tmode
30864 || !insn_p->operand[0].predicate (target, tmode))
30865 target = gen_reg_rtx (tmode);
30866 real_target = target;
30867 }
30868 else
30869 {
30870 target = gen_reg_rtx (rmode);
30871 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
30872 }
30873
30874 for (i = 0; i < nargs; i++)
30875 {
30876 tree arg = CALL_EXPR_ARG (exp, i);
30877 rtx op = expand_normal (arg);
30878 enum machine_mode mode = insn_p->operand[i + 1].mode;
30879 bool match = insn_p->operand[i + 1].predicate (op, mode);
30880
30881 if (last_arg_count && (i + 1) == nargs)
30882 {
30883 /* SIMD shift insns take either an 8-bit immediate or
30884 register as count. But builtin functions take int as
30885 count. If count doesn't match, we put it in register. */
30886 if (!match)
30887 {
30888 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
30889 if (!insn_p->operand[i + 1].predicate (op, mode))
30890 op = copy_to_reg (op);
30891 }
30892 }
30893 else if ((nargs - i) <= nargs_constant)
30894 {
30895 if (!match)
30896 switch (icode)
30897 {
30898 case CODE_FOR_avx2_inserti128:
30899 case CODE_FOR_avx2_extracti128:
30900 error ("the last argument must be an 1-bit immediate");
30901 return const0_rtx;
30902
30903 case CODE_FOR_sse4_1_roundsd:
30904 case CODE_FOR_sse4_1_roundss:
30905
30906 case CODE_FOR_sse4_1_roundpd:
30907 case CODE_FOR_sse4_1_roundps:
30908 case CODE_FOR_avx_roundpd256:
30909 case CODE_FOR_avx_roundps256:
30910
30911 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
30912 case CODE_FOR_sse4_1_roundps_sfix:
30913 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
30914 case CODE_FOR_avx_roundps_sfix256:
30915
30916 case CODE_FOR_sse4_1_blendps:
30917 case CODE_FOR_avx_blendpd256:
30918 case CODE_FOR_avx_vpermilv4df:
30919 error ("the last argument must be a 4-bit immediate");
30920 return const0_rtx;
30921
30922 case CODE_FOR_sse4_1_blendpd:
30923 case CODE_FOR_avx_vpermilv2df:
30924 case CODE_FOR_xop_vpermil2v2df3:
30925 case CODE_FOR_xop_vpermil2v4sf3:
30926 case CODE_FOR_xop_vpermil2v4df3:
30927 case CODE_FOR_xop_vpermil2v8sf3:
30928 error ("the last argument must be a 2-bit immediate");
30929 return const0_rtx;
30930
30931 case CODE_FOR_avx_vextractf128v4df:
30932 case CODE_FOR_avx_vextractf128v8sf:
30933 case CODE_FOR_avx_vextractf128v8si:
30934 case CODE_FOR_avx_vinsertf128v4df:
30935 case CODE_FOR_avx_vinsertf128v8sf:
30936 case CODE_FOR_avx_vinsertf128v8si:
30937 error ("the last argument must be a 1-bit immediate");
30938 return const0_rtx;
30939
30940 case CODE_FOR_avx_vmcmpv2df3:
30941 case CODE_FOR_avx_vmcmpv4sf3:
30942 case CODE_FOR_avx_cmpv2df3:
30943 case CODE_FOR_avx_cmpv4sf3:
30944 case CODE_FOR_avx_cmpv4df3:
30945 case CODE_FOR_avx_cmpv8sf3:
30946 error ("the last argument must be a 5-bit immediate");
30947 return const0_rtx;
30948
30949 default:
30950 switch (nargs_constant)
30951 {
30952 case 2:
30953 if ((nargs - i) == nargs_constant)
30954 {
30955 error ("the next to last argument must be an 8-bit immediate");
30956 break;
30957 }
30958 case 1:
30959 error ("the last argument must be an 8-bit immediate");
30960 break;
30961 default:
30962 gcc_unreachable ();
30963 }
30964 return const0_rtx;
30965 }
30966 }
30967 else
30968 {
30969 if (VECTOR_MODE_P (mode))
30970 op = safe_vector_operand (op, mode);
30971
30972 /* If we aren't optimizing, only allow one memory operand to
30973 be generated. */
30974 if (memory_operand (op, mode))
30975 num_memory++;
30976
30977 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
30978 {
30979 if (optimize || !match || num_memory > 1)
30980 op = copy_to_mode_reg (mode, op);
30981 }
30982 else
30983 {
30984 op = copy_to_reg (op);
30985 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
30986 }
30987 }
30988
30989 args[i].op = op;
30990 args[i].mode = mode;
30991 }
30992
30993 switch (nargs)
30994 {
30995 case 1:
30996 pat = GEN_FCN (icode) (real_target, args[0].op);
30997 break;
30998 case 2:
30999 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31000 break;
31001 case 3:
31002 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31003 args[2].op);
31004 break;
31005 case 4:
31006 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31007 args[2].op, args[3].op);
31008 break;
31009 default:
31010 gcc_unreachable ();
31011 }
31012
31013 if (! pat)
31014 return 0;
31015
31016 emit_insn (pat);
31017 return target;
31018 }
31019
31020 /* Subroutine of ix86_expand_builtin to take care of special insns
31021 with variable number of operands. */
31022
31023 static rtx
31024 ix86_expand_special_args_builtin (const struct builtin_description *d,
31025 tree exp, rtx target)
31026 {
31027 tree arg;
31028 rtx pat, op;
31029 unsigned int i, nargs, arg_adjust, memory;
31030 struct
31031 {
31032 rtx op;
31033 enum machine_mode mode;
31034 } args[3];
31035 enum insn_code icode = d->icode;
31036 bool last_arg_constant = false;
31037 const struct insn_data_d *insn_p = &insn_data[icode];
31038 enum machine_mode tmode = insn_p->operand[0].mode;
31039 enum { load, store } klass;
31040
31041 switch ((enum ix86_builtin_func_type) d->flag)
31042 {
31043 case VOID_FTYPE_VOID:
31044 emit_insn (GEN_FCN (icode) (target));
31045 return 0;
31046 case VOID_FTYPE_UINT64:
31047 case VOID_FTYPE_UNSIGNED:
31048 nargs = 0;
31049 klass = store;
31050 memory = 0;
31051 break;
31052
31053 case INT_FTYPE_VOID:
31054 case UINT64_FTYPE_VOID:
31055 case UNSIGNED_FTYPE_VOID:
31056 nargs = 0;
31057 klass = load;
31058 memory = 0;
31059 break;
31060 case UINT64_FTYPE_PUNSIGNED:
31061 case V2DI_FTYPE_PV2DI:
31062 case V4DI_FTYPE_PV4DI:
31063 case V32QI_FTYPE_PCCHAR:
31064 case V16QI_FTYPE_PCCHAR:
31065 case V8SF_FTYPE_PCV4SF:
31066 case V8SF_FTYPE_PCFLOAT:
31067 case V4SF_FTYPE_PCFLOAT:
31068 case V4DF_FTYPE_PCV2DF:
31069 case V4DF_FTYPE_PCDOUBLE:
31070 case V2DF_FTYPE_PCDOUBLE:
31071 case VOID_FTYPE_PVOID:
31072 nargs = 1;
31073 klass = load;
31074 memory = 0;
31075 break;
31076 case VOID_FTYPE_PV2SF_V4SF:
31077 case VOID_FTYPE_PV4DI_V4DI:
31078 case VOID_FTYPE_PV2DI_V2DI:
31079 case VOID_FTYPE_PCHAR_V32QI:
31080 case VOID_FTYPE_PCHAR_V16QI:
31081 case VOID_FTYPE_PFLOAT_V8SF:
31082 case VOID_FTYPE_PFLOAT_V4SF:
31083 case VOID_FTYPE_PDOUBLE_V4DF:
31084 case VOID_FTYPE_PDOUBLE_V2DF:
31085 case VOID_FTYPE_PLONGLONG_LONGLONG:
31086 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31087 case VOID_FTYPE_PINT_INT:
31088 nargs = 1;
31089 klass = store;
31090 /* Reserve memory operand for target. */
31091 memory = ARRAY_SIZE (args);
31092 break;
31093 case V4SF_FTYPE_V4SF_PCV2SF:
31094 case V2DF_FTYPE_V2DF_PCDOUBLE:
31095 nargs = 2;
31096 klass = load;
31097 memory = 1;
31098 break;
31099 case V8SF_FTYPE_PCV8SF_V8SI:
31100 case V4DF_FTYPE_PCV4DF_V4DI:
31101 case V4SF_FTYPE_PCV4SF_V4SI:
31102 case V2DF_FTYPE_PCV2DF_V2DI:
31103 case V8SI_FTYPE_PCV8SI_V8SI:
31104 case V4DI_FTYPE_PCV4DI_V4DI:
31105 case V4SI_FTYPE_PCV4SI_V4SI:
31106 case V2DI_FTYPE_PCV2DI_V2DI:
31107 nargs = 2;
31108 klass = load;
31109 memory = 0;
31110 break;
31111 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31112 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31113 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31114 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31115 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31116 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31117 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31118 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31119 nargs = 2;
31120 klass = store;
31121 /* Reserve memory operand for target. */
31122 memory = ARRAY_SIZE (args);
31123 break;
31124 case VOID_FTYPE_UINT_UINT_UINT:
31125 case VOID_FTYPE_UINT64_UINT_UINT:
31126 case UCHAR_FTYPE_UINT_UINT_UINT:
31127 case UCHAR_FTYPE_UINT64_UINT_UINT:
31128 nargs = 3;
31129 klass = load;
31130 memory = ARRAY_SIZE (args);
31131 last_arg_constant = true;
31132 break;
31133 default:
31134 gcc_unreachable ();
31135 }
31136
31137 gcc_assert (nargs <= ARRAY_SIZE (args));
31138
31139 if (klass == store)
31140 {
31141 arg = CALL_EXPR_ARG (exp, 0);
31142 op = expand_normal (arg);
31143 gcc_assert (target == 0);
31144 if (memory)
31145 {
31146 if (GET_MODE (op) != Pmode)
31147 op = convert_to_mode (Pmode, op, 1);
31148 target = gen_rtx_MEM (tmode, force_reg (Pmode, op));
31149 }
31150 else
31151 target = force_reg (tmode, op);
31152 arg_adjust = 1;
31153 }
31154 else
31155 {
31156 arg_adjust = 0;
31157 if (optimize
31158 || target == 0
31159 || !register_operand (target, tmode)
31160 || GET_MODE (target) != tmode)
31161 target = gen_reg_rtx (tmode);
31162 }
31163
31164 for (i = 0; i < nargs; i++)
31165 {
31166 enum machine_mode mode = insn_p->operand[i + 1].mode;
31167 bool match;
31168
31169 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31170 op = expand_normal (arg);
31171 match = insn_p->operand[i + 1].predicate (op, mode);
31172
31173 if (last_arg_constant && (i + 1) == nargs)
31174 {
31175 if (!match)
31176 {
31177 if (icode == CODE_FOR_lwp_lwpvalsi3
31178 || icode == CODE_FOR_lwp_lwpinssi3
31179 || icode == CODE_FOR_lwp_lwpvaldi3
31180 || icode == CODE_FOR_lwp_lwpinsdi3)
31181 error ("the last argument must be a 32-bit immediate");
31182 else
31183 error ("the last argument must be an 8-bit immediate");
31184 return const0_rtx;
31185 }
31186 }
31187 else
31188 {
31189 if (i == memory)
31190 {
31191 /* This must be the memory operand. */
31192 if (GET_MODE (op) != Pmode)
31193 op = convert_to_mode (Pmode, op, 1);
31194 op = gen_rtx_MEM (mode, force_reg (Pmode, op));
31195 gcc_assert (GET_MODE (op) == mode
31196 || GET_MODE (op) == VOIDmode);
31197 }
31198 else
31199 {
31200 /* This must be register. */
31201 if (VECTOR_MODE_P (mode))
31202 op = safe_vector_operand (op, mode);
31203
31204 gcc_assert (GET_MODE (op) == mode
31205 || GET_MODE (op) == VOIDmode);
31206 op = copy_to_mode_reg (mode, op);
31207 }
31208 }
31209
31210 args[i].op = op;
31211 args[i].mode = mode;
31212 }
31213
31214 switch (nargs)
31215 {
31216 case 0:
31217 pat = GEN_FCN (icode) (target);
31218 break;
31219 case 1:
31220 pat = GEN_FCN (icode) (target, args[0].op);
31221 break;
31222 case 2:
31223 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31224 break;
31225 case 3:
31226 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31227 break;
31228 default:
31229 gcc_unreachable ();
31230 }
31231
31232 if (! pat)
31233 return 0;
31234 emit_insn (pat);
31235 return klass == store ? 0 : target;
31236 }
31237
31238 /* Return the integer constant in ARG. Constrain it to be in the range
31239 of the subparts of VEC_TYPE; issue an error if not. */
31240
31241 static int
31242 get_element_number (tree vec_type, tree arg)
31243 {
31244 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31245
31246 if (!host_integerp (arg, 1)
31247 || (elt = tree_low_cst (arg, 1), elt > max))
31248 {
31249 error ("selector must be an integer constant in the range 0..%wi", max);
31250 return 0;
31251 }
31252
31253 return elt;
31254 }
31255
31256 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31257 ix86_expand_vector_init. We DO have language-level syntax for this, in
31258 the form of (type){ init-list }. Except that since we can't place emms
31259 instructions from inside the compiler, we can't allow the use of MMX
31260 registers unless the user explicitly asks for it. So we do *not* define
31261 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31262 we have builtins invoked by mmintrin.h that gives us license to emit
31263 these sorts of instructions. */
31264
31265 static rtx
31266 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31267 {
31268 enum machine_mode tmode = TYPE_MODE (type);
31269 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31270 int i, n_elt = GET_MODE_NUNITS (tmode);
31271 rtvec v = rtvec_alloc (n_elt);
31272
31273 gcc_assert (VECTOR_MODE_P (tmode));
31274 gcc_assert (call_expr_nargs (exp) == n_elt);
31275
31276 for (i = 0; i < n_elt; ++i)
31277 {
31278 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31279 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31280 }
31281
31282 if (!target || !register_operand (target, tmode))
31283 target = gen_reg_rtx (tmode);
31284
31285 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31286 return target;
31287 }
31288
31289 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31290 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31291 had a language-level syntax for referencing vector elements. */
31292
31293 static rtx
31294 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31295 {
31296 enum machine_mode tmode, mode0;
31297 tree arg0, arg1;
31298 int elt;
31299 rtx op0;
31300
31301 arg0 = CALL_EXPR_ARG (exp, 0);
31302 arg1 = CALL_EXPR_ARG (exp, 1);
31303
31304 op0 = expand_normal (arg0);
31305 elt = get_element_number (TREE_TYPE (arg0), arg1);
31306
31307 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31308 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31309 gcc_assert (VECTOR_MODE_P (mode0));
31310
31311 op0 = force_reg (mode0, op0);
31312
31313 if (optimize || !target || !register_operand (target, tmode))
31314 target = gen_reg_rtx (tmode);
31315
31316 ix86_expand_vector_extract (true, target, op0, elt);
31317
31318 return target;
31319 }
31320
31321 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31322 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31323 a language-level syntax for referencing vector elements. */
31324
31325 static rtx
31326 ix86_expand_vec_set_builtin (tree exp)
31327 {
31328 enum machine_mode tmode, mode1;
31329 tree arg0, arg1, arg2;
31330 int elt;
31331 rtx op0, op1, target;
31332
31333 arg0 = CALL_EXPR_ARG (exp, 0);
31334 arg1 = CALL_EXPR_ARG (exp, 1);
31335 arg2 = CALL_EXPR_ARG (exp, 2);
31336
31337 tmode = TYPE_MODE (TREE_TYPE (arg0));
31338 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31339 gcc_assert (VECTOR_MODE_P (tmode));
31340
31341 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31342 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31343 elt = get_element_number (TREE_TYPE (arg0), arg2);
31344
31345 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31346 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31347
31348 op0 = force_reg (tmode, op0);
31349 op1 = force_reg (mode1, op1);
31350
31351 /* OP0 is the source of these builtin functions and shouldn't be
31352 modified. Create a copy, use it and return it as target. */
31353 target = gen_reg_rtx (tmode);
31354 emit_move_insn (target, op0);
31355 ix86_expand_vector_set (true, target, op1, elt);
31356
31357 return target;
31358 }
31359
31360 /* Expand an expression EXP that calls a built-in function,
31361 with result going to TARGET if that's convenient
31362 (and in mode MODE if that's convenient).
31363 SUBTARGET may be used as the target for computing one of EXP's operands.
31364 IGNORE is nonzero if the value is to be ignored. */
31365
31366 static rtx
31367 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31368 enum machine_mode mode ATTRIBUTE_UNUSED,
31369 int ignore ATTRIBUTE_UNUSED)
31370 {
31371 const struct builtin_description *d;
31372 size_t i;
31373 enum insn_code icode;
31374 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31375 tree arg0, arg1, arg2, arg3, arg4;
31376 rtx op0, op1, op2, op3, op4, pat, insn;
31377 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31378 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31379
31380 /* For CPU builtins that can be folded, fold first and expand the fold. */
31381 switch (fcode)
31382 {
31383 case IX86_BUILTIN_CPU_INIT:
31384 {
31385 /* Make it call __cpu_indicator_init in libgcc. */
31386 tree call_expr, fndecl, type;
31387 type = build_function_type_list (integer_type_node, NULL_TREE);
31388 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31389 call_expr = build_call_expr (fndecl, 0);
31390 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31391 }
31392 case IX86_BUILTIN_CPU_IS:
31393 case IX86_BUILTIN_CPU_SUPPORTS:
31394 {
31395 tree arg0 = CALL_EXPR_ARG (exp, 0);
31396 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31397 gcc_assert (fold_expr != NULL_TREE);
31398 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31399 }
31400 }
31401
31402 /* Determine whether the builtin function is available under the current ISA.
31403 Originally the builtin was not created if it wasn't applicable to the
31404 current ISA based on the command line switches. With function specific
31405 options, we need to check in the context of the function making the call
31406 whether it is supported. */
31407 if (ix86_builtins_isa[fcode].isa
31408 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31409 {
31410 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31411 NULL, (enum fpmath_unit) 0, false);
31412
31413 if (!opts)
31414 error ("%qE needs unknown isa option", fndecl);
31415 else
31416 {
31417 gcc_assert (opts != NULL);
31418 error ("%qE needs isa option %s", fndecl, opts);
31419 free (opts);
31420 }
31421 return const0_rtx;
31422 }
31423
31424 switch (fcode)
31425 {
31426 case IX86_BUILTIN_MASKMOVQ:
31427 case IX86_BUILTIN_MASKMOVDQU:
31428 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31429 ? CODE_FOR_mmx_maskmovq
31430 : CODE_FOR_sse2_maskmovdqu);
31431 /* Note the arg order is different from the operand order. */
31432 arg1 = CALL_EXPR_ARG (exp, 0);
31433 arg2 = CALL_EXPR_ARG (exp, 1);
31434 arg0 = CALL_EXPR_ARG (exp, 2);
31435 op0 = expand_normal (arg0);
31436 op1 = expand_normal (arg1);
31437 op2 = expand_normal (arg2);
31438 mode0 = insn_data[icode].operand[0].mode;
31439 mode1 = insn_data[icode].operand[1].mode;
31440 mode2 = insn_data[icode].operand[2].mode;
31441
31442 if (GET_MODE (op0) != Pmode)
31443 op0 = convert_to_mode (Pmode, op0, 1);
31444 op0 = gen_rtx_MEM (mode1, force_reg (Pmode, op0));
31445
31446 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31447 op0 = copy_to_mode_reg (mode0, op0);
31448 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31449 op1 = copy_to_mode_reg (mode1, op1);
31450 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31451 op2 = copy_to_mode_reg (mode2, op2);
31452 pat = GEN_FCN (icode) (op0, op1, op2);
31453 if (! pat)
31454 return 0;
31455 emit_insn (pat);
31456 return 0;
31457
31458 case IX86_BUILTIN_LDMXCSR:
31459 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31460 target = assign_386_stack_local (SImode, SLOT_TEMP);
31461 emit_move_insn (target, op0);
31462 emit_insn (gen_sse_ldmxcsr (target));
31463 return 0;
31464
31465 case IX86_BUILTIN_STMXCSR:
31466 target = assign_386_stack_local (SImode, SLOT_TEMP);
31467 emit_insn (gen_sse_stmxcsr (target));
31468 return copy_to_mode_reg (SImode, target);
31469
31470 case IX86_BUILTIN_CLFLUSH:
31471 arg0 = CALL_EXPR_ARG (exp, 0);
31472 op0 = expand_normal (arg0);
31473 icode = CODE_FOR_sse2_clflush;
31474 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31475 {
31476 if (GET_MODE (op0) != Pmode)
31477 op0 = convert_to_mode (Pmode, op0, 1);
31478 op0 = force_reg (Pmode, op0);
31479 }
31480
31481 emit_insn (gen_sse2_clflush (op0));
31482 return 0;
31483
31484 case IX86_BUILTIN_MONITOR:
31485 arg0 = CALL_EXPR_ARG (exp, 0);
31486 arg1 = CALL_EXPR_ARG (exp, 1);
31487 arg2 = CALL_EXPR_ARG (exp, 2);
31488 op0 = expand_normal (arg0);
31489 op1 = expand_normal (arg1);
31490 op2 = expand_normal (arg2);
31491 if (!REG_P (op0))
31492 {
31493 if (GET_MODE (op0) != Pmode)
31494 op0 = convert_to_mode (Pmode, op0, 1);
31495 op0 = force_reg (Pmode, op0);
31496 }
31497 if (!REG_P (op1))
31498 op1 = copy_to_mode_reg (SImode, op1);
31499 if (!REG_P (op2))
31500 op2 = copy_to_mode_reg (SImode, op2);
31501 emit_insn (ix86_gen_monitor (op0, op1, op2));
31502 return 0;
31503
31504 case IX86_BUILTIN_MWAIT:
31505 arg0 = CALL_EXPR_ARG (exp, 0);
31506 arg1 = CALL_EXPR_ARG (exp, 1);
31507 op0 = expand_normal (arg0);
31508 op1 = expand_normal (arg1);
31509 if (!REG_P (op0))
31510 op0 = copy_to_mode_reg (SImode, op0);
31511 if (!REG_P (op1))
31512 op1 = copy_to_mode_reg (SImode, op1);
31513 emit_insn (gen_sse3_mwait (op0, op1));
31514 return 0;
31515
31516 case IX86_BUILTIN_VEC_INIT_V2SI:
31517 case IX86_BUILTIN_VEC_INIT_V4HI:
31518 case IX86_BUILTIN_VEC_INIT_V8QI:
31519 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31520
31521 case IX86_BUILTIN_VEC_EXT_V2DF:
31522 case IX86_BUILTIN_VEC_EXT_V2DI:
31523 case IX86_BUILTIN_VEC_EXT_V4SF:
31524 case IX86_BUILTIN_VEC_EXT_V4SI:
31525 case IX86_BUILTIN_VEC_EXT_V8HI:
31526 case IX86_BUILTIN_VEC_EXT_V2SI:
31527 case IX86_BUILTIN_VEC_EXT_V4HI:
31528 case IX86_BUILTIN_VEC_EXT_V16QI:
31529 return ix86_expand_vec_ext_builtin (exp, target);
31530
31531 case IX86_BUILTIN_VEC_SET_V2DI:
31532 case IX86_BUILTIN_VEC_SET_V4SF:
31533 case IX86_BUILTIN_VEC_SET_V4SI:
31534 case IX86_BUILTIN_VEC_SET_V8HI:
31535 case IX86_BUILTIN_VEC_SET_V4HI:
31536 case IX86_BUILTIN_VEC_SET_V16QI:
31537 return ix86_expand_vec_set_builtin (exp);
31538
31539 case IX86_BUILTIN_INFQ:
31540 case IX86_BUILTIN_HUGE_VALQ:
31541 {
31542 REAL_VALUE_TYPE inf;
31543 rtx tmp;
31544
31545 real_inf (&inf);
31546 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31547
31548 tmp = validize_mem (force_const_mem (mode, tmp));
31549
31550 if (target == 0)
31551 target = gen_reg_rtx (mode);
31552
31553 emit_move_insn (target, tmp);
31554 return target;
31555 }
31556
31557 case IX86_BUILTIN_RDPMC:
31558 case IX86_BUILTIN_RDTSC:
31559 case IX86_BUILTIN_RDTSCP:
31560
31561 op0 = gen_reg_rtx (DImode);
31562 op1 = gen_reg_rtx (DImode);
31563
31564 if (fcode == IX86_BUILTIN_RDPMC)
31565 {
31566 arg0 = CALL_EXPR_ARG (exp, 0);
31567 op2 = expand_normal (arg0);
31568 if (!register_operand (op2, SImode))
31569 op2 = copy_to_mode_reg (SImode, op2);
31570
31571 insn = (TARGET_64BIT
31572 ? gen_rdpmc_rex64 (op0, op1, op2)
31573 : gen_rdpmc (op0, op2));
31574 emit_insn (insn);
31575 }
31576 else if (fcode == IX86_BUILTIN_RDTSC)
31577 {
31578 insn = (TARGET_64BIT
31579 ? gen_rdtsc_rex64 (op0, op1)
31580 : gen_rdtsc (op0));
31581 emit_insn (insn);
31582 }
31583 else
31584 {
31585 op2 = gen_reg_rtx (SImode);
31586
31587 insn = (TARGET_64BIT
31588 ? gen_rdtscp_rex64 (op0, op1, op2)
31589 : gen_rdtscp (op0, op2));
31590 emit_insn (insn);
31591
31592 arg0 = CALL_EXPR_ARG (exp, 0);
31593 op4 = expand_normal (arg0);
31594 if (!address_operand (op4, VOIDmode))
31595 {
31596 op4 = convert_memory_address (Pmode, op4);
31597 op4 = copy_addr_to_reg (op4);
31598 }
31599 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31600 }
31601
31602 if (target == 0)
31603 target = gen_reg_rtx (mode);
31604
31605 if (TARGET_64BIT)
31606 {
31607 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31608 op1, 1, OPTAB_DIRECT);
31609 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31610 op0, 1, OPTAB_DIRECT);
31611 }
31612
31613 emit_move_insn (target, op0);
31614 return target;
31615
31616 case IX86_BUILTIN_FXSAVE:
31617 case IX86_BUILTIN_FXRSTOR:
31618 case IX86_BUILTIN_FXSAVE64:
31619 case IX86_BUILTIN_FXRSTOR64:
31620 switch (fcode)
31621 {
31622 case IX86_BUILTIN_FXSAVE:
31623 icode = CODE_FOR_fxsave;
31624 break;
31625 case IX86_BUILTIN_FXRSTOR:
31626 icode = CODE_FOR_fxrstor;
31627 break;
31628 case IX86_BUILTIN_FXSAVE64:
31629 icode = CODE_FOR_fxsave64;
31630 break;
31631 case IX86_BUILTIN_FXRSTOR64:
31632 icode = CODE_FOR_fxrstor64;
31633 break;
31634 default:
31635 gcc_unreachable ();
31636 }
31637
31638 arg0 = CALL_EXPR_ARG (exp, 0);
31639 op0 = expand_normal (arg0);
31640
31641 if (!address_operand (op0, VOIDmode))
31642 {
31643 op0 = convert_memory_address (Pmode, op0);
31644 op0 = copy_addr_to_reg (op0);
31645 }
31646 op0 = gen_rtx_MEM (BLKmode, op0);
31647
31648 pat = GEN_FCN (icode) (op0);
31649 if (pat)
31650 emit_insn (pat);
31651 return 0;
31652
31653 case IX86_BUILTIN_XSAVE:
31654 case IX86_BUILTIN_XRSTOR:
31655 case IX86_BUILTIN_XSAVE64:
31656 case IX86_BUILTIN_XRSTOR64:
31657 case IX86_BUILTIN_XSAVEOPT:
31658 case IX86_BUILTIN_XSAVEOPT64:
31659 arg0 = CALL_EXPR_ARG (exp, 0);
31660 arg1 = CALL_EXPR_ARG (exp, 1);
31661 op0 = expand_normal (arg0);
31662 op1 = expand_normal (arg1);
31663
31664 if (!address_operand (op0, VOIDmode))
31665 {
31666 op0 = convert_memory_address (Pmode, op0);
31667 op0 = copy_addr_to_reg (op0);
31668 }
31669 op0 = gen_rtx_MEM (BLKmode, op0);
31670
31671 op1 = force_reg (DImode, op1);
31672
31673 if (TARGET_64BIT)
31674 {
31675 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31676 NULL, 1, OPTAB_DIRECT);
31677 switch (fcode)
31678 {
31679 case IX86_BUILTIN_XSAVE:
31680 icode = CODE_FOR_xsave_rex64;
31681 break;
31682 case IX86_BUILTIN_XRSTOR:
31683 icode = CODE_FOR_xrstor_rex64;
31684 break;
31685 case IX86_BUILTIN_XSAVE64:
31686 icode = CODE_FOR_xsave64;
31687 break;
31688 case IX86_BUILTIN_XRSTOR64:
31689 icode = CODE_FOR_xrstor64;
31690 break;
31691 case IX86_BUILTIN_XSAVEOPT:
31692 icode = CODE_FOR_xsaveopt_rex64;
31693 break;
31694 case IX86_BUILTIN_XSAVEOPT64:
31695 icode = CODE_FOR_xsaveopt64;
31696 break;
31697 default:
31698 gcc_unreachable ();
31699 }
31700
31701 op2 = gen_lowpart (SImode, op2);
31702 op1 = gen_lowpart (SImode, op1);
31703 pat = GEN_FCN (icode) (op0, op1, op2);
31704 }
31705 else
31706 {
31707 switch (fcode)
31708 {
31709 case IX86_BUILTIN_XSAVE:
31710 icode = CODE_FOR_xsave;
31711 break;
31712 case IX86_BUILTIN_XRSTOR:
31713 icode = CODE_FOR_xrstor;
31714 break;
31715 case IX86_BUILTIN_XSAVEOPT:
31716 icode = CODE_FOR_xsaveopt;
31717 break;
31718 default:
31719 gcc_unreachable ();
31720 }
31721 pat = GEN_FCN (icode) (op0, op1);
31722 }
31723
31724 if (pat)
31725 emit_insn (pat);
31726 return 0;
31727
31728 case IX86_BUILTIN_LLWPCB:
31729 arg0 = CALL_EXPR_ARG (exp, 0);
31730 op0 = expand_normal (arg0);
31731 icode = CODE_FOR_lwp_llwpcb;
31732 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31733 {
31734 if (GET_MODE (op0) != Pmode)
31735 op0 = convert_to_mode (Pmode, op0, 1);
31736 op0 = force_reg (Pmode, op0);
31737 }
31738 emit_insn (gen_lwp_llwpcb (op0));
31739 return 0;
31740
31741 case IX86_BUILTIN_SLWPCB:
31742 icode = CODE_FOR_lwp_slwpcb;
31743 if (!target
31744 || !insn_data[icode].operand[0].predicate (target, Pmode))
31745 target = gen_reg_rtx (Pmode);
31746 emit_insn (gen_lwp_slwpcb (target));
31747 return target;
31748
31749 case IX86_BUILTIN_BEXTRI32:
31750 case IX86_BUILTIN_BEXTRI64:
31751 arg0 = CALL_EXPR_ARG (exp, 0);
31752 arg1 = CALL_EXPR_ARG (exp, 1);
31753 op0 = expand_normal (arg0);
31754 op1 = expand_normal (arg1);
31755 icode = (fcode == IX86_BUILTIN_BEXTRI32
31756 ? CODE_FOR_tbm_bextri_si
31757 : CODE_FOR_tbm_bextri_di);
31758 if (!CONST_INT_P (op1))
31759 {
31760 error ("last argument must be an immediate");
31761 return const0_rtx;
31762 }
31763 else
31764 {
31765 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
31766 unsigned char lsb_index = INTVAL (op1) & 0xFF;
31767 op1 = GEN_INT (length);
31768 op2 = GEN_INT (lsb_index);
31769 pat = GEN_FCN (icode) (target, op0, op1, op2);
31770 if (pat)
31771 emit_insn (pat);
31772 return target;
31773 }
31774
31775 case IX86_BUILTIN_RDRAND16_STEP:
31776 icode = CODE_FOR_rdrandhi_1;
31777 mode0 = HImode;
31778 goto rdrand_step;
31779
31780 case IX86_BUILTIN_RDRAND32_STEP:
31781 icode = CODE_FOR_rdrandsi_1;
31782 mode0 = SImode;
31783 goto rdrand_step;
31784
31785 case IX86_BUILTIN_RDRAND64_STEP:
31786 icode = CODE_FOR_rdranddi_1;
31787 mode0 = DImode;
31788
31789 rdrand_step:
31790 op0 = gen_reg_rtx (mode0);
31791 emit_insn (GEN_FCN (icode) (op0));
31792
31793 arg0 = CALL_EXPR_ARG (exp, 0);
31794 op1 = expand_normal (arg0);
31795 if (!address_operand (op1, VOIDmode))
31796 {
31797 op1 = convert_memory_address (Pmode, op1);
31798 op1 = copy_addr_to_reg (op1);
31799 }
31800 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31801
31802 op1 = gen_reg_rtx (SImode);
31803 emit_move_insn (op1, CONST1_RTX (SImode));
31804
31805 /* Emit SImode conditional move. */
31806 if (mode0 == HImode)
31807 {
31808 op2 = gen_reg_rtx (SImode);
31809 emit_insn (gen_zero_extendhisi2 (op2, op0));
31810 }
31811 else if (mode0 == SImode)
31812 op2 = op0;
31813 else
31814 op2 = gen_rtx_SUBREG (SImode, op0, 0);
31815
31816 if (target == 0)
31817 target = gen_reg_rtx (SImode);
31818
31819 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
31820 const0_rtx);
31821 emit_insn (gen_rtx_SET (VOIDmode, target,
31822 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
31823 return target;
31824
31825 case IX86_BUILTIN_RDSEED16_STEP:
31826 icode = CODE_FOR_rdseedhi_1;
31827 mode0 = HImode;
31828 goto rdseed_step;
31829
31830 case IX86_BUILTIN_RDSEED32_STEP:
31831 icode = CODE_FOR_rdseedsi_1;
31832 mode0 = SImode;
31833 goto rdseed_step;
31834
31835 case IX86_BUILTIN_RDSEED64_STEP:
31836 icode = CODE_FOR_rdseeddi_1;
31837 mode0 = DImode;
31838
31839 rdseed_step:
31840 op0 = gen_reg_rtx (mode0);
31841 emit_insn (GEN_FCN (icode) (op0));
31842
31843 arg0 = CALL_EXPR_ARG (exp, 0);
31844 op1 = expand_normal (arg0);
31845 if (!address_operand (op1, VOIDmode))
31846 {
31847 op1 = convert_memory_address (Pmode, op1);
31848 op1 = copy_addr_to_reg (op1);
31849 }
31850 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
31851
31852 op2 = gen_reg_rtx (QImode);
31853
31854 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
31855 const0_rtx);
31856 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
31857
31858 if (target == 0)
31859 target = gen_reg_rtx (SImode);
31860
31861 emit_insn (gen_zero_extendqisi2 (target, op2));
31862 return target;
31863
31864 case IX86_BUILTIN_ADDCARRYX32:
31865 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
31866 mode0 = SImode;
31867 goto addcarryx;
31868
31869 case IX86_BUILTIN_ADDCARRYX64:
31870 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
31871 mode0 = DImode;
31872
31873 addcarryx:
31874 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
31875 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
31876 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
31877 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
31878
31879 op0 = gen_reg_rtx (QImode);
31880
31881 /* Generate CF from input operand. */
31882 op1 = expand_normal (arg0);
31883 if (GET_MODE (op1) != QImode)
31884 op1 = convert_to_mode (QImode, op1, 1);
31885 op1 = copy_to_mode_reg (QImode, op1);
31886 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
31887
31888 /* Gen ADCX instruction to compute X+Y+CF. */
31889 op2 = expand_normal (arg1);
31890 op3 = expand_normal (arg2);
31891
31892 if (!REG_P (op2))
31893 op2 = copy_to_mode_reg (mode0, op2);
31894 if (!REG_P (op3))
31895 op3 = copy_to_mode_reg (mode0, op3);
31896
31897 op0 = gen_reg_rtx (mode0);
31898
31899 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
31900 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
31901 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
31902
31903 /* Store the result. */
31904 op4 = expand_normal (arg3);
31905 if (!address_operand (op4, VOIDmode))
31906 {
31907 op4 = convert_memory_address (Pmode, op4);
31908 op4 = copy_addr_to_reg (op4);
31909 }
31910 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
31911
31912 /* Return current CF value. */
31913 if (target == 0)
31914 target = gen_reg_rtx (QImode);
31915
31916 PUT_MODE (pat, QImode);
31917 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
31918 return target;
31919
31920 case IX86_BUILTIN_GATHERSIV2DF:
31921 icode = CODE_FOR_avx2_gathersiv2df;
31922 goto gather_gen;
31923 case IX86_BUILTIN_GATHERSIV4DF:
31924 icode = CODE_FOR_avx2_gathersiv4df;
31925 goto gather_gen;
31926 case IX86_BUILTIN_GATHERDIV2DF:
31927 icode = CODE_FOR_avx2_gatherdiv2df;
31928 goto gather_gen;
31929 case IX86_BUILTIN_GATHERDIV4DF:
31930 icode = CODE_FOR_avx2_gatherdiv4df;
31931 goto gather_gen;
31932 case IX86_BUILTIN_GATHERSIV4SF:
31933 icode = CODE_FOR_avx2_gathersiv4sf;
31934 goto gather_gen;
31935 case IX86_BUILTIN_GATHERSIV8SF:
31936 icode = CODE_FOR_avx2_gathersiv8sf;
31937 goto gather_gen;
31938 case IX86_BUILTIN_GATHERDIV4SF:
31939 icode = CODE_FOR_avx2_gatherdiv4sf;
31940 goto gather_gen;
31941 case IX86_BUILTIN_GATHERDIV8SF:
31942 icode = CODE_FOR_avx2_gatherdiv8sf;
31943 goto gather_gen;
31944 case IX86_BUILTIN_GATHERSIV2DI:
31945 icode = CODE_FOR_avx2_gathersiv2di;
31946 goto gather_gen;
31947 case IX86_BUILTIN_GATHERSIV4DI:
31948 icode = CODE_FOR_avx2_gathersiv4di;
31949 goto gather_gen;
31950 case IX86_BUILTIN_GATHERDIV2DI:
31951 icode = CODE_FOR_avx2_gatherdiv2di;
31952 goto gather_gen;
31953 case IX86_BUILTIN_GATHERDIV4DI:
31954 icode = CODE_FOR_avx2_gatherdiv4di;
31955 goto gather_gen;
31956 case IX86_BUILTIN_GATHERSIV4SI:
31957 icode = CODE_FOR_avx2_gathersiv4si;
31958 goto gather_gen;
31959 case IX86_BUILTIN_GATHERSIV8SI:
31960 icode = CODE_FOR_avx2_gathersiv8si;
31961 goto gather_gen;
31962 case IX86_BUILTIN_GATHERDIV4SI:
31963 icode = CODE_FOR_avx2_gatherdiv4si;
31964 goto gather_gen;
31965 case IX86_BUILTIN_GATHERDIV8SI:
31966 icode = CODE_FOR_avx2_gatherdiv8si;
31967 goto gather_gen;
31968 case IX86_BUILTIN_GATHERALTSIV4DF:
31969 icode = CODE_FOR_avx2_gathersiv4df;
31970 goto gather_gen;
31971 case IX86_BUILTIN_GATHERALTDIV8SF:
31972 icode = CODE_FOR_avx2_gatherdiv8sf;
31973 goto gather_gen;
31974 case IX86_BUILTIN_GATHERALTSIV4DI:
31975 icode = CODE_FOR_avx2_gathersiv4di;
31976 goto gather_gen;
31977 case IX86_BUILTIN_GATHERALTDIV8SI:
31978 icode = CODE_FOR_avx2_gatherdiv8si;
31979 goto gather_gen;
31980
31981 gather_gen:
31982 arg0 = CALL_EXPR_ARG (exp, 0);
31983 arg1 = CALL_EXPR_ARG (exp, 1);
31984 arg2 = CALL_EXPR_ARG (exp, 2);
31985 arg3 = CALL_EXPR_ARG (exp, 3);
31986 arg4 = CALL_EXPR_ARG (exp, 4);
31987 op0 = expand_normal (arg0);
31988 op1 = expand_normal (arg1);
31989 op2 = expand_normal (arg2);
31990 op3 = expand_normal (arg3);
31991 op4 = expand_normal (arg4);
31992 /* Note the arg order is different from the operand order. */
31993 mode0 = insn_data[icode].operand[1].mode;
31994 mode2 = insn_data[icode].operand[3].mode;
31995 mode3 = insn_data[icode].operand[4].mode;
31996 mode4 = insn_data[icode].operand[5].mode;
31997
31998 if (target == NULL_RTX
31999 || GET_MODE (target) != insn_data[icode].operand[0].mode)
32000 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32001 else
32002 subtarget = target;
32003
32004 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32005 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32006 {
32007 rtx half = gen_reg_rtx (V4SImode);
32008 if (!nonimmediate_operand (op2, V8SImode))
32009 op2 = copy_to_mode_reg (V8SImode, op2);
32010 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32011 op2 = half;
32012 }
32013 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32014 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32015 {
32016 rtx (*gen) (rtx, rtx);
32017 rtx half = gen_reg_rtx (mode0);
32018 if (mode0 == V4SFmode)
32019 gen = gen_vec_extract_lo_v8sf;
32020 else
32021 gen = gen_vec_extract_lo_v8si;
32022 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32023 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32024 emit_insn (gen (half, op0));
32025 op0 = half;
32026 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32027 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32028 emit_insn (gen (half, op3));
32029 op3 = half;
32030 }
32031
32032 /* Force memory operand only with base register here. But we
32033 don't want to do it on memory operand for other builtin
32034 functions. */
32035 if (GET_MODE (op1) != Pmode)
32036 op1 = convert_to_mode (Pmode, op1, 1);
32037 op1 = force_reg (Pmode, op1);
32038
32039 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32040 op0 = copy_to_mode_reg (mode0, op0);
32041 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32042 op1 = copy_to_mode_reg (Pmode, op1);
32043 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32044 op2 = copy_to_mode_reg (mode2, op2);
32045 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32046 op3 = copy_to_mode_reg (mode3, op3);
32047 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32048 {
32049 error ("last argument must be scale 1, 2, 4, 8");
32050 return const0_rtx;
32051 }
32052
32053 /* Optimize. If mask is known to have all high bits set,
32054 replace op0 with pc_rtx to signal that the instruction
32055 overwrites the whole destination and doesn't use its
32056 previous contents. */
32057 if (optimize)
32058 {
32059 if (TREE_CODE (arg3) == VECTOR_CST)
32060 {
32061 unsigned int negative = 0;
32062 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32063 {
32064 tree cst = VECTOR_CST_ELT (arg3, i);
32065 if (TREE_CODE (cst) == INTEGER_CST
32066 && tree_int_cst_sign_bit (cst))
32067 negative++;
32068 else if (TREE_CODE (cst) == REAL_CST
32069 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32070 negative++;
32071 }
32072 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32073 op0 = pc_rtx;
32074 }
32075 else if (TREE_CODE (arg3) == SSA_NAME)
32076 {
32077 /* Recognize also when mask is like:
32078 __v2df src = _mm_setzero_pd ();
32079 __v2df mask = _mm_cmpeq_pd (src, src);
32080 or
32081 __v8sf src = _mm256_setzero_ps ();
32082 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32083 as that is a cheaper way to load all ones into
32084 a register than having to load a constant from
32085 memory. */
32086 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32087 if (is_gimple_call (def_stmt))
32088 {
32089 tree fndecl = gimple_call_fndecl (def_stmt);
32090 if (fndecl
32091 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32092 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32093 {
32094 case IX86_BUILTIN_CMPPD:
32095 case IX86_BUILTIN_CMPPS:
32096 case IX86_BUILTIN_CMPPD256:
32097 case IX86_BUILTIN_CMPPS256:
32098 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32099 break;
32100 /* FALLTHRU */
32101 case IX86_BUILTIN_CMPEQPD:
32102 case IX86_BUILTIN_CMPEQPS:
32103 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32104 && initializer_zerop (gimple_call_arg (def_stmt,
32105 1)))
32106 op0 = pc_rtx;
32107 break;
32108 default:
32109 break;
32110 }
32111 }
32112 }
32113 }
32114
32115 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32116 if (! pat)
32117 return const0_rtx;
32118 emit_insn (pat);
32119
32120 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32121 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32122 {
32123 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32124 ? V4SFmode : V4SImode;
32125 if (target == NULL_RTX)
32126 target = gen_reg_rtx (tmode);
32127 if (tmode == V4SFmode)
32128 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32129 else
32130 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32131 }
32132 else
32133 target = subtarget;
32134
32135 return target;
32136
32137 case IX86_BUILTIN_XABORT:
32138 icode = CODE_FOR_xabort;
32139 arg0 = CALL_EXPR_ARG (exp, 0);
32140 op0 = expand_normal (arg0);
32141 mode0 = insn_data[icode].operand[0].mode;
32142 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32143 {
32144 error ("the xabort's argument must be an 8-bit immediate");
32145 return const0_rtx;
32146 }
32147 emit_insn (gen_xabort (op0));
32148 return 0;
32149
32150 default:
32151 break;
32152 }
32153
32154 for (i = 0, d = bdesc_special_args;
32155 i < ARRAY_SIZE (bdesc_special_args);
32156 i++, d++)
32157 if (d->code == fcode)
32158 return ix86_expand_special_args_builtin (d, exp, target);
32159
32160 for (i = 0, d = bdesc_args;
32161 i < ARRAY_SIZE (bdesc_args);
32162 i++, d++)
32163 if (d->code == fcode)
32164 switch (fcode)
32165 {
32166 case IX86_BUILTIN_FABSQ:
32167 case IX86_BUILTIN_COPYSIGNQ:
32168 if (!TARGET_SSE)
32169 /* Emit a normal call if SSE isn't available. */
32170 return expand_call (exp, target, ignore);
32171 default:
32172 return ix86_expand_args_builtin (d, exp, target);
32173 }
32174
32175 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32176 if (d->code == fcode)
32177 return ix86_expand_sse_comi (d, exp, target);
32178
32179 for (i = 0, d = bdesc_pcmpestr;
32180 i < ARRAY_SIZE (bdesc_pcmpestr);
32181 i++, d++)
32182 if (d->code == fcode)
32183 return ix86_expand_sse_pcmpestr (d, exp, target);
32184
32185 for (i = 0, d = bdesc_pcmpistr;
32186 i < ARRAY_SIZE (bdesc_pcmpistr);
32187 i++, d++)
32188 if (d->code == fcode)
32189 return ix86_expand_sse_pcmpistr (d, exp, target);
32190
32191 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32192 if (d->code == fcode)
32193 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32194 (enum ix86_builtin_func_type)
32195 d->flag, d->comparison);
32196
32197 gcc_unreachable ();
32198 }
32199
32200 /* Returns a function decl for a vectorized version of the builtin function
32201 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32202 if it is not available. */
32203
32204 static tree
32205 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32206 tree type_in)
32207 {
32208 enum machine_mode in_mode, out_mode;
32209 int in_n, out_n;
32210 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32211
32212 if (TREE_CODE (type_out) != VECTOR_TYPE
32213 || TREE_CODE (type_in) != VECTOR_TYPE
32214 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32215 return NULL_TREE;
32216
32217 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32218 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32219 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32220 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32221
32222 switch (fn)
32223 {
32224 case BUILT_IN_SQRT:
32225 if (out_mode == DFmode && in_mode == DFmode)
32226 {
32227 if (out_n == 2 && in_n == 2)
32228 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32229 else if (out_n == 4 && in_n == 4)
32230 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32231 }
32232 break;
32233
32234 case BUILT_IN_SQRTF:
32235 if (out_mode == SFmode && in_mode == SFmode)
32236 {
32237 if (out_n == 4 && in_n == 4)
32238 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32239 else if (out_n == 8 && in_n == 8)
32240 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32241 }
32242 break;
32243
32244 case BUILT_IN_IFLOOR:
32245 case BUILT_IN_LFLOOR:
32246 case BUILT_IN_LLFLOOR:
32247 /* The round insn does not trap on denormals. */
32248 if (flag_trapping_math || !TARGET_ROUND)
32249 break;
32250
32251 if (out_mode == SImode && in_mode == DFmode)
32252 {
32253 if (out_n == 4 && in_n == 2)
32254 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32255 else if (out_n == 8 && in_n == 4)
32256 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32257 }
32258 break;
32259
32260 case BUILT_IN_IFLOORF:
32261 case BUILT_IN_LFLOORF:
32262 case BUILT_IN_LLFLOORF:
32263 /* The round insn does not trap on denormals. */
32264 if (flag_trapping_math || !TARGET_ROUND)
32265 break;
32266
32267 if (out_mode == SImode && in_mode == SFmode)
32268 {
32269 if (out_n == 4 && in_n == 4)
32270 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32271 else if (out_n == 8 && in_n == 8)
32272 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32273 }
32274 break;
32275
32276 case BUILT_IN_ICEIL:
32277 case BUILT_IN_LCEIL:
32278 case BUILT_IN_LLCEIL:
32279 /* The round insn does not trap on denormals. */
32280 if (flag_trapping_math || !TARGET_ROUND)
32281 break;
32282
32283 if (out_mode == SImode && in_mode == DFmode)
32284 {
32285 if (out_n == 4 && in_n == 2)
32286 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32287 else if (out_n == 8 && in_n == 4)
32288 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32289 }
32290 break;
32291
32292 case BUILT_IN_ICEILF:
32293 case BUILT_IN_LCEILF:
32294 case BUILT_IN_LLCEILF:
32295 /* The round insn does not trap on denormals. */
32296 if (flag_trapping_math || !TARGET_ROUND)
32297 break;
32298
32299 if (out_mode == SImode && in_mode == SFmode)
32300 {
32301 if (out_n == 4 && in_n == 4)
32302 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32303 else if (out_n == 8 && in_n == 8)
32304 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32305 }
32306 break;
32307
32308 case BUILT_IN_IRINT:
32309 case BUILT_IN_LRINT:
32310 case BUILT_IN_LLRINT:
32311 if (out_mode == SImode && in_mode == DFmode)
32312 {
32313 if (out_n == 4 && in_n == 2)
32314 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32315 else if (out_n == 8 && in_n == 4)
32316 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32317 }
32318 break;
32319
32320 case BUILT_IN_IRINTF:
32321 case BUILT_IN_LRINTF:
32322 case BUILT_IN_LLRINTF:
32323 if (out_mode == SImode && in_mode == SFmode)
32324 {
32325 if (out_n == 4 && in_n == 4)
32326 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32327 else if (out_n == 8 && in_n == 8)
32328 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32329 }
32330 break;
32331
32332 case BUILT_IN_IROUND:
32333 case BUILT_IN_LROUND:
32334 case BUILT_IN_LLROUND:
32335 /* The round insn does not trap on denormals. */
32336 if (flag_trapping_math || !TARGET_ROUND)
32337 break;
32338
32339 if (out_mode == SImode && in_mode == DFmode)
32340 {
32341 if (out_n == 4 && in_n == 2)
32342 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32343 else if (out_n == 8 && in_n == 4)
32344 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32345 }
32346 break;
32347
32348 case BUILT_IN_IROUNDF:
32349 case BUILT_IN_LROUNDF:
32350 case BUILT_IN_LLROUNDF:
32351 /* The round insn does not trap on denormals. */
32352 if (flag_trapping_math || !TARGET_ROUND)
32353 break;
32354
32355 if (out_mode == SImode && in_mode == SFmode)
32356 {
32357 if (out_n == 4 && in_n == 4)
32358 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32359 else if (out_n == 8 && in_n == 8)
32360 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32361 }
32362 break;
32363
32364 case BUILT_IN_COPYSIGN:
32365 if (out_mode == DFmode && in_mode == DFmode)
32366 {
32367 if (out_n == 2 && in_n == 2)
32368 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32369 else if (out_n == 4 && in_n == 4)
32370 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32371 }
32372 break;
32373
32374 case BUILT_IN_COPYSIGNF:
32375 if (out_mode == SFmode && in_mode == SFmode)
32376 {
32377 if (out_n == 4 && in_n == 4)
32378 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32379 else if (out_n == 8 && in_n == 8)
32380 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32381 }
32382 break;
32383
32384 case BUILT_IN_FLOOR:
32385 /* The round insn does not trap on denormals. */
32386 if (flag_trapping_math || !TARGET_ROUND)
32387 break;
32388
32389 if (out_mode == DFmode && in_mode == DFmode)
32390 {
32391 if (out_n == 2 && in_n == 2)
32392 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32393 else if (out_n == 4 && in_n == 4)
32394 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32395 }
32396 break;
32397
32398 case BUILT_IN_FLOORF:
32399 /* The round insn does not trap on denormals. */
32400 if (flag_trapping_math || !TARGET_ROUND)
32401 break;
32402
32403 if (out_mode == SFmode && in_mode == SFmode)
32404 {
32405 if (out_n == 4 && in_n == 4)
32406 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32407 else if (out_n == 8 && in_n == 8)
32408 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32409 }
32410 break;
32411
32412 case BUILT_IN_CEIL:
32413 /* The round insn does not trap on denormals. */
32414 if (flag_trapping_math || !TARGET_ROUND)
32415 break;
32416
32417 if (out_mode == DFmode && in_mode == DFmode)
32418 {
32419 if (out_n == 2 && in_n == 2)
32420 return ix86_builtins[IX86_BUILTIN_CEILPD];
32421 else if (out_n == 4 && in_n == 4)
32422 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32423 }
32424 break;
32425
32426 case BUILT_IN_CEILF:
32427 /* The round insn does not trap on denormals. */
32428 if (flag_trapping_math || !TARGET_ROUND)
32429 break;
32430
32431 if (out_mode == SFmode && in_mode == SFmode)
32432 {
32433 if (out_n == 4 && in_n == 4)
32434 return ix86_builtins[IX86_BUILTIN_CEILPS];
32435 else if (out_n == 8 && in_n == 8)
32436 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32437 }
32438 break;
32439
32440 case BUILT_IN_TRUNC:
32441 /* The round insn does not trap on denormals. */
32442 if (flag_trapping_math || !TARGET_ROUND)
32443 break;
32444
32445 if (out_mode == DFmode && in_mode == DFmode)
32446 {
32447 if (out_n == 2 && in_n == 2)
32448 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32449 else if (out_n == 4 && in_n == 4)
32450 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32451 }
32452 break;
32453
32454 case BUILT_IN_TRUNCF:
32455 /* The round insn does not trap on denormals. */
32456 if (flag_trapping_math || !TARGET_ROUND)
32457 break;
32458
32459 if (out_mode == SFmode && in_mode == SFmode)
32460 {
32461 if (out_n == 4 && in_n == 4)
32462 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32463 else if (out_n == 8 && in_n == 8)
32464 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32465 }
32466 break;
32467
32468 case BUILT_IN_RINT:
32469 /* The round insn does not trap on denormals. */
32470 if (flag_trapping_math || !TARGET_ROUND)
32471 break;
32472
32473 if (out_mode == DFmode && in_mode == DFmode)
32474 {
32475 if (out_n == 2 && in_n == 2)
32476 return ix86_builtins[IX86_BUILTIN_RINTPD];
32477 else if (out_n == 4 && in_n == 4)
32478 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32479 }
32480 break;
32481
32482 case BUILT_IN_RINTF:
32483 /* The round insn does not trap on denormals. */
32484 if (flag_trapping_math || !TARGET_ROUND)
32485 break;
32486
32487 if (out_mode == SFmode && in_mode == SFmode)
32488 {
32489 if (out_n == 4 && in_n == 4)
32490 return ix86_builtins[IX86_BUILTIN_RINTPS];
32491 else if (out_n == 8 && in_n == 8)
32492 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32493 }
32494 break;
32495
32496 case BUILT_IN_ROUND:
32497 /* The round insn does not trap on denormals. */
32498 if (flag_trapping_math || !TARGET_ROUND)
32499 break;
32500
32501 if (out_mode == DFmode && in_mode == DFmode)
32502 {
32503 if (out_n == 2 && in_n == 2)
32504 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32505 else if (out_n == 4 && in_n == 4)
32506 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32507 }
32508 break;
32509
32510 case BUILT_IN_ROUNDF:
32511 /* The round insn does not trap on denormals. */
32512 if (flag_trapping_math || !TARGET_ROUND)
32513 break;
32514
32515 if (out_mode == SFmode && in_mode == SFmode)
32516 {
32517 if (out_n == 4 && in_n == 4)
32518 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32519 else if (out_n == 8 && in_n == 8)
32520 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32521 }
32522 break;
32523
32524 case BUILT_IN_FMA:
32525 if (out_mode == DFmode && in_mode == DFmode)
32526 {
32527 if (out_n == 2 && in_n == 2)
32528 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32529 if (out_n == 4 && in_n == 4)
32530 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32531 }
32532 break;
32533
32534 case BUILT_IN_FMAF:
32535 if (out_mode == SFmode && in_mode == SFmode)
32536 {
32537 if (out_n == 4 && in_n == 4)
32538 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32539 if (out_n == 8 && in_n == 8)
32540 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32541 }
32542 break;
32543
32544 default:
32545 break;
32546 }
32547
32548 /* Dispatch to a handler for a vectorization library. */
32549 if (ix86_veclib_handler)
32550 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32551 type_in);
32552
32553 return NULL_TREE;
32554 }
32555
32556 /* Handler for an SVML-style interface to
32557 a library with vectorized intrinsics. */
32558
32559 static tree
32560 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32561 {
32562 char name[20];
32563 tree fntype, new_fndecl, args;
32564 unsigned arity;
32565 const char *bname;
32566 enum machine_mode el_mode, in_mode;
32567 int n, in_n;
32568
32569 /* The SVML is suitable for unsafe math only. */
32570 if (!flag_unsafe_math_optimizations)
32571 return NULL_TREE;
32572
32573 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32574 n = TYPE_VECTOR_SUBPARTS (type_out);
32575 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32576 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32577 if (el_mode != in_mode
32578 || n != in_n)
32579 return NULL_TREE;
32580
32581 switch (fn)
32582 {
32583 case BUILT_IN_EXP:
32584 case BUILT_IN_LOG:
32585 case BUILT_IN_LOG10:
32586 case BUILT_IN_POW:
32587 case BUILT_IN_TANH:
32588 case BUILT_IN_TAN:
32589 case BUILT_IN_ATAN:
32590 case BUILT_IN_ATAN2:
32591 case BUILT_IN_ATANH:
32592 case BUILT_IN_CBRT:
32593 case BUILT_IN_SINH:
32594 case BUILT_IN_SIN:
32595 case BUILT_IN_ASINH:
32596 case BUILT_IN_ASIN:
32597 case BUILT_IN_COSH:
32598 case BUILT_IN_COS:
32599 case BUILT_IN_ACOSH:
32600 case BUILT_IN_ACOS:
32601 if (el_mode != DFmode || n != 2)
32602 return NULL_TREE;
32603 break;
32604
32605 case BUILT_IN_EXPF:
32606 case BUILT_IN_LOGF:
32607 case BUILT_IN_LOG10F:
32608 case BUILT_IN_POWF:
32609 case BUILT_IN_TANHF:
32610 case BUILT_IN_TANF:
32611 case BUILT_IN_ATANF:
32612 case BUILT_IN_ATAN2F:
32613 case BUILT_IN_ATANHF:
32614 case BUILT_IN_CBRTF:
32615 case BUILT_IN_SINHF:
32616 case BUILT_IN_SINF:
32617 case BUILT_IN_ASINHF:
32618 case BUILT_IN_ASINF:
32619 case BUILT_IN_COSHF:
32620 case BUILT_IN_COSF:
32621 case BUILT_IN_ACOSHF:
32622 case BUILT_IN_ACOSF:
32623 if (el_mode != SFmode || n != 4)
32624 return NULL_TREE;
32625 break;
32626
32627 default:
32628 return NULL_TREE;
32629 }
32630
32631 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32632
32633 if (fn == BUILT_IN_LOGF)
32634 strcpy (name, "vmlsLn4");
32635 else if (fn == BUILT_IN_LOG)
32636 strcpy (name, "vmldLn2");
32637 else if (n == 4)
32638 {
32639 sprintf (name, "vmls%s", bname+10);
32640 name[strlen (name)-1] = '4';
32641 }
32642 else
32643 sprintf (name, "vmld%s2", bname+10);
32644
32645 /* Convert to uppercase. */
32646 name[4] &= ~0x20;
32647
32648 arity = 0;
32649 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32650 args;
32651 args = TREE_CHAIN (args))
32652 arity++;
32653
32654 if (arity == 1)
32655 fntype = build_function_type_list (type_out, type_in, NULL);
32656 else
32657 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32658
32659 /* Build a function declaration for the vectorized function. */
32660 new_fndecl = build_decl (BUILTINS_LOCATION,
32661 FUNCTION_DECL, get_identifier (name), fntype);
32662 TREE_PUBLIC (new_fndecl) = 1;
32663 DECL_EXTERNAL (new_fndecl) = 1;
32664 DECL_IS_NOVOPS (new_fndecl) = 1;
32665 TREE_READONLY (new_fndecl) = 1;
32666
32667 return new_fndecl;
32668 }
32669
32670 /* Handler for an ACML-style interface to
32671 a library with vectorized intrinsics. */
32672
32673 static tree
32674 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32675 {
32676 char name[20] = "__vr.._";
32677 tree fntype, new_fndecl, args;
32678 unsigned arity;
32679 const char *bname;
32680 enum machine_mode el_mode, in_mode;
32681 int n, in_n;
32682
32683 /* The ACML is 64bits only and suitable for unsafe math only as
32684 it does not correctly support parts of IEEE with the required
32685 precision such as denormals. */
32686 if (!TARGET_64BIT
32687 || !flag_unsafe_math_optimizations)
32688 return NULL_TREE;
32689
32690 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32691 n = TYPE_VECTOR_SUBPARTS (type_out);
32692 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32693 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32694 if (el_mode != in_mode
32695 || n != in_n)
32696 return NULL_TREE;
32697
32698 switch (fn)
32699 {
32700 case BUILT_IN_SIN:
32701 case BUILT_IN_COS:
32702 case BUILT_IN_EXP:
32703 case BUILT_IN_LOG:
32704 case BUILT_IN_LOG2:
32705 case BUILT_IN_LOG10:
32706 name[4] = 'd';
32707 name[5] = '2';
32708 if (el_mode != DFmode
32709 || n != 2)
32710 return NULL_TREE;
32711 break;
32712
32713 case BUILT_IN_SINF:
32714 case BUILT_IN_COSF:
32715 case BUILT_IN_EXPF:
32716 case BUILT_IN_POWF:
32717 case BUILT_IN_LOGF:
32718 case BUILT_IN_LOG2F:
32719 case BUILT_IN_LOG10F:
32720 name[4] = 's';
32721 name[5] = '4';
32722 if (el_mode != SFmode
32723 || n != 4)
32724 return NULL_TREE;
32725 break;
32726
32727 default:
32728 return NULL_TREE;
32729 }
32730
32731 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32732 sprintf (name + 7, "%s", bname+10);
32733
32734 arity = 0;
32735 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32736 args;
32737 args = TREE_CHAIN (args))
32738 arity++;
32739
32740 if (arity == 1)
32741 fntype = build_function_type_list (type_out, type_in, NULL);
32742 else
32743 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32744
32745 /* Build a function declaration for the vectorized function. */
32746 new_fndecl = build_decl (BUILTINS_LOCATION,
32747 FUNCTION_DECL, get_identifier (name), fntype);
32748 TREE_PUBLIC (new_fndecl) = 1;
32749 DECL_EXTERNAL (new_fndecl) = 1;
32750 DECL_IS_NOVOPS (new_fndecl) = 1;
32751 TREE_READONLY (new_fndecl) = 1;
32752
32753 return new_fndecl;
32754 }
32755
32756 /* Returns a decl of a function that implements gather load with
32757 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
32758 Return NULL_TREE if it is not available. */
32759
32760 static tree
32761 ix86_vectorize_builtin_gather (const_tree mem_vectype,
32762 const_tree index_type, int scale)
32763 {
32764 bool si;
32765 enum ix86_builtins code;
32766
32767 if (! TARGET_AVX2)
32768 return NULL_TREE;
32769
32770 if ((TREE_CODE (index_type) != INTEGER_TYPE
32771 && !POINTER_TYPE_P (index_type))
32772 || (TYPE_MODE (index_type) != SImode
32773 && TYPE_MODE (index_type) != DImode))
32774 return NULL_TREE;
32775
32776 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
32777 return NULL_TREE;
32778
32779 /* v*gather* insn sign extends index to pointer mode. */
32780 if (TYPE_PRECISION (index_type) < POINTER_SIZE
32781 && TYPE_UNSIGNED (index_type))
32782 return NULL_TREE;
32783
32784 if (scale <= 0
32785 || scale > 8
32786 || (scale & (scale - 1)) != 0)
32787 return NULL_TREE;
32788
32789 si = TYPE_MODE (index_type) == SImode;
32790 switch (TYPE_MODE (mem_vectype))
32791 {
32792 case V2DFmode:
32793 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
32794 break;
32795 case V4DFmode:
32796 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
32797 break;
32798 case V2DImode:
32799 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
32800 break;
32801 case V4DImode:
32802 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
32803 break;
32804 case V4SFmode:
32805 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
32806 break;
32807 case V8SFmode:
32808 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
32809 break;
32810 case V4SImode:
32811 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
32812 break;
32813 case V8SImode:
32814 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
32815 break;
32816 default:
32817 return NULL_TREE;
32818 }
32819
32820 return ix86_builtins[code];
32821 }
32822
32823 /* Returns a code for a target-specific builtin that implements
32824 reciprocal of the function, or NULL_TREE if not available. */
32825
32826 static tree
32827 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
32828 bool sqrt ATTRIBUTE_UNUSED)
32829 {
32830 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
32831 && flag_finite_math_only && !flag_trapping_math
32832 && flag_unsafe_math_optimizations))
32833 return NULL_TREE;
32834
32835 if (md_fn)
32836 /* Machine dependent builtins. */
32837 switch (fn)
32838 {
32839 /* Vectorized version of sqrt to rsqrt conversion. */
32840 case IX86_BUILTIN_SQRTPS_NR:
32841 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
32842
32843 case IX86_BUILTIN_SQRTPS_NR256:
32844 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
32845
32846 default:
32847 return NULL_TREE;
32848 }
32849 else
32850 /* Normal builtins. */
32851 switch (fn)
32852 {
32853 /* Sqrt to rsqrt conversion. */
32854 case BUILT_IN_SQRTF:
32855 return ix86_builtins[IX86_BUILTIN_RSQRTF];
32856
32857 default:
32858 return NULL_TREE;
32859 }
32860 }
32861 \f
32862 /* Helper for avx_vpermilps256_operand et al. This is also used by
32863 the expansion functions to turn the parallel back into a mask.
32864 The return value is 0 for no match and the imm8+1 for a match. */
32865
32866 int
32867 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
32868 {
32869 unsigned i, nelt = GET_MODE_NUNITS (mode);
32870 unsigned mask = 0;
32871 unsigned char ipar[8];
32872
32873 if (XVECLEN (par, 0) != (int) nelt)
32874 return 0;
32875
32876 /* Validate that all of the elements are constants, and not totally
32877 out of range. Copy the data into an integral array to make the
32878 subsequent checks easier. */
32879 for (i = 0; i < nelt; ++i)
32880 {
32881 rtx er = XVECEXP (par, 0, i);
32882 unsigned HOST_WIDE_INT ei;
32883
32884 if (!CONST_INT_P (er))
32885 return 0;
32886 ei = INTVAL (er);
32887 if (ei >= nelt)
32888 return 0;
32889 ipar[i] = ei;
32890 }
32891
32892 switch (mode)
32893 {
32894 case V4DFmode:
32895 /* In the 256-bit DFmode case, we can only move elements within
32896 a 128-bit lane. */
32897 for (i = 0; i < 2; ++i)
32898 {
32899 if (ipar[i] >= 2)
32900 return 0;
32901 mask |= ipar[i] << i;
32902 }
32903 for (i = 2; i < 4; ++i)
32904 {
32905 if (ipar[i] < 2)
32906 return 0;
32907 mask |= (ipar[i] - 2) << i;
32908 }
32909 break;
32910
32911 case V8SFmode:
32912 /* In the 256-bit SFmode case, we have full freedom of movement
32913 within the low 128-bit lane, but the high 128-bit lane must
32914 mirror the exact same pattern. */
32915 for (i = 0; i < 4; ++i)
32916 if (ipar[i] + 4 != ipar[i + 4])
32917 return 0;
32918 nelt = 4;
32919 /* FALLTHRU */
32920
32921 case V2DFmode:
32922 case V4SFmode:
32923 /* In the 128-bit case, we've full freedom in the placement of
32924 the elements from the source operand. */
32925 for (i = 0; i < nelt; ++i)
32926 mask |= ipar[i] << (i * (nelt / 2));
32927 break;
32928
32929 default:
32930 gcc_unreachable ();
32931 }
32932
32933 /* Make sure success has a non-zero value by adding one. */
32934 return mask + 1;
32935 }
32936
32937 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
32938 the expansion functions to turn the parallel back into a mask.
32939 The return value is 0 for no match and the imm8+1 for a match. */
32940
32941 int
32942 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
32943 {
32944 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
32945 unsigned mask = 0;
32946 unsigned char ipar[8];
32947
32948 if (XVECLEN (par, 0) != (int) nelt)
32949 return 0;
32950
32951 /* Validate that all of the elements are constants, and not totally
32952 out of range. Copy the data into an integral array to make the
32953 subsequent checks easier. */
32954 for (i = 0; i < nelt; ++i)
32955 {
32956 rtx er = XVECEXP (par, 0, i);
32957 unsigned HOST_WIDE_INT ei;
32958
32959 if (!CONST_INT_P (er))
32960 return 0;
32961 ei = INTVAL (er);
32962 if (ei >= 2 * nelt)
32963 return 0;
32964 ipar[i] = ei;
32965 }
32966
32967 /* Validate that the halves of the permute are halves. */
32968 for (i = 0; i < nelt2 - 1; ++i)
32969 if (ipar[i] + 1 != ipar[i + 1])
32970 return 0;
32971 for (i = nelt2; i < nelt - 1; ++i)
32972 if (ipar[i] + 1 != ipar[i + 1])
32973 return 0;
32974
32975 /* Reconstruct the mask. */
32976 for (i = 0; i < 2; ++i)
32977 {
32978 unsigned e = ipar[i * nelt2];
32979 if (e % nelt2)
32980 return 0;
32981 e /= nelt2;
32982 mask |= e << (i * 4);
32983 }
32984
32985 /* Make sure success has a non-zero value by adding one. */
32986 return mask + 1;
32987 }
32988 \f
32989 /* Store OPERAND to the memory after reload is completed. This means
32990 that we can't easily use assign_stack_local. */
32991 rtx
32992 ix86_force_to_memory (enum machine_mode mode, rtx operand)
32993 {
32994 rtx result;
32995
32996 gcc_assert (reload_completed);
32997 if (ix86_using_red_zone ())
32998 {
32999 result = gen_rtx_MEM (mode,
33000 gen_rtx_PLUS (Pmode,
33001 stack_pointer_rtx,
33002 GEN_INT (-RED_ZONE_SIZE)));
33003 emit_move_insn (result, operand);
33004 }
33005 else if (TARGET_64BIT)
33006 {
33007 switch (mode)
33008 {
33009 case HImode:
33010 case SImode:
33011 operand = gen_lowpart (DImode, operand);
33012 /* FALLTHRU */
33013 case DImode:
33014 emit_insn (
33015 gen_rtx_SET (VOIDmode,
33016 gen_rtx_MEM (DImode,
33017 gen_rtx_PRE_DEC (DImode,
33018 stack_pointer_rtx)),
33019 operand));
33020 break;
33021 default:
33022 gcc_unreachable ();
33023 }
33024 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33025 }
33026 else
33027 {
33028 switch (mode)
33029 {
33030 case DImode:
33031 {
33032 rtx operands[2];
33033 split_double_mode (mode, &operand, 1, operands, operands + 1);
33034 emit_insn (
33035 gen_rtx_SET (VOIDmode,
33036 gen_rtx_MEM (SImode,
33037 gen_rtx_PRE_DEC (Pmode,
33038 stack_pointer_rtx)),
33039 operands[1]));
33040 emit_insn (
33041 gen_rtx_SET (VOIDmode,
33042 gen_rtx_MEM (SImode,
33043 gen_rtx_PRE_DEC (Pmode,
33044 stack_pointer_rtx)),
33045 operands[0]));
33046 }
33047 break;
33048 case HImode:
33049 /* Store HImodes as SImodes. */
33050 operand = gen_lowpart (SImode, operand);
33051 /* FALLTHRU */
33052 case SImode:
33053 emit_insn (
33054 gen_rtx_SET (VOIDmode,
33055 gen_rtx_MEM (GET_MODE (operand),
33056 gen_rtx_PRE_DEC (SImode,
33057 stack_pointer_rtx)),
33058 operand));
33059 break;
33060 default:
33061 gcc_unreachable ();
33062 }
33063 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33064 }
33065 return result;
33066 }
33067
33068 /* Free operand from the memory. */
33069 void
33070 ix86_free_from_memory (enum machine_mode mode)
33071 {
33072 if (!ix86_using_red_zone ())
33073 {
33074 int size;
33075
33076 if (mode == DImode || TARGET_64BIT)
33077 size = 8;
33078 else
33079 size = 4;
33080 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33081 to pop or add instruction if registers are available. */
33082 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33083 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33084 GEN_INT (size))));
33085 }
33086 }
33087
33088 /* Return a register priority for hard reg REGNO. */
33089 static int
33090 ix86_register_priority (int hard_regno)
33091 {
33092 /* ebp and r13 as the base always wants a displacement, r12 as the
33093 base always wants an index. So discourage their usage in an
33094 address. */
33095 if (hard_regno == R12_REG || hard_regno == R13_REG)
33096 return 0;
33097 if (hard_regno == BP_REG)
33098 return 1;
33099 /* New x86-64 int registers result in bigger code size. Discourage
33100 them. */
33101 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33102 return 2;
33103 /* New x86-64 SSE registers result in bigger code size. Discourage
33104 them. */
33105 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33106 return 2;
33107 /* Usage of AX register results in smaller code. Prefer it. */
33108 if (hard_regno == 0)
33109 return 4;
33110 return 3;
33111 }
33112
33113 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33114
33115 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33116 QImode must go into class Q_REGS.
33117 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33118 movdf to do mem-to-mem moves through integer regs. */
33119
33120 static reg_class_t
33121 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33122 {
33123 enum machine_mode mode = GET_MODE (x);
33124
33125 /* We're only allowed to return a subclass of CLASS. Many of the
33126 following checks fail for NO_REGS, so eliminate that early. */
33127 if (regclass == NO_REGS)
33128 return NO_REGS;
33129
33130 /* All classes can load zeros. */
33131 if (x == CONST0_RTX (mode))
33132 return regclass;
33133
33134 /* Force constants into memory if we are loading a (nonzero) constant into
33135 an MMX or SSE register. This is because there are no MMX/SSE instructions
33136 to load from a constant. */
33137 if (CONSTANT_P (x)
33138 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33139 return NO_REGS;
33140
33141 /* Prefer SSE regs only, if we can use them for math. */
33142 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33143 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33144
33145 /* Floating-point constants need more complex checks. */
33146 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33147 {
33148 /* General regs can load everything. */
33149 if (reg_class_subset_p (regclass, GENERAL_REGS))
33150 return regclass;
33151
33152 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33153 zero above. We only want to wind up preferring 80387 registers if
33154 we plan on doing computation with them. */
33155 if (TARGET_80387
33156 && standard_80387_constant_p (x) > 0)
33157 {
33158 /* Limit class to non-sse. */
33159 if (regclass == FLOAT_SSE_REGS)
33160 return FLOAT_REGS;
33161 if (regclass == FP_TOP_SSE_REGS)
33162 return FP_TOP_REG;
33163 if (regclass == FP_SECOND_SSE_REGS)
33164 return FP_SECOND_REG;
33165 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33166 return regclass;
33167 }
33168
33169 return NO_REGS;
33170 }
33171
33172 /* Generally when we see PLUS here, it's the function invariant
33173 (plus soft-fp const_int). Which can only be computed into general
33174 regs. */
33175 if (GET_CODE (x) == PLUS)
33176 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33177
33178 /* QImode constants are easy to load, but non-constant QImode data
33179 must go into Q_REGS. */
33180 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33181 {
33182 if (reg_class_subset_p (regclass, Q_REGS))
33183 return regclass;
33184 if (reg_class_subset_p (Q_REGS, regclass))
33185 return Q_REGS;
33186 return NO_REGS;
33187 }
33188
33189 return regclass;
33190 }
33191
33192 /* Discourage putting floating-point values in SSE registers unless
33193 SSE math is being used, and likewise for the 387 registers. */
33194 static reg_class_t
33195 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33196 {
33197 enum machine_mode mode = GET_MODE (x);
33198
33199 /* Restrict the output reload class to the register bank that we are doing
33200 math on. If we would like not to return a subset of CLASS, reject this
33201 alternative: if reload cannot do this, it will still use its choice. */
33202 mode = GET_MODE (x);
33203 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33204 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33205
33206 if (X87_FLOAT_MODE_P (mode))
33207 {
33208 if (regclass == FP_TOP_SSE_REGS)
33209 return FP_TOP_REG;
33210 else if (regclass == FP_SECOND_SSE_REGS)
33211 return FP_SECOND_REG;
33212 else
33213 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33214 }
33215
33216 return regclass;
33217 }
33218
33219 static reg_class_t
33220 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33221 enum machine_mode mode, secondary_reload_info *sri)
33222 {
33223 /* Double-word spills from general registers to non-offsettable memory
33224 references (zero-extended addresses) require special handling. */
33225 if (TARGET_64BIT
33226 && MEM_P (x)
33227 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33228 && rclass == GENERAL_REGS
33229 && !offsettable_memref_p (x))
33230 {
33231 sri->icode = (in_p
33232 ? CODE_FOR_reload_noff_load
33233 : CODE_FOR_reload_noff_store);
33234 /* Add the cost of moving address to a temporary. */
33235 sri->extra_cost = 1;
33236
33237 return NO_REGS;
33238 }
33239
33240 /* QImode spills from non-QI registers require
33241 intermediate register on 32bit targets. */
33242 if (!TARGET_64BIT
33243 && !in_p && mode == QImode
33244 && (rclass == GENERAL_REGS
33245 || rclass == LEGACY_REGS
33246 || rclass == NON_Q_REGS
33247 || rclass == SIREG
33248 || rclass == DIREG
33249 || rclass == INDEX_REGS))
33250 {
33251 int regno;
33252
33253 if (REG_P (x))
33254 regno = REGNO (x);
33255 else
33256 regno = -1;
33257
33258 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33259 regno = true_regnum (x);
33260
33261 /* Return Q_REGS if the operand is in memory. */
33262 if (regno == -1)
33263 return Q_REGS;
33264 }
33265
33266 /* This condition handles corner case where an expression involving
33267 pointers gets vectorized. We're trying to use the address of a
33268 stack slot as a vector initializer.
33269
33270 (set (reg:V2DI 74 [ vect_cst_.2 ])
33271 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33272
33273 Eventually frame gets turned into sp+offset like this:
33274
33275 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33276 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33277 (const_int 392 [0x188]))))
33278
33279 That later gets turned into:
33280
33281 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33282 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33283 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33284
33285 We'll have the following reload recorded:
33286
33287 Reload 0: reload_in (DI) =
33288 (plus:DI (reg/f:DI 7 sp)
33289 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33290 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33291 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33292 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33293 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33294 reload_reg_rtx: (reg:V2DI 22 xmm1)
33295
33296 Which isn't going to work since SSE instructions can't handle scalar
33297 additions. Returning GENERAL_REGS forces the addition into integer
33298 register and reload can handle subsequent reloads without problems. */
33299
33300 if (in_p && GET_CODE (x) == PLUS
33301 && SSE_CLASS_P (rclass)
33302 && SCALAR_INT_MODE_P (mode))
33303 return GENERAL_REGS;
33304
33305 return NO_REGS;
33306 }
33307
33308 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33309
33310 static bool
33311 ix86_class_likely_spilled_p (reg_class_t rclass)
33312 {
33313 switch (rclass)
33314 {
33315 case AREG:
33316 case DREG:
33317 case CREG:
33318 case BREG:
33319 case AD_REGS:
33320 case SIREG:
33321 case DIREG:
33322 case SSE_FIRST_REG:
33323 case FP_TOP_REG:
33324 case FP_SECOND_REG:
33325 return true;
33326
33327 default:
33328 break;
33329 }
33330
33331 return false;
33332 }
33333
33334 /* If we are copying between general and FP registers, we need a memory
33335 location. The same is true for SSE and MMX registers.
33336
33337 To optimize register_move_cost performance, allow inline variant.
33338
33339 The macro can't work reliably when one of the CLASSES is class containing
33340 registers from multiple units (SSE, MMX, integer). We avoid this by never
33341 combining those units in single alternative in the machine description.
33342 Ensure that this constraint holds to avoid unexpected surprises.
33343
33344 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33345 enforce these sanity checks. */
33346
33347 static inline bool
33348 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33349 enum machine_mode mode, int strict)
33350 {
33351 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33352 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33353 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33354 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33355 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33356 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33357 {
33358 gcc_assert (!strict || lra_in_progress);
33359 return true;
33360 }
33361
33362 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33363 return true;
33364
33365 /* ??? This is a lie. We do have moves between mmx/general, and for
33366 mmx/sse2. But by saying we need secondary memory we discourage the
33367 register allocator from using the mmx registers unless needed. */
33368 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33369 return true;
33370
33371 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33372 {
33373 /* SSE1 doesn't have any direct moves from other classes. */
33374 if (!TARGET_SSE2)
33375 return true;
33376
33377 /* If the target says that inter-unit moves are more expensive
33378 than moving through memory, then don't generate them. */
33379 if (!TARGET_INTER_UNIT_MOVES)
33380 return true;
33381
33382 /* Between SSE and general, we have moves no larger than word size. */
33383 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33384 return true;
33385 }
33386
33387 return false;
33388 }
33389
33390 bool
33391 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33392 enum machine_mode mode, int strict)
33393 {
33394 return inline_secondary_memory_needed (class1, class2, mode, strict);
33395 }
33396
33397 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33398
33399 On the 80386, this is the size of MODE in words,
33400 except in the FP regs, where a single reg is always enough. */
33401
33402 static unsigned char
33403 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33404 {
33405 if (MAYBE_INTEGER_CLASS_P (rclass))
33406 {
33407 if (mode == XFmode)
33408 return (TARGET_64BIT ? 2 : 3);
33409 else if (mode == XCmode)
33410 return (TARGET_64BIT ? 4 : 6);
33411 else
33412 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33413 }
33414 else
33415 {
33416 if (COMPLEX_MODE_P (mode))
33417 return 2;
33418 else
33419 return 1;
33420 }
33421 }
33422
33423 /* Return true if the registers in CLASS cannot represent the change from
33424 modes FROM to TO. */
33425
33426 bool
33427 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33428 enum reg_class regclass)
33429 {
33430 if (from == to)
33431 return false;
33432
33433 /* x87 registers can't do subreg at all, as all values are reformatted
33434 to extended precision. */
33435 if (MAYBE_FLOAT_CLASS_P (regclass))
33436 return true;
33437
33438 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33439 {
33440 /* Vector registers do not support QI or HImode loads. If we don't
33441 disallow a change to these modes, reload will assume it's ok to
33442 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33443 the vec_dupv4hi pattern. */
33444 if (GET_MODE_SIZE (from) < 4)
33445 return true;
33446
33447 /* Vector registers do not support subreg with nonzero offsets, which
33448 are otherwise valid for integer registers. Since we can't see
33449 whether we have a nonzero offset from here, prohibit all
33450 nonparadoxical subregs changing size. */
33451 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33452 return true;
33453 }
33454
33455 return false;
33456 }
33457
33458 /* Return the cost of moving data of mode M between a
33459 register and memory. A value of 2 is the default; this cost is
33460 relative to those in `REGISTER_MOVE_COST'.
33461
33462 This function is used extensively by register_move_cost that is used to
33463 build tables at startup. Make it inline in this case.
33464 When IN is 2, return maximum of in and out move cost.
33465
33466 If moving between registers and memory is more expensive than
33467 between two registers, you should define this macro to express the
33468 relative cost.
33469
33470 Model also increased moving costs of QImode registers in non
33471 Q_REGS classes.
33472 */
33473 static inline int
33474 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33475 int in)
33476 {
33477 int cost;
33478 if (FLOAT_CLASS_P (regclass))
33479 {
33480 int index;
33481 switch (mode)
33482 {
33483 case SFmode:
33484 index = 0;
33485 break;
33486 case DFmode:
33487 index = 1;
33488 break;
33489 case XFmode:
33490 index = 2;
33491 break;
33492 default:
33493 return 100;
33494 }
33495 if (in == 2)
33496 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33497 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33498 }
33499 if (SSE_CLASS_P (regclass))
33500 {
33501 int index;
33502 switch (GET_MODE_SIZE (mode))
33503 {
33504 case 4:
33505 index = 0;
33506 break;
33507 case 8:
33508 index = 1;
33509 break;
33510 case 16:
33511 index = 2;
33512 break;
33513 default:
33514 return 100;
33515 }
33516 if (in == 2)
33517 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33518 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33519 }
33520 if (MMX_CLASS_P (regclass))
33521 {
33522 int index;
33523 switch (GET_MODE_SIZE (mode))
33524 {
33525 case 4:
33526 index = 0;
33527 break;
33528 case 8:
33529 index = 1;
33530 break;
33531 default:
33532 return 100;
33533 }
33534 if (in)
33535 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33536 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33537 }
33538 switch (GET_MODE_SIZE (mode))
33539 {
33540 case 1:
33541 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33542 {
33543 if (!in)
33544 return ix86_cost->int_store[0];
33545 if (TARGET_PARTIAL_REG_DEPENDENCY
33546 && optimize_function_for_speed_p (cfun))
33547 cost = ix86_cost->movzbl_load;
33548 else
33549 cost = ix86_cost->int_load[0];
33550 if (in == 2)
33551 return MAX (cost, ix86_cost->int_store[0]);
33552 return cost;
33553 }
33554 else
33555 {
33556 if (in == 2)
33557 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33558 if (in)
33559 return ix86_cost->movzbl_load;
33560 else
33561 return ix86_cost->int_store[0] + 4;
33562 }
33563 break;
33564 case 2:
33565 if (in == 2)
33566 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33567 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33568 default:
33569 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33570 if (mode == TFmode)
33571 mode = XFmode;
33572 if (in == 2)
33573 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33574 else if (in)
33575 cost = ix86_cost->int_load[2];
33576 else
33577 cost = ix86_cost->int_store[2];
33578 return (cost * (((int) GET_MODE_SIZE (mode)
33579 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33580 }
33581 }
33582
33583 static int
33584 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33585 bool in)
33586 {
33587 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33588 }
33589
33590
33591 /* Return the cost of moving data from a register in class CLASS1 to
33592 one in class CLASS2.
33593
33594 It is not required that the cost always equal 2 when FROM is the same as TO;
33595 on some machines it is expensive to move between registers if they are not
33596 general registers. */
33597
33598 static int
33599 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33600 reg_class_t class2_i)
33601 {
33602 enum reg_class class1 = (enum reg_class) class1_i;
33603 enum reg_class class2 = (enum reg_class) class2_i;
33604
33605 /* In case we require secondary memory, compute cost of the store followed
33606 by load. In order to avoid bad register allocation choices, we need
33607 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33608
33609 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33610 {
33611 int cost = 1;
33612
33613 cost += inline_memory_move_cost (mode, class1, 2);
33614 cost += inline_memory_move_cost (mode, class2, 2);
33615
33616 /* In case of copying from general_purpose_register we may emit multiple
33617 stores followed by single load causing memory size mismatch stall.
33618 Count this as arbitrarily high cost of 20. */
33619 if (targetm.class_max_nregs (class1, mode)
33620 > targetm.class_max_nregs (class2, mode))
33621 cost += 20;
33622
33623 /* In the case of FP/MMX moves, the registers actually overlap, and we
33624 have to switch modes in order to treat them differently. */
33625 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33626 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33627 cost += 20;
33628
33629 return cost;
33630 }
33631
33632 /* Moves between SSE/MMX and integer unit are expensive. */
33633 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33634 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33635
33636 /* ??? By keeping returned value relatively high, we limit the number
33637 of moves between integer and MMX/SSE registers for all targets.
33638 Additionally, high value prevents problem with x86_modes_tieable_p(),
33639 where integer modes in MMX/SSE registers are not tieable
33640 because of missing QImode and HImode moves to, from or between
33641 MMX/SSE registers. */
33642 return MAX (8, ix86_cost->mmxsse_to_integer);
33643
33644 if (MAYBE_FLOAT_CLASS_P (class1))
33645 return ix86_cost->fp_move;
33646 if (MAYBE_SSE_CLASS_P (class1))
33647 return ix86_cost->sse_move;
33648 if (MAYBE_MMX_CLASS_P (class1))
33649 return ix86_cost->mmx_move;
33650 return 2;
33651 }
33652
33653 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33654 MODE. */
33655
33656 bool
33657 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33658 {
33659 /* Flags and only flags can only hold CCmode values. */
33660 if (CC_REGNO_P (regno))
33661 return GET_MODE_CLASS (mode) == MODE_CC;
33662 if (GET_MODE_CLASS (mode) == MODE_CC
33663 || GET_MODE_CLASS (mode) == MODE_RANDOM
33664 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33665 return false;
33666 if (STACK_REGNO_P (regno))
33667 return VALID_FP_MODE_P (mode);
33668 if (SSE_REGNO_P (regno))
33669 {
33670 /* We implement the move patterns for all vector modes into and
33671 out of SSE registers, even when no operation instructions
33672 are available. OImode move is available only when AVX is
33673 enabled. */
33674 return ((TARGET_AVX && mode == OImode)
33675 || VALID_AVX256_REG_MODE (mode)
33676 || VALID_SSE_REG_MODE (mode)
33677 || VALID_SSE2_REG_MODE (mode)
33678 || VALID_MMX_REG_MODE (mode)
33679 || VALID_MMX_REG_MODE_3DNOW (mode));
33680 }
33681 if (MMX_REGNO_P (regno))
33682 {
33683 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33684 so if the register is available at all, then we can move data of
33685 the given mode into or out of it. */
33686 return (VALID_MMX_REG_MODE (mode)
33687 || VALID_MMX_REG_MODE_3DNOW (mode));
33688 }
33689
33690 if (mode == QImode)
33691 {
33692 /* Take care for QImode values - they can be in non-QI regs,
33693 but then they do cause partial register stalls. */
33694 if (TARGET_64BIT || QI_REGNO_P (regno))
33695 return true;
33696 if (!TARGET_PARTIAL_REG_STALL)
33697 return true;
33698 return !can_create_pseudo_p ();
33699 }
33700 /* We handle both integer and floats in the general purpose registers. */
33701 else if (VALID_INT_MODE_P (mode))
33702 return true;
33703 else if (VALID_FP_MODE_P (mode))
33704 return true;
33705 else if (VALID_DFP_MODE_P (mode))
33706 return true;
33707 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33708 on to use that value in smaller contexts, this can easily force a
33709 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33710 supporting DImode, allow it. */
33711 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33712 return true;
33713
33714 return false;
33715 }
33716
33717 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33718 tieable integer mode. */
33719
33720 static bool
33721 ix86_tieable_integer_mode_p (enum machine_mode mode)
33722 {
33723 switch (mode)
33724 {
33725 case HImode:
33726 case SImode:
33727 return true;
33728
33729 case QImode:
33730 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33731
33732 case DImode:
33733 return TARGET_64BIT;
33734
33735 default:
33736 return false;
33737 }
33738 }
33739
33740 /* Return true if MODE1 is accessible in a register that can hold MODE2
33741 without copying. That is, all register classes that can hold MODE2
33742 can also hold MODE1. */
33743
33744 bool
33745 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
33746 {
33747 if (mode1 == mode2)
33748 return true;
33749
33750 if (ix86_tieable_integer_mode_p (mode1)
33751 && ix86_tieable_integer_mode_p (mode2))
33752 return true;
33753
33754 /* MODE2 being XFmode implies fp stack or general regs, which means we
33755 can tie any smaller floating point modes to it. Note that we do not
33756 tie this with TFmode. */
33757 if (mode2 == XFmode)
33758 return mode1 == SFmode || mode1 == DFmode;
33759
33760 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
33761 that we can tie it with SFmode. */
33762 if (mode2 == DFmode)
33763 return mode1 == SFmode;
33764
33765 /* If MODE2 is only appropriate for an SSE register, then tie with
33766 any other mode acceptable to SSE registers. */
33767 if (GET_MODE_SIZE (mode2) == 32
33768 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33769 return (GET_MODE_SIZE (mode1) == 32
33770 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33771 if (GET_MODE_SIZE (mode2) == 16
33772 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
33773 return (GET_MODE_SIZE (mode1) == 16
33774 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
33775
33776 /* If MODE2 is appropriate for an MMX register, then tie
33777 with any other mode acceptable to MMX registers. */
33778 if (GET_MODE_SIZE (mode2) == 8
33779 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
33780 return (GET_MODE_SIZE (mode1) == 8
33781 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
33782
33783 return false;
33784 }
33785
33786 /* Return the cost of moving between two registers of mode MODE. */
33787
33788 static int
33789 ix86_set_reg_reg_cost (enum machine_mode mode)
33790 {
33791 unsigned int units = UNITS_PER_WORD;
33792
33793 switch (GET_MODE_CLASS (mode))
33794 {
33795 default:
33796 break;
33797
33798 case MODE_CC:
33799 units = GET_MODE_SIZE (CCmode);
33800 break;
33801
33802 case MODE_FLOAT:
33803 if ((TARGET_SSE && mode == TFmode)
33804 || (TARGET_80387 && mode == XFmode)
33805 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
33806 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
33807 units = GET_MODE_SIZE (mode);
33808 break;
33809
33810 case MODE_COMPLEX_FLOAT:
33811 if ((TARGET_SSE && mode == TCmode)
33812 || (TARGET_80387 && mode == XCmode)
33813 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
33814 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
33815 units = GET_MODE_SIZE (mode);
33816 break;
33817
33818 case MODE_VECTOR_INT:
33819 case MODE_VECTOR_FLOAT:
33820 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
33821 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
33822 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
33823 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
33824 units = GET_MODE_SIZE (mode);
33825 }
33826
33827 /* Return the cost of moving between two registers of mode MODE,
33828 assuming that the move will be in pieces of at most UNITS bytes. */
33829 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
33830 }
33831
33832 /* Compute a (partial) cost for rtx X. Return true if the complete
33833 cost has been computed, and false if subexpressions should be
33834 scanned. In either case, *TOTAL contains the cost result. */
33835
33836 static bool
33837 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
33838 bool speed)
33839 {
33840 enum rtx_code code = (enum rtx_code) code_i;
33841 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
33842 enum machine_mode mode = GET_MODE (x);
33843 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
33844
33845 switch (code)
33846 {
33847 case SET:
33848 if (register_operand (SET_DEST (x), VOIDmode)
33849 && reg_or_0_operand (SET_SRC (x), VOIDmode))
33850 {
33851 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
33852 return true;
33853 }
33854 return false;
33855
33856 case CONST_INT:
33857 case CONST:
33858 case LABEL_REF:
33859 case SYMBOL_REF:
33860 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
33861 *total = 3;
33862 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
33863 *total = 2;
33864 else if (flag_pic && SYMBOLIC_CONST (x)
33865 && (!TARGET_64BIT
33866 || (!GET_CODE (x) != LABEL_REF
33867 && (GET_CODE (x) != SYMBOL_REF
33868 || !SYMBOL_REF_LOCAL_P (x)))))
33869 *total = 1;
33870 else
33871 *total = 0;
33872 return true;
33873
33874 case CONST_DOUBLE:
33875 if (mode == VOIDmode)
33876 {
33877 *total = 0;
33878 return true;
33879 }
33880 switch (standard_80387_constant_p (x))
33881 {
33882 case 1: /* 0.0 */
33883 *total = 1;
33884 return true;
33885 default: /* Other constants */
33886 *total = 2;
33887 return true;
33888 case 0:
33889 case -1:
33890 break;
33891 }
33892 if (SSE_FLOAT_MODE_P (mode))
33893 {
33894 case CONST_VECTOR:
33895 switch (standard_sse_constant_p (x))
33896 {
33897 case 0:
33898 break;
33899 case 1: /* 0: xor eliminates false dependency */
33900 *total = 0;
33901 return true;
33902 default: /* -1: cmp contains false dependency */
33903 *total = 1;
33904 return true;
33905 }
33906 }
33907 /* Fall back to (MEM (SYMBOL_REF)), since that's where
33908 it'll probably end up. Add a penalty for size. */
33909 *total = (COSTS_N_INSNS (1)
33910 + (flag_pic != 0 && !TARGET_64BIT)
33911 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
33912 return true;
33913
33914 case ZERO_EXTEND:
33915 /* The zero extensions is often completely free on x86_64, so make
33916 it as cheap as possible. */
33917 if (TARGET_64BIT && mode == DImode
33918 && GET_MODE (XEXP (x, 0)) == SImode)
33919 *total = 1;
33920 else if (TARGET_ZERO_EXTEND_WITH_AND)
33921 *total = cost->add;
33922 else
33923 *total = cost->movzx;
33924 return false;
33925
33926 case SIGN_EXTEND:
33927 *total = cost->movsx;
33928 return false;
33929
33930 case ASHIFT:
33931 if (SCALAR_INT_MODE_P (mode)
33932 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
33933 && CONST_INT_P (XEXP (x, 1)))
33934 {
33935 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
33936 if (value == 1)
33937 {
33938 *total = cost->add;
33939 return false;
33940 }
33941 if ((value == 2 || value == 3)
33942 && cost->lea <= cost->shift_const)
33943 {
33944 *total = cost->lea;
33945 return false;
33946 }
33947 }
33948 /* FALLTHRU */
33949
33950 case ROTATE:
33951 case ASHIFTRT:
33952 case LSHIFTRT:
33953 case ROTATERT:
33954 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
33955 {
33956 /* ??? Should be SSE vector operation cost. */
33957 /* At least for published AMD latencies, this really is the same
33958 as the latency for a simple fpu operation like fabs. */
33959 /* V*QImode is emulated with 1-11 insns. */
33960 if (mode == V16QImode || mode == V32QImode)
33961 {
33962 int count = 11;
33963 if (TARGET_XOP && mode == V16QImode)
33964 {
33965 /* For XOP we use vpshab, which requires a broadcast of the
33966 value to the variable shift insn. For constants this
33967 means a V16Q const in mem; even when we can perform the
33968 shift with one insn set the cost to prefer paddb. */
33969 if (CONSTANT_P (XEXP (x, 1)))
33970 {
33971 *total = (cost->fabs
33972 + rtx_cost (XEXP (x, 0), code, 0, speed)
33973 + (speed ? 2 : COSTS_N_BYTES (16)));
33974 return true;
33975 }
33976 count = 3;
33977 }
33978 else if (TARGET_SSSE3)
33979 count = 7;
33980 *total = cost->fabs * count;
33981 }
33982 else
33983 *total = cost->fabs;
33984 }
33985 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33986 {
33987 if (CONST_INT_P (XEXP (x, 1)))
33988 {
33989 if (INTVAL (XEXP (x, 1)) > 32)
33990 *total = cost->shift_const + COSTS_N_INSNS (2);
33991 else
33992 *total = cost->shift_const * 2;
33993 }
33994 else
33995 {
33996 if (GET_CODE (XEXP (x, 1)) == AND)
33997 *total = cost->shift_var * 2;
33998 else
33999 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34000 }
34001 }
34002 else
34003 {
34004 if (CONST_INT_P (XEXP (x, 1)))
34005 *total = cost->shift_const;
34006 else
34007 *total = cost->shift_var;
34008 }
34009 return false;
34010
34011 case FMA:
34012 {
34013 rtx sub;
34014
34015 gcc_assert (FLOAT_MODE_P (mode));
34016 gcc_assert (TARGET_FMA || TARGET_FMA4);
34017
34018 /* ??? SSE scalar/vector cost should be used here. */
34019 /* ??? Bald assumption that fma has the same cost as fmul. */
34020 *total = cost->fmul;
34021 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34022
34023 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34024 sub = XEXP (x, 0);
34025 if (GET_CODE (sub) == NEG)
34026 sub = XEXP (sub, 0);
34027 *total += rtx_cost (sub, FMA, 0, speed);
34028
34029 sub = XEXP (x, 2);
34030 if (GET_CODE (sub) == NEG)
34031 sub = XEXP (sub, 0);
34032 *total += rtx_cost (sub, FMA, 2, speed);
34033 return true;
34034 }
34035
34036 case MULT:
34037 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34038 {
34039 /* ??? SSE scalar cost should be used here. */
34040 *total = cost->fmul;
34041 return false;
34042 }
34043 else if (X87_FLOAT_MODE_P (mode))
34044 {
34045 *total = cost->fmul;
34046 return false;
34047 }
34048 else if (FLOAT_MODE_P (mode))
34049 {
34050 /* ??? SSE vector cost should be used here. */
34051 *total = cost->fmul;
34052 return false;
34053 }
34054 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34055 {
34056 /* V*QImode is emulated with 7-13 insns. */
34057 if (mode == V16QImode || mode == V32QImode)
34058 {
34059 int extra = 11;
34060 if (TARGET_XOP && mode == V16QImode)
34061 extra = 5;
34062 else if (TARGET_SSSE3)
34063 extra = 6;
34064 *total = cost->fmul * 2 + cost->fabs * extra;
34065 }
34066 /* V*DImode is emulated with 5-8 insns. */
34067 else if (mode == V2DImode || mode == V4DImode)
34068 {
34069 if (TARGET_XOP && mode == V2DImode)
34070 *total = cost->fmul * 2 + cost->fabs * 3;
34071 else
34072 *total = cost->fmul * 3 + cost->fabs * 5;
34073 }
34074 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34075 insns, including two PMULUDQ. */
34076 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34077 *total = cost->fmul * 2 + cost->fabs * 5;
34078 else
34079 *total = cost->fmul;
34080 return false;
34081 }
34082 else
34083 {
34084 rtx op0 = XEXP (x, 0);
34085 rtx op1 = XEXP (x, 1);
34086 int nbits;
34087 if (CONST_INT_P (XEXP (x, 1)))
34088 {
34089 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34090 for (nbits = 0; value != 0; value &= value - 1)
34091 nbits++;
34092 }
34093 else
34094 /* This is arbitrary. */
34095 nbits = 7;
34096
34097 /* Compute costs correctly for widening multiplication. */
34098 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34099 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34100 == GET_MODE_SIZE (mode))
34101 {
34102 int is_mulwiden = 0;
34103 enum machine_mode inner_mode = GET_MODE (op0);
34104
34105 if (GET_CODE (op0) == GET_CODE (op1))
34106 is_mulwiden = 1, op1 = XEXP (op1, 0);
34107 else if (CONST_INT_P (op1))
34108 {
34109 if (GET_CODE (op0) == SIGN_EXTEND)
34110 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34111 == INTVAL (op1);
34112 else
34113 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34114 }
34115
34116 if (is_mulwiden)
34117 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34118 }
34119
34120 *total = (cost->mult_init[MODE_INDEX (mode)]
34121 + nbits * cost->mult_bit
34122 + rtx_cost (op0, outer_code, opno, speed)
34123 + rtx_cost (op1, outer_code, opno, speed));
34124
34125 return true;
34126 }
34127
34128 case DIV:
34129 case UDIV:
34130 case MOD:
34131 case UMOD:
34132 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34133 /* ??? SSE cost should be used here. */
34134 *total = cost->fdiv;
34135 else if (X87_FLOAT_MODE_P (mode))
34136 *total = cost->fdiv;
34137 else if (FLOAT_MODE_P (mode))
34138 /* ??? SSE vector cost should be used here. */
34139 *total = cost->fdiv;
34140 else
34141 *total = cost->divide[MODE_INDEX (mode)];
34142 return false;
34143
34144 case PLUS:
34145 if (GET_MODE_CLASS (mode) == MODE_INT
34146 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34147 {
34148 if (GET_CODE (XEXP (x, 0)) == PLUS
34149 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34150 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34151 && CONSTANT_P (XEXP (x, 1)))
34152 {
34153 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34154 if (val == 2 || val == 4 || val == 8)
34155 {
34156 *total = cost->lea;
34157 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34158 outer_code, opno, speed);
34159 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34160 outer_code, opno, speed);
34161 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34162 return true;
34163 }
34164 }
34165 else if (GET_CODE (XEXP (x, 0)) == MULT
34166 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34167 {
34168 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34169 if (val == 2 || val == 4 || val == 8)
34170 {
34171 *total = cost->lea;
34172 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34173 outer_code, opno, speed);
34174 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34175 return true;
34176 }
34177 }
34178 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34179 {
34180 *total = cost->lea;
34181 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34182 outer_code, opno, speed);
34183 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34184 outer_code, opno, speed);
34185 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34186 return true;
34187 }
34188 }
34189 /* FALLTHRU */
34190
34191 case MINUS:
34192 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34193 {
34194 /* ??? SSE cost should be used here. */
34195 *total = cost->fadd;
34196 return false;
34197 }
34198 else if (X87_FLOAT_MODE_P (mode))
34199 {
34200 *total = cost->fadd;
34201 return false;
34202 }
34203 else if (FLOAT_MODE_P (mode))
34204 {
34205 /* ??? SSE vector cost should be used here. */
34206 *total = cost->fadd;
34207 return false;
34208 }
34209 /* FALLTHRU */
34210
34211 case AND:
34212 case IOR:
34213 case XOR:
34214 if (GET_MODE_CLASS (mode) == MODE_INT
34215 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34216 {
34217 *total = (cost->add * 2
34218 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34219 << (GET_MODE (XEXP (x, 0)) != DImode))
34220 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34221 << (GET_MODE (XEXP (x, 1)) != DImode)));
34222 return true;
34223 }
34224 /* FALLTHRU */
34225
34226 case NEG:
34227 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34228 {
34229 /* ??? SSE cost should be used here. */
34230 *total = cost->fchs;
34231 return false;
34232 }
34233 else if (X87_FLOAT_MODE_P (mode))
34234 {
34235 *total = cost->fchs;
34236 return false;
34237 }
34238 else if (FLOAT_MODE_P (mode))
34239 {
34240 /* ??? SSE vector cost should be used here. */
34241 *total = cost->fchs;
34242 return false;
34243 }
34244 /* FALLTHRU */
34245
34246 case NOT:
34247 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34248 {
34249 /* ??? Should be SSE vector operation cost. */
34250 /* At least for published AMD latencies, this really is the same
34251 as the latency for a simple fpu operation like fabs. */
34252 *total = cost->fabs;
34253 }
34254 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34255 *total = cost->add * 2;
34256 else
34257 *total = cost->add;
34258 return false;
34259
34260 case COMPARE:
34261 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34262 && XEXP (XEXP (x, 0), 1) == const1_rtx
34263 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34264 && XEXP (x, 1) == const0_rtx)
34265 {
34266 /* This kind of construct is implemented using test[bwl].
34267 Treat it as if we had an AND. */
34268 *total = (cost->add
34269 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34270 + rtx_cost (const1_rtx, outer_code, opno, speed));
34271 return true;
34272 }
34273 return false;
34274
34275 case FLOAT_EXTEND:
34276 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34277 *total = 0;
34278 return false;
34279
34280 case ABS:
34281 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34282 /* ??? SSE cost should be used here. */
34283 *total = cost->fabs;
34284 else if (X87_FLOAT_MODE_P (mode))
34285 *total = cost->fabs;
34286 else if (FLOAT_MODE_P (mode))
34287 /* ??? SSE vector cost should be used here. */
34288 *total = cost->fabs;
34289 return false;
34290
34291 case SQRT:
34292 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34293 /* ??? SSE cost should be used here. */
34294 *total = cost->fsqrt;
34295 else if (X87_FLOAT_MODE_P (mode))
34296 *total = cost->fsqrt;
34297 else if (FLOAT_MODE_P (mode))
34298 /* ??? SSE vector cost should be used here. */
34299 *total = cost->fsqrt;
34300 return false;
34301
34302 case UNSPEC:
34303 if (XINT (x, 1) == UNSPEC_TP)
34304 *total = 0;
34305 return false;
34306
34307 case VEC_SELECT:
34308 case VEC_CONCAT:
34309 case VEC_MERGE:
34310 case VEC_DUPLICATE:
34311 /* ??? Assume all of these vector manipulation patterns are
34312 recognizable. In which case they all pretty much have the
34313 same cost. */
34314 *total = cost->fabs;
34315 return true;
34316
34317 default:
34318 return false;
34319 }
34320 }
34321
34322 #if TARGET_MACHO
34323
34324 static int current_machopic_label_num;
34325
34326 /* Given a symbol name and its associated stub, write out the
34327 definition of the stub. */
34328
34329 void
34330 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34331 {
34332 unsigned int length;
34333 char *binder_name, *symbol_name, lazy_ptr_name[32];
34334 int label = ++current_machopic_label_num;
34335
34336 /* For 64-bit we shouldn't get here. */
34337 gcc_assert (!TARGET_64BIT);
34338
34339 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34340 symb = targetm.strip_name_encoding (symb);
34341
34342 length = strlen (stub);
34343 binder_name = XALLOCAVEC (char, length + 32);
34344 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34345
34346 length = strlen (symb);
34347 symbol_name = XALLOCAVEC (char, length + 32);
34348 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34349
34350 sprintf (lazy_ptr_name, "L%d$lz", label);
34351
34352 if (MACHOPIC_ATT_STUB)
34353 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34354 else if (MACHOPIC_PURE)
34355 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34356 else
34357 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34358
34359 fprintf (file, "%s:\n", stub);
34360 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34361
34362 if (MACHOPIC_ATT_STUB)
34363 {
34364 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34365 }
34366 else if (MACHOPIC_PURE)
34367 {
34368 /* PIC stub. */
34369 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34370 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34371 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34372 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34373 label, lazy_ptr_name, label);
34374 fprintf (file, "\tjmp\t*%%ecx\n");
34375 }
34376 else
34377 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34378
34379 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34380 it needs no stub-binding-helper. */
34381 if (MACHOPIC_ATT_STUB)
34382 return;
34383
34384 fprintf (file, "%s:\n", binder_name);
34385
34386 if (MACHOPIC_PURE)
34387 {
34388 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34389 fprintf (file, "\tpushl\t%%ecx\n");
34390 }
34391 else
34392 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34393
34394 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34395
34396 /* N.B. Keep the correspondence of these
34397 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34398 old-pic/new-pic/non-pic stubs; altering this will break
34399 compatibility with existing dylibs. */
34400 if (MACHOPIC_PURE)
34401 {
34402 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34403 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34404 }
34405 else
34406 /* 16-byte -mdynamic-no-pic stub. */
34407 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34408
34409 fprintf (file, "%s:\n", lazy_ptr_name);
34410 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34411 fprintf (file, ASM_LONG "%s\n", binder_name);
34412 }
34413 #endif /* TARGET_MACHO */
34414
34415 /* Order the registers for register allocator. */
34416
34417 void
34418 x86_order_regs_for_local_alloc (void)
34419 {
34420 int pos = 0;
34421 int i;
34422
34423 /* First allocate the local general purpose registers. */
34424 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34425 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34426 reg_alloc_order [pos++] = i;
34427
34428 /* Global general purpose registers. */
34429 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34430 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34431 reg_alloc_order [pos++] = i;
34432
34433 /* x87 registers come first in case we are doing FP math
34434 using them. */
34435 if (!TARGET_SSE_MATH)
34436 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34437 reg_alloc_order [pos++] = i;
34438
34439 /* SSE registers. */
34440 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34441 reg_alloc_order [pos++] = i;
34442 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34443 reg_alloc_order [pos++] = i;
34444
34445 /* x87 registers. */
34446 if (TARGET_SSE_MATH)
34447 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34448 reg_alloc_order [pos++] = i;
34449
34450 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34451 reg_alloc_order [pos++] = i;
34452
34453 /* Initialize the rest of array as we do not allocate some registers
34454 at all. */
34455 while (pos < FIRST_PSEUDO_REGISTER)
34456 reg_alloc_order [pos++] = 0;
34457 }
34458
34459 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34460 in struct attribute_spec handler. */
34461 static tree
34462 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34463 tree args,
34464 int flags ATTRIBUTE_UNUSED,
34465 bool *no_add_attrs)
34466 {
34467 if (TREE_CODE (*node) != FUNCTION_TYPE
34468 && TREE_CODE (*node) != METHOD_TYPE
34469 && TREE_CODE (*node) != FIELD_DECL
34470 && TREE_CODE (*node) != TYPE_DECL)
34471 {
34472 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34473 name);
34474 *no_add_attrs = true;
34475 return NULL_TREE;
34476 }
34477 if (TARGET_64BIT)
34478 {
34479 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34480 name);
34481 *no_add_attrs = true;
34482 return NULL_TREE;
34483 }
34484 if (is_attribute_p ("callee_pop_aggregate_return", name))
34485 {
34486 tree cst;
34487
34488 cst = TREE_VALUE (args);
34489 if (TREE_CODE (cst) != INTEGER_CST)
34490 {
34491 warning (OPT_Wattributes,
34492 "%qE attribute requires an integer constant argument",
34493 name);
34494 *no_add_attrs = true;
34495 }
34496 else if (compare_tree_int (cst, 0) != 0
34497 && compare_tree_int (cst, 1) != 0)
34498 {
34499 warning (OPT_Wattributes,
34500 "argument to %qE attribute is neither zero, nor one",
34501 name);
34502 *no_add_attrs = true;
34503 }
34504
34505 return NULL_TREE;
34506 }
34507
34508 return NULL_TREE;
34509 }
34510
34511 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34512 struct attribute_spec.handler. */
34513 static tree
34514 ix86_handle_abi_attribute (tree *node, tree name,
34515 tree args ATTRIBUTE_UNUSED,
34516 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34517 {
34518 if (TREE_CODE (*node) != FUNCTION_TYPE
34519 && TREE_CODE (*node) != METHOD_TYPE
34520 && TREE_CODE (*node) != FIELD_DECL
34521 && TREE_CODE (*node) != TYPE_DECL)
34522 {
34523 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34524 name);
34525 *no_add_attrs = true;
34526 return NULL_TREE;
34527 }
34528
34529 /* Can combine regparm with all attributes but fastcall. */
34530 if (is_attribute_p ("ms_abi", name))
34531 {
34532 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34533 {
34534 error ("ms_abi and sysv_abi attributes are not compatible");
34535 }
34536
34537 return NULL_TREE;
34538 }
34539 else if (is_attribute_p ("sysv_abi", name))
34540 {
34541 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34542 {
34543 error ("ms_abi and sysv_abi attributes are not compatible");
34544 }
34545
34546 return NULL_TREE;
34547 }
34548
34549 return NULL_TREE;
34550 }
34551
34552 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34553 struct attribute_spec.handler. */
34554 static tree
34555 ix86_handle_struct_attribute (tree *node, tree name,
34556 tree args ATTRIBUTE_UNUSED,
34557 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34558 {
34559 tree *type = NULL;
34560 if (DECL_P (*node))
34561 {
34562 if (TREE_CODE (*node) == TYPE_DECL)
34563 type = &TREE_TYPE (*node);
34564 }
34565 else
34566 type = node;
34567
34568 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34569 {
34570 warning (OPT_Wattributes, "%qE attribute ignored",
34571 name);
34572 *no_add_attrs = true;
34573 }
34574
34575 else if ((is_attribute_p ("ms_struct", name)
34576 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34577 || ((is_attribute_p ("gcc_struct", name)
34578 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34579 {
34580 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34581 name);
34582 *no_add_attrs = true;
34583 }
34584
34585 return NULL_TREE;
34586 }
34587
34588 static tree
34589 ix86_handle_fndecl_attribute (tree *node, tree name,
34590 tree args ATTRIBUTE_UNUSED,
34591 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34592 {
34593 if (TREE_CODE (*node) != FUNCTION_DECL)
34594 {
34595 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34596 name);
34597 *no_add_attrs = true;
34598 }
34599 return NULL_TREE;
34600 }
34601
34602 static bool
34603 ix86_ms_bitfield_layout_p (const_tree record_type)
34604 {
34605 return ((TARGET_MS_BITFIELD_LAYOUT
34606 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34607 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34608 }
34609
34610 /* Returns an expression indicating where the this parameter is
34611 located on entry to the FUNCTION. */
34612
34613 static rtx
34614 x86_this_parameter (tree function)
34615 {
34616 tree type = TREE_TYPE (function);
34617 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34618 int nregs;
34619
34620 if (TARGET_64BIT)
34621 {
34622 const int *parm_regs;
34623
34624 if (ix86_function_type_abi (type) == MS_ABI)
34625 parm_regs = x86_64_ms_abi_int_parameter_registers;
34626 else
34627 parm_regs = x86_64_int_parameter_registers;
34628 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34629 }
34630
34631 nregs = ix86_function_regparm (type, function);
34632
34633 if (nregs > 0 && !stdarg_p (type))
34634 {
34635 int regno;
34636 unsigned int ccvt = ix86_get_callcvt (type);
34637
34638 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34639 regno = aggr ? DX_REG : CX_REG;
34640 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34641 {
34642 regno = CX_REG;
34643 if (aggr)
34644 return gen_rtx_MEM (SImode,
34645 plus_constant (Pmode, stack_pointer_rtx, 4));
34646 }
34647 else
34648 {
34649 regno = AX_REG;
34650 if (aggr)
34651 {
34652 regno = DX_REG;
34653 if (nregs == 1)
34654 return gen_rtx_MEM (SImode,
34655 plus_constant (Pmode,
34656 stack_pointer_rtx, 4));
34657 }
34658 }
34659 return gen_rtx_REG (SImode, regno);
34660 }
34661
34662 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34663 aggr ? 8 : 4));
34664 }
34665
34666 /* Determine whether x86_output_mi_thunk can succeed. */
34667
34668 static bool
34669 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34670 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34671 HOST_WIDE_INT vcall_offset, const_tree function)
34672 {
34673 /* 64-bit can handle anything. */
34674 if (TARGET_64BIT)
34675 return true;
34676
34677 /* For 32-bit, everything's fine if we have one free register. */
34678 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34679 return true;
34680
34681 /* Need a free register for vcall_offset. */
34682 if (vcall_offset)
34683 return false;
34684
34685 /* Need a free register for GOT references. */
34686 if (flag_pic && !targetm.binds_local_p (function))
34687 return false;
34688
34689 /* Otherwise ok. */
34690 return true;
34691 }
34692
34693 /* Output the assembler code for a thunk function. THUNK_DECL is the
34694 declaration for the thunk function itself, FUNCTION is the decl for
34695 the target function. DELTA is an immediate constant offset to be
34696 added to THIS. If VCALL_OFFSET is nonzero, the word at
34697 *(*this + vcall_offset) should be added to THIS. */
34698
34699 static void
34700 x86_output_mi_thunk (FILE *file,
34701 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34702 HOST_WIDE_INT vcall_offset, tree function)
34703 {
34704 rtx this_param = x86_this_parameter (function);
34705 rtx this_reg, tmp, fnaddr;
34706 unsigned int tmp_regno;
34707
34708 if (TARGET_64BIT)
34709 tmp_regno = R10_REG;
34710 else
34711 {
34712 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34713 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
34714 tmp_regno = AX_REG;
34715 else
34716 tmp_regno = CX_REG;
34717 }
34718
34719 emit_note (NOTE_INSN_PROLOGUE_END);
34720
34721 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34722 pull it in now and let DELTA benefit. */
34723 if (REG_P (this_param))
34724 this_reg = this_param;
34725 else if (vcall_offset)
34726 {
34727 /* Put the this parameter into %eax. */
34728 this_reg = gen_rtx_REG (Pmode, AX_REG);
34729 emit_move_insn (this_reg, this_param);
34730 }
34731 else
34732 this_reg = NULL_RTX;
34733
34734 /* Adjust the this parameter by a fixed constant. */
34735 if (delta)
34736 {
34737 rtx delta_rtx = GEN_INT (delta);
34738 rtx delta_dst = this_reg ? this_reg : this_param;
34739
34740 if (TARGET_64BIT)
34741 {
34742 if (!x86_64_general_operand (delta_rtx, Pmode))
34743 {
34744 tmp = gen_rtx_REG (Pmode, tmp_regno);
34745 emit_move_insn (tmp, delta_rtx);
34746 delta_rtx = tmp;
34747 }
34748 }
34749
34750 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
34751 }
34752
34753 /* Adjust the this parameter by a value stored in the vtable. */
34754 if (vcall_offset)
34755 {
34756 rtx vcall_addr, vcall_mem, this_mem;
34757
34758 tmp = gen_rtx_REG (Pmode, tmp_regno);
34759
34760 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
34761 if (Pmode != ptr_mode)
34762 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
34763 emit_move_insn (tmp, this_mem);
34764
34765 /* Adjust the this parameter. */
34766 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
34767 if (TARGET_64BIT
34768 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
34769 {
34770 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
34771 emit_move_insn (tmp2, GEN_INT (vcall_offset));
34772 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
34773 }
34774
34775 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
34776 if (Pmode != ptr_mode)
34777 emit_insn (gen_addsi_1_zext (this_reg,
34778 gen_rtx_REG (ptr_mode,
34779 REGNO (this_reg)),
34780 vcall_mem));
34781 else
34782 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
34783 }
34784
34785 /* If necessary, drop THIS back to its stack slot. */
34786 if (this_reg && this_reg != this_param)
34787 emit_move_insn (this_param, this_reg);
34788
34789 fnaddr = XEXP (DECL_RTL (function), 0);
34790 if (TARGET_64BIT)
34791 {
34792 if (!flag_pic || targetm.binds_local_p (function)
34793 || cfun->machine->call_abi == MS_ABI)
34794 ;
34795 else
34796 {
34797 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
34798 tmp = gen_rtx_CONST (Pmode, tmp);
34799 fnaddr = gen_rtx_MEM (Pmode, tmp);
34800 }
34801 }
34802 else
34803 {
34804 if (!flag_pic || targetm.binds_local_p (function))
34805 ;
34806 #if TARGET_MACHO
34807 else if (TARGET_MACHO)
34808 {
34809 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
34810 fnaddr = XEXP (fnaddr, 0);
34811 }
34812 #endif /* TARGET_MACHO */
34813 else
34814 {
34815 tmp = gen_rtx_REG (Pmode, CX_REG);
34816 output_set_got (tmp, NULL_RTX);
34817
34818 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
34819 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
34820 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
34821 }
34822 }
34823
34824 /* Our sibling call patterns do not allow memories, because we have no
34825 predicate that can distinguish between frame and non-frame memory.
34826 For our purposes here, we can get away with (ab)using a jump pattern,
34827 because we're going to do no optimization. */
34828 if (MEM_P (fnaddr))
34829 emit_jump_insn (gen_indirect_jump (fnaddr));
34830 else
34831 {
34832 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
34833 fnaddr = legitimize_pic_address (fnaddr,
34834 gen_rtx_REG (Pmode, tmp_regno));
34835
34836 if (!sibcall_insn_operand (fnaddr, word_mode))
34837 {
34838 tmp = gen_rtx_REG (word_mode, tmp_regno);
34839 if (GET_MODE (fnaddr) != word_mode)
34840 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
34841 emit_move_insn (tmp, fnaddr);
34842 fnaddr = tmp;
34843 }
34844
34845 tmp = gen_rtx_MEM (QImode, fnaddr);
34846 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
34847 tmp = emit_call_insn (tmp);
34848 SIBLING_CALL_P (tmp) = 1;
34849 }
34850 emit_barrier ();
34851
34852 /* Emit just enough of rest_of_compilation to get the insns emitted.
34853 Note that use_thunk calls assemble_start_function et al. */
34854 tmp = get_insns ();
34855 shorten_branches (tmp);
34856 final_start_function (tmp, file, 1);
34857 final (tmp, file, 1);
34858 final_end_function ();
34859 }
34860
34861 static void
34862 x86_file_start (void)
34863 {
34864 default_file_start ();
34865 #if TARGET_MACHO
34866 darwin_file_start ();
34867 #endif
34868 if (X86_FILE_START_VERSION_DIRECTIVE)
34869 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
34870 if (X86_FILE_START_FLTUSED)
34871 fputs ("\t.global\t__fltused\n", asm_out_file);
34872 if (ix86_asm_dialect == ASM_INTEL)
34873 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
34874 }
34875
34876 int
34877 x86_field_alignment (tree field, int computed)
34878 {
34879 enum machine_mode mode;
34880 tree type = TREE_TYPE (field);
34881
34882 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
34883 return computed;
34884 mode = TYPE_MODE (strip_array_types (type));
34885 if (mode == DFmode || mode == DCmode
34886 || GET_MODE_CLASS (mode) == MODE_INT
34887 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
34888 return MIN (32, computed);
34889 return computed;
34890 }
34891
34892 /* Output assembler code to FILE to increment profiler label # LABELNO
34893 for profiling a function entry. */
34894 void
34895 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
34896 {
34897 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
34898 : MCOUNT_NAME);
34899
34900 if (TARGET_64BIT)
34901 {
34902 #ifndef NO_PROFILE_COUNTERS
34903 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
34904 #endif
34905
34906 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
34907 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
34908 else
34909 fprintf (file, "\tcall\t%s\n", mcount_name);
34910 }
34911 else if (flag_pic)
34912 {
34913 #ifndef NO_PROFILE_COUNTERS
34914 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
34915 LPREFIX, labelno);
34916 #endif
34917 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
34918 }
34919 else
34920 {
34921 #ifndef NO_PROFILE_COUNTERS
34922 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
34923 LPREFIX, labelno);
34924 #endif
34925 fprintf (file, "\tcall\t%s\n", mcount_name);
34926 }
34927 }
34928
34929 /* We don't have exact information about the insn sizes, but we may assume
34930 quite safely that we are informed about all 1 byte insns and memory
34931 address sizes. This is enough to eliminate unnecessary padding in
34932 99% of cases. */
34933
34934 static int
34935 min_insn_size (rtx insn)
34936 {
34937 int l = 0, len;
34938
34939 if (!INSN_P (insn) || !active_insn_p (insn))
34940 return 0;
34941
34942 /* Discard alignments we've emit and jump instructions. */
34943 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
34944 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
34945 return 0;
34946 if (JUMP_TABLE_DATA_P (insn))
34947 return 0;
34948
34949 /* Important case - calls are always 5 bytes.
34950 It is common to have many calls in the row. */
34951 if (CALL_P (insn)
34952 && symbolic_reference_mentioned_p (PATTERN (insn))
34953 && !SIBLING_CALL_P (insn))
34954 return 5;
34955 len = get_attr_length (insn);
34956 if (len <= 1)
34957 return 1;
34958
34959 /* For normal instructions we rely on get_attr_length being exact,
34960 with a few exceptions. */
34961 if (!JUMP_P (insn))
34962 {
34963 enum attr_type type = get_attr_type (insn);
34964
34965 switch (type)
34966 {
34967 case TYPE_MULTI:
34968 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
34969 || asm_noperands (PATTERN (insn)) >= 0)
34970 return 0;
34971 break;
34972 case TYPE_OTHER:
34973 case TYPE_FCMP:
34974 break;
34975 default:
34976 /* Otherwise trust get_attr_length. */
34977 return len;
34978 }
34979
34980 l = get_attr_length_address (insn);
34981 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
34982 l = 4;
34983 }
34984 if (l)
34985 return 1+l;
34986 else
34987 return 2;
34988 }
34989
34990 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
34991
34992 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
34993 window. */
34994
34995 static void
34996 ix86_avoid_jump_mispredicts (void)
34997 {
34998 rtx insn, start = get_insns ();
34999 int nbytes = 0, njumps = 0;
35000 int isjump = 0;
35001
35002 /* Look for all minimal intervals of instructions containing 4 jumps.
35003 The intervals are bounded by START and INSN. NBYTES is the total
35004 size of instructions in the interval including INSN and not including
35005 START. When the NBYTES is smaller than 16 bytes, it is possible
35006 that the end of START and INSN ends up in the same 16byte page.
35007
35008 The smallest offset in the page INSN can start is the case where START
35009 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35010 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35011 */
35012 for (insn = start; insn; insn = NEXT_INSN (insn))
35013 {
35014 int min_size;
35015
35016 if (LABEL_P (insn))
35017 {
35018 int align = label_to_alignment (insn);
35019 int max_skip = label_to_max_skip (insn);
35020
35021 if (max_skip > 15)
35022 max_skip = 15;
35023 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35024 already in the current 16 byte page, because otherwise
35025 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35026 bytes to reach 16 byte boundary. */
35027 if (align <= 0
35028 || (align <= 3 && max_skip != (1 << align) - 1))
35029 max_skip = 0;
35030 if (dump_file)
35031 fprintf (dump_file, "Label %i with max_skip %i\n",
35032 INSN_UID (insn), max_skip);
35033 if (max_skip)
35034 {
35035 while (nbytes + max_skip >= 16)
35036 {
35037 start = NEXT_INSN (start);
35038 if ((JUMP_P (start)
35039 && GET_CODE (PATTERN (start)) != ADDR_VEC
35040 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35041 || CALL_P (start))
35042 njumps--, isjump = 1;
35043 else
35044 isjump = 0;
35045 nbytes -= min_insn_size (start);
35046 }
35047 }
35048 continue;
35049 }
35050
35051 min_size = min_insn_size (insn);
35052 nbytes += min_size;
35053 if (dump_file)
35054 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35055 INSN_UID (insn), min_size);
35056 if ((JUMP_P (insn)
35057 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35058 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35059 || CALL_P (insn))
35060 njumps++;
35061 else
35062 continue;
35063
35064 while (njumps > 3)
35065 {
35066 start = NEXT_INSN (start);
35067 if ((JUMP_P (start)
35068 && GET_CODE (PATTERN (start)) != ADDR_VEC
35069 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35070 || CALL_P (start))
35071 njumps--, isjump = 1;
35072 else
35073 isjump = 0;
35074 nbytes -= min_insn_size (start);
35075 }
35076 gcc_assert (njumps >= 0);
35077 if (dump_file)
35078 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35079 INSN_UID (start), INSN_UID (insn), nbytes);
35080
35081 if (njumps == 3 && isjump && nbytes < 16)
35082 {
35083 int padsize = 15 - nbytes + min_insn_size (insn);
35084
35085 if (dump_file)
35086 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35087 INSN_UID (insn), padsize);
35088 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35089 }
35090 }
35091 }
35092 #endif
35093
35094 /* AMD Athlon works faster
35095 when RET is not destination of conditional jump or directly preceded
35096 by other jump instruction. We avoid the penalty by inserting NOP just
35097 before the RET instructions in such cases. */
35098 static void
35099 ix86_pad_returns (void)
35100 {
35101 edge e;
35102 edge_iterator ei;
35103
35104 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35105 {
35106 basic_block bb = e->src;
35107 rtx ret = BB_END (bb);
35108 rtx prev;
35109 bool replace = false;
35110
35111 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35112 || optimize_bb_for_size_p (bb))
35113 continue;
35114 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35115 if (active_insn_p (prev) || LABEL_P (prev))
35116 break;
35117 if (prev && LABEL_P (prev))
35118 {
35119 edge e;
35120 edge_iterator ei;
35121
35122 FOR_EACH_EDGE (e, ei, bb->preds)
35123 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35124 && !(e->flags & EDGE_FALLTHRU))
35125 replace = true;
35126 }
35127 if (!replace)
35128 {
35129 prev = prev_active_insn (ret);
35130 if (prev
35131 && ((JUMP_P (prev) && any_condjump_p (prev))
35132 || CALL_P (prev)))
35133 replace = true;
35134 /* Empty functions get branch mispredict even when
35135 the jump destination is not visible to us. */
35136 if (!prev && !optimize_function_for_size_p (cfun))
35137 replace = true;
35138 }
35139 if (replace)
35140 {
35141 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35142 delete_insn (ret);
35143 }
35144 }
35145 }
35146
35147 /* Count the minimum number of instructions in BB. Return 4 if the
35148 number of instructions >= 4. */
35149
35150 static int
35151 ix86_count_insn_bb (basic_block bb)
35152 {
35153 rtx insn;
35154 int insn_count = 0;
35155
35156 /* Count number of instructions in this block. Return 4 if the number
35157 of instructions >= 4. */
35158 FOR_BB_INSNS (bb, insn)
35159 {
35160 /* Only happen in exit blocks. */
35161 if (JUMP_P (insn)
35162 && ANY_RETURN_P (PATTERN (insn)))
35163 break;
35164
35165 if (NONDEBUG_INSN_P (insn)
35166 && GET_CODE (PATTERN (insn)) != USE
35167 && GET_CODE (PATTERN (insn)) != CLOBBER)
35168 {
35169 insn_count++;
35170 if (insn_count >= 4)
35171 return insn_count;
35172 }
35173 }
35174
35175 return insn_count;
35176 }
35177
35178
35179 /* Count the minimum number of instructions in code path in BB.
35180 Return 4 if the number of instructions >= 4. */
35181
35182 static int
35183 ix86_count_insn (basic_block bb)
35184 {
35185 edge e;
35186 edge_iterator ei;
35187 int min_prev_count;
35188
35189 /* Only bother counting instructions along paths with no
35190 more than 2 basic blocks between entry and exit. Given
35191 that BB has an edge to exit, determine if a predecessor
35192 of BB has an edge from entry. If so, compute the number
35193 of instructions in the predecessor block. If there
35194 happen to be multiple such blocks, compute the minimum. */
35195 min_prev_count = 4;
35196 FOR_EACH_EDGE (e, ei, bb->preds)
35197 {
35198 edge prev_e;
35199 edge_iterator prev_ei;
35200
35201 if (e->src == ENTRY_BLOCK_PTR)
35202 {
35203 min_prev_count = 0;
35204 break;
35205 }
35206 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35207 {
35208 if (prev_e->src == ENTRY_BLOCK_PTR)
35209 {
35210 int count = ix86_count_insn_bb (e->src);
35211 if (count < min_prev_count)
35212 min_prev_count = count;
35213 break;
35214 }
35215 }
35216 }
35217
35218 if (min_prev_count < 4)
35219 min_prev_count += ix86_count_insn_bb (bb);
35220
35221 return min_prev_count;
35222 }
35223
35224 /* Pad short function to 4 instructions. */
35225
35226 static void
35227 ix86_pad_short_function (void)
35228 {
35229 edge e;
35230 edge_iterator ei;
35231
35232 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35233 {
35234 rtx ret = BB_END (e->src);
35235 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35236 {
35237 int insn_count = ix86_count_insn (e->src);
35238
35239 /* Pad short function. */
35240 if (insn_count < 4)
35241 {
35242 rtx insn = ret;
35243
35244 /* Find epilogue. */
35245 while (insn
35246 && (!NOTE_P (insn)
35247 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35248 insn = PREV_INSN (insn);
35249
35250 if (!insn)
35251 insn = ret;
35252
35253 /* Two NOPs count as one instruction. */
35254 insn_count = 2 * (4 - insn_count);
35255 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35256 }
35257 }
35258 }
35259 }
35260
35261 /* Implement machine specific optimizations. We implement padding of returns
35262 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35263 static void
35264 ix86_reorg (void)
35265 {
35266 /* We are freeing block_for_insn in the toplev to keep compatibility
35267 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35268 compute_bb_for_insn ();
35269
35270 if (optimize && optimize_function_for_speed_p (cfun))
35271 {
35272 if (TARGET_PAD_SHORT_FUNCTION)
35273 ix86_pad_short_function ();
35274 else if (TARGET_PAD_RETURNS)
35275 ix86_pad_returns ();
35276 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35277 if (TARGET_FOUR_JUMP_LIMIT)
35278 ix86_avoid_jump_mispredicts ();
35279 #endif
35280 }
35281 }
35282
35283 /* Return nonzero when QImode register that must be represented via REX prefix
35284 is used. */
35285 bool
35286 x86_extended_QIreg_mentioned_p (rtx insn)
35287 {
35288 int i;
35289 extract_insn_cached (insn);
35290 for (i = 0; i < recog_data.n_operands; i++)
35291 if (GENERAL_REG_P (recog_data.operand[i])
35292 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35293 return true;
35294 return false;
35295 }
35296
35297 /* Return nonzero when P points to register encoded via REX prefix.
35298 Called via for_each_rtx. */
35299 static int
35300 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35301 {
35302 unsigned int regno;
35303 if (!REG_P (*p))
35304 return 0;
35305 regno = REGNO (*p);
35306 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35307 }
35308
35309 /* Return true when INSN mentions register that must be encoded using REX
35310 prefix. */
35311 bool
35312 x86_extended_reg_mentioned_p (rtx insn)
35313 {
35314 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35315 extended_reg_mentioned_1, NULL);
35316 }
35317
35318 /* If profitable, negate (without causing overflow) integer constant
35319 of mode MODE at location LOC. Return true in this case. */
35320 bool
35321 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35322 {
35323 HOST_WIDE_INT val;
35324
35325 if (!CONST_INT_P (*loc))
35326 return false;
35327
35328 switch (mode)
35329 {
35330 case DImode:
35331 /* DImode x86_64 constants must fit in 32 bits. */
35332 gcc_assert (x86_64_immediate_operand (*loc, mode));
35333
35334 mode = SImode;
35335 break;
35336
35337 case SImode:
35338 case HImode:
35339 case QImode:
35340 break;
35341
35342 default:
35343 gcc_unreachable ();
35344 }
35345
35346 /* Avoid overflows. */
35347 if (mode_signbit_p (mode, *loc))
35348 return false;
35349
35350 val = INTVAL (*loc);
35351
35352 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35353 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35354 if ((val < 0 && val != -128)
35355 || val == 128)
35356 {
35357 *loc = GEN_INT (-val);
35358 return true;
35359 }
35360
35361 return false;
35362 }
35363
35364 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35365 optabs would emit if we didn't have TFmode patterns. */
35366
35367 void
35368 x86_emit_floatuns (rtx operands[2])
35369 {
35370 rtx neglab, donelab, i0, i1, f0, in, out;
35371 enum machine_mode mode, inmode;
35372
35373 inmode = GET_MODE (operands[1]);
35374 gcc_assert (inmode == SImode || inmode == DImode);
35375
35376 out = operands[0];
35377 in = force_reg (inmode, operands[1]);
35378 mode = GET_MODE (out);
35379 neglab = gen_label_rtx ();
35380 donelab = gen_label_rtx ();
35381 f0 = gen_reg_rtx (mode);
35382
35383 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35384
35385 expand_float (out, in, 0);
35386
35387 emit_jump_insn (gen_jump (donelab));
35388 emit_barrier ();
35389
35390 emit_label (neglab);
35391
35392 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35393 1, OPTAB_DIRECT);
35394 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35395 1, OPTAB_DIRECT);
35396 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35397
35398 expand_float (f0, i0, 0);
35399
35400 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35401
35402 emit_label (donelab);
35403 }
35404 \f
35405 /* AVX2 does support 32-byte integer vector operations,
35406 thus the longest vector we are faced with is V32QImode. */
35407 #define MAX_VECT_LEN 32
35408
35409 struct expand_vec_perm_d
35410 {
35411 rtx target, op0, op1;
35412 unsigned char perm[MAX_VECT_LEN];
35413 enum machine_mode vmode;
35414 unsigned char nelt;
35415 bool one_operand_p;
35416 bool testing_p;
35417 };
35418
35419 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35420 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35421 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35422
35423 /* Get a vector mode of the same size as the original but with elements
35424 twice as wide. This is only guaranteed to apply to integral vectors. */
35425
35426 static inline enum machine_mode
35427 get_mode_wider_vector (enum machine_mode o)
35428 {
35429 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35430 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35431 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35432 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35433 return n;
35434 }
35435
35436 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35437 with all elements equal to VAR. Return true if successful. */
35438
35439 static bool
35440 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35441 rtx target, rtx val)
35442 {
35443 bool ok;
35444
35445 switch (mode)
35446 {
35447 case V2SImode:
35448 case V2SFmode:
35449 if (!mmx_ok)
35450 return false;
35451 /* FALLTHRU */
35452
35453 case V4DFmode:
35454 case V4DImode:
35455 case V8SFmode:
35456 case V8SImode:
35457 case V2DFmode:
35458 case V2DImode:
35459 case V4SFmode:
35460 case V4SImode:
35461 {
35462 rtx insn, dup;
35463
35464 /* First attempt to recognize VAL as-is. */
35465 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35466 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35467 if (recog_memoized (insn) < 0)
35468 {
35469 rtx seq;
35470 /* If that fails, force VAL into a register. */
35471
35472 start_sequence ();
35473 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35474 seq = get_insns ();
35475 end_sequence ();
35476 if (seq)
35477 emit_insn_before (seq, insn);
35478
35479 ok = recog_memoized (insn) >= 0;
35480 gcc_assert (ok);
35481 }
35482 }
35483 return true;
35484
35485 case V4HImode:
35486 if (!mmx_ok)
35487 return false;
35488 if (TARGET_SSE || TARGET_3DNOW_A)
35489 {
35490 rtx x;
35491
35492 val = gen_lowpart (SImode, val);
35493 x = gen_rtx_TRUNCATE (HImode, val);
35494 x = gen_rtx_VEC_DUPLICATE (mode, x);
35495 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35496 return true;
35497 }
35498 goto widen;
35499
35500 case V8QImode:
35501 if (!mmx_ok)
35502 return false;
35503 goto widen;
35504
35505 case V8HImode:
35506 if (TARGET_SSE2)
35507 {
35508 struct expand_vec_perm_d dperm;
35509 rtx tmp1, tmp2;
35510
35511 permute:
35512 memset (&dperm, 0, sizeof (dperm));
35513 dperm.target = target;
35514 dperm.vmode = mode;
35515 dperm.nelt = GET_MODE_NUNITS (mode);
35516 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35517 dperm.one_operand_p = true;
35518
35519 /* Extend to SImode using a paradoxical SUBREG. */
35520 tmp1 = gen_reg_rtx (SImode);
35521 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35522
35523 /* Insert the SImode value as low element of a V4SImode vector. */
35524 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35525 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35526
35527 ok = (expand_vec_perm_1 (&dperm)
35528 || expand_vec_perm_broadcast_1 (&dperm));
35529 gcc_assert (ok);
35530 return ok;
35531 }
35532 goto widen;
35533
35534 case V16QImode:
35535 if (TARGET_SSE2)
35536 goto permute;
35537 goto widen;
35538
35539 widen:
35540 /* Replicate the value once into the next wider mode and recurse. */
35541 {
35542 enum machine_mode smode, wsmode, wvmode;
35543 rtx x;
35544
35545 smode = GET_MODE_INNER (mode);
35546 wvmode = get_mode_wider_vector (mode);
35547 wsmode = GET_MODE_INNER (wvmode);
35548
35549 val = convert_modes (wsmode, smode, val, true);
35550 x = expand_simple_binop (wsmode, ASHIFT, val,
35551 GEN_INT (GET_MODE_BITSIZE (smode)),
35552 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35553 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35554
35555 x = gen_lowpart (wvmode, target);
35556 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35557 gcc_assert (ok);
35558 return ok;
35559 }
35560
35561 case V16HImode:
35562 case V32QImode:
35563 {
35564 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35565 rtx x = gen_reg_rtx (hvmode);
35566
35567 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35568 gcc_assert (ok);
35569
35570 x = gen_rtx_VEC_CONCAT (mode, x, x);
35571 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35572 }
35573 return true;
35574
35575 default:
35576 return false;
35577 }
35578 }
35579
35580 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35581 whose ONE_VAR element is VAR, and other elements are zero. Return true
35582 if successful. */
35583
35584 static bool
35585 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35586 rtx target, rtx var, int one_var)
35587 {
35588 enum machine_mode vsimode;
35589 rtx new_target;
35590 rtx x, tmp;
35591 bool use_vector_set = false;
35592
35593 switch (mode)
35594 {
35595 case V2DImode:
35596 /* For SSE4.1, we normally use vector set. But if the second
35597 element is zero and inter-unit moves are OK, we use movq
35598 instead. */
35599 use_vector_set = (TARGET_64BIT
35600 && TARGET_SSE4_1
35601 && !(TARGET_INTER_UNIT_MOVES
35602 && one_var == 0));
35603 break;
35604 case V16QImode:
35605 case V4SImode:
35606 case V4SFmode:
35607 use_vector_set = TARGET_SSE4_1;
35608 break;
35609 case V8HImode:
35610 use_vector_set = TARGET_SSE2;
35611 break;
35612 case V4HImode:
35613 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35614 break;
35615 case V32QImode:
35616 case V16HImode:
35617 case V8SImode:
35618 case V8SFmode:
35619 case V4DFmode:
35620 use_vector_set = TARGET_AVX;
35621 break;
35622 case V4DImode:
35623 /* Use ix86_expand_vector_set in 64bit mode only. */
35624 use_vector_set = TARGET_AVX && TARGET_64BIT;
35625 break;
35626 default:
35627 break;
35628 }
35629
35630 if (use_vector_set)
35631 {
35632 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35633 var = force_reg (GET_MODE_INNER (mode), var);
35634 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35635 return true;
35636 }
35637
35638 switch (mode)
35639 {
35640 case V2SFmode:
35641 case V2SImode:
35642 if (!mmx_ok)
35643 return false;
35644 /* FALLTHRU */
35645
35646 case V2DFmode:
35647 case V2DImode:
35648 if (one_var != 0)
35649 return false;
35650 var = force_reg (GET_MODE_INNER (mode), var);
35651 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35652 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35653 return true;
35654
35655 case V4SFmode:
35656 case V4SImode:
35657 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35658 new_target = gen_reg_rtx (mode);
35659 else
35660 new_target = target;
35661 var = force_reg (GET_MODE_INNER (mode), var);
35662 x = gen_rtx_VEC_DUPLICATE (mode, var);
35663 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35664 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35665 if (one_var != 0)
35666 {
35667 /* We need to shuffle the value to the correct position, so
35668 create a new pseudo to store the intermediate result. */
35669
35670 /* With SSE2, we can use the integer shuffle insns. */
35671 if (mode != V4SFmode && TARGET_SSE2)
35672 {
35673 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35674 const1_rtx,
35675 GEN_INT (one_var == 1 ? 0 : 1),
35676 GEN_INT (one_var == 2 ? 0 : 1),
35677 GEN_INT (one_var == 3 ? 0 : 1)));
35678 if (target != new_target)
35679 emit_move_insn (target, new_target);
35680 return true;
35681 }
35682
35683 /* Otherwise convert the intermediate result to V4SFmode and
35684 use the SSE1 shuffle instructions. */
35685 if (mode != V4SFmode)
35686 {
35687 tmp = gen_reg_rtx (V4SFmode);
35688 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
35689 }
35690 else
35691 tmp = new_target;
35692
35693 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
35694 const1_rtx,
35695 GEN_INT (one_var == 1 ? 0 : 1),
35696 GEN_INT (one_var == 2 ? 0+4 : 1+4),
35697 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
35698
35699 if (mode != V4SFmode)
35700 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
35701 else if (tmp != target)
35702 emit_move_insn (target, tmp);
35703 }
35704 else if (target != new_target)
35705 emit_move_insn (target, new_target);
35706 return true;
35707
35708 case V8HImode:
35709 case V16QImode:
35710 vsimode = V4SImode;
35711 goto widen;
35712 case V4HImode:
35713 case V8QImode:
35714 if (!mmx_ok)
35715 return false;
35716 vsimode = V2SImode;
35717 goto widen;
35718 widen:
35719 if (one_var != 0)
35720 return false;
35721
35722 /* Zero extend the variable element to SImode and recurse. */
35723 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
35724
35725 x = gen_reg_rtx (vsimode);
35726 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
35727 var, one_var))
35728 gcc_unreachable ();
35729
35730 emit_move_insn (target, gen_lowpart (mode, x));
35731 return true;
35732
35733 default:
35734 return false;
35735 }
35736 }
35737
35738 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35739 consisting of the values in VALS. It is known that all elements
35740 except ONE_VAR are constants. Return true if successful. */
35741
35742 static bool
35743 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
35744 rtx target, rtx vals, int one_var)
35745 {
35746 rtx var = XVECEXP (vals, 0, one_var);
35747 enum machine_mode wmode;
35748 rtx const_vec, x;
35749
35750 const_vec = copy_rtx (vals);
35751 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
35752 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
35753
35754 switch (mode)
35755 {
35756 case V2DFmode:
35757 case V2DImode:
35758 case V2SFmode:
35759 case V2SImode:
35760 /* For the two element vectors, it's just as easy to use
35761 the general case. */
35762 return false;
35763
35764 case V4DImode:
35765 /* Use ix86_expand_vector_set in 64bit mode only. */
35766 if (!TARGET_64BIT)
35767 return false;
35768 case V4DFmode:
35769 case V8SFmode:
35770 case V8SImode:
35771 case V16HImode:
35772 case V32QImode:
35773 case V4SFmode:
35774 case V4SImode:
35775 case V8HImode:
35776 case V4HImode:
35777 break;
35778
35779 case V16QImode:
35780 if (TARGET_SSE4_1)
35781 break;
35782 wmode = V8HImode;
35783 goto widen;
35784 case V8QImode:
35785 wmode = V4HImode;
35786 goto widen;
35787 widen:
35788 /* There's no way to set one QImode entry easily. Combine
35789 the variable value with its adjacent constant value, and
35790 promote to an HImode set. */
35791 x = XVECEXP (vals, 0, one_var ^ 1);
35792 if (one_var & 1)
35793 {
35794 var = convert_modes (HImode, QImode, var, true);
35795 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
35796 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35797 x = GEN_INT (INTVAL (x) & 0xff);
35798 }
35799 else
35800 {
35801 var = convert_modes (HImode, QImode, var, true);
35802 x = gen_int_mode (INTVAL (x) << 8, HImode);
35803 }
35804 if (x != const0_rtx)
35805 var = expand_simple_binop (HImode, IOR, var, x, var,
35806 1, OPTAB_LIB_WIDEN);
35807
35808 x = gen_reg_rtx (wmode);
35809 emit_move_insn (x, gen_lowpart (wmode, const_vec));
35810 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
35811
35812 emit_move_insn (target, gen_lowpart (mode, x));
35813 return true;
35814
35815 default:
35816 return false;
35817 }
35818
35819 emit_move_insn (target, const_vec);
35820 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35821 return true;
35822 }
35823
35824 /* A subroutine of ix86_expand_vector_init_general. Use vector
35825 concatenate to handle the most general case: all values variable,
35826 and none identical. */
35827
35828 static void
35829 ix86_expand_vector_init_concat (enum machine_mode mode,
35830 rtx target, rtx *ops, int n)
35831 {
35832 enum machine_mode cmode, hmode = VOIDmode;
35833 rtx first[8], second[4];
35834 rtvec v;
35835 int i, j;
35836
35837 switch (n)
35838 {
35839 case 2:
35840 switch (mode)
35841 {
35842 case V8SImode:
35843 cmode = V4SImode;
35844 break;
35845 case V8SFmode:
35846 cmode = V4SFmode;
35847 break;
35848 case V4DImode:
35849 cmode = V2DImode;
35850 break;
35851 case V4DFmode:
35852 cmode = V2DFmode;
35853 break;
35854 case V4SImode:
35855 cmode = V2SImode;
35856 break;
35857 case V4SFmode:
35858 cmode = V2SFmode;
35859 break;
35860 case V2DImode:
35861 cmode = DImode;
35862 break;
35863 case V2SImode:
35864 cmode = SImode;
35865 break;
35866 case V2DFmode:
35867 cmode = DFmode;
35868 break;
35869 case V2SFmode:
35870 cmode = SFmode;
35871 break;
35872 default:
35873 gcc_unreachable ();
35874 }
35875
35876 if (!register_operand (ops[1], cmode))
35877 ops[1] = force_reg (cmode, ops[1]);
35878 if (!register_operand (ops[0], cmode))
35879 ops[0] = force_reg (cmode, ops[0]);
35880 emit_insn (gen_rtx_SET (VOIDmode, target,
35881 gen_rtx_VEC_CONCAT (mode, ops[0],
35882 ops[1])));
35883 break;
35884
35885 case 4:
35886 switch (mode)
35887 {
35888 case V4DImode:
35889 cmode = V2DImode;
35890 break;
35891 case V4DFmode:
35892 cmode = V2DFmode;
35893 break;
35894 case V4SImode:
35895 cmode = V2SImode;
35896 break;
35897 case V4SFmode:
35898 cmode = V2SFmode;
35899 break;
35900 default:
35901 gcc_unreachable ();
35902 }
35903 goto half;
35904
35905 case 8:
35906 switch (mode)
35907 {
35908 case V8SImode:
35909 cmode = V2SImode;
35910 hmode = V4SImode;
35911 break;
35912 case V8SFmode:
35913 cmode = V2SFmode;
35914 hmode = V4SFmode;
35915 break;
35916 default:
35917 gcc_unreachable ();
35918 }
35919 goto half;
35920
35921 half:
35922 /* FIXME: We process inputs backward to help RA. PR 36222. */
35923 i = n - 1;
35924 j = (n >> 1) - 1;
35925 for (; i > 0; i -= 2, j--)
35926 {
35927 first[j] = gen_reg_rtx (cmode);
35928 v = gen_rtvec (2, ops[i - 1], ops[i]);
35929 ix86_expand_vector_init (false, first[j],
35930 gen_rtx_PARALLEL (cmode, v));
35931 }
35932
35933 n >>= 1;
35934 if (n > 2)
35935 {
35936 gcc_assert (hmode != VOIDmode);
35937 for (i = j = 0; i < n; i += 2, j++)
35938 {
35939 second[j] = gen_reg_rtx (hmode);
35940 ix86_expand_vector_init_concat (hmode, second [j],
35941 &first [i], 2);
35942 }
35943 n >>= 1;
35944 ix86_expand_vector_init_concat (mode, target, second, n);
35945 }
35946 else
35947 ix86_expand_vector_init_concat (mode, target, first, n);
35948 break;
35949
35950 default:
35951 gcc_unreachable ();
35952 }
35953 }
35954
35955 /* A subroutine of ix86_expand_vector_init_general. Use vector
35956 interleave to handle the most general case: all values variable,
35957 and none identical. */
35958
35959 static void
35960 ix86_expand_vector_init_interleave (enum machine_mode mode,
35961 rtx target, rtx *ops, int n)
35962 {
35963 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
35964 int i, j;
35965 rtx op0, op1;
35966 rtx (*gen_load_even) (rtx, rtx, rtx);
35967 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
35968 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
35969
35970 switch (mode)
35971 {
35972 case V8HImode:
35973 gen_load_even = gen_vec_setv8hi;
35974 gen_interleave_first_low = gen_vec_interleave_lowv4si;
35975 gen_interleave_second_low = gen_vec_interleave_lowv2di;
35976 inner_mode = HImode;
35977 first_imode = V4SImode;
35978 second_imode = V2DImode;
35979 third_imode = VOIDmode;
35980 break;
35981 case V16QImode:
35982 gen_load_even = gen_vec_setv16qi;
35983 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
35984 gen_interleave_second_low = gen_vec_interleave_lowv4si;
35985 inner_mode = QImode;
35986 first_imode = V8HImode;
35987 second_imode = V4SImode;
35988 third_imode = V2DImode;
35989 break;
35990 default:
35991 gcc_unreachable ();
35992 }
35993
35994 for (i = 0; i < n; i++)
35995 {
35996 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
35997 op0 = gen_reg_rtx (SImode);
35998 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
35999
36000 /* Insert the SImode value as low element of V4SImode vector. */
36001 op1 = gen_reg_rtx (V4SImode);
36002 op0 = gen_rtx_VEC_MERGE (V4SImode,
36003 gen_rtx_VEC_DUPLICATE (V4SImode,
36004 op0),
36005 CONST0_RTX (V4SImode),
36006 const1_rtx);
36007 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36008
36009 /* Cast the V4SImode vector back to a vector in orignal mode. */
36010 op0 = gen_reg_rtx (mode);
36011 emit_move_insn (op0, gen_lowpart (mode, op1));
36012
36013 /* Load even elements into the second positon. */
36014 emit_insn (gen_load_even (op0,
36015 force_reg (inner_mode,
36016 ops [i + i + 1]),
36017 const1_rtx));
36018
36019 /* Cast vector to FIRST_IMODE vector. */
36020 ops[i] = gen_reg_rtx (first_imode);
36021 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36022 }
36023
36024 /* Interleave low FIRST_IMODE vectors. */
36025 for (i = j = 0; i < n; i += 2, j++)
36026 {
36027 op0 = gen_reg_rtx (first_imode);
36028 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36029
36030 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36031 ops[j] = gen_reg_rtx (second_imode);
36032 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36033 }
36034
36035 /* Interleave low SECOND_IMODE vectors. */
36036 switch (second_imode)
36037 {
36038 case V4SImode:
36039 for (i = j = 0; i < n / 2; i += 2, j++)
36040 {
36041 op0 = gen_reg_rtx (second_imode);
36042 emit_insn (gen_interleave_second_low (op0, ops[i],
36043 ops[i + 1]));
36044
36045 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36046 vector. */
36047 ops[j] = gen_reg_rtx (third_imode);
36048 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36049 }
36050 second_imode = V2DImode;
36051 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36052 /* FALLTHRU */
36053
36054 case V2DImode:
36055 op0 = gen_reg_rtx (second_imode);
36056 emit_insn (gen_interleave_second_low (op0, ops[0],
36057 ops[1]));
36058
36059 /* Cast the SECOND_IMODE vector back to a vector on original
36060 mode. */
36061 emit_insn (gen_rtx_SET (VOIDmode, target,
36062 gen_lowpart (mode, op0)));
36063 break;
36064
36065 default:
36066 gcc_unreachable ();
36067 }
36068 }
36069
36070 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36071 all values variable, and none identical. */
36072
36073 static void
36074 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36075 rtx target, rtx vals)
36076 {
36077 rtx ops[32], op0, op1;
36078 enum machine_mode half_mode = VOIDmode;
36079 int n, i;
36080
36081 switch (mode)
36082 {
36083 case V2SFmode:
36084 case V2SImode:
36085 if (!mmx_ok && !TARGET_SSE)
36086 break;
36087 /* FALLTHRU */
36088
36089 case V8SFmode:
36090 case V8SImode:
36091 case V4DFmode:
36092 case V4DImode:
36093 case V4SFmode:
36094 case V4SImode:
36095 case V2DFmode:
36096 case V2DImode:
36097 n = GET_MODE_NUNITS (mode);
36098 for (i = 0; i < n; i++)
36099 ops[i] = XVECEXP (vals, 0, i);
36100 ix86_expand_vector_init_concat (mode, target, ops, n);
36101 return;
36102
36103 case V32QImode:
36104 half_mode = V16QImode;
36105 goto half;
36106
36107 case V16HImode:
36108 half_mode = V8HImode;
36109 goto half;
36110
36111 half:
36112 n = GET_MODE_NUNITS (mode);
36113 for (i = 0; i < n; i++)
36114 ops[i] = XVECEXP (vals, 0, i);
36115 op0 = gen_reg_rtx (half_mode);
36116 op1 = gen_reg_rtx (half_mode);
36117 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36118 n >> 2);
36119 ix86_expand_vector_init_interleave (half_mode, op1,
36120 &ops [n >> 1], n >> 2);
36121 emit_insn (gen_rtx_SET (VOIDmode, target,
36122 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36123 return;
36124
36125 case V16QImode:
36126 if (!TARGET_SSE4_1)
36127 break;
36128 /* FALLTHRU */
36129
36130 case V8HImode:
36131 if (!TARGET_SSE2)
36132 break;
36133
36134 /* Don't use ix86_expand_vector_init_interleave if we can't
36135 move from GPR to SSE register directly. */
36136 if (!TARGET_INTER_UNIT_MOVES)
36137 break;
36138
36139 n = GET_MODE_NUNITS (mode);
36140 for (i = 0; i < n; i++)
36141 ops[i] = XVECEXP (vals, 0, i);
36142 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36143 return;
36144
36145 case V4HImode:
36146 case V8QImode:
36147 break;
36148
36149 default:
36150 gcc_unreachable ();
36151 }
36152
36153 {
36154 int i, j, n_elts, n_words, n_elt_per_word;
36155 enum machine_mode inner_mode;
36156 rtx words[4], shift;
36157
36158 inner_mode = GET_MODE_INNER (mode);
36159 n_elts = GET_MODE_NUNITS (mode);
36160 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36161 n_elt_per_word = n_elts / n_words;
36162 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36163
36164 for (i = 0; i < n_words; ++i)
36165 {
36166 rtx word = NULL_RTX;
36167
36168 for (j = 0; j < n_elt_per_word; ++j)
36169 {
36170 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36171 elt = convert_modes (word_mode, inner_mode, elt, true);
36172
36173 if (j == 0)
36174 word = elt;
36175 else
36176 {
36177 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36178 word, 1, OPTAB_LIB_WIDEN);
36179 word = expand_simple_binop (word_mode, IOR, word, elt,
36180 word, 1, OPTAB_LIB_WIDEN);
36181 }
36182 }
36183
36184 words[i] = word;
36185 }
36186
36187 if (n_words == 1)
36188 emit_move_insn (target, gen_lowpart (mode, words[0]));
36189 else if (n_words == 2)
36190 {
36191 rtx tmp = gen_reg_rtx (mode);
36192 emit_clobber (tmp);
36193 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36194 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36195 emit_move_insn (target, tmp);
36196 }
36197 else if (n_words == 4)
36198 {
36199 rtx tmp = gen_reg_rtx (V4SImode);
36200 gcc_assert (word_mode == SImode);
36201 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36202 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36203 emit_move_insn (target, gen_lowpart (mode, tmp));
36204 }
36205 else
36206 gcc_unreachable ();
36207 }
36208 }
36209
36210 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36211 instructions unless MMX_OK is true. */
36212
36213 void
36214 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36215 {
36216 enum machine_mode mode = GET_MODE (target);
36217 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36218 int n_elts = GET_MODE_NUNITS (mode);
36219 int n_var = 0, one_var = -1;
36220 bool all_same = true, all_const_zero = true;
36221 int i;
36222 rtx x;
36223
36224 for (i = 0; i < n_elts; ++i)
36225 {
36226 x = XVECEXP (vals, 0, i);
36227 if (!(CONST_INT_P (x)
36228 || GET_CODE (x) == CONST_DOUBLE
36229 || GET_CODE (x) == CONST_FIXED))
36230 n_var++, one_var = i;
36231 else if (x != CONST0_RTX (inner_mode))
36232 all_const_zero = false;
36233 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36234 all_same = false;
36235 }
36236
36237 /* Constants are best loaded from the constant pool. */
36238 if (n_var == 0)
36239 {
36240 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36241 return;
36242 }
36243
36244 /* If all values are identical, broadcast the value. */
36245 if (all_same
36246 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36247 XVECEXP (vals, 0, 0)))
36248 return;
36249
36250 /* Values where only one field is non-constant are best loaded from
36251 the pool and overwritten via move later. */
36252 if (n_var == 1)
36253 {
36254 if (all_const_zero
36255 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36256 XVECEXP (vals, 0, one_var),
36257 one_var))
36258 return;
36259
36260 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36261 return;
36262 }
36263
36264 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36265 }
36266
36267 void
36268 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36269 {
36270 enum machine_mode mode = GET_MODE (target);
36271 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36272 enum machine_mode half_mode;
36273 bool use_vec_merge = false;
36274 rtx tmp;
36275 static rtx (*gen_extract[6][2]) (rtx, rtx)
36276 = {
36277 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36278 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36279 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36280 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36281 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36282 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36283 };
36284 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36285 = {
36286 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36287 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36288 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36289 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36290 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36291 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36292 };
36293 int i, j, n;
36294
36295 switch (mode)
36296 {
36297 case V2SFmode:
36298 case V2SImode:
36299 if (mmx_ok)
36300 {
36301 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36302 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36303 if (elt == 0)
36304 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36305 else
36306 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36307 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36308 return;
36309 }
36310 break;
36311
36312 case V2DImode:
36313 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36314 if (use_vec_merge)
36315 break;
36316
36317 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36318 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36319 if (elt == 0)
36320 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36321 else
36322 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36323 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36324 return;
36325
36326 case V2DFmode:
36327 {
36328 rtx op0, op1;
36329
36330 /* For the two element vectors, we implement a VEC_CONCAT with
36331 the extraction of the other element. */
36332
36333 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36334 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36335
36336 if (elt == 0)
36337 op0 = val, op1 = tmp;
36338 else
36339 op0 = tmp, op1 = val;
36340
36341 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36342 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36343 }
36344 return;
36345
36346 case V4SFmode:
36347 use_vec_merge = TARGET_SSE4_1;
36348 if (use_vec_merge)
36349 break;
36350
36351 switch (elt)
36352 {
36353 case 0:
36354 use_vec_merge = true;
36355 break;
36356
36357 case 1:
36358 /* tmp = target = A B C D */
36359 tmp = copy_to_reg (target);
36360 /* target = A A B B */
36361 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36362 /* target = X A B B */
36363 ix86_expand_vector_set (false, target, val, 0);
36364 /* target = A X C D */
36365 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36366 const1_rtx, const0_rtx,
36367 GEN_INT (2+4), GEN_INT (3+4)));
36368 return;
36369
36370 case 2:
36371 /* tmp = target = A B C D */
36372 tmp = copy_to_reg (target);
36373 /* tmp = X B C D */
36374 ix86_expand_vector_set (false, tmp, val, 0);
36375 /* target = A B X D */
36376 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36377 const0_rtx, const1_rtx,
36378 GEN_INT (0+4), GEN_INT (3+4)));
36379 return;
36380
36381 case 3:
36382 /* tmp = target = A B C D */
36383 tmp = copy_to_reg (target);
36384 /* tmp = X B C D */
36385 ix86_expand_vector_set (false, tmp, val, 0);
36386 /* target = A B X D */
36387 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36388 const0_rtx, const1_rtx,
36389 GEN_INT (2+4), GEN_INT (0+4)));
36390 return;
36391
36392 default:
36393 gcc_unreachable ();
36394 }
36395 break;
36396
36397 case V4SImode:
36398 use_vec_merge = TARGET_SSE4_1;
36399 if (use_vec_merge)
36400 break;
36401
36402 /* Element 0 handled by vec_merge below. */
36403 if (elt == 0)
36404 {
36405 use_vec_merge = true;
36406 break;
36407 }
36408
36409 if (TARGET_SSE2)
36410 {
36411 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36412 store into element 0, then shuffle them back. */
36413
36414 rtx order[4];
36415
36416 order[0] = GEN_INT (elt);
36417 order[1] = const1_rtx;
36418 order[2] = const2_rtx;
36419 order[3] = GEN_INT (3);
36420 order[elt] = const0_rtx;
36421
36422 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36423 order[1], order[2], order[3]));
36424
36425 ix86_expand_vector_set (false, target, val, 0);
36426
36427 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36428 order[1], order[2], order[3]));
36429 }
36430 else
36431 {
36432 /* For SSE1, we have to reuse the V4SF code. */
36433 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36434 gen_lowpart (SFmode, val), elt);
36435 }
36436 return;
36437
36438 case V8HImode:
36439 use_vec_merge = TARGET_SSE2;
36440 break;
36441 case V4HImode:
36442 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36443 break;
36444
36445 case V16QImode:
36446 use_vec_merge = TARGET_SSE4_1;
36447 break;
36448
36449 case V8QImode:
36450 break;
36451
36452 case V32QImode:
36453 half_mode = V16QImode;
36454 j = 0;
36455 n = 16;
36456 goto half;
36457
36458 case V16HImode:
36459 half_mode = V8HImode;
36460 j = 1;
36461 n = 8;
36462 goto half;
36463
36464 case V8SImode:
36465 half_mode = V4SImode;
36466 j = 2;
36467 n = 4;
36468 goto half;
36469
36470 case V4DImode:
36471 half_mode = V2DImode;
36472 j = 3;
36473 n = 2;
36474 goto half;
36475
36476 case V8SFmode:
36477 half_mode = V4SFmode;
36478 j = 4;
36479 n = 4;
36480 goto half;
36481
36482 case V4DFmode:
36483 half_mode = V2DFmode;
36484 j = 5;
36485 n = 2;
36486 goto half;
36487
36488 half:
36489 /* Compute offset. */
36490 i = elt / n;
36491 elt %= n;
36492
36493 gcc_assert (i <= 1);
36494
36495 /* Extract the half. */
36496 tmp = gen_reg_rtx (half_mode);
36497 emit_insn (gen_extract[j][i] (tmp, target));
36498
36499 /* Put val in tmp at elt. */
36500 ix86_expand_vector_set (false, tmp, val, elt);
36501
36502 /* Put it back. */
36503 emit_insn (gen_insert[j][i] (target, target, tmp));
36504 return;
36505
36506 default:
36507 break;
36508 }
36509
36510 if (use_vec_merge)
36511 {
36512 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36513 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36514 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36515 }
36516 else
36517 {
36518 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36519
36520 emit_move_insn (mem, target);
36521
36522 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36523 emit_move_insn (tmp, val);
36524
36525 emit_move_insn (target, mem);
36526 }
36527 }
36528
36529 void
36530 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36531 {
36532 enum machine_mode mode = GET_MODE (vec);
36533 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36534 bool use_vec_extr = false;
36535 rtx tmp;
36536
36537 switch (mode)
36538 {
36539 case V2SImode:
36540 case V2SFmode:
36541 if (!mmx_ok)
36542 break;
36543 /* FALLTHRU */
36544
36545 case V2DFmode:
36546 case V2DImode:
36547 use_vec_extr = true;
36548 break;
36549
36550 case V4SFmode:
36551 use_vec_extr = TARGET_SSE4_1;
36552 if (use_vec_extr)
36553 break;
36554
36555 switch (elt)
36556 {
36557 case 0:
36558 tmp = vec;
36559 break;
36560
36561 case 1:
36562 case 3:
36563 tmp = gen_reg_rtx (mode);
36564 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36565 GEN_INT (elt), GEN_INT (elt),
36566 GEN_INT (elt+4), GEN_INT (elt+4)));
36567 break;
36568
36569 case 2:
36570 tmp = gen_reg_rtx (mode);
36571 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36572 break;
36573
36574 default:
36575 gcc_unreachable ();
36576 }
36577 vec = tmp;
36578 use_vec_extr = true;
36579 elt = 0;
36580 break;
36581
36582 case V4SImode:
36583 use_vec_extr = TARGET_SSE4_1;
36584 if (use_vec_extr)
36585 break;
36586
36587 if (TARGET_SSE2)
36588 {
36589 switch (elt)
36590 {
36591 case 0:
36592 tmp = vec;
36593 break;
36594
36595 case 1:
36596 case 3:
36597 tmp = gen_reg_rtx (mode);
36598 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36599 GEN_INT (elt), GEN_INT (elt),
36600 GEN_INT (elt), GEN_INT (elt)));
36601 break;
36602
36603 case 2:
36604 tmp = gen_reg_rtx (mode);
36605 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36606 break;
36607
36608 default:
36609 gcc_unreachable ();
36610 }
36611 vec = tmp;
36612 use_vec_extr = true;
36613 elt = 0;
36614 }
36615 else
36616 {
36617 /* For SSE1, we have to reuse the V4SF code. */
36618 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36619 gen_lowpart (V4SFmode, vec), elt);
36620 return;
36621 }
36622 break;
36623
36624 case V8HImode:
36625 use_vec_extr = TARGET_SSE2;
36626 break;
36627 case V4HImode:
36628 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36629 break;
36630
36631 case V16QImode:
36632 use_vec_extr = TARGET_SSE4_1;
36633 break;
36634
36635 case V8SFmode:
36636 if (TARGET_AVX)
36637 {
36638 tmp = gen_reg_rtx (V4SFmode);
36639 if (elt < 4)
36640 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36641 else
36642 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36643 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36644 return;
36645 }
36646 break;
36647
36648 case V4DFmode:
36649 if (TARGET_AVX)
36650 {
36651 tmp = gen_reg_rtx (V2DFmode);
36652 if (elt < 2)
36653 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36654 else
36655 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36656 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36657 return;
36658 }
36659 break;
36660
36661 case V32QImode:
36662 if (TARGET_AVX)
36663 {
36664 tmp = gen_reg_rtx (V16QImode);
36665 if (elt < 16)
36666 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36667 else
36668 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36669 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36670 return;
36671 }
36672 break;
36673
36674 case V16HImode:
36675 if (TARGET_AVX)
36676 {
36677 tmp = gen_reg_rtx (V8HImode);
36678 if (elt < 8)
36679 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36680 else
36681 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36682 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36683 return;
36684 }
36685 break;
36686
36687 case V8SImode:
36688 if (TARGET_AVX)
36689 {
36690 tmp = gen_reg_rtx (V4SImode);
36691 if (elt < 4)
36692 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
36693 else
36694 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
36695 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36696 return;
36697 }
36698 break;
36699
36700 case V4DImode:
36701 if (TARGET_AVX)
36702 {
36703 tmp = gen_reg_rtx (V2DImode);
36704 if (elt < 2)
36705 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
36706 else
36707 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
36708 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36709 return;
36710 }
36711 break;
36712
36713 case V8QImode:
36714 /* ??? Could extract the appropriate HImode element and shift. */
36715 default:
36716 break;
36717 }
36718
36719 if (use_vec_extr)
36720 {
36721 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
36722 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
36723
36724 /* Let the rtl optimizers know about the zero extension performed. */
36725 if (inner_mode == QImode || inner_mode == HImode)
36726 {
36727 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
36728 target = gen_lowpart (SImode, target);
36729 }
36730
36731 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36732 }
36733 else
36734 {
36735 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36736
36737 emit_move_insn (mem, vec);
36738
36739 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36740 emit_move_insn (target, tmp);
36741 }
36742 }
36743
36744 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
36745 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
36746 The upper bits of DEST are undefined, though they shouldn't cause
36747 exceptions (some bits from src or all zeros are ok). */
36748
36749 static void
36750 emit_reduc_half (rtx dest, rtx src, int i)
36751 {
36752 rtx tem;
36753 switch (GET_MODE (src))
36754 {
36755 case V4SFmode:
36756 if (i == 128)
36757 tem = gen_sse_movhlps (dest, src, src);
36758 else
36759 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
36760 GEN_INT (1 + 4), GEN_INT (1 + 4));
36761 break;
36762 case V2DFmode:
36763 tem = gen_vec_interleave_highv2df (dest, src, src);
36764 break;
36765 case V16QImode:
36766 case V8HImode:
36767 case V4SImode:
36768 case V2DImode:
36769 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
36770 gen_lowpart (V1TImode, src),
36771 GEN_INT (i / 2));
36772 break;
36773 case V8SFmode:
36774 if (i == 256)
36775 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
36776 else
36777 tem = gen_avx_shufps256 (dest, src, src,
36778 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
36779 break;
36780 case V4DFmode:
36781 if (i == 256)
36782 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
36783 else
36784 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
36785 break;
36786 case V32QImode:
36787 case V16HImode:
36788 case V8SImode:
36789 case V4DImode:
36790 if (i == 256)
36791 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
36792 gen_lowpart (V4DImode, src),
36793 gen_lowpart (V4DImode, src),
36794 const1_rtx);
36795 else
36796 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
36797 gen_lowpart (V2TImode, src),
36798 GEN_INT (i / 2));
36799 break;
36800 default:
36801 gcc_unreachable ();
36802 }
36803 emit_insn (tem);
36804 }
36805
36806 /* Expand a vector reduction. FN is the binary pattern to reduce;
36807 DEST is the destination; IN is the input vector. */
36808
36809 void
36810 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
36811 {
36812 rtx half, dst, vec = in;
36813 enum machine_mode mode = GET_MODE (in);
36814 int i;
36815
36816 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
36817 if (TARGET_SSE4_1
36818 && mode == V8HImode
36819 && fn == gen_uminv8hi3)
36820 {
36821 emit_insn (gen_sse4_1_phminposuw (dest, in));
36822 return;
36823 }
36824
36825 for (i = GET_MODE_BITSIZE (mode);
36826 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
36827 i >>= 1)
36828 {
36829 half = gen_reg_rtx (mode);
36830 emit_reduc_half (half, vec, i);
36831 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
36832 dst = dest;
36833 else
36834 dst = gen_reg_rtx (mode);
36835 emit_insn (fn (dst, half, vec));
36836 vec = dst;
36837 }
36838 }
36839 \f
36840 /* Target hook for scalar_mode_supported_p. */
36841 static bool
36842 ix86_scalar_mode_supported_p (enum machine_mode mode)
36843 {
36844 if (DECIMAL_FLOAT_MODE_P (mode))
36845 return default_decimal_float_supported_p ();
36846 else if (mode == TFmode)
36847 return true;
36848 else
36849 return default_scalar_mode_supported_p (mode);
36850 }
36851
36852 /* Implements target hook vector_mode_supported_p. */
36853 static bool
36854 ix86_vector_mode_supported_p (enum machine_mode mode)
36855 {
36856 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
36857 return true;
36858 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
36859 return true;
36860 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
36861 return true;
36862 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
36863 return true;
36864 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
36865 return true;
36866 return false;
36867 }
36868
36869 /* Target hook for c_mode_for_suffix. */
36870 static enum machine_mode
36871 ix86_c_mode_for_suffix (char suffix)
36872 {
36873 if (suffix == 'q')
36874 return TFmode;
36875 if (suffix == 'w')
36876 return XFmode;
36877
36878 return VOIDmode;
36879 }
36880
36881 /* Worker function for TARGET_MD_ASM_CLOBBERS.
36882
36883 We do this in the new i386 backend to maintain source compatibility
36884 with the old cc0-based compiler. */
36885
36886 static tree
36887 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
36888 tree inputs ATTRIBUTE_UNUSED,
36889 tree clobbers)
36890 {
36891 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
36892 clobbers);
36893 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
36894 clobbers);
36895 return clobbers;
36896 }
36897
36898 /* Implements target vector targetm.asm.encode_section_info. */
36899
36900 static void ATTRIBUTE_UNUSED
36901 ix86_encode_section_info (tree decl, rtx rtl, int first)
36902 {
36903 default_encode_section_info (decl, rtl, first);
36904
36905 if (TREE_CODE (decl) == VAR_DECL
36906 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
36907 && ix86_in_large_data_p (decl))
36908 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
36909 }
36910
36911 /* Worker function for REVERSE_CONDITION. */
36912
36913 enum rtx_code
36914 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
36915 {
36916 return (mode != CCFPmode && mode != CCFPUmode
36917 ? reverse_condition (code)
36918 : reverse_condition_maybe_unordered (code));
36919 }
36920
36921 /* Output code to perform an x87 FP register move, from OPERANDS[1]
36922 to OPERANDS[0]. */
36923
36924 const char *
36925 output_387_reg_move (rtx insn, rtx *operands)
36926 {
36927 if (REG_P (operands[0]))
36928 {
36929 if (REG_P (operands[1])
36930 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36931 {
36932 if (REGNO (operands[0]) == FIRST_STACK_REG)
36933 return output_387_ffreep (operands, 0);
36934 return "fstp\t%y0";
36935 }
36936 if (STACK_TOP_P (operands[0]))
36937 return "fld%Z1\t%y1";
36938 return "fst\t%y0";
36939 }
36940 else if (MEM_P (operands[0]))
36941 {
36942 gcc_assert (REG_P (operands[1]));
36943 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
36944 return "fstp%Z0\t%y0";
36945 else
36946 {
36947 /* There is no non-popping store to memory for XFmode.
36948 So if we need one, follow the store with a load. */
36949 if (GET_MODE (operands[0]) == XFmode)
36950 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
36951 else
36952 return "fst%Z0\t%y0";
36953 }
36954 }
36955 else
36956 gcc_unreachable();
36957 }
36958
36959 /* Output code to perform a conditional jump to LABEL, if C2 flag in
36960 FP status register is set. */
36961
36962 void
36963 ix86_emit_fp_unordered_jump (rtx label)
36964 {
36965 rtx reg = gen_reg_rtx (HImode);
36966 rtx temp;
36967
36968 emit_insn (gen_x86_fnstsw_1 (reg));
36969
36970 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
36971 {
36972 emit_insn (gen_x86_sahf_1 (reg));
36973
36974 temp = gen_rtx_REG (CCmode, FLAGS_REG);
36975 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
36976 }
36977 else
36978 {
36979 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
36980
36981 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
36982 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
36983 }
36984
36985 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
36986 gen_rtx_LABEL_REF (VOIDmode, label),
36987 pc_rtx);
36988 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
36989
36990 emit_jump_insn (temp);
36991 predict_jump (REG_BR_PROB_BASE * 10 / 100);
36992 }
36993
36994 /* Output code to perform a log1p XFmode calculation. */
36995
36996 void ix86_emit_i387_log1p (rtx op0, rtx op1)
36997 {
36998 rtx label1 = gen_label_rtx ();
36999 rtx label2 = gen_label_rtx ();
37000
37001 rtx tmp = gen_reg_rtx (XFmode);
37002 rtx tmp2 = gen_reg_rtx (XFmode);
37003 rtx test;
37004
37005 emit_insn (gen_absxf2 (tmp, op1));
37006 test = gen_rtx_GE (VOIDmode, tmp,
37007 CONST_DOUBLE_FROM_REAL_VALUE (
37008 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37009 XFmode));
37010 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37011
37012 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37013 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37014 emit_jump (label2);
37015
37016 emit_label (label1);
37017 emit_move_insn (tmp, CONST1_RTX (XFmode));
37018 emit_insn (gen_addxf3 (tmp, op1, tmp));
37019 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37020 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37021
37022 emit_label (label2);
37023 }
37024
37025 /* Emit code for round calculation. */
37026 void ix86_emit_i387_round (rtx op0, rtx op1)
37027 {
37028 enum machine_mode inmode = GET_MODE (op1);
37029 enum machine_mode outmode = GET_MODE (op0);
37030 rtx e1, e2, res, tmp, tmp1, half;
37031 rtx scratch = gen_reg_rtx (HImode);
37032 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37033 rtx jump_label = gen_label_rtx ();
37034 rtx insn;
37035 rtx (*gen_abs) (rtx, rtx);
37036 rtx (*gen_neg) (rtx, rtx);
37037
37038 switch (inmode)
37039 {
37040 case SFmode:
37041 gen_abs = gen_abssf2;
37042 break;
37043 case DFmode:
37044 gen_abs = gen_absdf2;
37045 break;
37046 case XFmode:
37047 gen_abs = gen_absxf2;
37048 break;
37049 default:
37050 gcc_unreachable ();
37051 }
37052
37053 switch (outmode)
37054 {
37055 case SFmode:
37056 gen_neg = gen_negsf2;
37057 break;
37058 case DFmode:
37059 gen_neg = gen_negdf2;
37060 break;
37061 case XFmode:
37062 gen_neg = gen_negxf2;
37063 break;
37064 case HImode:
37065 gen_neg = gen_neghi2;
37066 break;
37067 case SImode:
37068 gen_neg = gen_negsi2;
37069 break;
37070 case DImode:
37071 gen_neg = gen_negdi2;
37072 break;
37073 default:
37074 gcc_unreachable ();
37075 }
37076
37077 e1 = gen_reg_rtx (inmode);
37078 e2 = gen_reg_rtx (inmode);
37079 res = gen_reg_rtx (outmode);
37080
37081 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37082
37083 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37084
37085 /* scratch = fxam(op1) */
37086 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37087 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37088 UNSPEC_FXAM)));
37089 /* e1 = fabs(op1) */
37090 emit_insn (gen_abs (e1, op1));
37091
37092 /* e2 = e1 + 0.5 */
37093 half = force_reg (inmode, half);
37094 emit_insn (gen_rtx_SET (VOIDmode, e2,
37095 gen_rtx_PLUS (inmode, e1, half)));
37096
37097 /* res = floor(e2) */
37098 if (inmode != XFmode)
37099 {
37100 tmp1 = gen_reg_rtx (XFmode);
37101
37102 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37103 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37104 }
37105 else
37106 tmp1 = e2;
37107
37108 switch (outmode)
37109 {
37110 case SFmode:
37111 case DFmode:
37112 {
37113 rtx tmp0 = gen_reg_rtx (XFmode);
37114
37115 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37116
37117 emit_insn (gen_rtx_SET (VOIDmode, res,
37118 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37119 UNSPEC_TRUNC_NOOP)));
37120 }
37121 break;
37122 case XFmode:
37123 emit_insn (gen_frndintxf2_floor (res, tmp1));
37124 break;
37125 case HImode:
37126 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37127 break;
37128 case SImode:
37129 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37130 break;
37131 case DImode:
37132 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37133 break;
37134 default:
37135 gcc_unreachable ();
37136 }
37137
37138 /* flags = signbit(a) */
37139 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37140
37141 /* if (flags) then res = -res */
37142 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37143 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37144 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37145 pc_rtx);
37146 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37147 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37148 JUMP_LABEL (insn) = jump_label;
37149
37150 emit_insn (gen_neg (res, res));
37151
37152 emit_label (jump_label);
37153 LABEL_NUSES (jump_label) = 1;
37154
37155 emit_move_insn (op0, res);
37156 }
37157
37158 /* Output code to perform a Newton-Rhapson approximation of a single precision
37159 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37160
37161 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37162 {
37163 rtx x0, x1, e0, e1;
37164
37165 x0 = gen_reg_rtx (mode);
37166 e0 = gen_reg_rtx (mode);
37167 e1 = gen_reg_rtx (mode);
37168 x1 = gen_reg_rtx (mode);
37169
37170 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37171
37172 b = force_reg (mode, b);
37173
37174 /* x0 = rcp(b) estimate */
37175 emit_insn (gen_rtx_SET (VOIDmode, x0,
37176 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37177 UNSPEC_RCP)));
37178 /* e0 = x0 * b */
37179 emit_insn (gen_rtx_SET (VOIDmode, e0,
37180 gen_rtx_MULT (mode, x0, b)));
37181
37182 /* e0 = x0 * e0 */
37183 emit_insn (gen_rtx_SET (VOIDmode, e0,
37184 gen_rtx_MULT (mode, x0, e0)));
37185
37186 /* e1 = x0 + x0 */
37187 emit_insn (gen_rtx_SET (VOIDmode, e1,
37188 gen_rtx_PLUS (mode, x0, x0)));
37189
37190 /* x1 = e1 - e0 */
37191 emit_insn (gen_rtx_SET (VOIDmode, x1,
37192 gen_rtx_MINUS (mode, e1, e0)));
37193
37194 /* res = a * x1 */
37195 emit_insn (gen_rtx_SET (VOIDmode, res,
37196 gen_rtx_MULT (mode, a, x1)));
37197 }
37198
37199 /* Output code to perform a Newton-Rhapson approximation of a
37200 single precision floating point [reciprocal] square root. */
37201
37202 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37203 bool recip)
37204 {
37205 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37206 REAL_VALUE_TYPE r;
37207
37208 x0 = gen_reg_rtx (mode);
37209 e0 = gen_reg_rtx (mode);
37210 e1 = gen_reg_rtx (mode);
37211 e2 = gen_reg_rtx (mode);
37212 e3 = gen_reg_rtx (mode);
37213
37214 real_from_integer (&r, VOIDmode, -3, -1, 0);
37215 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37216
37217 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37218 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37219
37220 if (VECTOR_MODE_P (mode))
37221 {
37222 mthree = ix86_build_const_vector (mode, true, mthree);
37223 mhalf = ix86_build_const_vector (mode, true, mhalf);
37224 }
37225
37226 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37227 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37228
37229 a = force_reg (mode, a);
37230
37231 /* x0 = rsqrt(a) estimate */
37232 emit_insn (gen_rtx_SET (VOIDmode, x0,
37233 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37234 UNSPEC_RSQRT)));
37235
37236 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37237 if (!recip)
37238 {
37239 rtx zero, mask;
37240
37241 zero = gen_reg_rtx (mode);
37242 mask = gen_reg_rtx (mode);
37243
37244 zero = force_reg (mode, CONST0_RTX(mode));
37245 emit_insn (gen_rtx_SET (VOIDmode, mask,
37246 gen_rtx_NE (mode, zero, a)));
37247
37248 emit_insn (gen_rtx_SET (VOIDmode, x0,
37249 gen_rtx_AND (mode, x0, mask)));
37250 }
37251
37252 /* e0 = x0 * a */
37253 emit_insn (gen_rtx_SET (VOIDmode, e0,
37254 gen_rtx_MULT (mode, x0, a)));
37255 /* e1 = e0 * x0 */
37256 emit_insn (gen_rtx_SET (VOIDmode, e1,
37257 gen_rtx_MULT (mode, e0, x0)));
37258
37259 /* e2 = e1 - 3. */
37260 mthree = force_reg (mode, mthree);
37261 emit_insn (gen_rtx_SET (VOIDmode, e2,
37262 gen_rtx_PLUS (mode, e1, mthree)));
37263
37264 mhalf = force_reg (mode, mhalf);
37265 if (recip)
37266 /* e3 = -.5 * x0 */
37267 emit_insn (gen_rtx_SET (VOIDmode, e3,
37268 gen_rtx_MULT (mode, x0, mhalf)));
37269 else
37270 /* e3 = -.5 * e0 */
37271 emit_insn (gen_rtx_SET (VOIDmode, e3,
37272 gen_rtx_MULT (mode, e0, mhalf)));
37273 /* ret = e2 * e3 */
37274 emit_insn (gen_rtx_SET (VOIDmode, res,
37275 gen_rtx_MULT (mode, e2, e3)));
37276 }
37277
37278 #ifdef TARGET_SOLARIS
37279 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37280
37281 static void
37282 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37283 tree decl)
37284 {
37285 /* With Binutils 2.15, the "@unwind" marker must be specified on
37286 every occurrence of the ".eh_frame" section, not just the first
37287 one. */
37288 if (TARGET_64BIT
37289 && strcmp (name, ".eh_frame") == 0)
37290 {
37291 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37292 flags & SECTION_WRITE ? "aw" : "a");
37293 return;
37294 }
37295
37296 #ifndef USE_GAS
37297 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37298 {
37299 solaris_elf_asm_comdat_section (name, flags, decl);
37300 return;
37301 }
37302 #endif
37303
37304 default_elf_asm_named_section (name, flags, decl);
37305 }
37306 #endif /* TARGET_SOLARIS */
37307
37308 /* Return the mangling of TYPE if it is an extended fundamental type. */
37309
37310 static const char *
37311 ix86_mangle_type (const_tree type)
37312 {
37313 type = TYPE_MAIN_VARIANT (type);
37314
37315 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37316 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37317 return NULL;
37318
37319 switch (TYPE_MODE (type))
37320 {
37321 case TFmode:
37322 /* __float128 is "g". */
37323 return "g";
37324 case XFmode:
37325 /* "long double" or __float80 is "e". */
37326 return "e";
37327 default:
37328 return NULL;
37329 }
37330 }
37331
37332 /* For 32-bit code we can save PIC register setup by using
37333 __stack_chk_fail_local hidden function instead of calling
37334 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37335 register, so it is better to call __stack_chk_fail directly. */
37336
37337 static tree ATTRIBUTE_UNUSED
37338 ix86_stack_protect_fail (void)
37339 {
37340 return TARGET_64BIT
37341 ? default_external_stack_protect_fail ()
37342 : default_hidden_stack_protect_fail ();
37343 }
37344
37345 /* Select a format to encode pointers in exception handling data. CODE
37346 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37347 true if the symbol may be affected by dynamic relocations.
37348
37349 ??? All x86 object file formats are capable of representing this.
37350 After all, the relocation needed is the same as for the call insn.
37351 Whether or not a particular assembler allows us to enter such, I
37352 guess we'll have to see. */
37353 int
37354 asm_preferred_eh_data_format (int code, int global)
37355 {
37356 if (flag_pic)
37357 {
37358 int type = DW_EH_PE_sdata8;
37359 if (!TARGET_64BIT
37360 || ix86_cmodel == CM_SMALL_PIC
37361 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37362 type = DW_EH_PE_sdata4;
37363 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37364 }
37365 if (ix86_cmodel == CM_SMALL
37366 || (ix86_cmodel == CM_MEDIUM && code))
37367 return DW_EH_PE_udata4;
37368 return DW_EH_PE_absptr;
37369 }
37370 \f
37371 /* Expand copysign from SIGN to the positive value ABS_VALUE
37372 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37373 the sign-bit. */
37374 static void
37375 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37376 {
37377 enum machine_mode mode = GET_MODE (sign);
37378 rtx sgn = gen_reg_rtx (mode);
37379 if (mask == NULL_RTX)
37380 {
37381 enum machine_mode vmode;
37382
37383 if (mode == SFmode)
37384 vmode = V4SFmode;
37385 else if (mode == DFmode)
37386 vmode = V2DFmode;
37387 else
37388 vmode = mode;
37389
37390 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37391 if (!VECTOR_MODE_P (mode))
37392 {
37393 /* We need to generate a scalar mode mask in this case. */
37394 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37395 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37396 mask = gen_reg_rtx (mode);
37397 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37398 }
37399 }
37400 else
37401 mask = gen_rtx_NOT (mode, mask);
37402 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37403 gen_rtx_AND (mode, mask, sign)));
37404 emit_insn (gen_rtx_SET (VOIDmode, result,
37405 gen_rtx_IOR (mode, abs_value, sgn)));
37406 }
37407
37408 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37409 mask for masking out the sign-bit is stored in *SMASK, if that is
37410 non-null. */
37411 static rtx
37412 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37413 {
37414 enum machine_mode vmode, mode = GET_MODE (op0);
37415 rtx xa, mask;
37416
37417 xa = gen_reg_rtx (mode);
37418 if (mode == SFmode)
37419 vmode = V4SFmode;
37420 else if (mode == DFmode)
37421 vmode = V2DFmode;
37422 else
37423 vmode = mode;
37424 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37425 if (!VECTOR_MODE_P (mode))
37426 {
37427 /* We need to generate a scalar mode mask in this case. */
37428 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37429 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37430 mask = gen_reg_rtx (mode);
37431 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37432 }
37433 emit_insn (gen_rtx_SET (VOIDmode, xa,
37434 gen_rtx_AND (mode, op0, mask)));
37435
37436 if (smask)
37437 *smask = mask;
37438
37439 return xa;
37440 }
37441
37442 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37443 swapping the operands if SWAP_OPERANDS is true. The expanded
37444 code is a forward jump to a newly created label in case the
37445 comparison is true. The generated label rtx is returned. */
37446 static rtx
37447 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37448 bool swap_operands)
37449 {
37450 rtx label, tmp;
37451
37452 if (swap_operands)
37453 {
37454 tmp = op0;
37455 op0 = op1;
37456 op1 = tmp;
37457 }
37458
37459 label = gen_label_rtx ();
37460 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37461 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37462 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37463 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37464 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37465 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37466 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37467 JUMP_LABEL (tmp) = label;
37468
37469 return label;
37470 }
37471
37472 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37473 using comparison code CODE. Operands are swapped for the comparison if
37474 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37475 static rtx
37476 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37477 bool swap_operands)
37478 {
37479 rtx (*insn)(rtx, rtx, rtx, rtx);
37480 enum machine_mode mode = GET_MODE (op0);
37481 rtx mask = gen_reg_rtx (mode);
37482
37483 if (swap_operands)
37484 {
37485 rtx tmp = op0;
37486 op0 = op1;
37487 op1 = tmp;
37488 }
37489
37490 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37491
37492 emit_insn (insn (mask, op0, op1,
37493 gen_rtx_fmt_ee (code, mode, op0, op1)));
37494 return mask;
37495 }
37496
37497 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37498 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37499 static rtx
37500 ix86_gen_TWO52 (enum machine_mode mode)
37501 {
37502 REAL_VALUE_TYPE TWO52r;
37503 rtx TWO52;
37504
37505 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37506 TWO52 = const_double_from_real_value (TWO52r, mode);
37507 TWO52 = force_reg (mode, TWO52);
37508
37509 return TWO52;
37510 }
37511
37512 /* Expand SSE sequence for computing lround from OP1 storing
37513 into OP0. */
37514 void
37515 ix86_expand_lround (rtx op0, rtx op1)
37516 {
37517 /* C code for the stuff we're doing below:
37518 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37519 return (long)tmp;
37520 */
37521 enum machine_mode mode = GET_MODE (op1);
37522 const struct real_format *fmt;
37523 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37524 rtx adj;
37525
37526 /* load nextafter (0.5, 0.0) */
37527 fmt = REAL_MODE_FORMAT (mode);
37528 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37529 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37530
37531 /* adj = copysign (0.5, op1) */
37532 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37533 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37534
37535 /* adj = op1 + adj */
37536 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37537
37538 /* op0 = (imode)adj */
37539 expand_fix (op0, adj, 0);
37540 }
37541
37542 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37543 into OPERAND0. */
37544 void
37545 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37546 {
37547 /* C code for the stuff we're doing below (for do_floor):
37548 xi = (long)op1;
37549 xi -= (double)xi > op1 ? 1 : 0;
37550 return xi;
37551 */
37552 enum machine_mode fmode = GET_MODE (op1);
37553 enum machine_mode imode = GET_MODE (op0);
37554 rtx ireg, freg, label, tmp;
37555
37556 /* reg = (long)op1 */
37557 ireg = gen_reg_rtx (imode);
37558 expand_fix (ireg, op1, 0);
37559
37560 /* freg = (double)reg */
37561 freg = gen_reg_rtx (fmode);
37562 expand_float (freg, ireg, 0);
37563
37564 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37565 label = ix86_expand_sse_compare_and_jump (UNLE,
37566 freg, op1, !do_floor);
37567 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37568 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37569 emit_move_insn (ireg, tmp);
37570
37571 emit_label (label);
37572 LABEL_NUSES (label) = 1;
37573
37574 emit_move_insn (op0, ireg);
37575 }
37576
37577 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37578 result in OPERAND0. */
37579 void
37580 ix86_expand_rint (rtx operand0, rtx operand1)
37581 {
37582 /* C code for the stuff we're doing below:
37583 xa = fabs (operand1);
37584 if (!isless (xa, 2**52))
37585 return operand1;
37586 xa = xa + 2**52 - 2**52;
37587 return copysign (xa, operand1);
37588 */
37589 enum machine_mode mode = GET_MODE (operand0);
37590 rtx res, xa, label, TWO52, mask;
37591
37592 res = gen_reg_rtx (mode);
37593 emit_move_insn (res, operand1);
37594
37595 /* xa = abs (operand1) */
37596 xa = ix86_expand_sse_fabs (res, &mask);
37597
37598 /* if (!isless (xa, TWO52)) goto label; */
37599 TWO52 = ix86_gen_TWO52 (mode);
37600 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37601
37602 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37603 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37604
37605 ix86_sse_copysign_to_positive (res, xa, res, mask);
37606
37607 emit_label (label);
37608 LABEL_NUSES (label) = 1;
37609
37610 emit_move_insn (operand0, res);
37611 }
37612
37613 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37614 into OPERAND0. */
37615 void
37616 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37617 {
37618 /* C code for the stuff we expand below.
37619 double xa = fabs (x), x2;
37620 if (!isless (xa, TWO52))
37621 return x;
37622 xa = xa + TWO52 - TWO52;
37623 x2 = copysign (xa, x);
37624 Compensate. Floor:
37625 if (x2 > x)
37626 x2 -= 1;
37627 Compensate. Ceil:
37628 if (x2 < x)
37629 x2 -= -1;
37630 return x2;
37631 */
37632 enum machine_mode mode = GET_MODE (operand0);
37633 rtx xa, TWO52, tmp, label, one, res, mask;
37634
37635 TWO52 = ix86_gen_TWO52 (mode);
37636
37637 /* Temporary for holding the result, initialized to the input
37638 operand to ease control flow. */
37639 res = gen_reg_rtx (mode);
37640 emit_move_insn (res, operand1);
37641
37642 /* xa = abs (operand1) */
37643 xa = ix86_expand_sse_fabs (res, &mask);
37644
37645 /* if (!isless (xa, TWO52)) goto label; */
37646 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37647
37648 /* xa = xa + TWO52 - TWO52; */
37649 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37650 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37651
37652 /* xa = copysign (xa, operand1) */
37653 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37654
37655 /* generate 1.0 or -1.0 */
37656 one = force_reg (mode,
37657 const_double_from_real_value (do_floor
37658 ? dconst1 : dconstm1, mode));
37659
37660 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37661 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37662 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37663 gen_rtx_AND (mode, one, tmp)));
37664 /* We always need to subtract here to preserve signed zero. */
37665 tmp = expand_simple_binop (mode, MINUS,
37666 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37667 emit_move_insn (res, tmp);
37668
37669 emit_label (label);
37670 LABEL_NUSES (label) = 1;
37671
37672 emit_move_insn (operand0, res);
37673 }
37674
37675 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37676 into OPERAND0. */
37677 void
37678 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37679 {
37680 /* C code for the stuff we expand below.
37681 double xa = fabs (x), x2;
37682 if (!isless (xa, TWO52))
37683 return x;
37684 x2 = (double)(long)x;
37685 Compensate. Floor:
37686 if (x2 > x)
37687 x2 -= 1;
37688 Compensate. Ceil:
37689 if (x2 < x)
37690 x2 += 1;
37691 if (HONOR_SIGNED_ZEROS (mode))
37692 return copysign (x2, x);
37693 return x2;
37694 */
37695 enum machine_mode mode = GET_MODE (operand0);
37696 rtx xa, xi, TWO52, tmp, label, one, res, mask;
37697
37698 TWO52 = ix86_gen_TWO52 (mode);
37699
37700 /* Temporary for holding the result, initialized to the input
37701 operand to ease control flow. */
37702 res = gen_reg_rtx (mode);
37703 emit_move_insn (res, operand1);
37704
37705 /* xa = abs (operand1) */
37706 xa = ix86_expand_sse_fabs (res, &mask);
37707
37708 /* if (!isless (xa, TWO52)) goto label; */
37709 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37710
37711 /* xa = (double)(long)x */
37712 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37713 expand_fix (xi, res, 0);
37714 expand_float (xa, xi, 0);
37715
37716 /* generate 1.0 */
37717 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37718
37719 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37720 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37721 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37722 gen_rtx_AND (mode, one, tmp)));
37723 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
37724 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37725 emit_move_insn (res, tmp);
37726
37727 if (HONOR_SIGNED_ZEROS (mode))
37728 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37729
37730 emit_label (label);
37731 LABEL_NUSES (label) = 1;
37732
37733 emit_move_insn (operand0, res);
37734 }
37735
37736 /* Expand SSE sequence for computing round from OPERAND1 storing
37737 into OPERAND0. Sequence that works without relying on DImode truncation
37738 via cvttsd2siq that is only available on 64bit targets. */
37739 void
37740 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
37741 {
37742 /* C code for the stuff we expand below.
37743 double xa = fabs (x), xa2, x2;
37744 if (!isless (xa, TWO52))
37745 return x;
37746 Using the absolute value and copying back sign makes
37747 -0.0 -> -0.0 correct.
37748 xa2 = xa + TWO52 - TWO52;
37749 Compensate.
37750 dxa = xa2 - xa;
37751 if (dxa <= -0.5)
37752 xa2 += 1;
37753 else if (dxa > 0.5)
37754 xa2 -= 1;
37755 x2 = copysign (xa2, x);
37756 return x2;
37757 */
37758 enum machine_mode mode = GET_MODE (operand0);
37759 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
37760
37761 TWO52 = ix86_gen_TWO52 (mode);
37762
37763 /* Temporary for holding the result, initialized to the input
37764 operand to ease control flow. */
37765 res = gen_reg_rtx (mode);
37766 emit_move_insn (res, operand1);
37767
37768 /* xa = abs (operand1) */
37769 xa = ix86_expand_sse_fabs (res, &mask);
37770
37771 /* if (!isless (xa, TWO52)) goto label; */
37772 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37773
37774 /* xa2 = xa + TWO52 - TWO52; */
37775 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37776 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
37777
37778 /* dxa = xa2 - xa; */
37779 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
37780
37781 /* generate 0.5, 1.0 and -0.5 */
37782 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
37783 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
37784 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
37785 0, OPTAB_DIRECT);
37786
37787 /* Compensate. */
37788 tmp = gen_reg_rtx (mode);
37789 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
37790 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
37791 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37792 gen_rtx_AND (mode, one, tmp)));
37793 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37794 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
37795 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
37796 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37797 gen_rtx_AND (mode, one, tmp)));
37798 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37799
37800 /* res = copysign (xa2, operand1) */
37801 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
37802
37803 emit_label (label);
37804 LABEL_NUSES (label) = 1;
37805
37806 emit_move_insn (operand0, res);
37807 }
37808
37809 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37810 into OPERAND0. */
37811 void
37812 ix86_expand_trunc (rtx operand0, rtx operand1)
37813 {
37814 /* C code for SSE variant we expand below.
37815 double xa = fabs (x), x2;
37816 if (!isless (xa, TWO52))
37817 return x;
37818 x2 = (double)(long)x;
37819 if (HONOR_SIGNED_ZEROS (mode))
37820 return copysign (x2, x);
37821 return x2;
37822 */
37823 enum machine_mode mode = GET_MODE (operand0);
37824 rtx xa, xi, TWO52, label, res, mask;
37825
37826 TWO52 = ix86_gen_TWO52 (mode);
37827
37828 /* Temporary for holding the result, initialized to the input
37829 operand to ease control flow. */
37830 res = gen_reg_rtx (mode);
37831 emit_move_insn (res, operand1);
37832
37833 /* xa = abs (operand1) */
37834 xa = ix86_expand_sse_fabs (res, &mask);
37835
37836 /* if (!isless (xa, TWO52)) goto label; */
37837 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37838
37839 /* x = (double)(long)x */
37840 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37841 expand_fix (xi, res, 0);
37842 expand_float (res, xi, 0);
37843
37844 if (HONOR_SIGNED_ZEROS (mode))
37845 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
37846
37847 emit_label (label);
37848 LABEL_NUSES (label) = 1;
37849
37850 emit_move_insn (operand0, res);
37851 }
37852
37853 /* Expand SSE sequence for computing trunc from OPERAND1 storing
37854 into OPERAND0. */
37855 void
37856 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
37857 {
37858 enum machine_mode mode = GET_MODE (operand0);
37859 rtx xa, mask, TWO52, label, one, res, smask, tmp;
37860
37861 /* C code for SSE variant we expand below.
37862 double xa = fabs (x), x2;
37863 if (!isless (xa, TWO52))
37864 return x;
37865 xa2 = xa + TWO52 - TWO52;
37866 Compensate:
37867 if (xa2 > xa)
37868 xa2 -= 1.0;
37869 x2 = copysign (xa2, x);
37870 return x2;
37871 */
37872
37873 TWO52 = ix86_gen_TWO52 (mode);
37874
37875 /* Temporary for holding the result, initialized to the input
37876 operand to ease control flow. */
37877 res = gen_reg_rtx (mode);
37878 emit_move_insn (res, operand1);
37879
37880 /* xa = abs (operand1) */
37881 xa = ix86_expand_sse_fabs (res, &smask);
37882
37883 /* if (!isless (xa, TWO52)) goto label; */
37884 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37885
37886 /* res = xa + TWO52 - TWO52; */
37887 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37888 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
37889 emit_move_insn (res, tmp);
37890
37891 /* generate 1.0 */
37892 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
37893
37894 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
37895 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
37896 emit_insn (gen_rtx_SET (VOIDmode, mask,
37897 gen_rtx_AND (mode, mask, one)));
37898 tmp = expand_simple_binop (mode, MINUS,
37899 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
37900 emit_move_insn (res, tmp);
37901
37902 /* res = copysign (res, operand1) */
37903 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
37904
37905 emit_label (label);
37906 LABEL_NUSES (label) = 1;
37907
37908 emit_move_insn (operand0, res);
37909 }
37910
37911 /* Expand SSE sequence for computing round from OPERAND1 storing
37912 into OPERAND0. */
37913 void
37914 ix86_expand_round (rtx operand0, rtx operand1)
37915 {
37916 /* C code for the stuff we're doing below:
37917 double xa = fabs (x);
37918 if (!isless (xa, TWO52))
37919 return x;
37920 xa = (double)(long)(xa + nextafter (0.5, 0.0));
37921 return copysign (xa, x);
37922 */
37923 enum machine_mode mode = GET_MODE (operand0);
37924 rtx res, TWO52, xa, label, xi, half, mask;
37925 const struct real_format *fmt;
37926 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37927
37928 /* Temporary for holding the result, initialized to the input
37929 operand to ease control flow. */
37930 res = gen_reg_rtx (mode);
37931 emit_move_insn (res, operand1);
37932
37933 TWO52 = ix86_gen_TWO52 (mode);
37934 xa = ix86_expand_sse_fabs (res, &mask);
37935 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37936
37937 /* load nextafter (0.5, 0.0) */
37938 fmt = REAL_MODE_FORMAT (mode);
37939 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37940 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37941
37942 /* xa = xa + 0.5 */
37943 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
37944 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
37945
37946 /* xa = (double)(int64_t)xa */
37947 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
37948 expand_fix (xi, xa, 0);
37949 expand_float (xa, xi, 0);
37950
37951 /* res = copysign (xa, operand1) */
37952 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
37953
37954 emit_label (label);
37955 LABEL_NUSES (label) = 1;
37956
37957 emit_move_insn (operand0, res);
37958 }
37959
37960 /* Expand SSE sequence for computing round
37961 from OP1 storing into OP0 using sse4 round insn. */
37962 void
37963 ix86_expand_round_sse4 (rtx op0, rtx op1)
37964 {
37965 enum machine_mode mode = GET_MODE (op0);
37966 rtx e1, e2, res, half;
37967 const struct real_format *fmt;
37968 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37969 rtx (*gen_copysign) (rtx, rtx, rtx);
37970 rtx (*gen_round) (rtx, rtx, rtx);
37971
37972 switch (mode)
37973 {
37974 case SFmode:
37975 gen_copysign = gen_copysignsf3;
37976 gen_round = gen_sse4_1_roundsf2;
37977 break;
37978 case DFmode:
37979 gen_copysign = gen_copysigndf3;
37980 gen_round = gen_sse4_1_rounddf2;
37981 break;
37982 default:
37983 gcc_unreachable ();
37984 }
37985
37986 /* round (a) = trunc (a + copysign (0.5, a)) */
37987
37988 /* load nextafter (0.5, 0.0) */
37989 fmt = REAL_MODE_FORMAT (mode);
37990 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37991 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37992 half = const_double_from_real_value (pred_half, mode);
37993
37994 /* e1 = copysign (0.5, op1) */
37995 e1 = gen_reg_rtx (mode);
37996 emit_insn (gen_copysign (e1, half, op1));
37997
37998 /* e2 = op1 + e1 */
37999 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38000
38001 /* res = trunc (e2) */
38002 res = gen_reg_rtx (mode);
38003 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38004
38005 emit_move_insn (op0, res);
38006 }
38007 \f
38008
38009 /* Table of valid machine attributes. */
38010 static const struct attribute_spec ix86_attribute_table[] =
38011 {
38012 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38013 affects_type_identity } */
38014 /* Stdcall attribute says callee is responsible for popping arguments
38015 if they are not variable. */
38016 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38017 true },
38018 /* Fastcall attribute says callee is responsible for popping arguments
38019 if they are not variable. */
38020 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38021 true },
38022 /* Thiscall attribute says callee is responsible for popping arguments
38023 if they are not variable. */
38024 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38025 true },
38026 /* Cdecl attribute says the callee is a normal C declaration */
38027 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38028 true },
38029 /* Regparm attribute specifies how many integer arguments are to be
38030 passed in registers. */
38031 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38032 true },
38033 /* Sseregparm attribute says we are using x86_64 calling conventions
38034 for FP arguments. */
38035 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38036 true },
38037 /* The transactional memory builtins are implicitly regparm or fastcall
38038 depending on the ABI. Override the generic do-nothing attribute that
38039 these builtins were declared with. */
38040 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38041 true },
38042 /* force_align_arg_pointer says this function realigns the stack at entry. */
38043 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38044 false, true, true, ix86_handle_cconv_attribute, false },
38045 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38046 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38047 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38048 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38049 false },
38050 #endif
38051 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38052 false },
38053 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38054 false },
38055 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38056 SUBTARGET_ATTRIBUTE_TABLE,
38057 #endif
38058 /* ms_abi and sysv_abi calling convention function attributes. */
38059 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38060 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38061 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38062 false },
38063 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38064 ix86_handle_callee_pop_aggregate_return, true },
38065 /* End element. */
38066 { NULL, 0, 0, false, false, false, NULL, false }
38067 };
38068
38069 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38070 static int
38071 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38072 tree vectype,
38073 int misalign ATTRIBUTE_UNUSED)
38074 {
38075 unsigned elements;
38076
38077 switch (type_of_cost)
38078 {
38079 case scalar_stmt:
38080 return ix86_cost->scalar_stmt_cost;
38081
38082 case scalar_load:
38083 return ix86_cost->scalar_load_cost;
38084
38085 case scalar_store:
38086 return ix86_cost->scalar_store_cost;
38087
38088 case vector_stmt:
38089 return ix86_cost->vec_stmt_cost;
38090
38091 case vector_load:
38092 return ix86_cost->vec_align_load_cost;
38093
38094 case vector_store:
38095 return ix86_cost->vec_store_cost;
38096
38097 case vec_to_scalar:
38098 return ix86_cost->vec_to_scalar_cost;
38099
38100 case scalar_to_vec:
38101 return ix86_cost->scalar_to_vec_cost;
38102
38103 case unaligned_load:
38104 case unaligned_store:
38105 return ix86_cost->vec_unalign_load_cost;
38106
38107 case cond_branch_taken:
38108 return ix86_cost->cond_taken_branch_cost;
38109
38110 case cond_branch_not_taken:
38111 return ix86_cost->cond_not_taken_branch_cost;
38112
38113 case vec_perm:
38114 case vec_promote_demote:
38115 return ix86_cost->vec_stmt_cost;
38116
38117 case vec_construct:
38118 elements = TYPE_VECTOR_SUBPARTS (vectype);
38119 return elements / 2 + 1;
38120
38121 default:
38122 gcc_unreachable ();
38123 }
38124 }
38125
38126 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38127 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38128 insn every time. */
38129
38130 static GTY(()) rtx vselect_insn;
38131
38132 /* Initialize vselect_insn. */
38133
38134 static void
38135 init_vselect_insn (void)
38136 {
38137 unsigned i;
38138 rtx x;
38139
38140 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38141 for (i = 0; i < MAX_VECT_LEN; ++i)
38142 XVECEXP (x, 0, i) = const0_rtx;
38143 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38144 const0_rtx), x);
38145 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38146 start_sequence ();
38147 vselect_insn = emit_insn (x);
38148 end_sequence ();
38149 }
38150
38151 /* Construct (set target (vec_select op0 (parallel perm))) and
38152 return true if that's a valid instruction in the active ISA. */
38153
38154 static bool
38155 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38156 unsigned nelt, bool testing_p)
38157 {
38158 unsigned int i;
38159 rtx x, save_vconcat;
38160 int icode;
38161
38162 if (vselect_insn == NULL_RTX)
38163 init_vselect_insn ();
38164
38165 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38166 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38167 for (i = 0; i < nelt; ++i)
38168 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38169 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38170 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38171 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38172 SET_DEST (PATTERN (vselect_insn)) = target;
38173 icode = recog_memoized (vselect_insn);
38174
38175 if (icode >= 0 && !testing_p)
38176 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38177
38178 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38179 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38180 INSN_CODE (vselect_insn) = -1;
38181
38182 return icode >= 0;
38183 }
38184
38185 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38186
38187 static bool
38188 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38189 const unsigned char *perm, unsigned nelt,
38190 bool testing_p)
38191 {
38192 enum machine_mode v2mode;
38193 rtx x;
38194 bool ok;
38195
38196 if (vselect_insn == NULL_RTX)
38197 init_vselect_insn ();
38198
38199 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38200 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38201 PUT_MODE (x, v2mode);
38202 XEXP (x, 0) = op0;
38203 XEXP (x, 1) = op1;
38204 ok = expand_vselect (target, x, perm, nelt, testing_p);
38205 XEXP (x, 0) = const0_rtx;
38206 XEXP (x, 1) = const0_rtx;
38207 return ok;
38208 }
38209
38210 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38211 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38212
38213 static bool
38214 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38215 {
38216 enum machine_mode vmode = d->vmode;
38217 unsigned i, mask, nelt = d->nelt;
38218 rtx target, op0, op1, x;
38219 rtx rperm[32], vperm;
38220
38221 if (d->one_operand_p)
38222 return false;
38223 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38224 ;
38225 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38226 ;
38227 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38228 ;
38229 else
38230 return false;
38231
38232 /* This is a blend, not a permute. Elements must stay in their
38233 respective lanes. */
38234 for (i = 0; i < nelt; ++i)
38235 {
38236 unsigned e = d->perm[i];
38237 if (!(e == i || e == i + nelt))
38238 return false;
38239 }
38240
38241 if (d->testing_p)
38242 return true;
38243
38244 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38245 decision should be extracted elsewhere, so that we only try that
38246 sequence once all budget==3 options have been tried. */
38247 target = d->target;
38248 op0 = d->op0;
38249 op1 = d->op1;
38250 mask = 0;
38251
38252 switch (vmode)
38253 {
38254 case V4DFmode:
38255 case V8SFmode:
38256 case V2DFmode:
38257 case V4SFmode:
38258 case V8HImode:
38259 case V8SImode:
38260 for (i = 0; i < nelt; ++i)
38261 mask |= (d->perm[i] >= nelt) << i;
38262 break;
38263
38264 case V2DImode:
38265 for (i = 0; i < 2; ++i)
38266 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38267 vmode = V8HImode;
38268 goto do_subreg;
38269
38270 case V4SImode:
38271 for (i = 0; i < 4; ++i)
38272 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38273 vmode = V8HImode;
38274 goto do_subreg;
38275
38276 case V16QImode:
38277 /* See if bytes move in pairs so we can use pblendw with
38278 an immediate argument, rather than pblendvb with a vector
38279 argument. */
38280 for (i = 0; i < 16; i += 2)
38281 if (d->perm[i] + 1 != d->perm[i + 1])
38282 {
38283 use_pblendvb:
38284 for (i = 0; i < nelt; ++i)
38285 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38286
38287 finish_pblendvb:
38288 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38289 vperm = force_reg (vmode, vperm);
38290
38291 if (GET_MODE_SIZE (vmode) == 16)
38292 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38293 else
38294 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38295 return true;
38296 }
38297
38298 for (i = 0; i < 8; ++i)
38299 mask |= (d->perm[i * 2] >= 16) << i;
38300 vmode = V8HImode;
38301 /* FALLTHRU */
38302
38303 do_subreg:
38304 target = gen_lowpart (vmode, target);
38305 op0 = gen_lowpart (vmode, op0);
38306 op1 = gen_lowpart (vmode, op1);
38307 break;
38308
38309 case V32QImode:
38310 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38311 for (i = 0; i < 32; i += 2)
38312 if (d->perm[i] + 1 != d->perm[i + 1])
38313 goto use_pblendvb;
38314 /* See if bytes move in quadruplets. If yes, vpblendd
38315 with immediate can be used. */
38316 for (i = 0; i < 32; i += 4)
38317 if (d->perm[i] + 2 != d->perm[i + 2])
38318 break;
38319 if (i < 32)
38320 {
38321 /* See if bytes move the same in both lanes. If yes,
38322 vpblendw with immediate can be used. */
38323 for (i = 0; i < 16; i += 2)
38324 if (d->perm[i] + 16 != d->perm[i + 16])
38325 goto use_pblendvb;
38326
38327 /* Use vpblendw. */
38328 for (i = 0; i < 16; ++i)
38329 mask |= (d->perm[i * 2] >= 32) << i;
38330 vmode = V16HImode;
38331 goto do_subreg;
38332 }
38333
38334 /* Use vpblendd. */
38335 for (i = 0; i < 8; ++i)
38336 mask |= (d->perm[i * 4] >= 32) << i;
38337 vmode = V8SImode;
38338 goto do_subreg;
38339
38340 case V16HImode:
38341 /* See if words move in pairs. If yes, vpblendd can be used. */
38342 for (i = 0; i < 16; i += 2)
38343 if (d->perm[i] + 1 != d->perm[i + 1])
38344 break;
38345 if (i < 16)
38346 {
38347 /* See if words move the same in both lanes. If not,
38348 vpblendvb must be used. */
38349 for (i = 0; i < 8; i++)
38350 if (d->perm[i] + 8 != d->perm[i + 8])
38351 {
38352 /* Use vpblendvb. */
38353 for (i = 0; i < 32; ++i)
38354 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38355
38356 vmode = V32QImode;
38357 nelt = 32;
38358 target = gen_lowpart (vmode, target);
38359 op0 = gen_lowpart (vmode, op0);
38360 op1 = gen_lowpart (vmode, op1);
38361 goto finish_pblendvb;
38362 }
38363
38364 /* Use vpblendw. */
38365 for (i = 0; i < 16; ++i)
38366 mask |= (d->perm[i] >= 16) << i;
38367 break;
38368 }
38369
38370 /* Use vpblendd. */
38371 for (i = 0; i < 8; ++i)
38372 mask |= (d->perm[i * 2] >= 16) << i;
38373 vmode = V8SImode;
38374 goto do_subreg;
38375
38376 case V4DImode:
38377 /* Use vpblendd. */
38378 for (i = 0; i < 4; ++i)
38379 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38380 vmode = V8SImode;
38381 goto do_subreg;
38382
38383 default:
38384 gcc_unreachable ();
38385 }
38386
38387 /* This matches five different patterns with the different modes. */
38388 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38389 x = gen_rtx_SET (VOIDmode, target, x);
38390 emit_insn (x);
38391
38392 return true;
38393 }
38394
38395 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38396 in terms of the variable form of vpermilps.
38397
38398 Note that we will have already failed the immediate input vpermilps,
38399 which requires that the high and low part shuffle be identical; the
38400 variable form doesn't require that. */
38401
38402 static bool
38403 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38404 {
38405 rtx rperm[8], vperm;
38406 unsigned i;
38407
38408 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38409 return false;
38410
38411 /* We can only permute within the 128-bit lane. */
38412 for (i = 0; i < 8; ++i)
38413 {
38414 unsigned e = d->perm[i];
38415 if (i < 4 ? e >= 4 : e < 4)
38416 return false;
38417 }
38418
38419 if (d->testing_p)
38420 return true;
38421
38422 for (i = 0; i < 8; ++i)
38423 {
38424 unsigned e = d->perm[i];
38425
38426 /* Within each 128-bit lane, the elements of op0 are numbered
38427 from 0 and the elements of op1 are numbered from 4. */
38428 if (e >= 8 + 4)
38429 e -= 8;
38430 else if (e >= 4)
38431 e -= 4;
38432
38433 rperm[i] = GEN_INT (e);
38434 }
38435
38436 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38437 vperm = force_reg (V8SImode, vperm);
38438 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38439
38440 return true;
38441 }
38442
38443 /* Return true if permutation D can be performed as VMODE permutation
38444 instead. */
38445
38446 static bool
38447 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38448 {
38449 unsigned int i, j, chunk;
38450
38451 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38452 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38453 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38454 return false;
38455
38456 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38457 return true;
38458
38459 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38460 for (i = 0; i < d->nelt; i += chunk)
38461 if (d->perm[i] & (chunk - 1))
38462 return false;
38463 else
38464 for (j = 1; j < chunk; ++j)
38465 if (d->perm[i] + j != d->perm[i + j])
38466 return false;
38467
38468 return true;
38469 }
38470
38471 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38472 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38473
38474 static bool
38475 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38476 {
38477 unsigned i, nelt, eltsz, mask;
38478 unsigned char perm[32];
38479 enum machine_mode vmode = V16QImode;
38480 rtx rperm[32], vperm, target, op0, op1;
38481
38482 nelt = d->nelt;
38483
38484 if (!d->one_operand_p)
38485 {
38486 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38487 {
38488 if (TARGET_AVX2
38489 && valid_perm_using_mode_p (V2TImode, d))
38490 {
38491 if (d->testing_p)
38492 return true;
38493
38494 /* Use vperm2i128 insn. The pattern uses
38495 V4DImode instead of V2TImode. */
38496 target = gen_lowpart (V4DImode, d->target);
38497 op0 = gen_lowpart (V4DImode, d->op0);
38498 op1 = gen_lowpart (V4DImode, d->op1);
38499 rperm[0]
38500 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
38501 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
38502 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38503 return true;
38504 }
38505 return false;
38506 }
38507 }
38508 else
38509 {
38510 if (GET_MODE_SIZE (d->vmode) == 16)
38511 {
38512 if (!TARGET_SSSE3)
38513 return false;
38514 }
38515 else if (GET_MODE_SIZE (d->vmode) == 32)
38516 {
38517 if (!TARGET_AVX2)
38518 return false;
38519
38520 /* V4DImode should be already handled through
38521 expand_vselect by vpermq instruction. */
38522 gcc_assert (d->vmode != V4DImode);
38523
38524 vmode = V32QImode;
38525 if (d->vmode == V8SImode
38526 || d->vmode == V16HImode
38527 || d->vmode == V32QImode)
38528 {
38529 /* First see if vpermq can be used for
38530 V8SImode/V16HImode/V32QImode. */
38531 if (valid_perm_using_mode_p (V4DImode, d))
38532 {
38533 for (i = 0; i < 4; i++)
38534 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38535 if (d->testing_p)
38536 return true;
38537 return expand_vselect (gen_lowpart (V4DImode, d->target),
38538 gen_lowpart (V4DImode, d->op0),
38539 perm, 4, false);
38540 }
38541
38542 /* Next see if vpermd can be used. */
38543 if (valid_perm_using_mode_p (V8SImode, d))
38544 vmode = V8SImode;
38545 }
38546 /* Or if vpermps can be used. */
38547 else if (d->vmode == V8SFmode)
38548 vmode = V8SImode;
38549
38550 if (vmode == V32QImode)
38551 {
38552 /* vpshufb only works intra lanes, it is not
38553 possible to shuffle bytes in between the lanes. */
38554 for (i = 0; i < nelt; ++i)
38555 if ((d->perm[i] ^ i) & (nelt / 2))
38556 return false;
38557 }
38558 }
38559 else
38560 return false;
38561 }
38562
38563 if (d->testing_p)
38564 return true;
38565
38566 if (vmode == V8SImode)
38567 for (i = 0; i < 8; ++i)
38568 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38569 else
38570 {
38571 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38572 if (!d->one_operand_p)
38573 mask = 2 * nelt - 1;
38574 else if (vmode == V16QImode)
38575 mask = nelt - 1;
38576 else
38577 mask = nelt / 2 - 1;
38578
38579 for (i = 0; i < nelt; ++i)
38580 {
38581 unsigned j, e = d->perm[i] & mask;
38582 for (j = 0; j < eltsz; ++j)
38583 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38584 }
38585 }
38586
38587 vperm = gen_rtx_CONST_VECTOR (vmode,
38588 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38589 vperm = force_reg (vmode, vperm);
38590
38591 target = gen_lowpart (vmode, d->target);
38592 op0 = gen_lowpart (vmode, d->op0);
38593 if (d->one_operand_p)
38594 {
38595 if (vmode == V16QImode)
38596 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38597 else if (vmode == V32QImode)
38598 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38599 else if (vmode == V8SFmode)
38600 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38601 else
38602 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38603 }
38604 else
38605 {
38606 op1 = gen_lowpart (vmode, d->op1);
38607 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38608 }
38609
38610 return true;
38611 }
38612
38613 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38614 in a single instruction. */
38615
38616 static bool
38617 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38618 {
38619 unsigned i, nelt = d->nelt;
38620 unsigned char perm2[MAX_VECT_LEN];
38621
38622 /* Check plain VEC_SELECT first, because AVX has instructions that could
38623 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38624 input where SEL+CONCAT may not. */
38625 if (d->one_operand_p)
38626 {
38627 int mask = nelt - 1;
38628 bool identity_perm = true;
38629 bool broadcast_perm = true;
38630
38631 for (i = 0; i < nelt; i++)
38632 {
38633 perm2[i] = d->perm[i] & mask;
38634 if (perm2[i] != i)
38635 identity_perm = false;
38636 if (perm2[i])
38637 broadcast_perm = false;
38638 }
38639
38640 if (identity_perm)
38641 {
38642 if (!d->testing_p)
38643 emit_move_insn (d->target, d->op0);
38644 return true;
38645 }
38646 else if (broadcast_perm && TARGET_AVX2)
38647 {
38648 /* Use vpbroadcast{b,w,d}. */
38649 rtx (*gen) (rtx, rtx) = NULL;
38650 switch (d->vmode)
38651 {
38652 case V32QImode:
38653 gen = gen_avx2_pbroadcastv32qi_1;
38654 break;
38655 case V16HImode:
38656 gen = gen_avx2_pbroadcastv16hi_1;
38657 break;
38658 case V8SImode:
38659 gen = gen_avx2_pbroadcastv8si_1;
38660 break;
38661 case V16QImode:
38662 gen = gen_avx2_pbroadcastv16qi;
38663 break;
38664 case V8HImode:
38665 gen = gen_avx2_pbroadcastv8hi;
38666 break;
38667 case V8SFmode:
38668 gen = gen_avx2_vec_dupv8sf_1;
38669 break;
38670 /* For other modes prefer other shuffles this function creates. */
38671 default: break;
38672 }
38673 if (gen != NULL)
38674 {
38675 if (!d->testing_p)
38676 emit_insn (gen (d->target, d->op0));
38677 return true;
38678 }
38679 }
38680
38681 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38682 return true;
38683
38684 /* There are plenty of patterns in sse.md that are written for
38685 SEL+CONCAT and are not replicated for a single op. Perhaps
38686 that should be changed, to avoid the nastiness here. */
38687
38688 /* Recognize interleave style patterns, which means incrementing
38689 every other permutation operand. */
38690 for (i = 0; i < nelt; i += 2)
38691 {
38692 perm2[i] = d->perm[i] & mask;
38693 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
38694 }
38695 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38696 d->testing_p))
38697 return true;
38698
38699 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
38700 if (nelt >= 4)
38701 {
38702 for (i = 0; i < nelt; i += 4)
38703 {
38704 perm2[i + 0] = d->perm[i + 0] & mask;
38705 perm2[i + 1] = d->perm[i + 1] & mask;
38706 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
38707 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
38708 }
38709
38710 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
38711 d->testing_p))
38712 return true;
38713 }
38714 }
38715
38716 /* Finally, try the fully general two operand permute. */
38717 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
38718 d->testing_p))
38719 return true;
38720
38721 /* Recognize interleave style patterns with reversed operands. */
38722 if (!d->one_operand_p)
38723 {
38724 for (i = 0; i < nelt; ++i)
38725 {
38726 unsigned e = d->perm[i];
38727 if (e >= nelt)
38728 e -= nelt;
38729 else
38730 e += nelt;
38731 perm2[i] = e;
38732 }
38733
38734 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
38735 d->testing_p))
38736 return true;
38737 }
38738
38739 /* Try the SSE4.1 blend variable merge instructions. */
38740 if (expand_vec_perm_blend (d))
38741 return true;
38742
38743 /* Try one of the AVX vpermil variable permutations. */
38744 if (expand_vec_perm_vpermil (d))
38745 return true;
38746
38747 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
38748 vpshufb, vpermd, vpermps or vpermq variable permutation. */
38749 if (expand_vec_perm_pshufb (d))
38750 return true;
38751
38752 return false;
38753 }
38754
38755 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38756 in terms of a pair of pshuflw + pshufhw instructions. */
38757
38758 static bool
38759 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
38760 {
38761 unsigned char perm2[MAX_VECT_LEN];
38762 unsigned i;
38763 bool ok;
38764
38765 if (d->vmode != V8HImode || !d->one_operand_p)
38766 return false;
38767
38768 /* The two permutations only operate in 64-bit lanes. */
38769 for (i = 0; i < 4; ++i)
38770 if (d->perm[i] >= 4)
38771 return false;
38772 for (i = 4; i < 8; ++i)
38773 if (d->perm[i] < 4)
38774 return false;
38775
38776 if (d->testing_p)
38777 return true;
38778
38779 /* Emit the pshuflw. */
38780 memcpy (perm2, d->perm, 4);
38781 for (i = 4; i < 8; ++i)
38782 perm2[i] = i;
38783 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
38784 gcc_assert (ok);
38785
38786 /* Emit the pshufhw. */
38787 memcpy (perm2 + 4, d->perm + 4, 4);
38788 for (i = 0; i < 4; ++i)
38789 perm2[i] = i;
38790 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
38791 gcc_assert (ok);
38792
38793 return true;
38794 }
38795
38796 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38797 the permutation using the SSSE3 palignr instruction. This succeeds
38798 when all of the elements in PERM fit within one vector and we merely
38799 need to shift them down so that a single vector permutation has a
38800 chance to succeed. */
38801
38802 static bool
38803 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
38804 {
38805 unsigned i, nelt = d->nelt;
38806 unsigned min, max;
38807 bool in_order, ok;
38808 rtx shift;
38809
38810 /* Even with AVX, palignr only operates on 128-bit vectors. */
38811 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
38812 return false;
38813
38814 min = nelt, max = 0;
38815 for (i = 0; i < nelt; ++i)
38816 {
38817 unsigned e = d->perm[i];
38818 if (e < min)
38819 min = e;
38820 if (e > max)
38821 max = e;
38822 }
38823 if (min == 0 || max - min >= nelt)
38824 return false;
38825
38826 /* Given that we have SSSE3, we know we'll be able to implement the
38827 single operand permutation after the palignr with pshufb. */
38828 if (d->testing_p)
38829 return true;
38830
38831 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
38832 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
38833 gen_lowpart (TImode, d->op1),
38834 gen_lowpart (TImode, d->op0), shift));
38835
38836 d->op0 = d->op1 = d->target;
38837 d->one_operand_p = true;
38838
38839 in_order = true;
38840 for (i = 0; i < nelt; ++i)
38841 {
38842 unsigned e = d->perm[i] - min;
38843 if (e != i)
38844 in_order = false;
38845 d->perm[i] = e;
38846 }
38847
38848 /* Test for the degenerate case where the alignment by itself
38849 produces the desired permutation. */
38850 if (in_order)
38851 return true;
38852
38853 ok = expand_vec_perm_1 (d);
38854 gcc_assert (ok);
38855
38856 return ok;
38857 }
38858
38859 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
38860
38861 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
38862 a two vector permutation into a single vector permutation by using
38863 an interleave operation to merge the vectors. */
38864
38865 static bool
38866 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
38867 {
38868 struct expand_vec_perm_d dremap, dfinal;
38869 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
38870 unsigned HOST_WIDE_INT contents;
38871 unsigned char remap[2 * MAX_VECT_LEN];
38872 rtx seq;
38873 bool ok, same_halves = false;
38874
38875 if (GET_MODE_SIZE (d->vmode) == 16)
38876 {
38877 if (d->one_operand_p)
38878 return false;
38879 }
38880 else if (GET_MODE_SIZE (d->vmode) == 32)
38881 {
38882 if (!TARGET_AVX)
38883 return false;
38884 /* For 32-byte modes allow even d->one_operand_p.
38885 The lack of cross-lane shuffling in some instructions
38886 might prevent a single insn shuffle. */
38887 dfinal = *d;
38888 dfinal.testing_p = true;
38889 /* If expand_vec_perm_interleave3 can expand this into
38890 a 3 insn sequence, give up and let it be expanded as
38891 3 insn sequence. While that is one insn longer,
38892 it doesn't need a memory operand and in the common
38893 case that both interleave low and high permutations
38894 with the same operands are adjacent needs 4 insns
38895 for both after CSE. */
38896 if (expand_vec_perm_interleave3 (&dfinal))
38897 return false;
38898 }
38899 else
38900 return false;
38901
38902 /* Examine from whence the elements come. */
38903 contents = 0;
38904 for (i = 0; i < nelt; ++i)
38905 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
38906
38907 memset (remap, 0xff, sizeof (remap));
38908 dremap = *d;
38909
38910 if (GET_MODE_SIZE (d->vmode) == 16)
38911 {
38912 unsigned HOST_WIDE_INT h1, h2, h3, h4;
38913
38914 /* Split the two input vectors into 4 halves. */
38915 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
38916 h2 = h1 << nelt2;
38917 h3 = h2 << nelt2;
38918 h4 = h3 << nelt2;
38919
38920 /* If the elements from the low halves use interleave low, and similarly
38921 for interleave high. If the elements are from mis-matched halves, we
38922 can use shufps for V4SF/V4SI or do a DImode shuffle. */
38923 if ((contents & (h1 | h3)) == contents)
38924 {
38925 /* punpckl* */
38926 for (i = 0; i < nelt2; ++i)
38927 {
38928 remap[i] = i * 2;
38929 remap[i + nelt] = i * 2 + 1;
38930 dremap.perm[i * 2] = i;
38931 dremap.perm[i * 2 + 1] = i + nelt;
38932 }
38933 if (!TARGET_SSE2 && d->vmode == V4SImode)
38934 dremap.vmode = V4SFmode;
38935 }
38936 else if ((contents & (h2 | h4)) == contents)
38937 {
38938 /* punpckh* */
38939 for (i = 0; i < nelt2; ++i)
38940 {
38941 remap[i + nelt2] = i * 2;
38942 remap[i + nelt + nelt2] = i * 2 + 1;
38943 dremap.perm[i * 2] = i + nelt2;
38944 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
38945 }
38946 if (!TARGET_SSE2 && d->vmode == V4SImode)
38947 dremap.vmode = V4SFmode;
38948 }
38949 else if ((contents & (h1 | h4)) == contents)
38950 {
38951 /* shufps */
38952 for (i = 0; i < nelt2; ++i)
38953 {
38954 remap[i] = i;
38955 remap[i + nelt + nelt2] = i + nelt2;
38956 dremap.perm[i] = i;
38957 dremap.perm[i + nelt2] = i + nelt + nelt2;
38958 }
38959 if (nelt != 4)
38960 {
38961 /* shufpd */
38962 dremap.vmode = V2DImode;
38963 dremap.nelt = 2;
38964 dremap.perm[0] = 0;
38965 dremap.perm[1] = 3;
38966 }
38967 }
38968 else if ((contents & (h2 | h3)) == contents)
38969 {
38970 /* shufps */
38971 for (i = 0; i < nelt2; ++i)
38972 {
38973 remap[i + nelt2] = i;
38974 remap[i + nelt] = i + nelt2;
38975 dremap.perm[i] = i + nelt2;
38976 dremap.perm[i + nelt2] = i + nelt;
38977 }
38978 if (nelt != 4)
38979 {
38980 /* shufpd */
38981 dremap.vmode = V2DImode;
38982 dremap.nelt = 2;
38983 dremap.perm[0] = 1;
38984 dremap.perm[1] = 2;
38985 }
38986 }
38987 else
38988 return false;
38989 }
38990 else
38991 {
38992 unsigned int nelt4 = nelt / 4, nzcnt = 0;
38993 unsigned HOST_WIDE_INT q[8];
38994 unsigned int nonzero_halves[4];
38995
38996 /* Split the two input vectors into 8 quarters. */
38997 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
38998 for (i = 1; i < 8; ++i)
38999 q[i] = q[0] << (nelt4 * i);
39000 for (i = 0; i < 4; ++i)
39001 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39002 {
39003 nonzero_halves[nzcnt] = i;
39004 ++nzcnt;
39005 }
39006
39007 if (nzcnt == 1)
39008 {
39009 gcc_assert (d->one_operand_p);
39010 nonzero_halves[1] = nonzero_halves[0];
39011 same_halves = true;
39012 }
39013 else if (d->one_operand_p)
39014 {
39015 gcc_assert (nonzero_halves[0] == 0);
39016 gcc_assert (nonzero_halves[1] == 1);
39017 }
39018
39019 if (nzcnt <= 2)
39020 {
39021 if (d->perm[0] / nelt2 == nonzero_halves[1])
39022 {
39023 /* Attempt to increase the likelihood that dfinal
39024 shuffle will be intra-lane. */
39025 char tmph = nonzero_halves[0];
39026 nonzero_halves[0] = nonzero_halves[1];
39027 nonzero_halves[1] = tmph;
39028 }
39029
39030 /* vperm2f128 or vperm2i128. */
39031 for (i = 0; i < nelt2; ++i)
39032 {
39033 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39034 remap[i + nonzero_halves[0] * nelt2] = i;
39035 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39036 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39037 }
39038
39039 if (d->vmode != V8SFmode
39040 && d->vmode != V4DFmode
39041 && d->vmode != V8SImode)
39042 {
39043 dremap.vmode = V8SImode;
39044 dremap.nelt = 8;
39045 for (i = 0; i < 4; ++i)
39046 {
39047 dremap.perm[i] = i + nonzero_halves[0] * 4;
39048 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39049 }
39050 }
39051 }
39052 else if (d->one_operand_p)
39053 return false;
39054 else if (TARGET_AVX2
39055 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39056 {
39057 /* vpunpckl* */
39058 for (i = 0; i < nelt4; ++i)
39059 {
39060 remap[i] = i * 2;
39061 remap[i + nelt] = i * 2 + 1;
39062 remap[i + nelt2] = i * 2 + nelt2;
39063 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39064 dremap.perm[i * 2] = i;
39065 dremap.perm[i * 2 + 1] = i + nelt;
39066 dremap.perm[i * 2 + nelt2] = i + nelt2;
39067 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39068 }
39069 }
39070 else if (TARGET_AVX2
39071 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39072 {
39073 /* vpunpckh* */
39074 for (i = 0; i < nelt4; ++i)
39075 {
39076 remap[i + nelt4] = i * 2;
39077 remap[i + nelt + nelt4] = i * 2 + 1;
39078 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39079 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39080 dremap.perm[i * 2] = i + nelt4;
39081 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39082 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39083 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39084 }
39085 }
39086 else
39087 return false;
39088 }
39089
39090 /* Use the remapping array set up above to move the elements from their
39091 swizzled locations into their final destinations. */
39092 dfinal = *d;
39093 for (i = 0; i < nelt; ++i)
39094 {
39095 unsigned e = remap[d->perm[i]];
39096 gcc_assert (e < nelt);
39097 /* If same_halves is true, both halves of the remapped vector are the
39098 same. Avoid cross-lane accesses if possible. */
39099 if (same_halves && i >= nelt2)
39100 {
39101 gcc_assert (e < nelt2);
39102 dfinal.perm[i] = e + nelt2;
39103 }
39104 else
39105 dfinal.perm[i] = e;
39106 }
39107 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39108 dfinal.op1 = dfinal.op0;
39109 dfinal.one_operand_p = true;
39110 dremap.target = dfinal.op0;
39111
39112 /* Test if the final remap can be done with a single insn. For V4SFmode or
39113 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39114 start_sequence ();
39115 ok = expand_vec_perm_1 (&dfinal);
39116 seq = get_insns ();
39117 end_sequence ();
39118
39119 if (!ok)
39120 return false;
39121
39122 if (d->testing_p)
39123 return true;
39124
39125 if (dremap.vmode != dfinal.vmode)
39126 {
39127 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39128 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39129 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39130 }
39131
39132 ok = expand_vec_perm_1 (&dremap);
39133 gcc_assert (ok);
39134
39135 emit_insn (seq);
39136 return true;
39137 }
39138
39139 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39140 a single vector cross-lane permutation into vpermq followed
39141 by any of the single insn permutations. */
39142
39143 static bool
39144 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39145 {
39146 struct expand_vec_perm_d dremap, dfinal;
39147 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39148 unsigned contents[2];
39149 bool ok;
39150
39151 if (!(TARGET_AVX2
39152 && (d->vmode == V32QImode || d->vmode == V16HImode)
39153 && d->one_operand_p))
39154 return false;
39155
39156 contents[0] = 0;
39157 contents[1] = 0;
39158 for (i = 0; i < nelt2; ++i)
39159 {
39160 contents[0] |= 1u << (d->perm[i] / nelt4);
39161 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39162 }
39163
39164 for (i = 0; i < 2; ++i)
39165 {
39166 unsigned int cnt = 0;
39167 for (j = 0; j < 4; ++j)
39168 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39169 return false;
39170 }
39171
39172 if (d->testing_p)
39173 return true;
39174
39175 dremap = *d;
39176 dremap.vmode = V4DImode;
39177 dremap.nelt = 4;
39178 dremap.target = gen_reg_rtx (V4DImode);
39179 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39180 dremap.op1 = dremap.op0;
39181 dremap.one_operand_p = true;
39182 for (i = 0; i < 2; ++i)
39183 {
39184 unsigned int cnt = 0;
39185 for (j = 0; j < 4; ++j)
39186 if ((contents[i] & (1u << j)) != 0)
39187 dremap.perm[2 * i + cnt++] = j;
39188 for (; cnt < 2; ++cnt)
39189 dremap.perm[2 * i + cnt] = 0;
39190 }
39191
39192 dfinal = *d;
39193 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39194 dfinal.op1 = dfinal.op0;
39195 dfinal.one_operand_p = true;
39196 for (i = 0, j = 0; i < nelt; ++i)
39197 {
39198 if (i == nelt2)
39199 j = 2;
39200 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39201 if ((d->perm[i] / nelt4) == dremap.perm[j])
39202 ;
39203 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39204 dfinal.perm[i] |= nelt4;
39205 else
39206 gcc_unreachable ();
39207 }
39208
39209 ok = expand_vec_perm_1 (&dremap);
39210 gcc_assert (ok);
39211
39212 ok = expand_vec_perm_1 (&dfinal);
39213 gcc_assert (ok);
39214
39215 return true;
39216 }
39217
39218 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39219 a vector permutation using two instructions, vperm2f128 resp.
39220 vperm2i128 followed by any single in-lane permutation. */
39221
39222 static bool
39223 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39224 {
39225 struct expand_vec_perm_d dfirst, dsecond;
39226 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39227 bool ok;
39228
39229 if (!TARGET_AVX
39230 || GET_MODE_SIZE (d->vmode) != 32
39231 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39232 return false;
39233
39234 dsecond = *d;
39235 dsecond.one_operand_p = false;
39236 dsecond.testing_p = true;
39237
39238 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39239 immediate. For perm < 16 the second permutation uses
39240 d->op0 as first operand, for perm >= 16 it uses d->op1
39241 as first operand. The second operand is the result of
39242 vperm2[fi]128. */
39243 for (perm = 0; perm < 32; perm++)
39244 {
39245 /* Ignore permutations which do not move anything cross-lane. */
39246 if (perm < 16)
39247 {
39248 /* The second shuffle for e.g. V4DFmode has
39249 0123 and ABCD operands.
39250 Ignore AB23, as 23 is already in the second lane
39251 of the first operand. */
39252 if ((perm & 0xc) == (1 << 2)) continue;
39253 /* And 01CD, as 01 is in the first lane of the first
39254 operand. */
39255 if ((perm & 3) == 0) continue;
39256 /* And 4567, as then the vperm2[fi]128 doesn't change
39257 anything on the original 4567 second operand. */
39258 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39259 }
39260 else
39261 {
39262 /* The second shuffle for e.g. V4DFmode has
39263 4567 and ABCD operands.
39264 Ignore AB67, as 67 is already in the second lane
39265 of the first operand. */
39266 if ((perm & 0xc) == (3 << 2)) continue;
39267 /* And 45CD, as 45 is in the first lane of the first
39268 operand. */
39269 if ((perm & 3) == 2) continue;
39270 /* And 0123, as then the vperm2[fi]128 doesn't change
39271 anything on the original 0123 first operand. */
39272 if ((perm & 0xf) == (1 << 2)) continue;
39273 }
39274
39275 for (i = 0; i < nelt; i++)
39276 {
39277 j = d->perm[i] / nelt2;
39278 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39279 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39280 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39281 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39282 else
39283 break;
39284 }
39285
39286 if (i == nelt)
39287 {
39288 start_sequence ();
39289 ok = expand_vec_perm_1 (&dsecond);
39290 end_sequence ();
39291 }
39292 else
39293 ok = false;
39294
39295 if (ok)
39296 {
39297 if (d->testing_p)
39298 return true;
39299
39300 /* Found a usable second shuffle. dfirst will be
39301 vperm2f128 on d->op0 and d->op1. */
39302 dsecond.testing_p = false;
39303 dfirst = *d;
39304 dfirst.target = gen_reg_rtx (d->vmode);
39305 for (i = 0; i < nelt; i++)
39306 dfirst.perm[i] = (i & (nelt2 - 1))
39307 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39308
39309 ok = expand_vec_perm_1 (&dfirst);
39310 gcc_assert (ok);
39311
39312 /* And dsecond is some single insn shuffle, taking
39313 d->op0 and result of vperm2f128 (if perm < 16) or
39314 d->op1 and result of vperm2f128 (otherwise). */
39315 dsecond.op1 = dfirst.target;
39316 if (perm >= 16)
39317 dsecond.op0 = dfirst.op1;
39318
39319 ok = expand_vec_perm_1 (&dsecond);
39320 gcc_assert (ok);
39321
39322 return true;
39323 }
39324
39325 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39326 if (d->one_operand_p)
39327 return false;
39328 }
39329
39330 return false;
39331 }
39332
39333 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39334 a two vector permutation using 2 intra-lane interleave insns
39335 and cross-lane shuffle for 32-byte vectors. */
39336
39337 static bool
39338 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39339 {
39340 unsigned i, nelt;
39341 rtx (*gen) (rtx, rtx, rtx);
39342
39343 if (d->one_operand_p)
39344 return false;
39345 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39346 ;
39347 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39348 ;
39349 else
39350 return false;
39351
39352 nelt = d->nelt;
39353 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39354 return false;
39355 for (i = 0; i < nelt; i += 2)
39356 if (d->perm[i] != d->perm[0] + i / 2
39357 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39358 return false;
39359
39360 if (d->testing_p)
39361 return true;
39362
39363 switch (d->vmode)
39364 {
39365 case V32QImode:
39366 if (d->perm[0])
39367 gen = gen_vec_interleave_highv32qi;
39368 else
39369 gen = gen_vec_interleave_lowv32qi;
39370 break;
39371 case V16HImode:
39372 if (d->perm[0])
39373 gen = gen_vec_interleave_highv16hi;
39374 else
39375 gen = gen_vec_interleave_lowv16hi;
39376 break;
39377 case V8SImode:
39378 if (d->perm[0])
39379 gen = gen_vec_interleave_highv8si;
39380 else
39381 gen = gen_vec_interleave_lowv8si;
39382 break;
39383 case V4DImode:
39384 if (d->perm[0])
39385 gen = gen_vec_interleave_highv4di;
39386 else
39387 gen = gen_vec_interleave_lowv4di;
39388 break;
39389 case V8SFmode:
39390 if (d->perm[0])
39391 gen = gen_vec_interleave_highv8sf;
39392 else
39393 gen = gen_vec_interleave_lowv8sf;
39394 break;
39395 case V4DFmode:
39396 if (d->perm[0])
39397 gen = gen_vec_interleave_highv4df;
39398 else
39399 gen = gen_vec_interleave_lowv4df;
39400 break;
39401 default:
39402 gcc_unreachable ();
39403 }
39404
39405 emit_insn (gen (d->target, d->op0, d->op1));
39406 return true;
39407 }
39408
39409 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39410 a single vector permutation using a single intra-lane vector
39411 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39412 the non-swapped and swapped vectors together. */
39413
39414 static bool
39415 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39416 {
39417 struct expand_vec_perm_d dfirst, dsecond;
39418 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39419 rtx seq;
39420 bool ok;
39421 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39422
39423 if (!TARGET_AVX
39424 || TARGET_AVX2
39425 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39426 || !d->one_operand_p)
39427 return false;
39428
39429 dfirst = *d;
39430 for (i = 0; i < nelt; i++)
39431 dfirst.perm[i] = 0xff;
39432 for (i = 0, msk = 0; i < nelt; i++)
39433 {
39434 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39435 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39436 return false;
39437 dfirst.perm[j] = d->perm[i];
39438 if (j != i)
39439 msk |= (1 << i);
39440 }
39441 for (i = 0; i < nelt; i++)
39442 if (dfirst.perm[i] == 0xff)
39443 dfirst.perm[i] = i;
39444
39445 if (!d->testing_p)
39446 dfirst.target = gen_reg_rtx (dfirst.vmode);
39447
39448 start_sequence ();
39449 ok = expand_vec_perm_1 (&dfirst);
39450 seq = get_insns ();
39451 end_sequence ();
39452
39453 if (!ok)
39454 return false;
39455
39456 if (d->testing_p)
39457 return true;
39458
39459 emit_insn (seq);
39460
39461 dsecond = *d;
39462 dsecond.op0 = dfirst.target;
39463 dsecond.op1 = dfirst.target;
39464 dsecond.one_operand_p = true;
39465 dsecond.target = gen_reg_rtx (dsecond.vmode);
39466 for (i = 0; i < nelt; i++)
39467 dsecond.perm[i] = i ^ nelt2;
39468
39469 ok = expand_vec_perm_1 (&dsecond);
39470 gcc_assert (ok);
39471
39472 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39473 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39474 return true;
39475 }
39476
39477 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39478 permutation using two vperm2f128, followed by a vshufpd insn blending
39479 the two vectors together. */
39480
39481 static bool
39482 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39483 {
39484 struct expand_vec_perm_d dfirst, dsecond, dthird;
39485 bool ok;
39486
39487 if (!TARGET_AVX || (d->vmode != V4DFmode))
39488 return false;
39489
39490 if (d->testing_p)
39491 return true;
39492
39493 dfirst = *d;
39494 dsecond = *d;
39495 dthird = *d;
39496
39497 dfirst.perm[0] = (d->perm[0] & ~1);
39498 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39499 dfirst.perm[2] = (d->perm[2] & ~1);
39500 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39501 dsecond.perm[0] = (d->perm[1] & ~1);
39502 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39503 dsecond.perm[2] = (d->perm[3] & ~1);
39504 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39505 dthird.perm[0] = (d->perm[0] % 2);
39506 dthird.perm[1] = (d->perm[1] % 2) + 4;
39507 dthird.perm[2] = (d->perm[2] % 2) + 2;
39508 dthird.perm[3] = (d->perm[3] % 2) + 6;
39509
39510 dfirst.target = gen_reg_rtx (dfirst.vmode);
39511 dsecond.target = gen_reg_rtx (dsecond.vmode);
39512 dthird.op0 = dfirst.target;
39513 dthird.op1 = dsecond.target;
39514 dthird.one_operand_p = false;
39515
39516 canonicalize_perm (&dfirst);
39517 canonicalize_perm (&dsecond);
39518
39519 ok = expand_vec_perm_1 (&dfirst)
39520 && expand_vec_perm_1 (&dsecond)
39521 && expand_vec_perm_1 (&dthird);
39522
39523 gcc_assert (ok);
39524
39525 return true;
39526 }
39527
39528 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39529 permutation with two pshufb insns and an ior. We should have already
39530 failed all two instruction sequences. */
39531
39532 static bool
39533 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39534 {
39535 rtx rperm[2][16], vperm, l, h, op, m128;
39536 unsigned int i, nelt, eltsz;
39537
39538 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39539 return false;
39540 gcc_assert (!d->one_operand_p);
39541
39542 nelt = d->nelt;
39543 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39544
39545 /* Generate two permutation masks. If the required element is within
39546 the given vector it is shuffled into the proper lane. If the required
39547 element is in the other vector, force a zero into the lane by setting
39548 bit 7 in the permutation mask. */
39549 m128 = GEN_INT (-128);
39550 for (i = 0; i < nelt; ++i)
39551 {
39552 unsigned j, e = d->perm[i];
39553 unsigned which = (e >= nelt);
39554 if (e >= nelt)
39555 e -= nelt;
39556
39557 for (j = 0; j < eltsz; ++j)
39558 {
39559 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39560 rperm[1-which][i*eltsz + j] = m128;
39561 }
39562 }
39563
39564 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39565 vperm = force_reg (V16QImode, vperm);
39566
39567 l = gen_reg_rtx (V16QImode);
39568 op = gen_lowpart (V16QImode, d->op0);
39569 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39570
39571 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39572 vperm = force_reg (V16QImode, vperm);
39573
39574 h = gen_reg_rtx (V16QImode);
39575 op = gen_lowpart (V16QImode, d->op1);
39576 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39577
39578 op = gen_lowpart (V16QImode, d->target);
39579 emit_insn (gen_iorv16qi3 (op, l, h));
39580
39581 return true;
39582 }
39583
39584 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39585 with two vpshufb insns, vpermq and vpor. We should have already failed
39586 all two or three instruction sequences. */
39587
39588 static bool
39589 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39590 {
39591 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39592 unsigned int i, nelt, eltsz;
39593
39594 if (!TARGET_AVX2
39595 || !d->one_operand_p
39596 || (d->vmode != V32QImode && d->vmode != V16HImode))
39597 return false;
39598
39599 if (d->testing_p)
39600 return true;
39601
39602 nelt = d->nelt;
39603 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39604
39605 /* Generate two permutation masks. If the required element is within
39606 the same lane, it is shuffled in. If the required element from the
39607 other lane, force a zero by setting bit 7 in the permutation mask.
39608 In the other mask the mask has non-negative elements if element
39609 is requested from the other lane, but also moved to the other lane,
39610 so that the result of vpshufb can have the two V2TImode halves
39611 swapped. */
39612 m128 = GEN_INT (-128);
39613 for (i = 0; i < nelt; ++i)
39614 {
39615 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39616 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39617
39618 for (j = 0; j < eltsz; ++j)
39619 {
39620 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39621 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39622 }
39623 }
39624
39625 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39626 vperm = force_reg (V32QImode, vperm);
39627
39628 h = gen_reg_rtx (V32QImode);
39629 op = gen_lowpart (V32QImode, d->op0);
39630 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39631
39632 /* Swap the 128-byte lanes of h into hp. */
39633 hp = gen_reg_rtx (V4DImode);
39634 op = gen_lowpart (V4DImode, h);
39635 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39636 const1_rtx));
39637
39638 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39639 vperm = force_reg (V32QImode, vperm);
39640
39641 l = gen_reg_rtx (V32QImode);
39642 op = gen_lowpart (V32QImode, d->op0);
39643 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39644
39645 op = gen_lowpart (V32QImode, d->target);
39646 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39647
39648 return true;
39649 }
39650
39651 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39652 and extract-odd permutations of two V32QImode and V16QImode operand
39653 with two vpshufb insns, vpor and vpermq. We should have already
39654 failed all two or three instruction sequences. */
39655
39656 static bool
39657 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39658 {
39659 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39660 unsigned int i, nelt, eltsz;
39661
39662 if (!TARGET_AVX2
39663 || d->one_operand_p
39664 || (d->vmode != V32QImode && d->vmode != V16HImode))
39665 return false;
39666
39667 for (i = 0; i < d->nelt; ++i)
39668 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39669 return false;
39670
39671 if (d->testing_p)
39672 return true;
39673
39674 nelt = d->nelt;
39675 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39676
39677 /* Generate two permutation masks. In the first permutation mask
39678 the first quarter will contain indexes for the first half
39679 of the op0, the second quarter will contain bit 7 set, third quarter
39680 will contain indexes for the second half of the op0 and the
39681 last quarter bit 7 set. In the second permutation mask
39682 the first quarter will contain bit 7 set, the second quarter
39683 indexes for the first half of the op1, the third quarter bit 7 set
39684 and last quarter indexes for the second half of the op1.
39685 I.e. the first mask e.g. for V32QImode extract even will be:
39686 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
39687 (all values masked with 0xf except for -128) and second mask
39688 for extract even will be
39689 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
39690 m128 = GEN_INT (-128);
39691 for (i = 0; i < nelt; ++i)
39692 {
39693 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39694 unsigned which = d->perm[i] >= nelt;
39695 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
39696
39697 for (j = 0; j < eltsz; ++j)
39698 {
39699 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
39700 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
39701 }
39702 }
39703
39704 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39705 vperm = force_reg (V32QImode, vperm);
39706
39707 l = gen_reg_rtx (V32QImode);
39708 op = gen_lowpart (V32QImode, d->op0);
39709 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39710
39711 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39712 vperm = force_reg (V32QImode, vperm);
39713
39714 h = gen_reg_rtx (V32QImode);
39715 op = gen_lowpart (V32QImode, d->op1);
39716 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39717
39718 ior = gen_reg_rtx (V32QImode);
39719 emit_insn (gen_iorv32qi3 (ior, l, h));
39720
39721 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
39722 op = gen_lowpart (V4DImode, d->target);
39723 ior = gen_lowpart (V4DImode, ior);
39724 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
39725 const1_rtx, GEN_INT (3)));
39726
39727 return true;
39728 }
39729
39730 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
39731 and extract-odd permutations. */
39732
39733 static bool
39734 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
39735 {
39736 rtx t1, t2, t3;
39737
39738 switch (d->vmode)
39739 {
39740 case V4DFmode:
39741 t1 = gen_reg_rtx (V4DFmode);
39742 t2 = gen_reg_rtx (V4DFmode);
39743
39744 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39745 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
39746 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
39747
39748 /* Now an unpck[lh]pd will produce the result required. */
39749 if (odd)
39750 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
39751 else
39752 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
39753 emit_insn (t3);
39754 break;
39755
39756 case V8SFmode:
39757 {
39758 int mask = odd ? 0xdd : 0x88;
39759
39760 t1 = gen_reg_rtx (V8SFmode);
39761 t2 = gen_reg_rtx (V8SFmode);
39762 t3 = gen_reg_rtx (V8SFmode);
39763
39764 /* Shuffle within the 128-bit lanes to produce:
39765 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
39766 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
39767 GEN_INT (mask)));
39768
39769 /* Shuffle the lanes around to produce:
39770 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
39771 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
39772 GEN_INT (0x3)));
39773
39774 /* Shuffle within the 128-bit lanes to produce:
39775 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
39776 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
39777
39778 /* Shuffle within the 128-bit lanes to produce:
39779 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
39780 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
39781
39782 /* Shuffle the lanes around to produce:
39783 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
39784 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
39785 GEN_INT (0x20)));
39786 }
39787 break;
39788
39789 case V2DFmode:
39790 case V4SFmode:
39791 case V2DImode:
39792 case V4SImode:
39793 /* These are always directly implementable by expand_vec_perm_1. */
39794 gcc_unreachable ();
39795
39796 case V8HImode:
39797 if (TARGET_SSSE3)
39798 return expand_vec_perm_pshufb2 (d);
39799 else
39800 {
39801 /* We need 2*log2(N)-1 operations to achieve odd/even
39802 with interleave. */
39803 t1 = gen_reg_rtx (V8HImode);
39804 t2 = gen_reg_rtx (V8HImode);
39805 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
39806 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
39807 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
39808 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
39809 if (odd)
39810 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
39811 else
39812 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
39813 emit_insn (t3);
39814 }
39815 break;
39816
39817 case V16QImode:
39818 if (TARGET_SSSE3)
39819 return expand_vec_perm_pshufb2 (d);
39820 else
39821 {
39822 t1 = gen_reg_rtx (V16QImode);
39823 t2 = gen_reg_rtx (V16QImode);
39824 t3 = gen_reg_rtx (V16QImode);
39825 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
39826 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
39827 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
39828 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
39829 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
39830 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
39831 if (odd)
39832 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
39833 else
39834 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
39835 emit_insn (t3);
39836 }
39837 break;
39838
39839 case V16HImode:
39840 case V32QImode:
39841 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
39842
39843 case V4DImode:
39844 if (!TARGET_AVX2)
39845 {
39846 struct expand_vec_perm_d d_copy = *d;
39847 d_copy.vmode = V4DFmode;
39848 d_copy.target = gen_lowpart (V4DFmode, d->target);
39849 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
39850 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
39851 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39852 }
39853
39854 t1 = gen_reg_rtx (V4DImode);
39855 t2 = gen_reg_rtx (V4DImode);
39856
39857 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
39858 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
39859 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
39860
39861 /* Now an vpunpck[lh]qdq will produce the result required. */
39862 if (odd)
39863 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
39864 else
39865 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
39866 emit_insn (t3);
39867 break;
39868
39869 case V8SImode:
39870 if (!TARGET_AVX2)
39871 {
39872 struct expand_vec_perm_d d_copy = *d;
39873 d_copy.vmode = V8SFmode;
39874 d_copy.target = gen_lowpart (V8SFmode, d->target);
39875 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
39876 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
39877 return expand_vec_perm_even_odd_1 (&d_copy, odd);
39878 }
39879
39880 t1 = gen_reg_rtx (V8SImode);
39881 t2 = gen_reg_rtx (V8SImode);
39882
39883 /* Shuffle the lanes around into
39884 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
39885 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
39886 gen_lowpart (V4DImode, d->op0),
39887 gen_lowpart (V4DImode, d->op1),
39888 GEN_INT (0x20)));
39889 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
39890 gen_lowpart (V4DImode, d->op0),
39891 gen_lowpart (V4DImode, d->op1),
39892 GEN_INT (0x31)));
39893
39894 /* Swap the 2nd and 3rd position in each lane into
39895 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
39896 emit_insn (gen_avx2_pshufdv3 (t1, t1,
39897 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39898 emit_insn (gen_avx2_pshufdv3 (t2, t2,
39899 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
39900
39901 /* Now an vpunpck[lh]qdq will produce
39902 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
39903 if (odd)
39904 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
39905 gen_lowpart (V4DImode, t1),
39906 gen_lowpart (V4DImode, t2));
39907 else
39908 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
39909 gen_lowpart (V4DImode, t1),
39910 gen_lowpart (V4DImode, t2));
39911 emit_insn (t3);
39912 break;
39913
39914 default:
39915 gcc_unreachable ();
39916 }
39917
39918 return true;
39919 }
39920
39921 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
39922 extract-even and extract-odd permutations. */
39923
39924 static bool
39925 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
39926 {
39927 unsigned i, odd, nelt = d->nelt;
39928
39929 odd = d->perm[0];
39930 if (odd != 0 && odd != 1)
39931 return false;
39932
39933 for (i = 1; i < nelt; ++i)
39934 if (d->perm[i] != 2 * i + odd)
39935 return false;
39936
39937 return expand_vec_perm_even_odd_1 (d, odd);
39938 }
39939
39940 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
39941 permutations. We assume that expand_vec_perm_1 has already failed. */
39942
39943 static bool
39944 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
39945 {
39946 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
39947 enum machine_mode vmode = d->vmode;
39948 unsigned char perm2[4];
39949 rtx op0 = d->op0;
39950 bool ok;
39951
39952 switch (vmode)
39953 {
39954 case V4DFmode:
39955 case V8SFmode:
39956 /* These are special-cased in sse.md so that we can optionally
39957 use the vbroadcast instruction. They expand to two insns
39958 if the input happens to be in a register. */
39959 gcc_unreachable ();
39960
39961 case V2DFmode:
39962 case V2DImode:
39963 case V4SFmode:
39964 case V4SImode:
39965 /* These are always implementable using standard shuffle patterns. */
39966 gcc_unreachable ();
39967
39968 case V8HImode:
39969 case V16QImode:
39970 /* These can be implemented via interleave. We save one insn by
39971 stopping once we have promoted to V4SImode and then use pshufd. */
39972 do
39973 {
39974 rtx dest;
39975 rtx (*gen) (rtx, rtx, rtx)
39976 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
39977 : gen_vec_interleave_lowv8hi;
39978
39979 if (elt >= nelt2)
39980 {
39981 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
39982 : gen_vec_interleave_highv8hi;
39983 elt -= nelt2;
39984 }
39985 nelt2 /= 2;
39986
39987 dest = gen_reg_rtx (vmode);
39988 emit_insn (gen (dest, op0, op0));
39989 vmode = get_mode_wider_vector (vmode);
39990 op0 = gen_lowpart (vmode, dest);
39991 }
39992 while (vmode != V4SImode);
39993
39994 memset (perm2, elt, 4);
39995 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
39996 d->testing_p);
39997 gcc_assert (ok);
39998 return true;
39999
40000 case V32QImode:
40001 case V16HImode:
40002 case V8SImode:
40003 case V4DImode:
40004 /* For AVX2 broadcasts of the first element vpbroadcast* or
40005 vpermq should be used by expand_vec_perm_1. */
40006 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40007 return false;
40008
40009 default:
40010 gcc_unreachable ();
40011 }
40012 }
40013
40014 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40015 broadcast permutations. */
40016
40017 static bool
40018 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40019 {
40020 unsigned i, elt, nelt = d->nelt;
40021
40022 if (!d->one_operand_p)
40023 return false;
40024
40025 elt = d->perm[0];
40026 for (i = 1; i < nelt; ++i)
40027 if (d->perm[i] != elt)
40028 return false;
40029
40030 return expand_vec_perm_broadcast_1 (d);
40031 }
40032
40033 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40034 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40035 all the shorter instruction sequences. */
40036
40037 static bool
40038 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40039 {
40040 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40041 unsigned int i, nelt, eltsz;
40042 bool used[4];
40043
40044 if (!TARGET_AVX2
40045 || d->one_operand_p
40046 || (d->vmode != V32QImode && d->vmode != V16HImode))
40047 return false;
40048
40049 if (d->testing_p)
40050 return true;
40051
40052 nelt = d->nelt;
40053 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40054
40055 /* Generate 4 permutation masks. If the required element is within
40056 the same lane, it is shuffled in. If the required element from the
40057 other lane, force a zero by setting bit 7 in the permutation mask.
40058 In the other mask the mask has non-negative elements if element
40059 is requested from the other lane, but also moved to the other lane,
40060 so that the result of vpshufb can have the two V2TImode halves
40061 swapped. */
40062 m128 = GEN_INT (-128);
40063 for (i = 0; i < 32; ++i)
40064 {
40065 rperm[0][i] = m128;
40066 rperm[1][i] = m128;
40067 rperm[2][i] = m128;
40068 rperm[3][i] = m128;
40069 }
40070 used[0] = false;
40071 used[1] = false;
40072 used[2] = false;
40073 used[3] = false;
40074 for (i = 0; i < nelt; ++i)
40075 {
40076 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40077 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40078 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40079
40080 for (j = 0; j < eltsz; ++j)
40081 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40082 used[which] = true;
40083 }
40084
40085 for (i = 0; i < 2; ++i)
40086 {
40087 if (!used[2 * i + 1])
40088 {
40089 h[i] = NULL_RTX;
40090 continue;
40091 }
40092 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40093 gen_rtvec_v (32, rperm[2 * i + 1]));
40094 vperm = force_reg (V32QImode, vperm);
40095 h[i] = gen_reg_rtx (V32QImode);
40096 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40097 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40098 }
40099
40100 /* Swap the 128-byte lanes of h[X]. */
40101 for (i = 0; i < 2; ++i)
40102 {
40103 if (h[i] == NULL_RTX)
40104 continue;
40105 op = gen_reg_rtx (V4DImode);
40106 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40107 const2_rtx, GEN_INT (3), const0_rtx,
40108 const1_rtx));
40109 h[i] = gen_lowpart (V32QImode, op);
40110 }
40111
40112 for (i = 0; i < 2; ++i)
40113 {
40114 if (!used[2 * i])
40115 {
40116 l[i] = NULL_RTX;
40117 continue;
40118 }
40119 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40120 vperm = force_reg (V32QImode, vperm);
40121 l[i] = gen_reg_rtx (V32QImode);
40122 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40123 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40124 }
40125
40126 for (i = 0; i < 2; ++i)
40127 {
40128 if (h[i] && l[i])
40129 {
40130 op = gen_reg_rtx (V32QImode);
40131 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40132 l[i] = op;
40133 }
40134 else if (h[i])
40135 l[i] = h[i];
40136 }
40137
40138 gcc_assert (l[0] && l[1]);
40139 op = gen_lowpart (V32QImode, d->target);
40140 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40141 return true;
40142 }
40143
40144 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40145 With all of the interface bits taken care of, perform the expansion
40146 in D and return true on success. */
40147
40148 static bool
40149 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40150 {
40151 /* Try a single instruction expansion. */
40152 if (expand_vec_perm_1 (d))
40153 return true;
40154
40155 /* Try sequences of two instructions. */
40156
40157 if (expand_vec_perm_pshuflw_pshufhw (d))
40158 return true;
40159
40160 if (expand_vec_perm_palignr (d))
40161 return true;
40162
40163 if (expand_vec_perm_interleave2 (d))
40164 return true;
40165
40166 if (expand_vec_perm_broadcast (d))
40167 return true;
40168
40169 if (expand_vec_perm_vpermq_perm_1 (d))
40170 return true;
40171
40172 if (expand_vec_perm_vperm2f128 (d))
40173 return true;
40174
40175 /* Try sequences of three instructions. */
40176
40177 if (expand_vec_perm_2vperm2f128_vshuf (d))
40178 return true;
40179
40180 if (expand_vec_perm_pshufb2 (d))
40181 return true;
40182
40183 if (expand_vec_perm_interleave3 (d))
40184 return true;
40185
40186 if (expand_vec_perm_vperm2f128_vblend (d))
40187 return true;
40188
40189 /* Try sequences of four instructions. */
40190
40191 if (expand_vec_perm_vpshufb2_vpermq (d))
40192 return true;
40193
40194 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40195 return true;
40196
40197 /* ??? Look for narrow permutations whose element orderings would
40198 allow the promotion to a wider mode. */
40199
40200 /* ??? Look for sequences of interleave or a wider permute that place
40201 the data into the correct lanes for a half-vector shuffle like
40202 pshuf[lh]w or vpermilps. */
40203
40204 /* ??? Look for sequences of interleave that produce the desired results.
40205 The combinatorics of punpck[lh] get pretty ugly... */
40206
40207 if (expand_vec_perm_even_odd (d))
40208 return true;
40209
40210 /* Even longer sequences. */
40211 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40212 return true;
40213
40214 return false;
40215 }
40216
40217 /* If a permutation only uses one operand, make it clear. Returns true
40218 if the permutation references both operands. */
40219
40220 static bool
40221 canonicalize_perm (struct expand_vec_perm_d *d)
40222 {
40223 int i, which, nelt = d->nelt;
40224
40225 for (i = which = 0; i < nelt; ++i)
40226 which |= (d->perm[i] < nelt ? 1 : 2);
40227
40228 d->one_operand_p = true;
40229 switch (which)
40230 {
40231 default:
40232 gcc_unreachable();
40233
40234 case 3:
40235 if (!rtx_equal_p (d->op0, d->op1))
40236 {
40237 d->one_operand_p = false;
40238 break;
40239 }
40240 /* The elements of PERM do not suggest that only the first operand
40241 is used, but both operands are identical. Allow easier matching
40242 of the permutation by folding the permutation into the single
40243 input vector. */
40244 /* FALLTHRU */
40245
40246 case 2:
40247 for (i = 0; i < nelt; ++i)
40248 d->perm[i] &= nelt - 1;
40249 d->op0 = d->op1;
40250 break;
40251
40252 case 1:
40253 d->op1 = d->op0;
40254 break;
40255 }
40256
40257 return (which == 3);
40258 }
40259
40260 bool
40261 ix86_expand_vec_perm_const (rtx operands[4])
40262 {
40263 struct expand_vec_perm_d d;
40264 unsigned char perm[MAX_VECT_LEN];
40265 int i, nelt;
40266 bool two_args;
40267 rtx sel;
40268
40269 d.target = operands[0];
40270 d.op0 = operands[1];
40271 d.op1 = operands[2];
40272 sel = operands[3];
40273
40274 d.vmode = GET_MODE (d.target);
40275 gcc_assert (VECTOR_MODE_P (d.vmode));
40276 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40277 d.testing_p = false;
40278
40279 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40280 gcc_assert (XVECLEN (sel, 0) == nelt);
40281 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40282
40283 for (i = 0; i < nelt; ++i)
40284 {
40285 rtx e = XVECEXP (sel, 0, i);
40286 int ei = INTVAL (e) & (2 * nelt - 1);
40287 d.perm[i] = ei;
40288 perm[i] = ei;
40289 }
40290
40291 two_args = canonicalize_perm (&d);
40292
40293 if (ix86_expand_vec_perm_const_1 (&d))
40294 return true;
40295
40296 /* If the selector says both arguments are needed, but the operands are the
40297 same, the above tried to expand with one_operand_p and flattened selector.
40298 If that didn't work, retry without one_operand_p; we succeeded with that
40299 during testing. */
40300 if (two_args && d.one_operand_p)
40301 {
40302 d.one_operand_p = false;
40303 memcpy (d.perm, perm, sizeof (perm));
40304 return ix86_expand_vec_perm_const_1 (&d);
40305 }
40306
40307 return false;
40308 }
40309
40310 /* Implement targetm.vectorize.vec_perm_const_ok. */
40311
40312 static bool
40313 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40314 const unsigned char *sel)
40315 {
40316 struct expand_vec_perm_d d;
40317 unsigned int i, nelt, which;
40318 bool ret;
40319
40320 d.vmode = vmode;
40321 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40322 d.testing_p = true;
40323
40324 /* Given sufficient ISA support we can just return true here
40325 for selected vector modes. */
40326 if (GET_MODE_SIZE (d.vmode) == 16)
40327 {
40328 /* All implementable with a single vpperm insn. */
40329 if (TARGET_XOP)
40330 return true;
40331 /* All implementable with 2 pshufb + 1 ior. */
40332 if (TARGET_SSSE3)
40333 return true;
40334 /* All implementable with shufpd or unpck[lh]pd. */
40335 if (d.nelt == 2)
40336 return true;
40337 }
40338
40339 /* Extract the values from the vector CST into the permutation
40340 array in D. */
40341 memcpy (d.perm, sel, nelt);
40342 for (i = which = 0; i < nelt; ++i)
40343 {
40344 unsigned char e = d.perm[i];
40345 gcc_assert (e < 2 * nelt);
40346 which |= (e < nelt ? 1 : 2);
40347 }
40348
40349 /* For all elements from second vector, fold the elements to first. */
40350 if (which == 2)
40351 for (i = 0; i < nelt; ++i)
40352 d.perm[i] -= nelt;
40353
40354 /* Check whether the mask can be applied to the vector type. */
40355 d.one_operand_p = (which != 3);
40356
40357 /* Implementable with shufps or pshufd. */
40358 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40359 return true;
40360
40361 /* Otherwise we have to go through the motions and see if we can
40362 figure out how to generate the requested permutation. */
40363 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40364 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40365 if (!d.one_operand_p)
40366 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40367
40368 start_sequence ();
40369 ret = ix86_expand_vec_perm_const_1 (&d);
40370 end_sequence ();
40371
40372 return ret;
40373 }
40374
40375 void
40376 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40377 {
40378 struct expand_vec_perm_d d;
40379 unsigned i, nelt;
40380
40381 d.target = targ;
40382 d.op0 = op0;
40383 d.op1 = op1;
40384 d.vmode = GET_MODE (targ);
40385 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40386 d.one_operand_p = false;
40387 d.testing_p = false;
40388
40389 for (i = 0; i < nelt; ++i)
40390 d.perm[i] = i * 2 + odd;
40391
40392 /* We'll either be able to implement the permutation directly... */
40393 if (expand_vec_perm_1 (&d))
40394 return;
40395
40396 /* ... or we use the special-case patterns. */
40397 expand_vec_perm_even_odd_1 (&d, odd);
40398 }
40399
40400 static void
40401 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40402 {
40403 struct expand_vec_perm_d d;
40404 unsigned i, nelt, base;
40405 bool ok;
40406
40407 d.target = targ;
40408 d.op0 = op0;
40409 d.op1 = op1;
40410 d.vmode = GET_MODE (targ);
40411 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40412 d.one_operand_p = false;
40413 d.testing_p = false;
40414
40415 base = high_p ? nelt / 2 : 0;
40416 for (i = 0; i < nelt / 2; ++i)
40417 {
40418 d.perm[i * 2] = i + base;
40419 d.perm[i * 2 + 1] = i + base + nelt;
40420 }
40421
40422 /* Note that for AVX this isn't one instruction. */
40423 ok = ix86_expand_vec_perm_const_1 (&d);
40424 gcc_assert (ok);
40425 }
40426
40427
40428 /* Expand a vector operation CODE for a V*QImode in terms of the
40429 same operation on V*HImode. */
40430
40431 void
40432 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40433 {
40434 enum machine_mode qimode = GET_MODE (dest);
40435 enum machine_mode himode;
40436 rtx (*gen_il) (rtx, rtx, rtx);
40437 rtx (*gen_ih) (rtx, rtx, rtx);
40438 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40439 struct expand_vec_perm_d d;
40440 bool ok, full_interleave;
40441 bool uns_p = false;
40442 int i;
40443
40444 switch (qimode)
40445 {
40446 case V16QImode:
40447 himode = V8HImode;
40448 gen_il = gen_vec_interleave_lowv16qi;
40449 gen_ih = gen_vec_interleave_highv16qi;
40450 break;
40451 case V32QImode:
40452 himode = V16HImode;
40453 gen_il = gen_avx2_interleave_lowv32qi;
40454 gen_ih = gen_avx2_interleave_highv32qi;
40455 break;
40456 default:
40457 gcc_unreachable ();
40458 }
40459
40460 op2_l = op2_h = op2;
40461 switch (code)
40462 {
40463 case MULT:
40464 /* Unpack data such that we've got a source byte in each low byte of
40465 each word. We don't care what goes into the high byte of each word.
40466 Rather than trying to get zero in there, most convenient is to let
40467 it be a copy of the low byte. */
40468 op2_l = gen_reg_rtx (qimode);
40469 op2_h = gen_reg_rtx (qimode);
40470 emit_insn (gen_il (op2_l, op2, op2));
40471 emit_insn (gen_ih (op2_h, op2, op2));
40472 /* FALLTHRU */
40473
40474 op1_l = gen_reg_rtx (qimode);
40475 op1_h = gen_reg_rtx (qimode);
40476 emit_insn (gen_il (op1_l, op1, op1));
40477 emit_insn (gen_ih (op1_h, op1, op1));
40478 full_interleave = qimode == V16QImode;
40479 break;
40480
40481 case ASHIFT:
40482 case LSHIFTRT:
40483 uns_p = true;
40484 /* FALLTHRU */
40485 case ASHIFTRT:
40486 op1_l = gen_reg_rtx (himode);
40487 op1_h = gen_reg_rtx (himode);
40488 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40489 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40490 full_interleave = true;
40491 break;
40492 default:
40493 gcc_unreachable ();
40494 }
40495
40496 /* Perform the operation. */
40497 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40498 1, OPTAB_DIRECT);
40499 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40500 1, OPTAB_DIRECT);
40501 gcc_assert (res_l && res_h);
40502
40503 /* Merge the data back into the right place. */
40504 d.target = dest;
40505 d.op0 = gen_lowpart (qimode, res_l);
40506 d.op1 = gen_lowpart (qimode, res_h);
40507 d.vmode = qimode;
40508 d.nelt = GET_MODE_NUNITS (qimode);
40509 d.one_operand_p = false;
40510 d.testing_p = false;
40511
40512 if (full_interleave)
40513 {
40514 /* For SSE2, we used an full interleave, so the desired
40515 results are in the even elements. */
40516 for (i = 0; i < 32; ++i)
40517 d.perm[i] = i * 2;
40518 }
40519 else
40520 {
40521 /* For AVX, the interleave used above was not cross-lane. So the
40522 extraction is evens but with the second and third quarter swapped.
40523 Happily, that is even one insn shorter than even extraction. */
40524 for (i = 0; i < 32; ++i)
40525 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40526 }
40527
40528 ok = ix86_expand_vec_perm_const_1 (&d);
40529 gcc_assert (ok);
40530
40531 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40532 gen_rtx_fmt_ee (code, qimode, op1, op2));
40533 }
40534
40535 void
40536 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40537 bool uns_p, bool odd_p)
40538 {
40539 enum machine_mode mode = GET_MODE (op1);
40540 enum machine_mode wmode = GET_MODE (dest);
40541 rtx x;
40542
40543 /* We only play even/odd games with vectors of SImode. */
40544 gcc_assert (mode == V4SImode || mode == V8SImode);
40545
40546 /* If we're looking for the odd results, shift those members down to
40547 the even slots. For some cpus this is faster than a PSHUFD. */
40548 if (odd_p)
40549 {
40550 if (TARGET_XOP && mode == V4SImode)
40551 {
40552 x = force_reg (wmode, CONST0_RTX (wmode));
40553 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40554 return;
40555 }
40556
40557 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40558 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40559 x, NULL, 1, OPTAB_DIRECT);
40560 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40561 x, NULL, 1, OPTAB_DIRECT);
40562 op1 = gen_lowpart (mode, op1);
40563 op2 = gen_lowpart (mode, op2);
40564 }
40565
40566 if (mode == V8SImode)
40567 {
40568 if (uns_p)
40569 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40570 else
40571 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40572 }
40573 else if (uns_p)
40574 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40575 else if (TARGET_SSE4_1)
40576 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40577 else
40578 {
40579 rtx s1, s2, t0, t1, t2;
40580
40581 /* The easiest way to implement this without PMULDQ is to go through
40582 the motions as if we are performing a full 64-bit multiply. With
40583 the exception that we need to do less shuffling of the elements. */
40584
40585 /* Compute the sign-extension, aka highparts, of the two operands. */
40586 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40587 op1, pc_rtx, pc_rtx);
40588 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40589 op2, pc_rtx, pc_rtx);
40590
40591 /* Multiply LO(A) * HI(B), and vice-versa. */
40592 t1 = gen_reg_rtx (wmode);
40593 t2 = gen_reg_rtx (wmode);
40594 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40595 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40596
40597 /* Multiply LO(A) * LO(B). */
40598 t0 = gen_reg_rtx (wmode);
40599 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40600
40601 /* Combine and shift the highparts into place. */
40602 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40603 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40604 1, OPTAB_DIRECT);
40605
40606 /* Combine high and low parts. */
40607 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40608 return;
40609 }
40610 emit_insn (x);
40611 }
40612
40613 void
40614 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40615 bool uns_p, bool high_p)
40616 {
40617 enum machine_mode wmode = GET_MODE (dest);
40618 enum machine_mode mode = GET_MODE (op1);
40619 rtx t1, t2, t3, t4, mask;
40620
40621 switch (mode)
40622 {
40623 case V4SImode:
40624 t1 = gen_reg_rtx (mode);
40625 t2 = gen_reg_rtx (mode);
40626 if (TARGET_XOP && !uns_p)
40627 {
40628 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40629 shuffle the elements once so that all elements are in the right
40630 place for immediate use: { A C B D }. */
40631 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40632 const1_rtx, GEN_INT (3)));
40633 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40634 const1_rtx, GEN_INT (3)));
40635 }
40636 else
40637 {
40638 /* Put the elements into place for the multiply. */
40639 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40640 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40641 high_p = false;
40642 }
40643 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40644 break;
40645
40646 case V8SImode:
40647 /* Shuffle the elements between the lanes. After this we
40648 have { A B E F | C D G H } for each operand. */
40649 t1 = gen_reg_rtx (V4DImode);
40650 t2 = gen_reg_rtx (V4DImode);
40651 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40652 const0_rtx, const2_rtx,
40653 const1_rtx, GEN_INT (3)));
40654 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40655 const0_rtx, const2_rtx,
40656 const1_rtx, GEN_INT (3)));
40657
40658 /* Shuffle the elements within the lanes. After this we
40659 have { A A B B | C C D D } or { E E F F | G G H H }. */
40660 t3 = gen_reg_rtx (V8SImode);
40661 t4 = gen_reg_rtx (V8SImode);
40662 mask = GEN_INT (high_p
40663 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
40664 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
40665 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
40666 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
40667
40668 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
40669 break;
40670
40671 case V8HImode:
40672 case V16HImode:
40673 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
40674 uns_p, OPTAB_DIRECT);
40675 t2 = expand_binop (mode,
40676 uns_p ? umul_highpart_optab : smul_highpart_optab,
40677 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
40678 gcc_assert (t1 && t2);
40679
40680 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
40681 break;
40682
40683 case V16QImode:
40684 case V32QImode:
40685 t1 = gen_reg_rtx (wmode);
40686 t2 = gen_reg_rtx (wmode);
40687 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
40688 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
40689
40690 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
40691 break;
40692
40693 default:
40694 gcc_unreachable ();
40695 }
40696 }
40697
40698 void
40699 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
40700 {
40701 rtx res_1, res_2;
40702
40703 res_1 = gen_reg_rtx (V4SImode);
40704 res_2 = gen_reg_rtx (V4SImode);
40705 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
40706 op1, op2, true, false);
40707 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
40708 op1, op2, true, true);
40709
40710 /* Move the results in element 2 down to element 1; we don't care
40711 what goes in elements 2 and 3. Then we can merge the parts
40712 back together with an interleave.
40713
40714 Note that two other sequences were tried:
40715 (1) Use interleaves at the start instead of psrldq, which allows
40716 us to use a single shufps to merge things back at the end.
40717 (2) Use shufps here to combine the two vectors, then pshufd to
40718 put the elements in the correct order.
40719 In both cases the cost of the reformatting stall was too high
40720 and the overall sequence slower. */
40721
40722 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
40723 const0_rtx, const0_rtx));
40724 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
40725 const0_rtx, const0_rtx));
40726 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
40727
40728 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
40729 }
40730
40731 void
40732 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
40733 {
40734 enum machine_mode mode = GET_MODE (op0);
40735 rtx t1, t2, t3, t4, t5, t6;
40736
40737 if (TARGET_XOP && mode == V2DImode)
40738 {
40739 /* op1: A,B,C,D, op2: E,F,G,H */
40740 op1 = gen_lowpart (V4SImode, op1);
40741 op2 = gen_lowpart (V4SImode, op2);
40742
40743 t1 = gen_reg_rtx (V4SImode);
40744 t2 = gen_reg_rtx (V4SImode);
40745 t3 = gen_reg_rtx (V2DImode);
40746 t4 = gen_reg_rtx (V2DImode);
40747
40748 /* t1: B,A,D,C */
40749 emit_insn (gen_sse2_pshufd_1 (t1, op1,
40750 GEN_INT (1),
40751 GEN_INT (0),
40752 GEN_INT (3),
40753 GEN_INT (2)));
40754
40755 /* t2: (B*E),(A*F),(D*G),(C*H) */
40756 emit_insn (gen_mulv4si3 (t2, t1, op2));
40757
40758 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
40759 emit_insn (gen_xop_phadddq (t3, t2));
40760
40761 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
40762 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
40763
40764 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
40765 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
40766 }
40767 else
40768 {
40769 enum machine_mode nmode;
40770 rtx (*umul) (rtx, rtx, rtx);
40771
40772 if (mode == V2DImode)
40773 {
40774 umul = gen_vec_widen_umult_even_v4si;
40775 nmode = V4SImode;
40776 }
40777 else if (mode == V4DImode)
40778 {
40779 umul = gen_vec_widen_umult_even_v8si;
40780 nmode = V8SImode;
40781 }
40782 else
40783 gcc_unreachable ();
40784
40785
40786 /* Multiply low parts. */
40787 t1 = gen_reg_rtx (mode);
40788 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
40789
40790 /* Shift input vectors right 32 bits so we can multiply high parts. */
40791 t6 = GEN_INT (32);
40792 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
40793 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
40794
40795 /* Multiply high parts by low parts. */
40796 t4 = gen_reg_rtx (mode);
40797 t5 = gen_reg_rtx (mode);
40798 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
40799 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
40800
40801 /* Combine and shift the highparts back. */
40802 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
40803 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
40804
40805 /* Combine high and low parts. */
40806 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
40807 }
40808
40809 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40810 gen_rtx_MULT (mode, op1, op2));
40811 }
40812
40813 /* Expand an insert into a vector register through pinsr insn.
40814 Return true if successful. */
40815
40816 bool
40817 ix86_expand_pinsr (rtx *operands)
40818 {
40819 rtx dst = operands[0];
40820 rtx src = operands[3];
40821
40822 unsigned int size = INTVAL (operands[1]);
40823 unsigned int pos = INTVAL (operands[2]);
40824
40825 if (GET_CODE (dst) == SUBREG)
40826 {
40827 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
40828 dst = SUBREG_REG (dst);
40829 }
40830
40831 if (GET_CODE (src) == SUBREG)
40832 src = SUBREG_REG (src);
40833
40834 switch (GET_MODE (dst))
40835 {
40836 case V16QImode:
40837 case V8HImode:
40838 case V4SImode:
40839 case V2DImode:
40840 {
40841 enum machine_mode srcmode, dstmode;
40842 rtx (*pinsr)(rtx, rtx, rtx, rtx);
40843
40844 srcmode = mode_for_size (size, MODE_INT, 0);
40845
40846 switch (srcmode)
40847 {
40848 case QImode:
40849 if (!TARGET_SSE4_1)
40850 return false;
40851 dstmode = V16QImode;
40852 pinsr = gen_sse4_1_pinsrb;
40853 break;
40854
40855 case HImode:
40856 if (!TARGET_SSE2)
40857 return false;
40858 dstmode = V8HImode;
40859 pinsr = gen_sse2_pinsrw;
40860 break;
40861
40862 case SImode:
40863 if (!TARGET_SSE4_1)
40864 return false;
40865 dstmode = V4SImode;
40866 pinsr = gen_sse4_1_pinsrd;
40867 break;
40868
40869 case DImode:
40870 gcc_assert (TARGET_64BIT);
40871 if (!TARGET_SSE4_1)
40872 return false;
40873 dstmode = V2DImode;
40874 pinsr = gen_sse4_1_pinsrq;
40875 break;
40876
40877 default:
40878 return false;
40879 }
40880
40881 dst = gen_lowpart (dstmode, dst);
40882 src = gen_lowpart (srcmode, src);
40883
40884 pos /= size;
40885
40886 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
40887 return true;
40888 }
40889
40890 default:
40891 return false;
40892 }
40893 }
40894 \f
40895 /* This function returns the calling abi specific va_list type node.
40896 It returns the FNDECL specific va_list type. */
40897
40898 static tree
40899 ix86_fn_abi_va_list (tree fndecl)
40900 {
40901 if (!TARGET_64BIT)
40902 return va_list_type_node;
40903 gcc_assert (fndecl != NULL_TREE);
40904
40905 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
40906 return ms_va_list_type_node;
40907 else
40908 return sysv_va_list_type_node;
40909 }
40910
40911 /* Returns the canonical va_list type specified by TYPE. If there
40912 is no valid TYPE provided, it return NULL_TREE. */
40913
40914 static tree
40915 ix86_canonical_va_list_type (tree type)
40916 {
40917 tree wtype, htype;
40918
40919 /* Resolve references and pointers to va_list type. */
40920 if (TREE_CODE (type) == MEM_REF)
40921 type = TREE_TYPE (type);
40922 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
40923 type = TREE_TYPE (type);
40924 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
40925 type = TREE_TYPE (type);
40926
40927 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
40928 {
40929 wtype = va_list_type_node;
40930 gcc_assert (wtype != NULL_TREE);
40931 htype = type;
40932 if (TREE_CODE (wtype) == ARRAY_TYPE)
40933 {
40934 /* If va_list is an array type, the argument may have decayed
40935 to a pointer type, e.g. by being passed to another function.
40936 In that case, unwrap both types so that we can compare the
40937 underlying records. */
40938 if (TREE_CODE (htype) == ARRAY_TYPE
40939 || POINTER_TYPE_P (htype))
40940 {
40941 wtype = TREE_TYPE (wtype);
40942 htype = TREE_TYPE (htype);
40943 }
40944 }
40945 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40946 return va_list_type_node;
40947 wtype = sysv_va_list_type_node;
40948 gcc_assert (wtype != NULL_TREE);
40949 htype = type;
40950 if (TREE_CODE (wtype) == ARRAY_TYPE)
40951 {
40952 /* If va_list is an array type, the argument may have decayed
40953 to a pointer type, e.g. by being passed to another function.
40954 In that case, unwrap both types so that we can compare the
40955 underlying records. */
40956 if (TREE_CODE (htype) == ARRAY_TYPE
40957 || POINTER_TYPE_P (htype))
40958 {
40959 wtype = TREE_TYPE (wtype);
40960 htype = TREE_TYPE (htype);
40961 }
40962 }
40963 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40964 return sysv_va_list_type_node;
40965 wtype = ms_va_list_type_node;
40966 gcc_assert (wtype != NULL_TREE);
40967 htype = type;
40968 if (TREE_CODE (wtype) == ARRAY_TYPE)
40969 {
40970 /* If va_list is an array type, the argument may have decayed
40971 to a pointer type, e.g. by being passed to another function.
40972 In that case, unwrap both types so that we can compare the
40973 underlying records. */
40974 if (TREE_CODE (htype) == ARRAY_TYPE
40975 || POINTER_TYPE_P (htype))
40976 {
40977 wtype = TREE_TYPE (wtype);
40978 htype = TREE_TYPE (htype);
40979 }
40980 }
40981 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
40982 return ms_va_list_type_node;
40983 return NULL_TREE;
40984 }
40985 return std_canonical_va_list_type (type);
40986 }
40987
40988 /* Iterate through the target-specific builtin types for va_list.
40989 IDX denotes the iterator, *PTREE is set to the result type of
40990 the va_list builtin, and *PNAME to its internal type.
40991 Returns zero if there is no element for this index, otherwise
40992 IDX should be increased upon the next call.
40993 Note, do not iterate a base builtin's name like __builtin_va_list.
40994 Used from c_common_nodes_and_builtins. */
40995
40996 static int
40997 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
40998 {
40999 if (TARGET_64BIT)
41000 {
41001 switch (idx)
41002 {
41003 default:
41004 break;
41005
41006 case 0:
41007 *ptree = ms_va_list_type_node;
41008 *pname = "__builtin_ms_va_list";
41009 return 1;
41010
41011 case 1:
41012 *ptree = sysv_va_list_type_node;
41013 *pname = "__builtin_sysv_va_list";
41014 return 1;
41015 }
41016 }
41017
41018 return 0;
41019 }
41020
41021 #undef TARGET_SCHED_DISPATCH
41022 #define TARGET_SCHED_DISPATCH has_dispatch
41023 #undef TARGET_SCHED_DISPATCH_DO
41024 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41025 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41026 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41027 #undef TARGET_SCHED_REORDER
41028 #define TARGET_SCHED_REORDER ix86_sched_reorder
41029 #undef TARGET_SCHED_ADJUST_PRIORITY
41030 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41031 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41032 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ix86_dependencies_evaluation_hook
41033
41034 /* The size of the dispatch window is the total number of bytes of
41035 object code allowed in a window. */
41036 #define DISPATCH_WINDOW_SIZE 16
41037
41038 /* Number of dispatch windows considered for scheduling. */
41039 #define MAX_DISPATCH_WINDOWS 3
41040
41041 /* Maximum number of instructions in a window. */
41042 #define MAX_INSN 4
41043
41044 /* Maximum number of immediate operands in a window. */
41045 #define MAX_IMM 4
41046
41047 /* Maximum number of immediate bits allowed in a window. */
41048 #define MAX_IMM_SIZE 128
41049
41050 /* Maximum number of 32 bit immediates allowed in a window. */
41051 #define MAX_IMM_32 4
41052
41053 /* Maximum number of 64 bit immediates allowed in a window. */
41054 #define MAX_IMM_64 2
41055
41056 /* Maximum total of loads or prefetches allowed in a window. */
41057 #define MAX_LOAD 2
41058
41059 /* Maximum total of stores allowed in a window. */
41060 #define MAX_STORE 1
41061
41062 #undef BIG
41063 #define BIG 100
41064
41065
41066 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41067 enum dispatch_group {
41068 disp_no_group = 0,
41069 disp_load,
41070 disp_store,
41071 disp_load_store,
41072 disp_prefetch,
41073 disp_imm,
41074 disp_imm_32,
41075 disp_imm_64,
41076 disp_branch,
41077 disp_cmp,
41078 disp_jcc,
41079 disp_last
41080 };
41081
41082 /* Number of allowable groups in a dispatch window. It is an array
41083 indexed by dispatch_group enum. 100 is used as a big number,
41084 because the number of these kind of operations does not have any
41085 effect in dispatch window, but we need them for other reasons in
41086 the table. */
41087 static unsigned int num_allowable_groups[disp_last] = {
41088 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41089 };
41090
41091 char group_name[disp_last + 1][16] = {
41092 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41093 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41094 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41095 };
41096
41097 /* Instruction path. */
41098 enum insn_path {
41099 no_path = 0,
41100 path_single, /* Single micro op. */
41101 path_double, /* Double micro op. */
41102 path_multi, /* Instructions with more than 2 micro op.. */
41103 last_path
41104 };
41105
41106 /* sched_insn_info defines a window to the instructions scheduled in
41107 the basic block. It contains a pointer to the insn_info table and
41108 the instruction scheduled.
41109
41110 Windows are allocated for each basic block and are linked
41111 together. */
41112 typedef struct sched_insn_info_s {
41113 rtx insn;
41114 enum dispatch_group group;
41115 enum insn_path path;
41116 int byte_len;
41117 int imm_bytes;
41118 } sched_insn_info;
41119
41120 /* Linked list of dispatch windows. This is a two way list of
41121 dispatch windows of a basic block. It contains information about
41122 the number of uops in the window and the total number of
41123 instructions and of bytes in the object code for this dispatch
41124 window. */
41125 typedef struct dispatch_windows_s {
41126 int num_insn; /* Number of insn in the window. */
41127 int num_uops; /* Number of uops in the window. */
41128 int window_size; /* Number of bytes in the window. */
41129 int window_num; /* Window number between 0 or 1. */
41130 int num_imm; /* Number of immediates in an insn. */
41131 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41132 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41133 int imm_size; /* Total immediates in the window. */
41134 int num_loads; /* Total memory loads in the window. */
41135 int num_stores; /* Total memory stores in the window. */
41136 int violation; /* Violation exists in window. */
41137 sched_insn_info *window; /* Pointer to the window. */
41138 struct dispatch_windows_s *next;
41139 struct dispatch_windows_s *prev;
41140 } dispatch_windows;
41141
41142 /* Immediate valuse used in an insn. */
41143 typedef struct imm_info_s
41144 {
41145 int imm;
41146 int imm32;
41147 int imm64;
41148 } imm_info;
41149
41150 static dispatch_windows *dispatch_window_list;
41151 static dispatch_windows *dispatch_window_list1;
41152
41153 /* Get dispatch group of insn. */
41154
41155 static enum dispatch_group
41156 get_mem_group (rtx insn)
41157 {
41158 enum attr_memory memory;
41159
41160 if (INSN_CODE (insn) < 0)
41161 return disp_no_group;
41162 memory = get_attr_memory (insn);
41163 if (memory == MEMORY_STORE)
41164 return disp_store;
41165
41166 if (memory == MEMORY_LOAD)
41167 return disp_load;
41168
41169 if (memory == MEMORY_BOTH)
41170 return disp_load_store;
41171
41172 return disp_no_group;
41173 }
41174
41175 /* Return true if insn is a compare instruction. */
41176
41177 static bool
41178 is_cmp (rtx insn)
41179 {
41180 enum attr_type type;
41181
41182 type = get_attr_type (insn);
41183 return (type == TYPE_TEST
41184 || type == TYPE_ICMP
41185 || type == TYPE_FCMP
41186 || GET_CODE (PATTERN (insn)) == COMPARE);
41187 }
41188
41189 /* Return true if a dispatch violation encountered. */
41190
41191 static bool
41192 dispatch_violation (void)
41193 {
41194 if (dispatch_window_list->next)
41195 return dispatch_window_list->next->violation;
41196 return dispatch_window_list->violation;
41197 }
41198
41199 /* Return true if insn is a branch instruction. */
41200
41201 static bool
41202 is_branch (rtx insn)
41203 {
41204 return (CALL_P (insn) || JUMP_P (insn));
41205 }
41206
41207 /* Return true if insn is a prefetch instruction. */
41208
41209 static bool
41210 is_prefetch (rtx insn)
41211 {
41212 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41213 }
41214
41215 /* This function initializes a dispatch window and the list container holding a
41216 pointer to the window. */
41217
41218 static void
41219 init_window (int window_num)
41220 {
41221 int i;
41222 dispatch_windows *new_list;
41223
41224 if (window_num == 0)
41225 new_list = dispatch_window_list;
41226 else
41227 new_list = dispatch_window_list1;
41228
41229 new_list->num_insn = 0;
41230 new_list->num_uops = 0;
41231 new_list->window_size = 0;
41232 new_list->next = NULL;
41233 new_list->prev = NULL;
41234 new_list->window_num = window_num;
41235 new_list->num_imm = 0;
41236 new_list->num_imm_32 = 0;
41237 new_list->num_imm_64 = 0;
41238 new_list->imm_size = 0;
41239 new_list->num_loads = 0;
41240 new_list->num_stores = 0;
41241 new_list->violation = false;
41242
41243 for (i = 0; i < MAX_INSN; i++)
41244 {
41245 new_list->window[i].insn = NULL;
41246 new_list->window[i].group = disp_no_group;
41247 new_list->window[i].path = no_path;
41248 new_list->window[i].byte_len = 0;
41249 new_list->window[i].imm_bytes = 0;
41250 }
41251 return;
41252 }
41253
41254 /* This function allocates and initializes a dispatch window and the
41255 list container holding a pointer to the window. */
41256
41257 static dispatch_windows *
41258 allocate_window (void)
41259 {
41260 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41261 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41262
41263 return new_list;
41264 }
41265
41266 /* This routine initializes the dispatch scheduling information. It
41267 initiates building dispatch scheduler tables and constructs the
41268 first dispatch window. */
41269
41270 static void
41271 init_dispatch_sched (void)
41272 {
41273 /* Allocate a dispatch list and a window. */
41274 dispatch_window_list = allocate_window ();
41275 dispatch_window_list1 = allocate_window ();
41276 init_window (0);
41277 init_window (1);
41278 }
41279
41280 /* This function returns true if a branch is detected. End of a basic block
41281 does not have to be a branch, but here we assume only branches end a
41282 window. */
41283
41284 static bool
41285 is_end_basic_block (enum dispatch_group group)
41286 {
41287 return group == disp_branch;
41288 }
41289
41290 /* This function is called when the end of a window processing is reached. */
41291
41292 static void
41293 process_end_window (void)
41294 {
41295 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41296 if (dispatch_window_list->next)
41297 {
41298 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41299 gcc_assert (dispatch_window_list->window_size
41300 + dispatch_window_list1->window_size <= 48);
41301 init_window (1);
41302 }
41303 init_window (0);
41304 }
41305
41306 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41307 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41308 for 48 bytes of instructions. Note that these windows are not dispatch
41309 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41310
41311 static dispatch_windows *
41312 allocate_next_window (int window_num)
41313 {
41314 if (window_num == 0)
41315 {
41316 if (dispatch_window_list->next)
41317 init_window (1);
41318 init_window (0);
41319 return dispatch_window_list;
41320 }
41321
41322 dispatch_window_list->next = dispatch_window_list1;
41323 dispatch_window_list1->prev = dispatch_window_list;
41324
41325 return dispatch_window_list1;
41326 }
41327
41328 /* Increment the number of immediate operands of an instruction. */
41329
41330 static int
41331 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41332 {
41333 if (*in_rtx == 0)
41334 return 0;
41335
41336 switch ( GET_CODE (*in_rtx))
41337 {
41338 case CONST:
41339 case SYMBOL_REF:
41340 case CONST_INT:
41341 (imm_values->imm)++;
41342 if (x86_64_immediate_operand (*in_rtx, SImode))
41343 (imm_values->imm32)++;
41344 else
41345 (imm_values->imm64)++;
41346 break;
41347
41348 case CONST_DOUBLE:
41349 (imm_values->imm)++;
41350 (imm_values->imm64)++;
41351 break;
41352
41353 case CODE_LABEL:
41354 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41355 {
41356 (imm_values->imm)++;
41357 (imm_values->imm32)++;
41358 }
41359 break;
41360
41361 default:
41362 break;
41363 }
41364
41365 return 0;
41366 }
41367
41368 /* Compute number of immediate operands of an instruction. */
41369
41370 static void
41371 find_constant (rtx in_rtx, imm_info *imm_values)
41372 {
41373 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41374 (rtx_function) find_constant_1, (void *) imm_values);
41375 }
41376
41377 /* Return total size of immediate operands of an instruction along with number
41378 of corresponding immediate-operands. It initializes its parameters to zero
41379 befor calling FIND_CONSTANT.
41380 INSN is the input instruction. IMM is the total of immediates.
41381 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41382 bit immediates. */
41383
41384 static int
41385 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41386 {
41387 imm_info imm_values = {0, 0, 0};
41388
41389 find_constant (insn, &imm_values);
41390 *imm = imm_values.imm;
41391 *imm32 = imm_values.imm32;
41392 *imm64 = imm_values.imm64;
41393 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41394 }
41395
41396 /* This function indicates if an operand of an instruction is an
41397 immediate. */
41398
41399 static bool
41400 has_immediate (rtx insn)
41401 {
41402 int num_imm_operand;
41403 int num_imm32_operand;
41404 int num_imm64_operand;
41405
41406 if (insn)
41407 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41408 &num_imm64_operand);
41409 return false;
41410 }
41411
41412 /* Return single or double path for instructions. */
41413
41414 static enum insn_path
41415 get_insn_path (rtx insn)
41416 {
41417 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41418
41419 if ((int)path == 0)
41420 return path_single;
41421
41422 if ((int)path == 1)
41423 return path_double;
41424
41425 return path_multi;
41426 }
41427
41428 /* Return insn dispatch group. */
41429
41430 static enum dispatch_group
41431 get_insn_group (rtx insn)
41432 {
41433 enum dispatch_group group = get_mem_group (insn);
41434 if (group)
41435 return group;
41436
41437 if (is_branch (insn))
41438 return disp_branch;
41439
41440 if (is_cmp (insn))
41441 return disp_cmp;
41442
41443 if (has_immediate (insn))
41444 return disp_imm;
41445
41446 if (is_prefetch (insn))
41447 return disp_prefetch;
41448
41449 return disp_no_group;
41450 }
41451
41452 /* Count number of GROUP restricted instructions in a dispatch
41453 window WINDOW_LIST. */
41454
41455 static int
41456 count_num_restricted (rtx insn, dispatch_windows *window_list)
41457 {
41458 enum dispatch_group group = get_insn_group (insn);
41459 int imm_size;
41460 int num_imm_operand;
41461 int num_imm32_operand;
41462 int num_imm64_operand;
41463
41464 if (group == disp_no_group)
41465 return 0;
41466
41467 if (group == disp_imm)
41468 {
41469 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41470 &num_imm64_operand);
41471 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41472 || num_imm_operand + window_list->num_imm > MAX_IMM
41473 || (num_imm32_operand > 0
41474 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41475 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41476 || (num_imm64_operand > 0
41477 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41478 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41479 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41480 && num_imm64_operand > 0
41481 && ((window_list->num_imm_64 > 0
41482 && window_list->num_insn >= 2)
41483 || window_list->num_insn >= 3)))
41484 return BIG;
41485
41486 return 1;
41487 }
41488
41489 if ((group == disp_load_store
41490 && (window_list->num_loads >= MAX_LOAD
41491 || window_list->num_stores >= MAX_STORE))
41492 || ((group == disp_load
41493 || group == disp_prefetch)
41494 && window_list->num_loads >= MAX_LOAD)
41495 || (group == disp_store
41496 && window_list->num_stores >= MAX_STORE))
41497 return BIG;
41498
41499 return 1;
41500 }
41501
41502 /* This function returns true if insn satisfies dispatch rules on the
41503 last window scheduled. */
41504
41505 static bool
41506 fits_dispatch_window (rtx insn)
41507 {
41508 dispatch_windows *window_list = dispatch_window_list;
41509 dispatch_windows *window_list_next = dispatch_window_list->next;
41510 unsigned int num_restrict;
41511 enum dispatch_group group = get_insn_group (insn);
41512 enum insn_path path = get_insn_path (insn);
41513 int sum;
41514
41515 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41516 instructions should be given the lowest priority in the
41517 scheduling process in Haifa scheduler to make sure they will be
41518 scheduled in the same dispatch window as the reference to them. */
41519 if (group == disp_jcc || group == disp_cmp)
41520 return false;
41521
41522 /* Check nonrestricted. */
41523 if (group == disp_no_group || group == disp_branch)
41524 return true;
41525
41526 /* Get last dispatch window. */
41527 if (window_list_next)
41528 window_list = window_list_next;
41529
41530 if (window_list->window_num == 1)
41531 {
41532 sum = window_list->prev->window_size + window_list->window_size;
41533
41534 if (sum == 32
41535 || (min_insn_size (insn) + sum) >= 48)
41536 /* Window 1 is full. Go for next window. */
41537 return true;
41538 }
41539
41540 num_restrict = count_num_restricted (insn, window_list);
41541
41542 if (num_restrict > num_allowable_groups[group])
41543 return false;
41544
41545 /* See if it fits in the first window. */
41546 if (window_list->window_num == 0)
41547 {
41548 /* The first widow should have only single and double path
41549 uops. */
41550 if (path == path_double
41551 && (window_list->num_uops + 2) > MAX_INSN)
41552 return false;
41553 else if (path != path_single)
41554 return false;
41555 }
41556 return true;
41557 }
41558
41559 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41560 dispatch window WINDOW_LIST. */
41561
41562 static void
41563 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41564 {
41565 int byte_len = min_insn_size (insn);
41566 int num_insn = window_list->num_insn;
41567 int imm_size;
41568 sched_insn_info *window = window_list->window;
41569 enum dispatch_group group = get_insn_group (insn);
41570 enum insn_path path = get_insn_path (insn);
41571 int num_imm_operand;
41572 int num_imm32_operand;
41573 int num_imm64_operand;
41574
41575 if (!window_list->violation && group != disp_cmp
41576 && !fits_dispatch_window (insn))
41577 window_list->violation = true;
41578
41579 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41580 &num_imm64_operand);
41581
41582 /* Initialize window with new instruction. */
41583 window[num_insn].insn = insn;
41584 window[num_insn].byte_len = byte_len;
41585 window[num_insn].group = group;
41586 window[num_insn].path = path;
41587 window[num_insn].imm_bytes = imm_size;
41588
41589 window_list->window_size += byte_len;
41590 window_list->num_insn = num_insn + 1;
41591 window_list->num_uops = window_list->num_uops + num_uops;
41592 window_list->imm_size += imm_size;
41593 window_list->num_imm += num_imm_operand;
41594 window_list->num_imm_32 += num_imm32_operand;
41595 window_list->num_imm_64 += num_imm64_operand;
41596
41597 if (group == disp_store)
41598 window_list->num_stores += 1;
41599 else if (group == disp_load
41600 || group == disp_prefetch)
41601 window_list->num_loads += 1;
41602 else if (group == disp_load_store)
41603 {
41604 window_list->num_stores += 1;
41605 window_list->num_loads += 1;
41606 }
41607 }
41608
41609 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41610 If the total bytes of instructions or the number of instructions in
41611 the window exceed allowable, it allocates a new window. */
41612
41613 static void
41614 add_to_dispatch_window (rtx insn)
41615 {
41616 int byte_len;
41617 dispatch_windows *window_list;
41618 dispatch_windows *next_list;
41619 dispatch_windows *window0_list;
41620 enum insn_path path;
41621 enum dispatch_group insn_group;
41622 bool insn_fits;
41623 int num_insn;
41624 int num_uops;
41625 int window_num;
41626 int insn_num_uops;
41627 int sum;
41628
41629 if (INSN_CODE (insn) < 0)
41630 return;
41631
41632 byte_len = min_insn_size (insn);
41633 window_list = dispatch_window_list;
41634 next_list = window_list->next;
41635 path = get_insn_path (insn);
41636 insn_group = get_insn_group (insn);
41637
41638 /* Get the last dispatch window. */
41639 if (next_list)
41640 window_list = dispatch_window_list->next;
41641
41642 if (path == path_single)
41643 insn_num_uops = 1;
41644 else if (path == path_double)
41645 insn_num_uops = 2;
41646 else
41647 insn_num_uops = (int) path;
41648
41649 /* If current window is full, get a new window.
41650 Window number zero is full, if MAX_INSN uops are scheduled in it.
41651 Window number one is full, if window zero's bytes plus window
41652 one's bytes is 32, or if the bytes of the new instruction added
41653 to the total makes it greater than 48, or it has already MAX_INSN
41654 instructions in it. */
41655 num_insn = window_list->num_insn;
41656 num_uops = window_list->num_uops;
41657 window_num = window_list->window_num;
41658 insn_fits = fits_dispatch_window (insn);
41659
41660 if (num_insn >= MAX_INSN
41661 || num_uops + insn_num_uops > MAX_INSN
41662 || !(insn_fits))
41663 {
41664 window_num = ~window_num & 1;
41665 window_list = allocate_next_window (window_num);
41666 }
41667
41668 if (window_num == 0)
41669 {
41670 add_insn_window (insn, window_list, insn_num_uops);
41671 if (window_list->num_insn >= MAX_INSN
41672 && insn_group == disp_branch)
41673 {
41674 process_end_window ();
41675 return;
41676 }
41677 }
41678 else if (window_num == 1)
41679 {
41680 window0_list = window_list->prev;
41681 sum = window0_list->window_size + window_list->window_size;
41682 if (sum == 32
41683 || (byte_len + sum) >= 48)
41684 {
41685 process_end_window ();
41686 window_list = dispatch_window_list;
41687 }
41688
41689 add_insn_window (insn, window_list, insn_num_uops);
41690 }
41691 else
41692 gcc_unreachable ();
41693
41694 if (is_end_basic_block (insn_group))
41695 {
41696 /* End of basic block is reached do end-basic-block process. */
41697 process_end_window ();
41698 return;
41699 }
41700 }
41701
41702 /* Print the dispatch window, WINDOW_NUM, to FILE. */
41703
41704 DEBUG_FUNCTION static void
41705 debug_dispatch_window_file (FILE *file, int window_num)
41706 {
41707 dispatch_windows *list;
41708 int i;
41709
41710 if (window_num == 0)
41711 list = dispatch_window_list;
41712 else
41713 list = dispatch_window_list1;
41714
41715 fprintf (file, "Window #%d:\n", list->window_num);
41716 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
41717 list->num_insn, list->num_uops, list->window_size);
41718 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41719 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
41720
41721 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
41722 list->num_stores);
41723 fprintf (file, " insn info:\n");
41724
41725 for (i = 0; i < MAX_INSN; i++)
41726 {
41727 if (!list->window[i].insn)
41728 break;
41729 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
41730 i, group_name[list->window[i].group],
41731 i, (void *)list->window[i].insn,
41732 i, list->window[i].path,
41733 i, list->window[i].byte_len,
41734 i, list->window[i].imm_bytes);
41735 }
41736 }
41737
41738 /* Print to stdout a dispatch window. */
41739
41740 DEBUG_FUNCTION void
41741 debug_dispatch_window (int window_num)
41742 {
41743 debug_dispatch_window_file (stdout, window_num);
41744 }
41745
41746 /* Print INSN dispatch information to FILE. */
41747
41748 DEBUG_FUNCTION static void
41749 debug_insn_dispatch_info_file (FILE *file, rtx insn)
41750 {
41751 int byte_len;
41752 enum insn_path path;
41753 enum dispatch_group group;
41754 int imm_size;
41755 int num_imm_operand;
41756 int num_imm32_operand;
41757 int num_imm64_operand;
41758
41759 if (INSN_CODE (insn) < 0)
41760 return;
41761
41762 byte_len = min_insn_size (insn);
41763 path = get_insn_path (insn);
41764 group = get_insn_group (insn);
41765 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41766 &num_imm64_operand);
41767
41768 fprintf (file, " insn info:\n");
41769 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
41770 group_name[group], path, byte_len);
41771 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
41772 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
41773 }
41774
41775 /* Print to STDERR the status of the ready list with respect to
41776 dispatch windows. */
41777
41778 DEBUG_FUNCTION void
41779 debug_ready_dispatch (void)
41780 {
41781 int i;
41782 int no_ready = number_in_ready ();
41783
41784 fprintf (stdout, "Number of ready: %d\n", no_ready);
41785
41786 for (i = 0; i < no_ready; i++)
41787 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
41788 }
41789
41790 /* This routine is the driver of the dispatch scheduler. */
41791
41792 static void
41793 do_dispatch (rtx insn, int mode)
41794 {
41795 if (mode == DISPATCH_INIT)
41796 init_dispatch_sched ();
41797 else if (mode == ADD_TO_DISPATCH_WINDOW)
41798 add_to_dispatch_window (insn);
41799 }
41800
41801 /* Return TRUE if Dispatch Scheduling is supported. */
41802
41803 static bool
41804 has_dispatch (rtx insn, int action)
41805 {
41806 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
41807 && flag_dispatch_scheduler)
41808 switch (action)
41809 {
41810 default:
41811 return false;
41812
41813 case IS_DISPATCH_ON:
41814 return true;
41815 break;
41816
41817 case IS_CMP:
41818 return is_cmp (insn);
41819
41820 case DISPATCH_VIOLATION:
41821 return dispatch_violation ();
41822
41823 case FITS_DISPATCH_WINDOW:
41824 return fits_dispatch_window (insn);
41825 }
41826
41827 return false;
41828 }
41829
41830 /* Implementation of reassociation_width target hook used by
41831 reassoc phase to identify parallelism level in reassociated
41832 tree. Statements tree_code is passed in OPC. Arguments type
41833 is passed in MODE.
41834
41835 Currently parallel reassociation is enabled for Atom
41836 processors only and we set reassociation width to be 2
41837 because Atom may issue up to 2 instructions per cycle.
41838
41839 Return value should be fixed if parallel reassociation is
41840 enabled for other processors. */
41841
41842 static int
41843 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
41844 enum machine_mode mode)
41845 {
41846 int res = 1;
41847
41848 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
41849 res = 2;
41850 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
41851 res = 2;
41852
41853 return res;
41854 }
41855
41856 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
41857 place emms and femms instructions. */
41858
41859 static enum machine_mode
41860 ix86_preferred_simd_mode (enum machine_mode mode)
41861 {
41862 if (!TARGET_SSE)
41863 return word_mode;
41864
41865 switch (mode)
41866 {
41867 case QImode:
41868 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
41869 case HImode:
41870 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
41871 case SImode:
41872 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
41873 case DImode:
41874 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
41875
41876 case SFmode:
41877 if (TARGET_AVX && !TARGET_PREFER_AVX128)
41878 return V8SFmode;
41879 else
41880 return V4SFmode;
41881
41882 case DFmode:
41883 if (!TARGET_VECTORIZE_DOUBLE)
41884 return word_mode;
41885 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
41886 return V4DFmode;
41887 else if (TARGET_SSE2)
41888 return V2DFmode;
41889 /* FALLTHRU */
41890
41891 default:
41892 return word_mode;
41893 }
41894 }
41895
41896 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
41897 vectors. */
41898
41899 static unsigned int
41900 ix86_autovectorize_vector_sizes (void)
41901 {
41902 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
41903 }
41904
41905 \f
41906
41907 /* Return class of registers which could be used for pseudo of MODE
41908 and of class RCLASS for spilling instead of memory. Return NO_REGS
41909 if it is not possible or non-profitable. */
41910 static reg_class_t
41911 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
41912 {
41913 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
41914 && hard_reg_set_subset_p (reg_class_contents[rclass],
41915 reg_class_contents[GENERAL_REGS])
41916 && (mode == SImode || (TARGET_64BIT && mode == DImode)))
41917 return SSE_REGS;
41918 return NO_REGS;
41919 }
41920
41921 /* Implement targetm.vectorize.init_cost. */
41922
41923 static void *
41924 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
41925 {
41926 unsigned *cost = XNEWVEC (unsigned, 3);
41927 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
41928 return cost;
41929 }
41930
41931 /* Implement targetm.vectorize.add_stmt_cost. */
41932
41933 static unsigned
41934 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
41935 struct _stmt_vec_info *stmt_info, int misalign,
41936 enum vect_cost_model_location where)
41937 {
41938 unsigned *cost = (unsigned *) data;
41939 unsigned retval = 0;
41940
41941 if (flag_vect_cost_model)
41942 {
41943 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
41944 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
41945
41946 /* Statements in an inner loop relative to the loop being
41947 vectorized are weighted more heavily. The value here is
41948 arbitrary and could potentially be improved with analysis. */
41949 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
41950 count *= 50; /* FIXME. */
41951
41952 retval = (unsigned) (count * stmt_cost);
41953 cost[where] += retval;
41954 }
41955
41956 return retval;
41957 }
41958
41959 /* Implement targetm.vectorize.finish_cost. */
41960
41961 static void
41962 ix86_finish_cost (void *data, unsigned *prologue_cost,
41963 unsigned *body_cost, unsigned *epilogue_cost)
41964 {
41965 unsigned *cost = (unsigned *) data;
41966 *prologue_cost = cost[vect_prologue];
41967 *body_cost = cost[vect_body];
41968 *epilogue_cost = cost[vect_epilogue];
41969 }
41970
41971 /* Implement targetm.vectorize.destroy_cost_data. */
41972
41973 static void
41974 ix86_destroy_cost_data (void *data)
41975 {
41976 free (data);
41977 }
41978
41979 /* Validate target specific memory model bits in VAL. */
41980
41981 static unsigned HOST_WIDE_INT
41982 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
41983 {
41984 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
41985 unsigned HOST_WIDE_INT strong;
41986
41987 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
41988 |MEMMODEL_MASK)
41989 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
41990 {
41991 warning (OPT_Winvalid_memory_model,
41992 "Unknown architecture specific memory model");
41993 return MEMMODEL_SEQ_CST;
41994 }
41995 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
41996 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
41997 {
41998 warning (OPT_Winvalid_memory_model,
41999 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42000 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42001 }
42002 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42003 {
42004 warning (OPT_Winvalid_memory_model,
42005 "HLE_RELEASE not used with RELEASE or stronger memory model");
42006 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42007 }
42008 return val;
42009 }
42010
42011 /* Initialize the GCC target structure. */
42012 #undef TARGET_RETURN_IN_MEMORY
42013 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42014
42015 #undef TARGET_LEGITIMIZE_ADDRESS
42016 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42017
42018 #undef TARGET_ATTRIBUTE_TABLE
42019 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42020 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42021 # undef TARGET_MERGE_DECL_ATTRIBUTES
42022 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42023 #endif
42024
42025 #undef TARGET_COMP_TYPE_ATTRIBUTES
42026 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42027
42028 #undef TARGET_INIT_BUILTINS
42029 #define TARGET_INIT_BUILTINS ix86_init_builtins
42030 #undef TARGET_BUILTIN_DECL
42031 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42032 #undef TARGET_EXPAND_BUILTIN
42033 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42034
42035 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42036 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42037 ix86_builtin_vectorized_function
42038
42039 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42040 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42041
42042 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42043 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42044
42045 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42046 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42047
42048 #undef TARGET_BUILTIN_RECIPROCAL
42049 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42050
42051 #undef TARGET_ASM_FUNCTION_EPILOGUE
42052 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42053
42054 #undef TARGET_ENCODE_SECTION_INFO
42055 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42056 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42057 #else
42058 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42059 #endif
42060
42061 #undef TARGET_ASM_OPEN_PAREN
42062 #define TARGET_ASM_OPEN_PAREN ""
42063 #undef TARGET_ASM_CLOSE_PAREN
42064 #define TARGET_ASM_CLOSE_PAREN ""
42065
42066 #undef TARGET_ASM_BYTE_OP
42067 #define TARGET_ASM_BYTE_OP ASM_BYTE
42068
42069 #undef TARGET_ASM_ALIGNED_HI_OP
42070 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42071 #undef TARGET_ASM_ALIGNED_SI_OP
42072 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42073 #ifdef ASM_QUAD
42074 #undef TARGET_ASM_ALIGNED_DI_OP
42075 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42076 #endif
42077
42078 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42079 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42080
42081 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42082 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42083
42084 #undef TARGET_ASM_UNALIGNED_HI_OP
42085 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42086 #undef TARGET_ASM_UNALIGNED_SI_OP
42087 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42088 #undef TARGET_ASM_UNALIGNED_DI_OP
42089 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42090
42091 #undef TARGET_PRINT_OPERAND
42092 #define TARGET_PRINT_OPERAND ix86_print_operand
42093 #undef TARGET_PRINT_OPERAND_ADDRESS
42094 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42095 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42096 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42097 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42098 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42099
42100 #undef TARGET_SCHED_INIT_GLOBAL
42101 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42102 #undef TARGET_SCHED_ADJUST_COST
42103 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42104 #undef TARGET_SCHED_ISSUE_RATE
42105 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42106 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42107 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42108 ia32_multipass_dfa_lookahead
42109
42110 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42111 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42112
42113 #undef TARGET_MEMMODEL_CHECK
42114 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42115
42116 #ifdef HAVE_AS_TLS
42117 #undef TARGET_HAVE_TLS
42118 #define TARGET_HAVE_TLS true
42119 #endif
42120 #undef TARGET_CANNOT_FORCE_CONST_MEM
42121 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42122 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42123 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42124
42125 #undef TARGET_DELEGITIMIZE_ADDRESS
42126 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42127
42128 #undef TARGET_MS_BITFIELD_LAYOUT_P
42129 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42130
42131 #if TARGET_MACHO
42132 #undef TARGET_BINDS_LOCAL_P
42133 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42134 #endif
42135 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42136 #undef TARGET_BINDS_LOCAL_P
42137 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42138 #endif
42139
42140 #undef TARGET_ASM_OUTPUT_MI_THUNK
42141 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42142 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42143 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42144
42145 #undef TARGET_ASM_FILE_START
42146 #define TARGET_ASM_FILE_START x86_file_start
42147
42148 #undef TARGET_OPTION_OVERRIDE
42149 #define TARGET_OPTION_OVERRIDE ix86_option_override
42150
42151 #undef TARGET_REGISTER_MOVE_COST
42152 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42153 #undef TARGET_MEMORY_MOVE_COST
42154 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42155 #undef TARGET_RTX_COSTS
42156 #define TARGET_RTX_COSTS ix86_rtx_costs
42157 #undef TARGET_ADDRESS_COST
42158 #define TARGET_ADDRESS_COST ix86_address_cost
42159
42160 #undef TARGET_FIXED_CONDITION_CODE_REGS
42161 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42162 #undef TARGET_CC_MODES_COMPATIBLE
42163 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42164
42165 #undef TARGET_MACHINE_DEPENDENT_REORG
42166 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42167
42168 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42169 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42170
42171 #undef TARGET_BUILD_BUILTIN_VA_LIST
42172 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42173
42174 #undef TARGET_FOLD_BUILTIN
42175 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42176
42177 #undef TARGET_COMPARE_VERSION_PRIORITY
42178 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42179
42180 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42181 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42182 ix86_generate_version_dispatcher_body
42183
42184 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42185 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42186 ix86_get_function_versions_dispatcher
42187
42188 #undef TARGET_ENUM_VA_LIST_P
42189 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42190
42191 #undef TARGET_FN_ABI_VA_LIST
42192 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42193
42194 #undef TARGET_CANONICAL_VA_LIST_TYPE
42195 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42196
42197 #undef TARGET_EXPAND_BUILTIN_VA_START
42198 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42199
42200 #undef TARGET_MD_ASM_CLOBBERS
42201 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42202
42203 #undef TARGET_PROMOTE_PROTOTYPES
42204 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42205 #undef TARGET_STRUCT_VALUE_RTX
42206 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
42207 #undef TARGET_SETUP_INCOMING_VARARGS
42208 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42209 #undef TARGET_MUST_PASS_IN_STACK
42210 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42211 #undef TARGET_FUNCTION_ARG_ADVANCE
42212 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42213 #undef TARGET_FUNCTION_ARG
42214 #define TARGET_FUNCTION_ARG ix86_function_arg
42215 #undef TARGET_FUNCTION_ARG_BOUNDARY
42216 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42217 #undef TARGET_PASS_BY_REFERENCE
42218 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42219 #undef TARGET_INTERNAL_ARG_POINTER
42220 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42221 #undef TARGET_UPDATE_STACK_BOUNDARY
42222 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42223 #undef TARGET_GET_DRAP_RTX
42224 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42225 #undef TARGET_STRICT_ARGUMENT_NAMING
42226 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42227 #undef TARGET_STATIC_CHAIN
42228 #define TARGET_STATIC_CHAIN ix86_static_chain
42229 #undef TARGET_TRAMPOLINE_INIT
42230 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42231 #undef TARGET_RETURN_POPS_ARGS
42232 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42233
42234 #undef TARGET_LEGITIMATE_COMBINED_INSN
42235 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42236
42237 #undef TARGET_ASAN_SHADOW_OFFSET
42238 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42239
42240 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42241 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42242
42243 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42244 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42245
42246 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42247 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42248
42249 #undef TARGET_C_MODE_FOR_SUFFIX
42250 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42251
42252 #ifdef HAVE_AS_TLS
42253 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42254 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42255 #endif
42256
42257 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42258 #undef TARGET_INSERT_ATTRIBUTES
42259 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42260 #endif
42261
42262 #undef TARGET_MANGLE_TYPE
42263 #define TARGET_MANGLE_TYPE ix86_mangle_type
42264
42265 #if !TARGET_MACHO
42266 #undef TARGET_STACK_PROTECT_FAIL
42267 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42268 #endif
42269
42270 #undef TARGET_FUNCTION_VALUE
42271 #define TARGET_FUNCTION_VALUE ix86_function_value
42272
42273 #undef TARGET_FUNCTION_VALUE_REGNO_P
42274 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42275
42276 #undef TARGET_PROMOTE_FUNCTION_MODE
42277 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42278
42279 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42280 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42281
42282 #undef TARGET_INSTANTIATE_DECLS
42283 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42284
42285 #undef TARGET_SECONDARY_RELOAD
42286 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42287
42288 #undef TARGET_CLASS_MAX_NREGS
42289 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42290
42291 #undef TARGET_PREFERRED_RELOAD_CLASS
42292 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42293 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42294 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42295 #undef TARGET_CLASS_LIKELY_SPILLED_P
42296 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42297
42298 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42299 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42300 ix86_builtin_vectorization_cost
42301 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42302 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42303 ix86_vectorize_vec_perm_const_ok
42304 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42305 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42306 ix86_preferred_simd_mode
42307 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42308 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42309 ix86_autovectorize_vector_sizes
42310 #undef TARGET_VECTORIZE_INIT_COST
42311 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42312 #undef TARGET_VECTORIZE_ADD_STMT_COST
42313 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42314 #undef TARGET_VECTORIZE_FINISH_COST
42315 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42316 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42317 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42318
42319 #undef TARGET_SET_CURRENT_FUNCTION
42320 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42321
42322 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42323 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42324
42325 #undef TARGET_OPTION_SAVE
42326 #define TARGET_OPTION_SAVE ix86_function_specific_save
42327
42328 #undef TARGET_OPTION_RESTORE
42329 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42330
42331 #undef TARGET_OPTION_PRINT
42332 #define TARGET_OPTION_PRINT ix86_function_specific_print
42333
42334 #undef TARGET_OPTION_FUNCTION_VERSIONS
42335 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42336
42337 #undef TARGET_CAN_INLINE_P
42338 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42339
42340 #undef TARGET_EXPAND_TO_RTL_HOOK
42341 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42342
42343 #undef TARGET_LEGITIMATE_ADDRESS_P
42344 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42345
42346 #undef TARGET_LRA_P
42347 #define TARGET_LRA_P hook_bool_void_true
42348
42349 #undef TARGET_REGISTER_PRIORITY
42350 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42351
42352 #undef TARGET_LEGITIMATE_CONSTANT_P
42353 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42354
42355 #undef TARGET_FRAME_POINTER_REQUIRED
42356 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42357
42358 #undef TARGET_CAN_ELIMINATE
42359 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42360
42361 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42362 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42363
42364 #undef TARGET_ASM_CODE_END
42365 #define TARGET_ASM_CODE_END ix86_code_end
42366
42367 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42368 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42369
42370 #if TARGET_MACHO
42371 #undef TARGET_INIT_LIBFUNCS
42372 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42373 #endif
42374
42375 #undef TARGET_SPILL_CLASS
42376 #define TARGET_SPILL_CLASS ix86_spill_class
42377
42378 struct gcc_target targetm = TARGET_INITIALIZER;
42379 \f
42380 #include "gt-i386.h"