Daily bump.
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "context.h"
65 #include "pass_manager.h"
66
67 static rtx legitimize_dllimport_symbol (rtx, bool);
68 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
69 static rtx legitimize_pe_coff_symbol (rtx, bool);
70
71 #ifndef CHECK_STACK_LIMIT
72 #define CHECK_STACK_LIMIT (-1)
73 #endif
74
75 /* Return index of given mode in mult and division cost tables. */
76 #define MODE_INDEX(mode) \
77 ((mode) == QImode ? 0 \
78 : (mode) == HImode ? 1 \
79 : (mode) == SImode ? 2 \
80 : (mode) == DImode ? 3 \
81 : 4)
82
83 /* Processor costs (relative to an add) */
84 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
85 #define COSTS_N_BYTES(N) ((N) * 2)
86
87 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
88
89 static stringop_algs ix86_size_memcpy[2] = {
90 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
91 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
92 static stringop_algs ix86_size_memset[2] = {
93 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
94 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
95
96 const
97 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
98 COSTS_N_BYTES (2), /* cost of an add instruction */
99 COSTS_N_BYTES (3), /* cost of a lea instruction */
100 COSTS_N_BYTES (2), /* variable shift costs */
101 COSTS_N_BYTES (3), /* constant shift costs */
102 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
103 COSTS_N_BYTES (3), /* HI */
104 COSTS_N_BYTES (3), /* SI */
105 COSTS_N_BYTES (3), /* DI */
106 COSTS_N_BYTES (5)}, /* other */
107 0, /* cost of multiply per each bit set */
108 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
109 COSTS_N_BYTES (3), /* HI */
110 COSTS_N_BYTES (3), /* SI */
111 COSTS_N_BYTES (3), /* DI */
112 COSTS_N_BYTES (5)}, /* other */
113 COSTS_N_BYTES (3), /* cost of movsx */
114 COSTS_N_BYTES (3), /* cost of movzx */
115 0, /* "large" insn */
116 2, /* MOVE_RATIO */
117 2, /* cost for loading QImode using movzbl */
118 {2, 2, 2}, /* cost of loading integer registers
119 in QImode, HImode and SImode.
120 Relative to reg-reg move (2). */
121 {2, 2, 2}, /* cost of storing integer registers */
122 2, /* cost of reg,reg fld/fst */
123 {2, 2, 2}, /* cost of loading fp registers
124 in SFmode, DFmode and XFmode */
125 {2, 2, 2}, /* cost of storing fp registers
126 in SFmode, DFmode and XFmode */
127 3, /* cost of moving MMX register */
128 {3, 3}, /* cost of loading MMX registers
129 in SImode and DImode */
130 {3, 3}, /* cost of storing MMX registers
131 in SImode and DImode */
132 3, /* cost of moving SSE register */
133 {3, 3, 3}, /* cost of loading SSE registers
134 in SImode, DImode and TImode */
135 {3, 3, 3}, /* cost of storing SSE registers
136 in SImode, DImode and TImode */
137 3, /* MMX or SSE register to integer */
138 0, /* size of l1 cache */
139 0, /* size of l2 cache */
140 0, /* size of prefetch block */
141 0, /* number of parallel prefetches */
142 2, /* Branch cost */
143 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
144 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
145 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
146 COSTS_N_BYTES (2), /* cost of FABS instruction. */
147 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
148 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
149 ix86_size_memcpy,
150 ix86_size_memset,
151 1, /* scalar_stmt_cost. */
152 1, /* scalar load_cost. */
153 1, /* scalar_store_cost. */
154 1, /* vec_stmt_cost. */
155 1, /* vec_to_scalar_cost. */
156 1, /* scalar_to_vec_cost. */
157 1, /* vec_align_load_cost. */
158 1, /* vec_unalign_load_cost. */
159 1, /* vec_store_cost. */
160 1, /* cond_taken_branch_cost. */
161 1, /* cond_not_taken_branch_cost. */
162 };
163
164 /* Processor costs (relative to an add) */
165 static stringop_algs i386_memcpy[2] = {
166 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
167 DUMMY_STRINGOP_ALGS};
168 static stringop_algs i386_memset[2] = {
169 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
170 DUMMY_STRINGOP_ALGS};
171
172 static const
173 struct processor_costs i386_cost = { /* 386 specific costs */
174 COSTS_N_INSNS (1), /* cost of an add instruction */
175 COSTS_N_INSNS (1), /* cost of a lea instruction */
176 COSTS_N_INSNS (3), /* variable shift costs */
177 COSTS_N_INSNS (2), /* constant shift costs */
178 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
179 COSTS_N_INSNS (6), /* HI */
180 COSTS_N_INSNS (6), /* SI */
181 COSTS_N_INSNS (6), /* DI */
182 COSTS_N_INSNS (6)}, /* other */
183 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
184 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
185 COSTS_N_INSNS (23), /* HI */
186 COSTS_N_INSNS (23), /* SI */
187 COSTS_N_INSNS (23), /* DI */
188 COSTS_N_INSNS (23)}, /* other */
189 COSTS_N_INSNS (3), /* cost of movsx */
190 COSTS_N_INSNS (2), /* cost of movzx */
191 15, /* "large" insn */
192 3, /* MOVE_RATIO */
193 4, /* cost for loading QImode using movzbl */
194 {2, 4, 2}, /* cost of loading integer registers
195 in QImode, HImode and SImode.
196 Relative to reg-reg move (2). */
197 {2, 4, 2}, /* cost of storing integer registers */
198 2, /* cost of reg,reg fld/fst */
199 {8, 8, 8}, /* cost of loading fp registers
200 in SFmode, DFmode and XFmode */
201 {8, 8, 8}, /* cost of storing fp registers
202 in SFmode, DFmode and XFmode */
203 2, /* cost of moving MMX register */
204 {4, 8}, /* cost of loading MMX registers
205 in SImode and DImode */
206 {4, 8}, /* cost of storing MMX registers
207 in SImode and DImode */
208 2, /* cost of moving SSE register */
209 {4, 8, 16}, /* cost of loading SSE registers
210 in SImode, DImode and TImode */
211 {4, 8, 16}, /* cost of storing SSE registers
212 in SImode, DImode and TImode */
213 3, /* MMX or SSE register to integer */
214 0, /* size of l1 cache */
215 0, /* size of l2 cache */
216 0, /* size of prefetch block */
217 0, /* number of parallel prefetches */
218 1, /* Branch cost */
219 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
220 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
221 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
222 COSTS_N_INSNS (22), /* cost of FABS instruction. */
223 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
224 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
225 i386_memcpy,
226 i386_memset,
227 1, /* scalar_stmt_cost. */
228 1, /* scalar load_cost. */
229 1, /* scalar_store_cost. */
230 1, /* vec_stmt_cost. */
231 1, /* vec_to_scalar_cost. */
232 1, /* scalar_to_vec_cost. */
233 1, /* vec_align_load_cost. */
234 2, /* vec_unalign_load_cost. */
235 1, /* vec_store_cost. */
236 3, /* cond_taken_branch_cost. */
237 1, /* cond_not_taken_branch_cost. */
238 };
239
240 static stringop_algs i486_memcpy[2] = {
241 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
242 DUMMY_STRINGOP_ALGS};
243 static stringop_algs i486_memset[2] = {
244 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
245 DUMMY_STRINGOP_ALGS};
246
247 static const
248 struct processor_costs i486_cost = { /* 486 specific costs */
249 COSTS_N_INSNS (1), /* cost of an add instruction */
250 COSTS_N_INSNS (1), /* cost of a lea instruction */
251 COSTS_N_INSNS (3), /* variable shift costs */
252 COSTS_N_INSNS (2), /* constant shift costs */
253 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
254 COSTS_N_INSNS (12), /* HI */
255 COSTS_N_INSNS (12), /* SI */
256 COSTS_N_INSNS (12), /* DI */
257 COSTS_N_INSNS (12)}, /* other */
258 1, /* cost of multiply per each bit set */
259 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
260 COSTS_N_INSNS (40), /* HI */
261 COSTS_N_INSNS (40), /* SI */
262 COSTS_N_INSNS (40), /* DI */
263 COSTS_N_INSNS (40)}, /* other */
264 COSTS_N_INSNS (3), /* cost of movsx */
265 COSTS_N_INSNS (2), /* cost of movzx */
266 15, /* "large" insn */
267 3, /* MOVE_RATIO */
268 4, /* cost for loading QImode using movzbl */
269 {2, 4, 2}, /* cost of loading integer registers
270 in QImode, HImode and SImode.
271 Relative to reg-reg move (2). */
272 {2, 4, 2}, /* cost of storing integer registers */
273 2, /* cost of reg,reg fld/fst */
274 {8, 8, 8}, /* cost of loading fp registers
275 in SFmode, DFmode and XFmode */
276 {8, 8, 8}, /* cost of storing fp registers
277 in SFmode, DFmode and XFmode */
278 2, /* cost of moving MMX register */
279 {4, 8}, /* cost of loading MMX registers
280 in SImode and DImode */
281 {4, 8}, /* cost of storing MMX registers
282 in SImode and DImode */
283 2, /* cost of moving SSE register */
284 {4, 8, 16}, /* cost of loading SSE registers
285 in SImode, DImode and TImode */
286 {4, 8, 16}, /* cost of storing SSE registers
287 in SImode, DImode and TImode */
288 3, /* MMX or SSE register to integer */
289 4, /* size of l1 cache. 486 has 8kB cache
290 shared for code and data, so 4kB is
291 not really precise. */
292 4, /* size of l2 cache */
293 0, /* size of prefetch block */
294 0, /* number of parallel prefetches */
295 1, /* Branch cost */
296 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
297 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
298 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
299 COSTS_N_INSNS (3), /* cost of FABS instruction. */
300 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
301 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
302 i486_memcpy,
303 i486_memset,
304 1, /* scalar_stmt_cost. */
305 1, /* scalar load_cost. */
306 1, /* scalar_store_cost. */
307 1, /* vec_stmt_cost. */
308 1, /* vec_to_scalar_cost. */
309 1, /* scalar_to_vec_cost. */
310 1, /* vec_align_load_cost. */
311 2, /* vec_unalign_load_cost. */
312 1, /* vec_store_cost. */
313 3, /* cond_taken_branch_cost. */
314 1, /* cond_not_taken_branch_cost. */
315 };
316
317 static stringop_algs pentium_memcpy[2] = {
318 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
319 DUMMY_STRINGOP_ALGS};
320 static stringop_algs pentium_memset[2] = {
321 {libcall, {{-1, rep_prefix_4_byte, false}}},
322 DUMMY_STRINGOP_ALGS};
323
324 static const
325 struct processor_costs pentium_cost = {
326 COSTS_N_INSNS (1), /* cost of an add instruction */
327 COSTS_N_INSNS (1), /* cost of a lea instruction */
328 COSTS_N_INSNS (4), /* variable shift costs */
329 COSTS_N_INSNS (1), /* constant shift costs */
330 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
331 COSTS_N_INSNS (11), /* HI */
332 COSTS_N_INSNS (11), /* SI */
333 COSTS_N_INSNS (11), /* DI */
334 COSTS_N_INSNS (11)}, /* other */
335 0, /* cost of multiply per each bit set */
336 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
337 COSTS_N_INSNS (25), /* HI */
338 COSTS_N_INSNS (25), /* SI */
339 COSTS_N_INSNS (25), /* DI */
340 COSTS_N_INSNS (25)}, /* other */
341 COSTS_N_INSNS (3), /* cost of movsx */
342 COSTS_N_INSNS (2), /* cost of movzx */
343 8, /* "large" insn */
344 6, /* MOVE_RATIO */
345 6, /* cost for loading QImode using movzbl */
346 {2, 4, 2}, /* cost of loading integer registers
347 in QImode, HImode and SImode.
348 Relative to reg-reg move (2). */
349 {2, 4, 2}, /* cost of storing integer registers */
350 2, /* cost of reg,reg fld/fst */
351 {2, 2, 6}, /* cost of loading fp registers
352 in SFmode, DFmode and XFmode */
353 {4, 4, 6}, /* cost of storing fp registers
354 in SFmode, DFmode and XFmode */
355 8, /* cost of moving MMX register */
356 {8, 8}, /* cost of loading MMX registers
357 in SImode and DImode */
358 {8, 8}, /* cost of storing MMX registers
359 in SImode and DImode */
360 2, /* cost of moving SSE register */
361 {4, 8, 16}, /* cost of loading SSE registers
362 in SImode, DImode and TImode */
363 {4, 8, 16}, /* cost of storing SSE registers
364 in SImode, DImode and TImode */
365 3, /* MMX or SSE register to integer */
366 8, /* size of l1 cache. */
367 8, /* size of l2 cache */
368 0, /* size of prefetch block */
369 0, /* number of parallel prefetches */
370 2, /* Branch cost */
371 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
372 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
373 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
374 COSTS_N_INSNS (1), /* cost of FABS instruction. */
375 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
376 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
377 pentium_memcpy,
378 pentium_memset,
379 1, /* scalar_stmt_cost. */
380 1, /* scalar load_cost. */
381 1, /* scalar_store_cost. */
382 1, /* vec_stmt_cost. */
383 1, /* vec_to_scalar_cost. */
384 1, /* scalar_to_vec_cost. */
385 1, /* vec_align_load_cost. */
386 2, /* vec_unalign_load_cost. */
387 1, /* vec_store_cost. */
388 3, /* cond_taken_branch_cost. */
389 1, /* cond_not_taken_branch_cost. */
390 };
391
392 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
393 (we ensure the alignment). For small blocks inline loop is still a
394 noticeable win, for bigger blocks either rep movsl or rep movsb is
395 way to go. Rep movsb has apparently more expensive startup time in CPU,
396 but after 4K the difference is down in the noise. */
397 static stringop_algs pentiumpro_memcpy[2] = {
398 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
399 {8192, rep_prefix_4_byte, false},
400 {-1, rep_prefix_1_byte, false}}},
401 DUMMY_STRINGOP_ALGS};
402 static stringop_algs pentiumpro_memset[2] = {
403 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
404 {8192, rep_prefix_4_byte, false},
405 {-1, libcall, false}}},
406 DUMMY_STRINGOP_ALGS};
407 static const
408 struct processor_costs pentiumpro_cost = {
409 COSTS_N_INSNS (1), /* cost of an add instruction */
410 COSTS_N_INSNS (1), /* cost of a lea instruction */
411 COSTS_N_INSNS (1), /* variable shift costs */
412 COSTS_N_INSNS (1), /* constant shift costs */
413 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
414 COSTS_N_INSNS (4), /* HI */
415 COSTS_N_INSNS (4), /* SI */
416 COSTS_N_INSNS (4), /* DI */
417 COSTS_N_INSNS (4)}, /* other */
418 0, /* cost of multiply per each bit set */
419 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
420 COSTS_N_INSNS (17), /* HI */
421 COSTS_N_INSNS (17), /* SI */
422 COSTS_N_INSNS (17), /* DI */
423 COSTS_N_INSNS (17)}, /* other */
424 COSTS_N_INSNS (1), /* cost of movsx */
425 COSTS_N_INSNS (1), /* cost of movzx */
426 8, /* "large" insn */
427 6, /* MOVE_RATIO */
428 2, /* cost for loading QImode using movzbl */
429 {4, 4, 4}, /* cost of loading integer registers
430 in QImode, HImode and SImode.
431 Relative to reg-reg move (2). */
432 {2, 2, 2}, /* cost of storing integer registers */
433 2, /* cost of reg,reg fld/fst */
434 {2, 2, 6}, /* cost of loading fp registers
435 in SFmode, DFmode and XFmode */
436 {4, 4, 6}, /* cost of storing fp registers
437 in SFmode, DFmode and XFmode */
438 2, /* cost of moving MMX register */
439 {2, 2}, /* cost of loading MMX registers
440 in SImode and DImode */
441 {2, 2}, /* cost of storing MMX registers
442 in SImode and DImode */
443 2, /* cost of moving SSE register */
444 {2, 2, 8}, /* cost of loading SSE registers
445 in SImode, DImode and TImode */
446 {2, 2, 8}, /* cost of storing SSE registers
447 in SImode, DImode and TImode */
448 3, /* MMX or SSE register to integer */
449 8, /* size of l1 cache. */
450 256, /* size of l2 cache */
451 32, /* size of prefetch block */
452 6, /* number of parallel prefetches */
453 2, /* Branch cost */
454 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
455 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
456 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
457 COSTS_N_INSNS (2), /* cost of FABS instruction. */
458 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
459 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
460 pentiumpro_memcpy,
461 pentiumpro_memset,
462 1, /* scalar_stmt_cost. */
463 1, /* scalar load_cost. */
464 1, /* scalar_store_cost. */
465 1, /* vec_stmt_cost. */
466 1, /* vec_to_scalar_cost. */
467 1, /* scalar_to_vec_cost. */
468 1, /* vec_align_load_cost. */
469 2, /* vec_unalign_load_cost. */
470 1, /* vec_store_cost. */
471 3, /* cond_taken_branch_cost. */
472 1, /* cond_not_taken_branch_cost. */
473 };
474
475 static stringop_algs geode_memcpy[2] = {
476 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
477 DUMMY_STRINGOP_ALGS};
478 static stringop_algs geode_memset[2] = {
479 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
480 DUMMY_STRINGOP_ALGS};
481 static const
482 struct processor_costs geode_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (1), /* cost of a lea instruction */
485 COSTS_N_INSNS (2), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (4), /* HI */
489 COSTS_N_INSNS (7), /* SI */
490 COSTS_N_INSNS (7), /* DI */
491 COSTS_N_INSNS (7)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (23), /* HI */
495 COSTS_N_INSNS (39), /* SI */
496 COSTS_N_INSNS (39), /* DI */
497 COSTS_N_INSNS (39)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
501 4, /* MOVE_RATIO */
502 1, /* cost for loading QImode using movzbl */
503 {1, 1, 1}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {1, 1, 1}, /* cost of storing integer registers */
507 1, /* cost of reg,reg fld/fst */
508 {1, 1, 1}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {4, 6, 6}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
512
513 1, /* cost of moving MMX register */
514 {1, 1}, /* cost of loading MMX registers
515 in SImode and DImode */
516 {1, 1}, /* cost of storing MMX registers
517 in SImode and DImode */
518 1, /* cost of moving SSE register */
519 {1, 1, 1}, /* cost of loading SSE registers
520 in SImode, DImode and TImode */
521 {1, 1, 1}, /* cost of storing SSE registers
522 in SImode, DImode and TImode */
523 1, /* MMX or SSE register to integer */
524 64, /* size of l1 cache. */
525 128, /* size of l2 cache. */
526 32, /* size of prefetch block */
527 1, /* number of parallel prefetches */
528 1, /* Branch cost */
529 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
530 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
531 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
532 COSTS_N_INSNS (1), /* cost of FABS instruction. */
533 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
534 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
535 geode_memcpy,
536 geode_memset,
537 1, /* scalar_stmt_cost. */
538 1, /* scalar load_cost. */
539 1, /* scalar_store_cost. */
540 1, /* vec_stmt_cost. */
541 1, /* vec_to_scalar_cost. */
542 1, /* scalar_to_vec_cost. */
543 1, /* vec_align_load_cost. */
544 2, /* vec_unalign_load_cost. */
545 1, /* vec_store_cost. */
546 3, /* cond_taken_branch_cost. */
547 1, /* cond_not_taken_branch_cost. */
548 };
549
550 static stringop_algs k6_memcpy[2] = {
551 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
552 DUMMY_STRINGOP_ALGS};
553 static stringop_algs k6_memset[2] = {
554 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
555 DUMMY_STRINGOP_ALGS};
556 static const
557 struct processor_costs k6_cost = {
558 COSTS_N_INSNS (1), /* cost of an add instruction */
559 COSTS_N_INSNS (2), /* cost of a lea instruction */
560 COSTS_N_INSNS (1), /* variable shift costs */
561 COSTS_N_INSNS (1), /* constant shift costs */
562 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
563 COSTS_N_INSNS (3), /* HI */
564 COSTS_N_INSNS (3), /* SI */
565 COSTS_N_INSNS (3), /* DI */
566 COSTS_N_INSNS (3)}, /* other */
567 0, /* cost of multiply per each bit set */
568 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
569 COSTS_N_INSNS (18), /* HI */
570 COSTS_N_INSNS (18), /* SI */
571 COSTS_N_INSNS (18), /* DI */
572 COSTS_N_INSNS (18)}, /* other */
573 COSTS_N_INSNS (2), /* cost of movsx */
574 COSTS_N_INSNS (2), /* cost of movzx */
575 8, /* "large" insn */
576 4, /* MOVE_RATIO */
577 3, /* cost for loading QImode using movzbl */
578 {4, 5, 4}, /* cost of loading integer registers
579 in QImode, HImode and SImode.
580 Relative to reg-reg move (2). */
581 {2, 3, 2}, /* cost of storing integer registers */
582 4, /* cost of reg,reg fld/fst */
583 {6, 6, 6}, /* cost of loading fp registers
584 in SFmode, DFmode and XFmode */
585 {4, 4, 4}, /* cost of storing fp registers
586 in SFmode, DFmode and XFmode */
587 2, /* cost of moving MMX register */
588 {2, 2}, /* cost of loading MMX registers
589 in SImode and DImode */
590 {2, 2}, /* cost of storing MMX registers
591 in SImode and DImode */
592 2, /* cost of moving SSE register */
593 {2, 2, 8}, /* cost of loading SSE registers
594 in SImode, DImode and TImode */
595 {2, 2, 8}, /* cost of storing SSE registers
596 in SImode, DImode and TImode */
597 6, /* MMX or SSE register to integer */
598 32, /* size of l1 cache. */
599 32, /* size of l2 cache. Some models
600 have integrated l2 cache, but
601 optimizing for k6 is not important
602 enough to worry about that. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (2), /* cost of FABS instruction. */
610 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
612 k6_memcpy,
613 k6_memset,
614 1, /* scalar_stmt_cost. */
615 1, /* scalar load_cost. */
616 1, /* scalar_store_cost. */
617 1, /* vec_stmt_cost. */
618 1, /* vec_to_scalar_cost. */
619 1, /* scalar_to_vec_cost. */
620 1, /* vec_align_load_cost. */
621 2, /* vec_unalign_load_cost. */
622 1, /* vec_store_cost. */
623 3, /* cond_taken_branch_cost. */
624 1, /* cond_not_taken_branch_cost. */
625 };
626
627 /* For some reason, Athlon deals better with REP prefix (relative to loops)
628 compared to K8. Alignment becomes important after 8 bytes for memcpy and
629 128 bytes for memset. */
630 static stringop_algs athlon_memcpy[2] = {
631 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633 static stringop_algs athlon_memset[2] = {
634 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs athlon_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (5), /* HI */
644 COSTS_N_INSNS (5), /* SI */
645 COSTS_N_INSNS (5), /* DI */
646 COSTS_N_INSNS (5)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (26), /* HI */
650 COSTS_N_INSNS (42), /* SI */
651 COSTS_N_INSNS (74), /* DI */
652 COSTS_N_INSNS (74)}, /* other */
653 COSTS_N_INSNS (1), /* cost of movsx */
654 COSTS_N_INSNS (1), /* cost of movzx */
655 8, /* "large" insn */
656 9, /* MOVE_RATIO */
657 4, /* cost for loading QImode using movzbl */
658 {3, 4, 3}, /* cost of loading integer registers
659 in QImode, HImode and SImode.
660 Relative to reg-reg move (2). */
661 {3, 4, 3}, /* cost of storing integer registers */
662 4, /* cost of reg,reg fld/fst */
663 {4, 4, 12}, /* cost of loading fp registers
664 in SFmode, DFmode and XFmode */
665 {6, 6, 8}, /* cost of storing fp registers
666 in SFmode, DFmode and XFmode */
667 2, /* cost of moving MMX register */
668 {4, 4}, /* cost of loading MMX registers
669 in SImode and DImode */
670 {4, 4}, /* cost of storing MMX registers
671 in SImode and DImode */
672 2, /* cost of moving SSE register */
673 {4, 4, 6}, /* cost of loading SSE registers
674 in SImode, DImode and TImode */
675 {4, 4, 5}, /* cost of storing SSE registers
676 in SImode, DImode and TImode */
677 5, /* MMX or SSE register to integer */
678 64, /* size of l1 cache. */
679 256, /* size of l2 cache. */
680 64, /* size of prefetch block */
681 6, /* number of parallel prefetches */
682 5, /* Branch cost */
683 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
684 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
685 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
686 COSTS_N_INSNS (2), /* cost of FABS instruction. */
687 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
688 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
689 athlon_memcpy,
690 athlon_memset,
691 1, /* scalar_stmt_cost. */
692 1, /* scalar load_cost. */
693 1, /* scalar_store_cost. */
694 1, /* vec_stmt_cost. */
695 1, /* vec_to_scalar_cost. */
696 1, /* scalar_to_vec_cost. */
697 1, /* vec_align_load_cost. */
698 2, /* vec_unalign_load_cost. */
699 1, /* vec_store_cost. */
700 3, /* cond_taken_branch_cost. */
701 1, /* cond_not_taken_branch_cost. */
702 };
703
704 /* K8 has optimized REP instruction for medium sized blocks, but for very
705 small blocks it is better to use loop. For large blocks, libcall can
706 do nontemporary accesses and beat inline considerably. */
707 static stringop_algs k8_memcpy[2] = {
708 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
709 {-1, rep_prefix_4_byte, false}}},
710 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
711 {-1, libcall, false}}}};
712 static stringop_algs k8_memset[2] = {
713 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
714 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
715 {libcall, {{48, unrolled_loop, false},
716 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
717 static const
718 struct processor_costs k8_cost = {
719 COSTS_N_INSNS (1), /* cost of an add instruction */
720 COSTS_N_INSNS (2), /* cost of a lea instruction */
721 COSTS_N_INSNS (1), /* variable shift costs */
722 COSTS_N_INSNS (1), /* constant shift costs */
723 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
724 COSTS_N_INSNS (4), /* HI */
725 COSTS_N_INSNS (3), /* SI */
726 COSTS_N_INSNS (4), /* DI */
727 COSTS_N_INSNS (5)}, /* other */
728 0, /* cost of multiply per each bit set */
729 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
730 COSTS_N_INSNS (26), /* HI */
731 COSTS_N_INSNS (42), /* SI */
732 COSTS_N_INSNS (74), /* DI */
733 COSTS_N_INSNS (74)}, /* other */
734 COSTS_N_INSNS (1), /* cost of movsx */
735 COSTS_N_INSNS (1), /* cost of movzx */
736 8, /* "large" insn */
737 9, /* MOVE_RATIO */
738 4, /* cost for loading QImode using movzbl */
739 {3, 4, 3}, /* cost of loading integer registers
740 in QImode, HImode and SImode.
741 Relative to reg-reg move (2). */
742 {3, 4, 3}, /* cost of storing integer registers */
743 4, /* cost of reg,reg fld/fst */
744 {4, 4, 12}, /* cost of loading fp registers
745 in SFmode, DFmode and XFmode */
746 {6, 6, 8}, /* cost of storing fp registers
747 in SFmode, DFmode and XFmode */
748 2, /* cost of moving MMX register */
749 {3, 3}, /* cost of loading MMX registers
750 in SImode and DImode */
751 {4, 4}, /* cost of storing MMX registers
752 in SImode and DImode */
753 2, /* cost of moving SSE register */
754 {4, 3, 6}, /* cost of loading SSE registers
755 in SImode, DImode and TImode */
756 {4, 4, 5}, /* cost of storing SSE registers
757 in SImode, DImode and TImode */
758 5, /* MMX or SSE register to integer */
759 64, /* size of l1 cache. */
760 512, /* size of l2 cache. */
761 64, /* size of prefetch block */
762 /* New AMD processors never drop prefetches; if they cannot be performed
763 immediately, they are queued. We set number of simultaneous prefetches
764 to a large constant to reflect this (it probably is not a good idea not
765 to limit number of prefetches at all, as their execution also takes some
766 time). */
767 100, /* number of parallel prefetches */
768 3, /* Branch cost */
769 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
770 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
771 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
772 COSTS_N_INSNS (2), /* cost of FABS instruction. */
773 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
774 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
775
776 k8_memcpy,
777 k8_memset,
778 4, /* scalar_stmt_cost. */
779 2, /* scalar load_cost. */
780 2, /* scalar_store_cost. */
781 5, /* vec_stmt_cost. */
782 0, /* vec_to_scalar_cost. */
783 2, /* scalar_to_vec_cost. */
784 2, /* vec_align_load_cost. */
785 3, /* vec_unalign_load_cost. */
786 3, /* vec_store_cost. */
787 3, /* cond_taken_branch_cost. */
788 2, /* cond_not_taken_branch_cost. */
789 };
790
791 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
792 very small blocks it is better to use loop. For large blocks, libcall can
793 do nontemporary accesses and beat inline considerably. */
794 static stringop_algs amdfam10_memcpy[2] = {
795 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
796 {-1, rep_prefix_4_byte, false}}},
797 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
798 {-1, libcall, false}}}};
799 static stringop_algs amdfam10_memset[2] = {
800 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
801 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
802 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
803 {-1, libcall, false}}}};
804 struct processor_costs amdfam10_cost = {
805 COSTS_N_INSNS (1), /* cost of an add instruction */
806 COSTS_N_INSNS (2), /* cost of a lea instruction */
807 COSTS_N_INSNS (1), /* variable shift costs */
808 COSTS_N_INSNS (1), /* constant shift costs */
809 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
810 COSTS_N_INSNS (4), /* HI */
811 COSTS_N_INSNS (3), /* SI */
812 COSTS_N_INSNS (4), /* DI */
813 COSTS_N_INSNS (5)}, /* other */
814 0, /* cost of multiply per each bit set */
815 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
816 COSTS_N_INSNS (35), /* HI */
817 COSTS_N_INSNS (51), /* SI */
818 COSTS_N_INSNS (83), /* DI */
819 COSTS_N_INSNS (83)}, /* other */
820 COSTS_N_INSNS (1), /* cost of movsx */
821 COSTS_N_INSNS (1), /* cost of movzx */
822 8, /* "large" insn */
823 9, /* MOVE_RATIO */
824 4, /* cost for loading QImode using movzbl */
825 {3, 4, 3}, /* cost of loading integer registers
826 in QImode, HImode and SImode.
827 Relative to reg-reg move (2). */
828 {3, 4, 3}, /* cost of storing integer registers */
829 4, /* cost of reg,reg fld/fst */
830 {4, 4, 12}, /* cost of loading fp registers
831 in SFmode, DFmode and XFmode */
832 {6, 6, 8}, /* cost of storing fp registers
833 in SFmode, DFmode and XFmode */
834 2, /* cost of moving MMX register */
835 {3, 3}, /* cost of loading MMX registers
836 in SImode and DImode */
837 {4, 4}, /* cost of storing MMX registers
838 in SImode and DImode */
839 2, /* cost of moving SSE register */
840 {4, 4, 3}, /* cost of loading SSE registers
841 in SImode, DImode and TImode */
842 {4, 4, 5}, /* cost of storing SSE registers
843 in SImode, DImode and TImode */
844 3, /* MMX or SSE register to integer */
845 /* On K8:
846 MOVD reg64, xmmreg Double FSTORE 4
847 MOVD reg32, xmmreg Double FSTORE 4
848 On AMDFAM10:
849 MOVD reg64, xmmreg Double FADD 3
850 1/1 1/1
851 MOVD reg32, xmmreg Double FADD 3
852 1/1 1/1 */
853 64, /* size of l1 cache. */
854 512, /* size of l2 cache. */
855 64, /* size of prefetch block */
856 /* New AMD processors never drop prefetches; if they cannot be performed
857 immediately, they are queued. We set number of simultaneous prefetches
858 to a large constant to reflect this (it probably is not a good idea not
859 to limit number of prefetches at all, as their execution also takes some
860 time). */
861 100, /* number of parallel prefetches */
862 2, /* Branch cost */
863 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
864 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
865 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
866 COSTS_N_INSNS (2), /* cost of FABS instruction. */
867 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
868 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
869
870 amdfam10_memcpy,
871 amdfam10_memset,
872 4, /* scalar_stmt_cost. */
873 2, /* scalar load_cost. */
874 2, /* scalar_store_cost. */
875 6, /* vec_stmt_cost. */
876 0, /* vec_to_scalar_cost. */
877 2, /* scalar_to_vec_cost. */
878 2, /* vec_align_load_cost. */
879 2, /* vec_unalign_load_cost. */
880 2, /* vec_store_cost. */
881 2, /* cond_taken_branch_cost. */
882 1, /* cond_not_taken_branch_cost. */
883 };
884
885 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
886 very small blocks it is better to use loop. For large blocks, libcall
887 can do nontemporary accesses and beat inline considerably. */
888 static stringop_algs bdver1_memcpy[2] = {
889 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
890 {-1, rep_prefix_4_byte, false}}},
891 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
892 {-1, libcall, false}}}};
893 static stringop_algs bdver1_memset[2] = {
894 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
895 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
896 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
897 {-1, libcall, false}}}};
898
899 const struct processor_costs bdver1_cost = {
900 COSTS_N_INSNS (1), /* cost of an add instruction */
901 COSTS_N_INSNS (1), /* cost of a lea instruction */
902 COSTS_N_INSNS (1), /* variable shift costs */
903 COSTS_N_INSNS (1), /* constant shift costs */
904 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
905 COSTS_N_INSNS (4), /* HI */
906 COSTS_N_INSNS (4), /* SI */
907 COSTS_N_INSNS (6), /* DI */
908 COSTS_N_INSNS (6)}, /* other */
909 0, /* cost of multiply per each bit set */
910 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
911 COSTS_N_INSNS (35), /* HI */
912 COSTS_N_INSNS (51), /* SI */
913 COSTS_N_INSNS (83), /* DI */
914 COSTS_N_INSNS (83)}, /* other */
915 COSTS_N_INSNS (1), /* cost of movsx */
916 COSTS_N_INSNS (1), /* cost of movzx */
917 8, /* "large" insn */
918 9, /* MOVE_RATIO */
919 4, /* cost for loading QImode using movzbl */
920 {5, 5, 4}, /* cost of loading integer registers
921 in QImode, HImode and SImode.
922 Relative to reg-reg move (2). */
923 {4, 4, 4}, /* cost of storing integer registers */
924 2, /* cost of reg,reg fld/fst */
925 {5, 5, 12}, /* cost of loading fp registers
926 in SFmode, DFmode and XFmode */
927 {4, 4, 8}, /* cost of storing fp registers
928 in SFmode, DFmode and XFmode */
929 2, /* cost of moving MMX register */
930 {4, 4}, /* cost of loading MMX registers
931 in SImode and DImode */
932 {4, 4}, /* cost of storing MMX registers
933 in SImode and DImode */
934 2, /* cost of moving SSE register */
935 {4, 4, 4}, /* cost of loading SSE registers
936 in SImode, DImode and TImode */
937 {4, 4, 4}, /* cost of storing SSE registers
938 in SImode, DImode and TImode */
939 2, /* MMX or SSE register to integer */
940 /* On K8:
941 MOVD reg64, xmmreg Double FSTORE 4
942 MOVD reg32, xmmreg Double FSTORE 4
943 On AMDFAM10:
944 MOVD reg64, xmmreg Double FADD 3
945 1/1 1/1
946 MOVD reg32, xmmreg Double FADD 3
947 1/1 1/1 */
948 16, /* size of l1 cache. */
949 2048, /* size of l2 cache. */
950 64, /* size of prefetch block */
951 /* New AMD processors never drop prefetches; if they cannot be performed
952 immediately, they are queued. We set number of simultaneous prefetches
953 to a large constant to reflect this (it probably is not a good idea not
954 to limit number of prefetches at all, as their execution also takes some
955 time). */
956 100, /* number of parallel prefetches */
957 2, /* Branch cost */
958 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
959 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
960 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
961 COSTS_N_INSNS (2), /* cost of FABS instruction. */
962 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
963 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
964
965 bdver1_memcpy,
966 bdver1_memset,
967 6, /* scalar_stmt_cost. */
968 4, /* scalar load_cost. */
969 4, /* scalar_store_cost. */
970 6, /* vec_stmt_cost. */
971 0, /* vec_to_scalar_cost. */
972 2, /* scalar_to_vec_cost. */
973 4, /* vec_align_load_cost. */
974 4, /* vec_unalign_load_cost. */
975 4, /* vec_store_cost. */
976 2, /* cond_taken_branch_cost. */
977 1, /* cond_not_taken_branch_cost. */
978 };
979
980 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
981 very small blocks it is better to use loop. For large blocks, libcall
982 can do nontemporary accesses and beat inline considerably. */
983
984 static stringop_algs bdver2_memcpy[2] = {
985 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
986 {-1, rep_prefix_4_byte, false}}},
987 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
988 {-1, libcall, false}}}};
989 static stringop_algs bdver2_memset[2] = {
990 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
991 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
992 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
993 {-1, libcall, false}}}};
994
995 const struct processor_costs bdver2_cost = {
996 COSTS_N_INSNS (1), /* cost of an add instruction */
997 COSTS_N_INSNS (1), /* cost of a lea instruction */
998 COSTS_N_INSNS (1), /* variable shift costs */
999 COSTS_N_INSNS (1), /* constant shift costs */
1000 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1001 COSTS_N_INSNS (4), /* HI */
1002 COSTS_N_INSNS (4), /* SI */
1003 COSTS_N_INSNS (6), /* DI */
1004 COSTS_N_INSNS (6)}, /* other */
1005 0, /* cost of multiply per each bit set */
1006 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1007 COSTS_N_INSNS (35), /* HI */
1008 COSTS_N_INSNS (51), /* SI */
1009 COSTS_N_INSNS (83), /* DI */
1010 COSTS_N_INSNS (83)}, /* other */
1011 COSTS_N_INSNS (1), /* cost of movsx */
1012 COSTS_N_INSNS (1), /* cost of movzx */
1013 8, /* "large" insn */
1014 9, /* MOVE_RATIO */
1015 4, /* cost for loading QImode using movzbl */
1016 {5, 5, 4}, /* cost of loading integer registers
1017 in QImode, HImode and SImode.
1018 Relative to reg-reg move (2). */
1019 {4, 4, 4}, /* cost of storing integer registers */
1020 2, /* cost of reg,reg fld/fst */
1021 {5, 5, 12}, /* cost of loading fp registers
1022 in SFmode, DFmode and XFmode */
1023 {4, 4, 8}, /* cost of storing fp registers
1024 in SFmode, DFmode and XFmode */
1025 2, /* cost of moving MMX register */
1026 {4, 4}, /* cost of loading MMX registers
1027 in SImode and DImode */
1028 {4, 4}, /* cost of storing MMX registers
1029 in SImode and DImode */
1030 2, /* cost of moving SSE register */
1031 {4, 4, 4}, /* cost of loading SSE registers
1032 in SImode, DImode and TImode */
1033 {4, 4, 4}, /* cost of storing SSE registers
1034 in SImode, DImode and TImode */
1035 2, /* MMX or SSE register to integer */
1036 /* On K8:
1037 MOVD reg64, xmmreg Double FSTORE 4
1038 MOVD reg32, xmmreg Double FSTORE 4
1039 On AMDFAM10:
1040 MOVD reg64, xmmreg Double FADD 3
1041 1/1 1/1
1042 MOVD reg32, xmmreg Double FADD 3
1043 1/1 1/1 */
1044 16, /* size of l1 cache. */
1045 2048, /* size of l2 cache. */
1046 64, /* size of prefetch block */
1047 /* New AMD processors never drop prefetches; if they cannot be performed
1048 immediately, they are queued. We set number of simultaneous prefetches
1049 to a large constant to reflect this (it probably is not a good idea not
1050 to limit number of prefetches at all, as their execution also takes some
1051 time). */
1052 100, /* number of parallel prefetches */
1053 2, /* Branch cost */
1054 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1055 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1056 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1057 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1058 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1059 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1060
1061 bdver2_memcpy,
1062 bdver2_memset,
1063 6, /* scalar_stmt_cost. */
1064 4, /* scalar load_cost. */
1065 4, /* scalar_store_cost. */
1066 6, /* vec_stmt_cost. */
1067 0, /* vec_to_scalar_cost. */
1068 2, /* scalar_to_vec_cost. */
1069 4, /* vec_align_load_cost. */
1070 4, /* vec_unalign_load_cost. */
1071 4, /* vec_store_cost. */
1072 2, /* cond_taken_branch_cost. */
1073 1, /* cond_not_taken_branch_cost. */
1074 };
1075
1076
1077 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1078 very small blocks it is better to use loop. For large blocks, libcall
1079 can do nontemporary accesses and beat inline considerably. */
1080 static stringop_algs bdver3_memcpy[2] = {
1081 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1082 {-1, rep_prefix_4_byte, false}}},
1083 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1084 {-1, libcall, false}}}};
1085 static stringop_algs bdver3_memset[2] = {
1086 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1087 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1088 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1089 {-1, libcall, false}}}};
1090 struct processor_costs bdver3_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (1), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (4), /* SI */
1098 COSTS_N_INSNS (6), /* DI */
1099 COSTS_N_INSNS (6)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (35), /* HI */
1103 COSTS_N_INSNS (51), /* SI */
1104 COSTS_N_INSNS (83), /* DI */
1105 COSTS_N_INSNS (83)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1109 9, /* MOVE_RATIO */
1110 4, /* cost for loading QImode using movzbl */
1111 {5, 5, 4}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {4, 4, 4}, /* cost of storing integer registers */
1115 2, /* cost of reg,reg fld/fst */
1116 {5, 5, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {4, 4, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {4, 4}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 4, 4}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 4}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 2, /* MMX or SSE register to integer */
1131 16, /* size of l1 cache. */
1132 2048, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1138 time). */
1139 100, /* number of parallel prefetches */
1140 2, /* Branch cost */
1141 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1147
1148 bdver3_memcpy,
1149 bdver3_memset,
1150 6, /* scalar_stmt_cost. */
1151 4, /* scalar load_cost. */
1152 4, /* scalar_store_cost. */
1153 6, /* vec_stmt_cost. */
1154 0, /* vec_to_scalar_cost. */
1155 2, /* scalar_to_vec_cost. */
1156 4, /* vec_align_load_cost. */
1157 4, /* vec_unalign_load_cost. */
1158 4, /* vec_store_cost. */
1159 2, /* cond_taken_branch_cost. */
1160 1, /* cond_not_taken_branch_cost. */
1161 };
1162
1163 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1164 very small blocks it is better to use loop. For large blocks, libcall can
1165 do nontemporary accesses and beat inline considerably. */
1166 static stringop_algs btver1_memcpy[2] = {
1167 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1168 {-1, rep_prefix_4_byte, false}}},
1169 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}};
1171 static stringop_algs btver1_memset[2] = {
1172 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1173 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1174 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1175 {-1, libcall, false}}}};
1176 const struct processor_costs btver1_cost = {
1177 COSTS_N_INSNS (1), /* cost of an add instruction */
1178 COSTS_N_INSNS (2), /* cost of a lea instruction */
1179 COSTS_N_INSNS (1), /* variable shift costs */
1180 COSTS_N_INSNS (1), /* constant shift costs */
1181 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1182 COSTS_N_INSNS (4), /* HI */
1183 COSTS_N_INSNS (3), /* SI */
1184 COSTS_N_INSNS (4), /* DI */
1185 COSTS_N_INSNS (5)}, /* other */
1186 0, /* cost of multiply per each bit set */
1187 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1188 COSTS_N_INSNS (35), /* HI */
1189 COSTS_N_INSNS (51), /* SI */
1190 COSTS_N_INSNS (83), /* DI */
1191 COSTS_N_INSNS (83)}, /* other */
1192 COSTS_N_INSNS (1), /* cost of movsx */
1193 COSTS_N_INSNS (1), /* cost of movzx */
1194 8, /* "large" insn */
1195 9, /* MOVE_RATIO */
1196 4, /* cost for loading QImode using movzbl */
1197 {3, 4, 3}, /* cost of loading integer registers
1198 in QImode, HImode and SImode.
1199 Relative to reg-reg move (2). */
1200 {3, 4, 3}, /* cost of storing integer registers */
1201 4, /* cost of reg,reg fld/fst */
1202 {4, 4, 12}, /* cost of loading fp registers
1203 in SFmode, DFmode and XFmode */
1204 {6, 6, 8}, /* cost of storing fp registers
1205 in SFmode, DFmode and XFmode */
1206 2, /* cost of moving MMX register */
1207 {3, 3}, /* cost of loading MMX registers
1208 in SImode and DImode */
1209 {4, 4}, /* cost of storing MMX registers
1210 in SImode and DImode */
1211 2, /* cost of moving SSE register */
1212 {4, 4, 3}, /* cost of loading SSE registers
1213 in SImode, DImode and TImode */
1214 {4, 4, 5}, /* cost of storing SSE registers
1215 in SImode, DImode and TImode */
1216 3, /* MMX or SSE register to integer */
1217 /* On K8:
1218 MOVD reg64, xmmreg Double FSTORE 4
1219 MOVD reg32, xmmreg Double FSTORE 4
1220 On AMDFAM10:
1221 MOVD reg64, xmmreg Double FADD 3
1222 1/1 1/1
1223 MOVD reg32, xmmreg Double FADD 3
1224 1/1 1/1 */
1225 32, /* size of l1 cache. */
1226 512, /* size of l2 cache. */
1227 64, /* size of prefetch block */
1228 100, /* number of parallel prefetches */
1229 2, /* Branch cost */
1230 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1231 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1232 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1233 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1234 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1235 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1236
1237 btver1_memcpy,
1238 btver1_memset,
1239 4, /* scalar_stmt_cost. */
1240 2, /* scalar load_cost. */
1241 2, /* scalar_store_cost. */
1242 6, /* vec_stmt_cost. */
1243 0, /* vec_to_scalar_cost. */
1244 2, /* scalar_to_vec_cost. */
1245 2, /* vec_align_load_cost. */
1246 2, /* vec_unalign_load_cost. */
1247 2, /* vec_store_cost. */
1248 2, /* cond_taken_branch_cost. */
1249 1, /* cond_not_taken_branch_cost. */
1250 };
1251
1252 static stringop_algs btver2_memcpy[2] = {
1253 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1254 {-1, rep_prefix_4_byte, false}}},
1255 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1256 {-1, libcall, false}}}};
1257 static stringop_algs btver2_memset[2] = {
1258 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1259 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1260 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1261 {-1, libcall, false}}}};
1262 const struct processor_costs btver2_cost = {
1263 COSTS_N_INSNS (1), /* cost of an add instruction */
1264 COSTS_N_INSNS (2), /* cost of a lea instruction */
1265 COSTS_N_INSNS (1), /* variable shift costs */
1266 COSTS_N_INSNS (1), /* constant shift costs */
1267 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1268 COSTS_N_INSNS (4), /* HI */
1269 COSTS_N_INSNS (3), /* SI */
1270 COSTS_N_INSNS (4), /* DI */
1271 COSTS_N_INSNS (5)}, /* other */
1272 0, /* cost of multiply per each bit set */
1273 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1274 COSTS_N_INSNS (35), /* HI */
1275 COSTS_N_INSNS (51), /* SI */
1276 COSTS_N_INSNS (83), /* DI */
1277 COSTS_N_INSNS (83)}, /* other */
1278 COSTS_N_INSNS (1), /* cost of movsx */
1279 COSTS_N_INSNS (1), /* cost of movzx */
1280 8, /* "large" insn */
1281 9, /* MOVE_RATIO */
1282 4, /* cost for loading QImode using movzbl */
1283 {3, 4, 3}, /* cost of loading integer registers
1284 in QImode, HImode and SImode.
1285 Relative to reg-reg move (2). */
1286 {3, 4, 3}, /* cost of storing integer registers */
1287 4, /* cost of reg,reg fld/fst */
1288 {4, 4, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode */
1290 {6, 6, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode */
1292 2, /* cost of moving MMX register */
1293 {3, 3}, /* cost of loading MMX registers
1294 in SImode and DImode */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode */
1297 2, /* cost of moving SSE register */
1298 {4, 4, 3}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode */
1300 {4, 4, 5}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode */
1302 3, /* MMX or SSE register to integer */
1303 /* On K8:
1304 MOVD reg64, xmmreg Double FSTORE 4
1305 MOVD reg32, xmmreg Double FSTORE 4
1306 On AMDFAM10:
1307 MOVD reg64, xmmreg Double FADD 3
1308 1/1 1/1
1309 MOVD reg32, xmmreg Double FADD 3
1310 1/1 1/1 */
1311 32, /* size of l1 cache. */
1312 2048, /* size of l2 cache. */
1313 64, /* size of prefetch block */
1314 100, /* number of parallel prefetches */
1315 2, /* Branch cost */
1316 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1317 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1318 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1319 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1320 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1321 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1322 btver2_memcpy,
1323 btver2_memset,
1324 4, /* scalar_stmt_cost. */
1325 2, /* scalar load_cost. */
1326 2, /* scalar_store_cost. */
1327 6, /* vec_stmt_cost. */
1328 0, /* vec_to_scalar_cost. */
1329 2, /* scalar_to_vec_cost. */
1330 2, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 2, /* vec_store_cost. */
1333 2, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1335 };
1336
1337 static stringop_algs pentium4_memcpy[2] = {
1338 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1339 DUMMY_STRINGOP_ALGS};
1340 static stringop_algs pentium4_memset[2] = {
1341 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1342 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1343 DUMMY_STRINGOP_ALGS};
1344
1345 static const
1346 struct processor_costs pentium4_cost = {
1347 COSTS_N_INSNS (1), /* cost of an add instruction */
1348 COSTS_N_INSNS (3), /* cost of a lea instruction */
1349 COSTS_N_INSNS (4), /* variable shift costs */
1350 COSTS_N_INSNS (4), /* constant shift costs */
1351 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1352 COSTS_N_INSNS (15), /* HI */
1353 COSTS_N_INSNS (15), /* SI */
1354 COSTS_N_INSNS (15), /* DI */
1355 COSTS_N_INSNS (15)}, /* other */
1356 0, /* cost of multiply per each bit set */
1357 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1358 COSTS_N_INSNS (56), /* HI */
1359 COSTS_N_INSNS (56), /* SI */
1360 COSTS_N_INSNS (56), /* DI */
1361 COSTS_N_INSNS (56)}, /* other */
1362 COSTS_N_INSNS (1), /* cost of movsx */
1363 COSTS_N_INSNS (1), /* cost of movzx */
1364 16, /* "large" insn */
1365 6, /* MOVE_RATIO */
1366 2, /* cost for loading QImode using movzbl */
1367 {4, 5, 4}, /* cost of loading integer registers
1368 in QImode, HImode and SImode.
1369 Relative to reg-reg move (2). */
1370 {2, 3, 2}, /* cost of storing integer registers */
1371 2, /* cost of reg,reg fld/fst */
1372 {2, 2, 6}, /* cost of loading fp registers
1373 in SFmode, DFmode and XFmode */
1374 {4, 4, 6}, /* cost of storing fp registers
1375 in SFmode, DFmode and XFmode */
1376 2, /* cost of moving MMX register */
1377 {2, 2}, /* cost of loading MMX registers
1378 in SImode and DImode */
1379 {2, 2}, /* cost of storing MMX registers
1380 in SImode and DImode */
1381 12, /* cost of moving SSE register */
1382 {12, 12, 12}, /* cost of loading SSE registers
1383 in SImode, DImode and TImode */
1384 {2, 2, 8}, /* cost of storing SSE registers
1385 in SImode, DImode and TImode */
1386 10, /* MMX or SSE register to integer */
1387 8, /* size of l1 cache. */
1388 256, /* size of l2 cache. */
1389 64, /* size of prefetch block */
1390 6, /* number of parallel prefetches */
1391 2, /* Branch cost */
1392 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1393 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1394 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1395 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1396 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1397 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1398 pentium4_memcpy,
1399 pentium4_memset,
1400 1, /* scalar_stmt_cost. */
1401 1, /* scalar load_cost. */
1402 1, /* scalar_store_cost. */
1403 1, /* vec_stmt_cost. */
1404 1, /* vec_to_scalar_cost. */
1405 1, /* scalar_to_vec_cost. */
1406 1, /* vec_align_load_cost. */
1407 2, /* vec_unalign_load_cost. */
1408 1, /* vec_store_cost. */
1409 3, /* cond_taken_branch_cost. */
1410 1, /* cond_not_taken_branch_cost. */
1411 };
1412
1413 static stringop_algs nocona_memcpy[2] = {
1414 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1415 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1416 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1417
1418 static stringop_algs nocona_memset[2] = {
1419 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1420 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1421 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1422 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1423
1424 static const
1425 struct processor_costs nocona_cost = {
1426 COSTS_N_INSNS (1), /* cost of an add instruction */
1427 COSTS_N_INSNS (1), /* cost of a lea instruction */
1428 COSTS_N_INSNS (1), /* variable shift costs */
1429 COSTS_N_INSNS (1), /* constant shift costs */
1430 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1431 COSTS_N_INSNS (10), /* HI */
1432 COSTS_N_INSNS (10), /* SI */
1433 COSTS_N_INSNS (10), /* DI */
1434 COSTS_N_INSNS (10)}, /* other */
1435 0, /* cost of multiply per each bit set */
1436 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1437 COSTS_N_INSNS (66), /* HI */
1438 COSTS_N_INSNS (66), /* SI */
1439 COSTS_N_INSNS (66), /* DI */
1440 COSTS_N_INSNS (66)}, /* other */
1441 COSTS_N_INSNS (1), /* cost of movsx */
1442 COSTS_N_INSNS (1), /* cost of movzx */
1443 16, /* "large" insn */
1444 17, /* MOVE_RATIO */
1445 4, /* cost for loading QImode using movzbl */
1446 {4, 4, 4}, /* cost of loading integer registers
1447 in QImode, HImode and SImode.
1448 Relative to reg-reg move (2). */
1449 {4, 4, 4}, /* cost of storing integer registers */
1450 3, /* cost of reg,reg fld/fst */
1451 {12, 12, 12}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode */
1453 {4, 4, 4}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode */
1455 6, /* cost of moving MMX register */
1456 {12, 12}, /* cost of loading MMX registers
1457 in SImode and DImode */
1458 {12, 12}, /* cost of storing MMX registers
1459 in SImode and DImode */
1460 6, /* cost of moving SSE register */
1461 {12, 12, 12}, /* cost of loading SSE registers
1462 in SImode, DImode and TImode */
1463 {12, 12, 12}, /* cost of storing SSE registers
1464 in SImode, DImode and TImode */
1465 8, /* MMX or SSE register to integer */
1466 8, /* size of l1 cache. */
1467 1024, /* size of l2 cache. */
1468 128, /* size of prefetch block */
1469 8, /* number of parallel prefetches */
1470 1, /* Branch cost */
1471 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1472 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1473 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1474 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1475 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1476 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1477 nocona_memcpy,
1478 nocona_memset,
1479 1, /* scalar_stmt_cost. */
1480 1, /* scalar load_cost. */
1481 1, /* scalar_store_cost. */
1482 1, /* vec_stmt_cost. */
1483 1, /* vec_to_scalar_cost. */
1484 1, /* scalar_to_vec_cost. */
1485 1, /* vec_align_load_cost. */
1486 2, /* vec_unalign_load_cost. */
1487 1, /* vec_store_cost. */
1488 3, /* cond_taken_branch_cost. */
1489 1, /* cond_not_taken_branch_cost. */
1490 };
1491
1492 static stringop_algs atom_memcpy[2] = {
1493 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1494 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1495 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1496 static stringop_algs atom_memset[2] = {
1497 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1498 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1499 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1500 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1501 static const
1502 struct processor_costs atom_cost = {
1503 COSTS_N_INSNS (1), /* cost of an add instruction */
1504 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1505 COSTS_N_INSNS (1), /* variable shift costs */
1506 COSTS_N_INSNS (1), /* constant shift costs */
1507 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1508 COSTS_N_INSNS (4), /* HI */
1509 COSTS_N_INSNS (3), /* SI */
1510 COSTS_N_INSNS (4), /* DI */
1511 COSTS_N_INSNS (2)}, /* other */
1512 0, /* cost of multiply per each bit set */
1513 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1514 COSTS_N_INSNS (26), /* HI */
1515 COSTS_N_INSNS (42), /* SI */
1516 COSTS_N_INSNS (74), /* DI */
1517 COSTS_N_INSNS (74)}, /* other */
1518 COSTS_N_INSNS (1), /* cost of movsx */
1519 COSTS_N_INSNS (1), /* cost of movzx */
1520 8, /* "large" insn */
1521 17, /* MOVE_RATIO */
1522 4, /* cost for loading QImode using movzbl */
1523 {4, 4, 4}, /* cost of loading integer registers
1524 in QImode, HImode and SImode.
1525 Relative to reg-reg move (2). */
1526 {4, 4, 4}, /* cost of storing integer registers */
1527 4, /* cost of reg,reg fld/fst */
1528 {12, 12, 12}, /* cost of loading fp registers
1529 in SFmode, DFmode and XFmode */
1530 {6, 6, 8}, /* cost of storing fp registers
1531 in SFmode, DFmode and XFmode */
1532 2, /* cost of moving MMX register */
1533 {8, 8}, /* cost of loading MMX registers
1534 in SImode and DImode */
1535 {8, 8}, /* cost of storing MMX registers
1536 in SImode and DImode */
1537 2, /* cost of moving SSE register */
1538 {8, 8, 8}, /* cost of loading SSE registers
1539 in SImode, DImode and TImode */
1540 {8, 8, 8}, /* cost of storing SSE registers
1541 in SImode, DImode and TImode */
1542 5, /* MMX or SSE register to integer */
1543 32, /* size of l1 cache. */
1544 256, /* size of l2 cache. */
1545 64, /* size of prefetch block */
1546 6, /* number of parallel prefetches */
1547 3, /* Branch cost */
1548 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1549 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1550 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1551 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1552 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1553 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1554 atom_memcpy,
1555 atom_memset,
1556 1, /* scalar_stmt_cost. */
1557 1, /* scalar load_cost. */
1558 1, /* scalar_store_cost. */
1559 1, /* vec_stmt_cost. */
1560 1, /* vec_to_scalar_cost. */
1561 1, /* scalar_to_vec_cost. */
1562 1, /* vec_align_load_cost. */
1563 2, /* vec_unalign_load_cost. */
1564 1, /* vec_store_cost. */
1565 3, /* cond_taken_branch_cost. */
1566 1, /* cond_not_taken_branch_cost. */
1567 };
1568
1569 static stringop_algs slm_memcpy[2] = {
1570 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1571 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1572 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1573 static stringop_algs slm_memset[2] = {
1574 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1575 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1576 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1577 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1578 static const
1579 struct processor_costs slm_cost = {
1580 COSTS_N_INSNS (1), /* cost of an add instruction */
1581 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1582 COSTS_N_INSNS (1), /* variable shift costs */
1583 COSTS_N_INSNS (1), /* constant shift costs */
1584 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1585 COSTS_N_INSNS (4), /* HI */
1586 COSTS_N_INSNS (3), /* SI */
1587 COSTS_N_INSNS (4), /* DI */
1588 COSTS_N_INSNS (2)}, /* other */
1589 0, /* cost of multiply per each bit set */
1590 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1591 COSTS_N_INSNS (26), /* HI */
1592 COSTS_N_INSNS (42), /* SI */
1593 COSTS_N_INSNS (74), /* DI */
1594 COSTS_N_INSNS (74)}, /* other */
1595 COSTS_N_INSNS (1), /* cost of movsx */
1596 COSTS_N_INSNS (1), /* cost of movzx */
1597 8, /* "large" insn */
1598 17, /* MOVE_RATIO */
1599 4, /* cost for loading QImode using movzbl */
1600 {4, 4, 4}, /* cost of loading integer registers
1601 in QImode, HImode and SImode.
1602 Relative to reg-reg move (2). */
1603 {4, 4, 4}, /* cost of storing integer registers */
1604 4, /* cost of reg,reg fld/fst */
1605 {12, 12, 12}, /* cost of loading fp registers
1606 in SFmode, DFmode and XFmode */
1607 {6, 6, 8}, /* cost of storing fp registers
1608 in SFmode, DFmode and XFmode */
1609 2, /* cost of moving MMX register */
1610 {8, 8}, /* cost of loading MMX registers
1611 in SImode and DImode */
1612 {8, 8}, /* cost of storing MMX registers
1613 in SImode and DImode */
1614 2, /* cost of moving SSE register */
1615 {8, 8, 8}, /* cost of loading SSE registers
1616 in SImode, DImode and TImode */
1617 {8, 8, 8}, /* cost of storing SSE registers
1618 in SImode, DImode and TImode */
1619 5, /* MMX or SSE register to integer */
1620 32, /* size of l1 cache. */
1621 256, /* size of l2 cache. */
1622 64, /* size of prefetch block */
1623 6, /* number of parallel prefetches */
1624 3, /* Branch cost */
1625 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1626 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1627 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1628 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1629 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1630 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1631 slm_memcpy,
1632 slm_memset,
1633 1, /* scalar_stmt_cost. */
1634 1, /* scalar load_cost. */
1635 1, /* scalar_store_cost. */
1636 1, /* vec_stmt_cost. */
1637 1, /* vec_to_scalar_cost. */
1638 1, /* scalar_to_vec_cost. */
1639 1, /* vec_align_load_cost. */
1640 2, /* vec_unalign_load_cost. */
1641 1, /* vec_store_cost. */
1642 3, /* cond_taken_branch_cost. */
1643 1, /* cond_not_taken_branch_cost. */
1644 };
1645
1646 /* Generic should produce code tuned for Core-i7 (and newer chips)
1647 and btver1 (and newer chips). */
1648
1649 static stringop_algs generic_memcpy[2] = {
1650 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1651 {-1, libcall, false}}},
1652 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1653 {-1, libcall, false}}}};
1654 static stringop_algs generic_memset[2] = {
1655 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1656 {-1, libcall, false}}},
1657 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1658 {-1, libcall, false}}}};
1659 static const
1660 struct processor_costs generic_cost = {
1661 COSTS_N_INSNS (1), /* cost of an add instruction */
1662 /* On all chips taken into consideration lea is 2 cycles and more. With
1663 this cost however our current implementation of synth_mult results in
1664 use of unnecessary temporary registers causing regression on several
1665 SPECfp benchmarks. */
1666 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1667 COSTS_N_INSNS (1), /* variable shift costs */
1668 COSTS_N_INSNS (1), /* constant shift costs */
1669 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1670 COSTS_N_INSNS (4), /* HI */
1671 COSTS_N_INSNS (3), /* SI */
1672 COSTS_N_INSNS (4), /* DI */
1673 COSTS_N_INSNS (2)}, /* other */
1674 0, /* cost of multiply per each bit set */
1675 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1676 COSTS_N_INSNS (26), /* HI */
1677 COSTS_N_INSNS (42), /* SI */
1678 COSTS_N_INSNS (74), /* DI */
1679 COSTS_N_INSNS (74)}, /* other */
1680 COSTS_N_INSNS (1), /* cost of movsx */
1681 COSTS_N_INSNS (1), /* cost of movzx */
1682 8, /* "large" insn */
1683 17, /* MOVE_RATIO */
1684 4, /* cost for loading QImode using movzbl */
1685 {4, 4, 4}, /* cost of loading integer registers
1686 in QImode, HImode and SImode.
1687 Relative to reg-reg move (2). */
1688 {4, 4, 4}, /* cost of storing integer registers */
1689 4, /* cost of reg,reg fld/fst */
1690 {12, 12, 12}, /* cost of loading fp registers
1691 in SFmode, DFmode and XFmode */
1692 {6, 6, 8}, /* cost of storing fp registers
1693 in SFmode, DFmode and XFmode */
1694 2, /* cost of moving MMX register */
1695 {8, 8}, /* cost of loading MMX registers
1696 in SImode and DImode */
1697 {8, 8}, /* cost of storing MMX registers
1698 in SImode and DImode */
1699 2, /* cost of moving SSE register */
1700 {8, 8, 8}, /* cost of loading SSE registers
1701 in SImode, DImode and TImode */
1702 {8, 8, 8}, /* cost of storing SSE registers
1703 in SImode, DImode and TImode */
1704 5, /* MMX or SSE register to integer */
1705 32, /* size of l1 cache. */
1706 512, /* size of l2 cache. */
1707 64, /* size of prefetch block */
1708 6, /* number of parallel prefetches */
1709 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1710 value is increased to perhaps more appropriate value of 5. */
1711 3, /* Branch cost */
1712 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1713 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1714 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1715 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1716 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1717 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1718 generic_memcpy,
1719 generic_memset,
1720 1, /* scalar_stmt_cost. */
1721 1, /* scalar load_cost. */
1722 1, /* scalar_store_cost. */
1723 1, /* vec_stmt_cost. */
1724 1, /* vec_to_scalar_cost. */
1725 1, /* scalar_to_vec_cost. */
1726 1, /* vec_align_load_cost. */
1727 2, /* vec_unalign_load_cost. */
1728 1, /* vec_store_cost. */
1729 3, /* cond_taken_branch_cost. */
1730 1, /* cond_not_taken_branch_cost. */
1731 };
1732
1733 /* core_cost should produce code tuned for Core familly of CPUs. */
1734 static stringop_algs core_memcpy[2] = {
1735 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1736 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1737 {-1, libcall, false}}}};
1738 static stringop_algs core_memset[2] = {
1739 {libcall, {{6, loop_1_byte, true},
1740 {24, loop, true},
1741 {8192, rep_prefix_4_byte, true},
1742 {-1, libcall, false}}},
1743 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1744 {-1, libcall, false}}}};
1745
1746 static const
1747 struct processor_costs core_cost = {
1748 COSTS_N_INSNS (1), /* cost of an add instruction */
1749 /* On all chips taken into consideration lea is 2 cycles and more. With
1750 this cost however our current implementation of synth_mult results in
1751 use of unnecessary temporary registers causing regression on several
1752 SPECfp benchmarks. */
1753 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1754 COSTS_N_INSNS (1), /* variable shift costs */
1755 COSTS_N_INSNS (1), /* constant shift costs */
1756 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1757 COSTS_N_INSNS (4), /* HI */
1758 COSTS_N_INSNS (3), /* SI */
1759 COSTS_N_INSNS (4), /* DI */
1760 COSTS_N_INSNS (2)}, /* other */
1761 0, /* cost of multiply per each bit set */
1762 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1763 COSTS_N_INSNS (26), /* HI */
1764 COSTS_N_INSNS (42), /* SI */
1765 COSTS_N_INSNS (74), /* DI */
1766 COSTS_N_INSNS (74)}, /* other */
1767 COSTS_N_INSNS (1), /* cost of movsx */
1768 COSTS_N_INSNS (1), /* cost of movzx */
1769 8, /* "large" insn */
1770 17, /* MOVE_RATIO */
1771 4, /* cost for loading QImode using movzbl */
1772 {4, 4, 4}, /* cost of loading integer registers
1773 in QImode, HImode and SImode.
1774 Relative to reg-reg move (2). */
1775 {4, 4, 4}, /* cost of storing integer registers */
1776 4, /* cost of reg,reg fld/fst */
1777 {12, 12, 12}, /* cost of loading fp registers
1778 in SFmode, DFmode and XFmode */
1779 {6, 6, 8}, /* cost of storing fp registers
1780 in SFmode, DFmode and XFmode */
1781 2, /* cost of moving MMX register */
1782 {8, 8}, /* cost of loading MMX registers
1783 in SImode and DImode */
1784 {8, 8}, /* cost of storing MMX registers
1785 in SImode and DImode */
1786 2, /* cost of moving SSE register */
1787 {8, 8, 8}, /* cost of loading SSE registers
1788 in SImode, DImode and TImode */
1789 {8, 8, 8}, /* cost of storing SSE registers
1790 in SImode, DImode and TImode */
1791 5, /* MMX or SSE register to integer */
1792 64, /* size of l1 cache. */
1793 512, /* size of l2 cache. */
1794 64, /* size of prefetch block */
1795 6, /* number of parallel prefetches */
1796 /* FIXME perhaps more appropriate value is 5. */
1797 3, /* Branch cost */
1798 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1799 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1800 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1801 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1802 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1803 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1804 core_memcpy,
1805 core_memset,
1806 1, /* scalar_stmt_cost. */
1807 1, /* scalar load_cost. */
1808 1, /* scalar_store_cost. */
1809 1, /* vec_stmt_cost. */
1810 1, /* vec_to_scalar_cost. */
1811 1, /* scalar_to_vec_cost. */
1812 1, /* vec_align_load_cost. */
1813 2, /* vec_unalign_load_cost. */
1814 1, /* vec_store_cost. */
1815 3, /* cond_taken_branch_cost. */
1816 1, /* cond_not_taken_branch_cost. */
1817 };
1818
1819
1820 /* Set by -mtune. */
1821 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1822
1823 /* Set by -mtune or -Os. */
1824 const struct processor_costs *ix86_cost = &pentium_cost;
1825
1826 /* Processor feature/optimization bitmasks. */
1827 #define m_386 (1<<PROCESSOR_I386)
1828 #define m_486 (1<<PROCESSOR_I486)
1829 #define m_PENT (1<<PROCESSOR_PENTIUM)
1830 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1831 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1832 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1833 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1834 #define m_CORE2 (1<<PROCESSOR_CORE2)
1835 #define m_COREI7 (1<<PROCESSOR_COREI7)
1836 #define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
1837 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1838 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
1839 #define m_ATOM (1<<PROCESSOR_ATOM)
1840 #define m_SLM (1<<PROCESSOR_SLM)
1841
1842 #define m_GEODE (1<<PROCESSOR_GEODE)
1843 #define m_K6 (1<<PROCESSOR_K6)
1844 #define m_K6_GEODE (m_K6 | m_GEODE)
1845 #define m_K8 (1<<PROCESSOR_K8)
1846 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1847 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1848 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1849 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1850 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1851 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1852 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1853 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1854 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1855 #define m_BTVER (m_BTVER1 | m_BTVER2)
1856 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1857
1858 #define m_GENERIC (1<<PROCESSOR_GENERIC)
1859
1860 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
1861 #undef DEF_TUNE
1862 #define DEF_TUNE(tune, name, selector) name,
1863 #include "x86-tune.def"
1864 #undef DEF_TUNE
1865 };
1866
1867 /* Feature tests against the various tunings. */
1868 unsigned char ix86_tune_features[X86_TUNE_LAST];
1869
1870 /* Feature tests against the various tunings used to create ix86_tune_features
1871 based on the processor mask. */
1872 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1873 #undef DEF_TUNE
1874 #define DEF_TUNE(tune, name, selector) selector,
1875 #include "x86-tune.def"
1876 #undef DEF_TUNE
1877 };
1878
1879 /* Feature tests against the various architecture variations. */
1880 unsigned char ix86_arch_features[X86_ARCH_LAST];
1881
1882 /* Feature tests against the various architecture variations, used to create
1883 ix86_arch_features based on the processor mask. */
1884 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
1885 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
1886 ~(m_386 | m_486 | m_PENT | m_K6),
1887
1888 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1889 ~m_386,
1890
1891 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1892 ~(m_386 | m_486),
1893
1894 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1895 ~m_386,
1896
1897 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1898 ~m_386,
1899 };
1900
1901 /* In case the average insn count for single function invocation is
1902 lower than this constant, emit fast (but longer) prologue and
1903 epilogue code. */
1904 #define FAST_PROLOGUE_INSN_COUNT 20
1905
1906 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1907 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1908 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1909 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1910
1911 /* Array of the smallest class containing reg number REGNO, indexed by
1912 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1913
1914 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1915 {
1916 /* ax, dx, cx, bx */
1917 AREG, DREG, CREG, BREG,
1918 /* si, di, bp, sp */
1919 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1920 /* FP registers */
1921 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1922 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1923 /* arg pointer */
1924 NON_Q_REGS,
1925 /* flags, fpsr, fpcr, frame */
1926 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1927 /* SSE registers */
1928 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1929 SSE_REGS, SSE_REGS,
1930 /* MMX registers */
1931 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1932 MMX_REGS, MMX_REGS,
1933 /* REX registers */
1934 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1935 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1936 /* SSE REX registers */
1937 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1938 SSE_REGS, SSE_REGS,
1939 /* AVX-512 SSE registers */
1940 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1941 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1942 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1943 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
1944 /* Mask registers. */
1945 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1946 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
1947 /* MPX bound registers */
1948 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
1949 };
1950
1951 /* The "default" register map used in 32bit mode. */
1952
1953 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1954 {
1955 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1956 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1957 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1958 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1959 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1960 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1961 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1962 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
1963 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
1964 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
1965 101, 102, 103, 104, /* bound registers */
1966 };
1967
1968 /* The "default" register map used in 64bit mode. */
1969
1970 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1971 {
1972 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1973 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1974 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1975 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1976 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1977 8,9,10,11,12,13,14,15, /* extended integer registers */
1978 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1979 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
1980 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
1981 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
1982 126, 127, 128, 129, /* bound registers */
1983 };
1984
1985 /* Define the register numbers to be used in Dwarf debugging information.
1986 The SVR4 reference port C compiler uses the following register numbers
1987 in its Dwarf output code:
1988 0 for %eax (gcc regno = 0)
1989 1 for %ecx (gcc regno = 2)
1990 2 for %edx (gcc regno = 1)
1991 3 for %ebx (gcc regno = 3)
1992 4 for %esp (gcc regno = 7)
1993 5 for %ebp (gcc regno = 6)
1994 6 for %esi (gcc regno = 4)
1995 7 for %edi (gcc regno = 5)
1996 The following three DWARF register numbers are never generated by
1997 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1998 believes these numbers have these meanings.
1999 8 for %eip (no gcc equivalent)
2000 9 for %eflags (gcc regno = 17)
2001 10 for %trapno (no gcc equivalent)
2002 It is not at all clear how we should number the FP stack registers
2003 for the x86 architecture. If the version of SDB on x86/svr4 were
2004 a bit less brain dead with respect to floating-point then we would
2005 have a precedent to follow with respect to DWARF register numbers
2006 for x86 FP registers, but the SDB on x86/svr4 is so completely
2007 broken with respect to FP registers that it is hardly worth thinking
2008 of it as something to strive for compatibility with.
2009 The version of x86/svr4 SDB I have at the moment does (partially)
2010 seem to believe that DWARF register number 11 is associated with
2011 the x86 register %st(0), but that's about all. Higher DWARF
2012 register numbers don't seem to be associated with anything in
2013 particular, and even for DWARF regno 11, SDB only seems to under-
2014 stand that it should say that a variable lives in %st(0) (when
2015 asked via an `=' command) if we said it was in DWARF regno 11,
2016 but SDB still prints garbage when asked for the value of the
2017 variable in question (via a `/' command).
2018 (Also note that the labels SDB prints for various FP stack regs
2019 when doing an `x' command are all wrong.)
2020 Note that these problems generally don't affect the native SVR4
2021 C compiler because it doesn't allow the use of -O with -g and
2022 because when it is *not* optimizing, it allocates a memory
2023 location for each floating-point variable, and the memory
2024 location is what gets described in the DWARF AT_location
2025 attribute for the variable in question.
2026 Regardless of the severe mental illness of the x86/svr4 SDB, we
2027 do something sensible here and we use the following DWARF
2028 register numbers. Note that these are all stack-top-relative
2029 numbers.
2030 11 for %st(0) (gcc regno = 8)
2031 12 for %st(1) (gcc regno = 9)
2032 13 for %st(2) (gcc regno = 10)
2033 14 for %st(3) (gcc regno = 11)
2034 15 for %st(4) (gcc regno = 12)
2035 16 for %st(5) (gcc regno = 13)
2036 17 for %st(6) (gcc regno = 14)
2037 18 for %st(7) (gcc regno = 15)
2038 */
2039 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2040 {
2041 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2042 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2043 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2044 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2045 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2046 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2047 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2048 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
2049 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
2050 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
2051 -1, -1, -1, -1, /* bound registers */
2052 };
2053
2054 /* Define parameter passing and return registers. */
2055
2056 static int const x86_64_int_parameter_registers[6] =
2057 {
2058 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2059 };
2060
2061 static int const x86_64_ms_abi_int_parameter_registers[4] =
2062 {
2063 CX_REG, DX_REG, R8_REG, R9_REG
2064 };
2065
2066 static int const x86_64_int_return_registers[4] =
2067 {
2068 AX_REG, DX_REG, DI_REG, SI_REG
2069 };
2070
2071 /* Additional registers that are clobbered by SYSV calls. */
2072
2073 int const x86_64_ms_sysv_extra_clobbered_registers[12] =
2074 {
2075 SI_REG, DI_REG,
2076 XMM6_REG, XMM7_REG,
2077 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
2078 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
2079 };
2080
2081 /* Define the structure for the machine field in struct function. */
2082
2083 struct GTY(()) stack_local_entry {
2084 unsigned short mode;
2085 unsigned short n;
2086 rtx rtl;
2087 struct stack_local_entry *next;
2088 };
2089
2090 /* Structure describing stack frame layout.
2091 Stack grows downward:
2092
2093 [arguments]
2094 <- ARG_POINTER
2095 saved pc
2096
2097 saved static chain if ix86_static_chain_on_stack
2098
2099 saved frame pointer if frame_pointer_needed
2100 <- HARD_FRAME_POINTER
2101 [saved regs]
2102 <- regs_save_offset
2103 [padding0]
2104
2105 [saved SSE regs]
2106 <- sse_regs_save_offset
2107 [padding1] |
2108 | <- FRAME_POINTER
2109 [va_arg registers] |
2110 |
2111 [frame] |
2112 |
2113 [padding2] | = to_allocate
2114 <- STACK_POINTER
2115 */
2116 struct ix86_frame
2117 {
2118 int nsseregs;
2119 int nregs;
2120 int va_arg_size;
2121 int red_zone_size;
2122 int outgoing_arguments_size;
2123
2124 /* The offsets relative to ARG_POINTER. */
2125 HOST_WIDE_INT frame_pointer_offset;
2126 HOST_WIDE_INT hard_frame_pointer_offset;
2127 HOST_WIDE_INT stack_pointer_offset;
2128 HOST_WIDE_INT hfp_save_offset;
2129 HOST_WIDE_INT reg_save_offset;
2130 HOST_WIDE_INT sse_reg_save_offset;
2131
2132 /* When save_regs_using_mov is set, emit prologue using
2133 move instead of push instructions. */
2134 bool save_regs_using_mov;
2135 };
2136
2137 /* Which cpu are we scheduling for. */
2138 enum attr_cpu ix86_schedule;
2139
2140 /* Which cpu are we optimizing for. */
2141 enum processor_type ix86_tune;
2142
2143 /* Which instruction set architecture to use. */
2144 enum processor_type ix86_arch;
2145
2146 /* True if processor has SSE prefetch instruction. */
2147 unsigned char x86_prefetch_sse;
2148
2149 /* -mstackrealign option */
2150 static const char ix86_force_align_arg_pointer_string[]
2151 = "force_align_arg_pointer";
2152
2153 static rtx (*ix86_gen_leave) (void);
2154 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2155 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2156 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2157 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2158 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2159 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2160 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2161 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2162 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2163 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2164 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2165
2166 /* Preferred alignment for stack boundary in bits. */
2167 unsigned int ix86_preferred_stack_boundary;
2168
2169 /* Alignment for incoming stack boundary in bits specified at
2170 command line. */
2171 static unsigned int ix86_user_incoming_stack_boundary;
2172
2173 /* Default alignment for incoming stack boundary in bits. */
2174 static unsigned int ix86_default_incoming_stack_boundary;
2175
2176 /* Alignment for incoming stack boundary in bits. */
2177 unsigned int ix86_incoming_stack_boundary;
2178
2179 /* Calling abi specific va_list type nodes. */
2180 static GTY(()) tree sysv_va_list_type_node;
2181 static GTY(()) tree ms_va_list_type_node;
2182
2183 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2184 char internal_label_prefix[16];
2185 int internal_label_prefix_len;
2186
2187 /* Fence to use after loop using movnt. */
2188 tree x86_mfence;
2189
2190 /* Register class used for passing given 64bit part of the argument.
2191 These represent classes as documented by the PS ABI, with the exception
2192 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2193 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2194
2195 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2196 whenever possible (upper half does contain padding). */
2197 enum x86_64_reg_class
2198 {
2199 X86_64_NO_CLASS,
2200 X86_64_INTEGER_CLASS,
2201 X86_64_INTEGERSI_CLASS,
2202 X86_64_SSE_CLASS,
2203 X86_64_SSESF_CLASS,
2204 X86_64_SSEDF_CLASS,
2205 X86_64_SSEUP_CLASS,
2206 X86_64_X87_CLASS,
2207 X86_64_X87UP_CLASS,
2208 X86_64_COMPLEX_X87_CLASS,
2209 X86_64_MEMORY_CLASS
2210 };
2211
2212 #define MAX_CLASSES 4
2213
2214 /* Table of constants used by fldpi, fldln2, etc.... */
2215 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2216 static bool ext_80387_constants_init = 0;
2217
2218 \f
2219 static struct machine_function * ix86_init_machine_status (void);
2220 static rtx ix86_function_value (const_tree, const_tree, bool);
2221 static bool ix86_function_value_regno_p (const unsigned int);
2222 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2223 const_tree);
2224 static rtx ix86_static_chain (const_tree, bool);
2225 static int ix86_function_regparm (const_tree, const_tree);
2226 static void ix86_compute_frame_layout (struct ix86_frame *);
2227 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2228 rtx, rtx, int);
2229 static void ix86_add_new_builtins (HOST_WIDE_INT);
2230 static tree ix86_canonical_va_list_type (tree);
2231 static void predict_jump (int);
2232 static unsigned int split_stack_prologue_scratch_regno (void);
2233 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2234
2235 enum ix86_function_specific_strings
2236 {
2237 IX86_FUNCTION_SPECIFIC_ARCH,
2238 IX86_FUNCTION_SPECIFIC_TUNE,
2239 IX86_FUNCTION_SPECIFIC_MAX
2240 };
2241
2242 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2243 const char *, enum fpmath_unit, bool);
2244 static void ix86_function_specific_save (struct cl_target_option *,
2245 struct gcc_options *opts);
2246 static void ix86_function_specific_restore (struct gcc_options *opts,
2247 struct cl_target_option *);
2248 static void ix86_function_specific_print (FILE *, int,
2249 struct cl_target_option *);
2250 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2251 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2252 struct gcc_options *,
2253 struct gcc_options *,
2254 struct gcc_options *);
2255 static bool ix86_can_inline_p (tree, tree);
2256 static void ix86_set_current_function (tree);
2257 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2258
2259 static enum calling_abi ix86_function_abi (const_tree);
2260
2261 \f
2262 #ifndef SUBTARGET32_DEFAULT_CPU
2263 #define SUBTARGET32_DEFAULT_CPU "i386"
2264 #endif
2265
2266 /* Whether -mtune= or -march= were specified */
2267 static int ix86_tune_defaulted;
2268 static int ix86_arch_specified;
2269
2270 /* Vectorization library interface and handlers. */
2271 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2272
2273 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2274 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2275
2276 /* Processor target table, indexed by processor number */
2277 struct ptt
2278 {
2279 const struct processor_costs *cost; /* Processor costs */
2280 const int align_loop; /* Default alignments. */
2281 const int align_loop_max_skip;
2282 const int align_jump;
2283 const int align_jump_max_skip;
2284 const int align_func;
2285 };
2286
2287 static const struct ptt processor_target_table[PROCESSOR_max] =
2288 {
2289 {&i386_cost, 4, 3, 4, 3, 4},
2290 {&i486_cost, 16, 15, 16, 15, 16},
2291 {&pentium_cost, 16, 7, 16, 7, 16},
2292 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2293 {&geode_cost, 0, 0, 0, 0, 0},
2294 {&k6_cost, 32, 7, 32, 7, 32},
2295 {&athlon_cost, 16, 7, 16, 7, 16},
2296 {&pentium4_cost, 0, 0, 0, 0, 0},
2297 {&k8_cost, 16, 7, 16, 7, 16},
2298 {&nocona_cost, 0, 0, 0, 0, 0},
2299 /* Core 2 */
2300 {&core_cost, 16, 10, 16, 10, 16},
2301 /* Core i7 */
2302 {&core_cost, 16, 10, 16, 10, 16},
2303 /* Core i7 avx */
2304 {&core_cost, 16, 10, 16, 10, 16},
2305 /* Core avx2 */
2306 {&core_cost, 16, 10, 16, 10, 16},
2307 {&generic_cost, 16, 10, 16, 10, 16},
2308 {&amdfam10_cost, 32, 24, 32, 7, 32},
2309 {&bdver1_cost, 16, 10, 16, 7, 11},
2310 {&bdver2_cost, 16, 10, 16, 7, 11},
2311 {&bdver3_cost, 16, 10, 16, 7, 11},
2312 {&btver1_cost, 16, 10, 16, 7, 11},
2313 {&btver2_cost, 16, 10, 16, 7, 11},
2314 {&atom_cost, 16, 15, 16, 7, 16},
2315 {&slm_cost, 16, 15, 16, 7, 16}
2316 };
2317
2318 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2319 {
2320 "generic",
2321 "i386",
2322 "i486",
2323 "pentium",
2324 "pentium-mmx",
2325 "pentiumpro",
2326 "pentium2",
2327 "pentium3",
2328 "pentium4",
2329 "pentium-m",
2330 "prescott",
2331 "nocona",
2332 "core2",
2333 "corei7",
2334 "corei7-avx",
2335 "core-avx2",
2336 "atom",
2337 "slm",
2338 "geode",
2339 "k6",
2340 "k6-2",
2341 "k6-3",
2342 "athlon",
2343 "athlon-4",
2344 "k8",
2345 "amdfam10",
2346 "bdver1",
2347 "bdver2",
2348 "bdver3",
2349 "btver1",
2350 "btver2"
2351 };
2352 \f
2353 static bool
2354 gate_insert_vzeroupper (void)
2355 {
2356 return TARGET_AVX && !TARGET_AVX512F && TARGET_VZEROUPPER;
2357 }
2358
2359 static unsigned int
2360 rest_of_handle_insert_vzeroupper (void)
2361 {
2362 int i;
2363
2364 /* vzeroupper instructions are inserted immediately after reload to
2365 account for possible spills from 256bit registers. The pass
2366 reuses mode switching infrastructure by re-running mode insertion
2367 pass, so disable entities that have already been processed. */
2368 for (i = 0; i < MAX_386_ENTITIES; i++)
2369 ix86_optimize_mode_switching[i] = 0;
2370
2371 ix86_optimize_mode_switching[AVX_U128] = 1;
2372
2373 /* Call optimize_mode_switching. */
2374 g->get_passes ()->execute_pass_mode_switching ();
2375 return 0;
2376 }
2377
2378 namespace {
2379
2380 const pass_data pass_data_insert_vzeroupper =
2381 {
2382 RTL_PASS, /* type */
2383 "vzeroupper", /* name */
2384 OPTGROUP_NONE, /* optinfo_flags */
2385 true, /* has_gate */
2386 true, /* has_execute */
2387 TV_NONE, /* tv_id */
2388 0, /* properties_required */
2389 0, /* properties_provided */
2390 0, /* properties_destroyed */
2391 0, /* todo_flags_start */
2392 ( TODO_df_finish | TODO_verify_rtl_sharing | 0 ), /* todo_flags_finish */
2393 };
2394
2395 class pass_insert_vzeroupper : public rtl_opt_pass
2396 {
2397 public:
2398 pass_insert_vzeroupper(gcc::context *ctxt)
2399 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2400 {}
2401
2402 /* opt_pass methods: */
2403 bool gate () { return gate_insert_vzeroupper (); }
2404 unsigned int execute () { return rest_of_handle_insert_vzeroupper (); }
2405
2406 }; // class pass_insert_vzeroupper
2407
2408 } // anon namespace
2409
2410 rtl_opt_pass *
2411 make_pass_insert_vzeroupper (gcc::context *ctxt)
2412 {
2413 return new pass_insert_vzeroupper (ctxt);
2414 }
2415
2416 /* Return true if a red-zone is in use. */
2417
2418 static inline bool
2419 ix86_using_red_zone (void)
2420 {
2421 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2422 }
2423 \f
2424 /* Return a string that documents the current -m options. The caller is
2425 responsible for freeing the string. */
2426
2427 static char *
2428 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2429 const char *tune, enum fpmath_unit fpmath,
2430 bool add_nl_p)
2431 {
2432 struct ix86_target_opts
2433 {
2434 const char *option; /* option string */
2435 HOST_WIDE_INT mask; /* isa mask options */
2436 };
2437
2438 /* This table is ordered so that options like -msse4.2 that imply
2439 preceding options while match those first. */
2440 static struct ix86_target_opts isa_opts[] =
2441 {
2442 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2443 { "-mfma", OPTION_MASK_ISA_FMA },
2444 { "-mxop", OPTION_MASK_ISA_XOP },
2445 { "-mlwp", OPTION_MASK_ISA_LWP },
2446 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2447 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2448 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2449 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2450 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2451 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2452 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2453 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2454 { "-msse3", OPTION_MASK_ISA_SSE3 },
2455 { "-msse2", OPTION_MASK_ISA_SSE2 },
2456 { "-msse", OPTION_MASK_ISA_SSE },
2457 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2458 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2459 { "-mmmx", OPTION_MASK_ISA_MMX },
2460 { "-mabm", OPTION_MASK_ISA_ABM },
2461 { "-mbmi", OPTION_MASK_ISA_BMI },
2462 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2463 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2464 { "-mhle", OPTION_MASK_ISA_HLE },
2465 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2466 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2467 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2468 { "-madx", OPTION_MASK_ISA_ADX },
2469 { "-mtbm", OPTION_MASK_ISA_TBM },
2470 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2471 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2472 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2473 { "-maes", OPTION_MASK_ISA_AES },
2474 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2475 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2476 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2477 { "-mf16c", OPTION_MASK_ISA_F16C },
2478 { "-mrtm", OPTION_MASK_ISA_RTM },
2479 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2480 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2481 { "-mmpx", OPTION_MASK_ISA_MPX },
2482 };
2483
2484 /* Flag options. */
2485 static struct ix86_target_opts flag_opts[] =
2486 {
2487 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2488 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2489 { "-m80387", MASK_80387 },
2490 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2491 { "-malign-double", MASK_ALIGN_DOUBLE },
2492 { "-mcld", MASK_CLD },
2493 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2494 { "-mieee-fp", MASK_IEEE_FP },
2495 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2496 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2497 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2498 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2499 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2500 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2501 { "-mno-red-zone", MASK_NO_RED_ZONE },
2502 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2503 { "-mrecip", MASK_RECIP },
2504 { "-mrtd", MASK_RTD },
2505 { "-msseregparm", MASK_SSEREGPARM },
2506 { "-mstack-arg-probe", MASK_STACK_PROBE },
2507 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2508 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2509 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2510 { "-mvzeroupper", MASK_VZEROUPPER },
2511 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2512 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2513 { "-mprefer-avx128", MASK_PREFER_AVX128},
2514 };
2515
2516 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2517
2518 char isa_other[40];
2519 char target_other[40];
2520 unsigned num = 0;
2521 unsigned i, j;
2522 char *ret;
2523 char *ptr;
2524 size_t len;
2525 size_t line_len;
2526 size_t sep_len;
2527 const char *abi;
2528
2529 memset (opts, '\0', sizeof (opts));
2530
2531 /* Add -march= option. */
2532 if (arch)
2533 {
2534 opts[num][0] = "-march=";
2535 opts[num++][1] = arch;
2536 }
2537
2538 /* Add -mtune= option. */
2539 if (tune)
2540 {
2541 opts[num][0] = "-mtune=";
2542 opts[num++][1] = tune;
2543 }
2544
2545 /* Add -m32/-m64/-mx32. */
2546 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2547 {
2548 if ((isa & OPTION_MASK_ABI_64) != 0)
2549 abi = "-m64";
2550 else
2551 abi = "-mx32";
2552 isa &= ~ (OPTION_MASK_ISA_64BIT
2553 | OPTION_MASK_ABI_64
2554 | OPTION_MASK_ABI_X32);
2555 }
2556 else
2557 abi = "-m32";
2558 opts[num++][0] = abi;
2559
2560 /* Pick out the options in isa options. */
2561 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2562 {
2563 if ((isa & isa_opts[i].mask) != 0)
2564 {
2565 opts[num++][0] = isa_opts[i].option;
2566 isa &= ~ isa_opts[i].mask;
2567 }
2568 }
2569
2570 if (isa && add_nl_p)
2571 {
2572 opts[num++][0] = isa_other;
2573 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2574 isa);
2575 }
2576
2577 /* Add flag options. */
2578 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2579 {
2580 if ((flags & flag_opts[i].mask) != 0)
2581 {
2582 opts[num++][0] = flag_opts[i].option;
2583 flags &= ~ flag_opts[i].mask;
2584 }
2585 }
2586
2587 if (flags && add_nl_p)
2588 {
2589 opts[num++][0] = target_other;
2590 sprintf (target_other, "(other flags: %#x)", flags);
2591 }
2592
2593 /* Add -fpmath= option. */
2594 if (fpmath)
2595 {
2596 opts[num][0] = "-mfpmath=";
2597 switch ((int) fpmath)
2598 {
2599 case FPMATH_387:
2600 opts[num++][1] = "387";
2601 break;
2602
2603 case FPMATH_SSE:
2604 opts[num++][1] = "sse";
2605 break;
2606
2607 case FPMATH_387 | FPMATH_SSE:
2608 opts[num++][1] = "sse+387";
2609 break;
2610
2611 default:
2612 gcc_unreachable ();
2613 }
2614 }
2615
2616 /* Any options? */
2617 if (num == 0)
2618 return NULL;
2619
2620 gcc_assert (num < ARRAY_SIZE (opts));
2621
2622 /* Size the string. */
2623 len = 0;
2624 sep_len = (add_nl_p) ? 3 : 1;
2625 for (i = 0; i < num; i++)
2626 {
2627 len += sep_len;
2628 for (j = 0; j < 2; j++)
2629 if (opts[i][j])
2630 len += strlen (opts[i][j]);
2631 }
2632
2633 /* Build the string. */
2634 ret = ptr = (char *) xmalloc (len);
2635 line_len = 0;
2636
2637 for (i = 0; i < num; i++)
2638 {
2639 size_t len2[2];
2640
2641 for (j = 0; j < 2; j++)
2642 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2643
2644 if (i != 0)
2645 {
2646 *ptr++ = ' ';
2647 line_len++;
2648
2649 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2650 {
2651 *ptr++ = '\\';
2652 *ptr++ = '\n';
2653 line_len = 0;
2654 }
2655 }
2656
2657 for (j = 0; j < 2; j++)
2658 if (opts[i][j])
2659 {
2660 memcpy (ptr, opts[i][j], len2[j]);
2661 ptr += len2[j];
2662 line_len += len2[j];
2663 }
2664 }
2665
2666 *ptr = '\0';
2667 gcc_assert (ret + len >= ptr);
2668
2669 return ret;
2670 }
2671
2672 /* Return true, if profiling code should be emitted before
2673 prologue. Otherwise it returns false.
2674 Note: For x86 with "hotfix" it is sorried. */
2675 static bool
2676 ix86_profile_before_prologue (void)
2677 {
2678 return flag_fentry != 0;
2679 }
2680
2681 /* Function that is callable from the debugger to print the current
2682 options. */
2683 void ATTRIBUTE_UNUSED
2684 ix86_debug_options (void)
2685 {
2686 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2687 ix86_arch_string, ix86_tune_string,
2688 ix86_fpmath, true);
2689
2690 if (opts)
2691 {
2692 fprintf (stderr, "%s\n\n", opts);
2693 free (opts);
2694 }
2695 else
2696 fputs ("<no options>\n\n", stderr);
2697
2698 return;
2699 }
2700
2701 static const char *stringop_alg_names[] = {
2702 #define DEF_ENUM
2703 #define DEF_ALG(alg, name) #name,
2704 #include "stringop.def"
2705 #undef DEF_ENUM
2706 #undef DEF_ALG
2707 };
2708
2709 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
2710 The string is of the following form (or comma separated list of it):
2711
2712 strategy_alg:max_size:[align|noalign]
2713
2714 where the full size range for the strategy is either [0, max_size] or
2715 [min_size, max_size], in which min_size is the max_size + 1 of the
2716 preceding range. The last size range must have max_size == -1.
2717
2718 Examples:
2719
2720 1.
2721 -mmemcpy-strategy=libcall:-1:noalign
2722
2723 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
2724
2725
2726 2.
2727 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
2728
2729 This is to tell the compiler to use the following strategy for memset
2730 1) when the expected size is between [1, 16], use rep_8byte strategy;
2731 2) when the size is between [17, 2048], use vector_loop;
2732 3) when the size is > 2048, use libcall. */
2733
2734 struct stringop_size_range
2735 {
2736 int max;
2737 stringop_alg alg;
2738 bool noalign;
2739 };
2740
2741 static void
2742 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
2743 {
2744 const struct stringop_algs *default_algs;
2745 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
2746 char *curr_range_str, *next_range_str;
2747 int i = 0, n = 0;
2748
2749 if (is_memset)
2750 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
2751 else
2752 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
2753
2754 curr_range_str = strategy_str;
2755
2756 do
2757 {
2758 int maxs;
2759 stringop_alg alg;
2760 char alg_name[128];
2761 char align[16];
2762 next_range_str = strchr (curr_range_str, ',');
2763 if (next_range_str)
2764 *next_range_str++ = '\0';
2765
2766 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
2767 alg_name, &maxs, align))
2768 {
2769 error ("wrong arg %s to option %s", curr_range_str,
2770 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2771 return;
2772 }
2773
2774 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
2775 {
2776 error ("size ranges of option %s should be increasing",
2777 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2778 return;
2779 }
2780
2781 for (i = 0; i < last_alg; i++)
2782 {
2783 if (!strcmp (alg_name, stringop_alg_names[i]))
2784 {
2785 alg = (stringop_alg) i;
2786 break;
2787 }
2788 }
2789
2790 if (i == last_alg)
2791 {
2792 error ("wrong stringop strategy name %s specified for option %s",
2793 alg_name,
2794 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2795 return;
2796 }
2797
2798 input_ranges[n].max = maxs;
2799 input_ranges[n].alg = alg;
2800 if (!strcmp (align, "align"))
2801 input_ranges[n].noalign = false;
2802 else if (!strcmp (align, "noalign"))
2803 input_ranges[n].noalign = true;
2804 else
2805 {
2806 error ("unknown alignment %s specified for option %s",
2807 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2808 return;
2809 }
2810 n++;
2811 curr_range_str = next_range_str;
2812 }
2813 while (curr_range_str);
2814
2815 if (input_ranges[n - 1].max != -1)
2816 {
2817 error ("the max value for the last size range should be -1"
2818 " for option %s",
2819 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2820 return;
2821 }
2822
2823 if (n > MAX_STRINGOP_ALGS)
2824 {
2825 error ("too many size ranges specified in option %s",
2826 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
2827 return;
2828 }
2829
2830 /* Now override the default algs array. */
2831 for (i = 0; i < n; i++)
2832 {
2833 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
2834 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
2835 = input_ranges[i].alg;
2836 *const_cast<int *>(&default_algs->size[i].noalign)
2837 = input_ranges[i].noalign;
2838 }
2839 }
2840
2841 \f
2842 /* parse -mtune-ctrl= option. When DUMP is true,
2843 print the features that are explicitly set. */
2844
2845 static void
2846 parse_mtune_ctrl_str (bool dump)
2847 {
2848 if (!ix86_tune_ctrl_string)
2849 return;
2850
2851 char *next_feature_string = NULL;
2852 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
2853 char *orig = curr_feature_string;
2854 int i;
2855 do
2856 {
2857 bool clear = false;
2858
2859 next_feature_string = strchr (curr_feature_string, ',');
2860 if (next_feature_string)
2861 *next_feature_string++ = '\0';
2862 if (*curr_feature_string == '^')
2863 {
2864 curr_feature_string++;
2865 clear = true;
2866 }
2867 for (i = 0; i < X86_TUNE_LAST; i++)
2868 {
2869 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
2870 {
2871 ix86_tune_features[i] = !clear;
2872 if (dump)
2873 fprintf (stderr, "Explicitly %s feature %s\n",
2874 clear ? "clear" : "set", ix86_tune_feature_names[i]);
2875 break;
2876 }
2877 }
2878 if (i == X86_TUNE_LAST)
2879 error ("Unknown parameter to option -mtune-ctrl: %s",
2880 clear ? curr_feature_string - 1 : curr_feature_string);
2881 curr_feature_string = next_feature_string;
2882 }
2883 while (curr_feature_string);
2884 free (orig);
2885 }
2886
2887 /* Helper function to set ix86_tune_features. IX86_TUNE is the
2888 processor type. */
2889
2890 static void
2891 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
2892 {
2893 unsigned int ix86_tune_mask = 1u << ix86_tune;
2894 int i;
2895
2896 for (i = 0; i < X86_TUNE_LAST; ++i)
2897 {
2898 if (ix86_tune_no_default)
2899 ix86_tune_features[i] = 0;
2900 else
2901 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
2902 }
2903
2904 if (dump)
2905 {
2906 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
2907 for (i = 0; i < X86_TUNE_LAST; i++)
2908 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
2909 ix86_tune_features[i] ? "on" : "off");
2910 }
2911
2912 parse_mtune_ctrl_str (dump);
2913 }
2914
2915
2916 /* Override various settings based on options. If MAIN_ARGS_P, the
2917 options are from the command line, otherwise they are from
2918 attributes. */
2919
2920 static void
2921 ix86_option_override_internal (bool main_args_p,
2922 struct gcc_options *opts,
2923 struct gcc_options *opts_set)
2924 {
2925 int i;
2926 unsigned int ix86_arch_mask;
2927 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
2928 const char *prefix;
2929 const char *suffix;
2930 const char *sw;
2931
2932 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2933 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2934 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2935 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2936 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2937 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2938 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2939 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2940 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2941 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2942 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2943 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2944 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2945 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2946 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2947 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2948 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2949 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2950 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2951 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2952 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2953 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2954 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2955 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2956 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2957 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2958 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2959 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2960 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2961 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2962 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2963 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2964 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2965 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2966 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2967 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2968 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2969 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2970 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2971 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2972 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
2973 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
2974 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
2975 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
2976 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
2977
2978 /* if this reaches 64, need to widen struct pta flags below */
2979
2980 static struct pta
2981 {
2982 const char *const name; /* processor name or nickname. */
2983 const enum processor_type processor;
2984 const enum attr_cpu schedule;
2985 const unsigned HOST_WIDE_INT flags;
2986 }
2987 const processor_alias_table[] =
2988 {
2989 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2990 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2991 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2992 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2993 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2994 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2995 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2996 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2997 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2998 PTA_MMX | PTA_SSE | PTA_FXSR},
2999 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3000 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3001 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3002 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3003 PTA_MMX | PTA_SSE | PTA_FXSR},
3004 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3005 PTA_MMX | PTA_SSE | PTA_FXSR},
3006 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3007 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3008 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3009 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3010 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3011 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3012 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3013 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3014 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3015 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3016 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3017 {"core2", PROCESSOR_CORE2, CPU_CORE2,
3018 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3019 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
3020 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
3021 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
3022 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
3023 {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
3024 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3025 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3026 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
3027 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3028 {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
3029 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3030 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
3031 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3032 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3033 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
3034 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3035 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
3036 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
3037 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
3038 | PTA_FMA | PTA_MOVBE | PTA_RTM | PTA_HLE | PTA_FXSR | PTA_XSAVE
3039 | PTA_XSAVEOPT},
3040 {"atom", PROCESSOR_ATOM, CPU_ATOM,
3041 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3042 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
3043 {"slm", PROCESSOR_SLM, CPU_SLM,
3044 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3045 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_MOVBE
3046 | PTA_FXSR},
3047 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3048 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3049 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3050 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3051 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
3052 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3053 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3054 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3055 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
3056 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3057 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3058 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3059 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3060 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3061 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
3062 {"x86-64", PROCESSOR_K8, CPU_K8,
3063 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3064 {"k8", PROCESSOR_K8, CPU_K8,
3065 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3066 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3067 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3068 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3069 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3070 {"opteron", PROCESSOR_K8, CPU_K8,
3071 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3072 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3073 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3074 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3075 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3076 {"athlon64", PROCESSOR_K8, CPU_K8,
3077 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3078 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3079 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3080 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3081 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3082 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3083 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3084 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
3085 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3086 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3087 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3088 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3089 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3090 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3091 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3092 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3093 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3094 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3095 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3096 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3097 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3098 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3099 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3100 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3101 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3102 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3103 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3104 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3105 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3106 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3107 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3108 | PTA_XSAVEOPT | PTA_FSGSBASE},
3109 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3110 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3111 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3112 | PTA_FXSR | PTA_XSAVE},
3113 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3114 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3115 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3116 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3117 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3118 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3119
3120 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3121 PTA_64BIT
3122 | PTA_HLE /* flags are only used for -march switch. */ },
3123 };
3124
3125 /* -mrecip options. */
3126 static struct
3127 {
3128 const char *string; /* option name */
3129 unsigned int mask; /* mask bits to set */
3130 }
3131 const recip_options[] =
3132 {
3133 { "all", RECIP_MASK_ALL },
3134 { "none", RECIP_MASK_NONE },
3135 { "div", RECIP_MASK_DIV },
3136 { "sqrt", RECIP_MASK_SQRT },
3137 { "vec-div", RECIP_MASK_VEC_DIV },
3138 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3139 };
3140
3141 int const pta_size = ARRAY_SIZE (processor_alias_table);
3142
3143 /* Set up prefix/suffix so the error messages refer to either the command
3144 line argument, or the attribute(target). */
3145 if (main_args_p)
3146 {
3147 prefix = "-m";
3148 suffix = "";
3149 sw = "switch";
3150 }
3151 else
3152 {
3153 prefix = "option(\"";
3154 suffix = "\")";
3155 sw = "attribute";
3156 }
3157
3158 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3159 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3160 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3161 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3162 #ifdef TARGET_BI_ARCH
3163 else
3164 {
3165 #if TARGET_BI_ARCH == 1
3166 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3167 is on and OPTION_MASK_ABI_X32 is off. We turn off
3168 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3169 -mx32. */
3170 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3171 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3172 #else
3173 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3174 on and OPTION_MASK_ABI_64 is off. We turn off
3175 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3176 -m64. */
3177 if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3178 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3179 #endif
3180 }
3181 #endif
3182
3183 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3184 {
3185 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3186 OPTION_MASK_ABI_64 for TARGET_X32. */
3187 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3188 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3189 }
3190 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3191 {
3192 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3193 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3194 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3195 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3196 }
3197
3198 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3199 SUBTARGET_OVERRIDE_OPTIONS;
3200 #endif
3201
3202 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3203 SUBSUBTARGET_OVERRIDE_OPTIONS;
3204 #endif
3205
3206 /* -fPIC is the default for x86_64. */
3207 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3208 opts->x_flag_pic = 2;
3209
3210 /* Need to check -mtune=generic first. */
3211 if (opts->x_ix86_tune_string)
3212 {
3213 if (!strcmp (opts->x_ix86_tune_string, "generic")
3214 || !strcmp (opts->x_ix86_tune_string, "i686")
3215 /* As special support for cross compilers we read -mtune=native
3216 as -mtune=generic. With native compilers we won't see the
3217 -mtune=native, as it was changed by the driver. */
3218 || !strcmp (opts->x_ix86_tune_string, "native"))
3219 {
3220 opts->x_ix86_tune_string = "generic";
3221 }
3222 /* If this call is for setting the option attribute, allow the
3223 generic that was previously set. */
3224 else if (!main_args_p
3225 && !strcmp (opts->x_ix86_tune_string, "generic"))
3226 ;
3227 else if (!strncmp (opts->x_ix86_tune_string, "generic", 7))
3228 error ("bad value (%s) for %stune=%s %s",
3229 opts->x_ix86_tune_string, prefix, suffix, sw);
3230 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3231 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3232 "%stune=k8%s or %stune=generic%s instead as appropriate",
3233 prefix, suffix, prefix, suffix, prefix, suffix);
3234 }
3235 else
3236 {
3237 if (opts->x_ix86_arch_string)
3238 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3239 if (!opts->x_ix86_tune_string)
3240 {
3241 opts->x_ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3242 ix86_tune_defaulted = 1;
3243 }
3244
3245 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3246 or defaulted. We need to use a sensible tune option. */
3247 if (!strcmp (opts->x_ix86_tune_string, "generic")
3248 || !strcmp (opts->x_ix86_tune_string, "x86-64")
3249 || !strcmp (opts->x_ix86_tune_string, "i686"))
3250 {
3251 opts->x_ix86_tune_string = "generic";
3252 }
3253 }
3254
3255 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3256 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3257 {
3258 /* rep; movq isn't available in 32-bit code. */
3259 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3260 opts->x_ix86_stringop_alg = no_stringop;
3261 }
3262
3263 if (!opts->x_ix86_arch_string)
3264 opts->x_ix86_arch_string
3265 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3266 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3267 else
3268 ix86_arch_specified = 1;
3269
3270 if (opts_set->x_ix86_pmode)
3271 {
3272 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3273 && opts->x_ix86_pmode == PMODE_SI)
3274 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3275 && opts->x_ix86_pmode == PMODE_DI))
3276 error ("address mode %qs not supported in the %s bit mode",
3277 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3278 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3279 }
3280 else
3281 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3282 ? PMODE_DI : PMODE_SI;
3283
3284 if (!opts_set->x_ix86_abi)
3285 opts->x_ix86_abi = DEFAULT_ABI;
3286
3287 /* For targets using ms ABI enable ms-extensions, if not
3288 explicit turned off. For non-ms ABI we turn off this
3289 option. */
3290 if (!opts_set->x_flag_ms_extensions)
3291 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3292
3293 if (opts_set->x_ix86_cmodel)
3294 {
3295 switch (opts->x_ix86_cmodel)
3296 {
3297 case CM_SMALL:
3298 case CM_SMALL_PIC:
3299 if (opts->x_flag_pic)
3300 opts->x_ix86_cmodel = CM_SMALL_PIC;
3301 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3302 error ("code model %qs not supported in the %s bit mode",
3303 "small", "32");
3304 break;
3305
3306 case CM_MEDIUM:
3307 case CM_MEDIUM_PIC:
3308 if (opts->x_flag_pic)
3309 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3310 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3311 error ("code model %qs not supported in the %s bit mode",
3312 "medium", "32");
3313 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3314 error ("code model %qs not supported in x32 mode",
3315 "medium");
3316 break;
3317
3318 case CM_LARGE:
3319 case CM_LARGE_PIC:
3320 if (opts->x_flag_pic)
3321 opts->x_ix86_cmodel = CM_LARGE_PIC;
3322 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3323 error ("code model %qs not supported in the %s bit mode",
3324 "large", "32");
3325 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3326 error ("code model %qs not supported in x32 mode",
3327 "large");
3328 break;
3329
3330 case CM_32:
3331 if (opts->x_flag_pic)
3332 error ("code model %s does not support PIC mode", "32");
3333 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3334 error ("code model %qs not supported in the %s bit mode",
3335 "32", "64");
3336 break;
3337
3338 case CM_KERNEL:
3339 if (opts->x_flag_pic)
3340 {
3341 error ("code model %s does not support PIC mode", "kernel");
3342 opts->x_ix86_cmodel = CM_32;
3343 }
3344 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3345 error ("code model %qs not supported in the %s bit mode",
3346 "kernel", "32");
3347 break;
3348
3349 default:
3350 gcc_unreachable ();
3351 }
3352 }
3353 else
3354 {
3355 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3356 use of rip-relative addressing. This eliminates fixups that
3357 would otherwise be needed if this object is to be placed in a
3358 DLL, and is essentially just as efficient as direct addressing. */
3359 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3360 && (TARGET_RDOS || TARGET_PECOFF))
3361 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3362 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3363 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3364 else
3365 opts->x_ix86_cmodel = CM_32;
3366 }
3367 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3368 {
3369 error ("-masm=intel not supported in this configuration");
3370 opts->x_ix86_asm_dialect = ASM_ATT;
3371 }
3372 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3373 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3374 sorry ("%i-bit mode not compiled in",
3375 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3376
3377 for (i = 0; i < pta_size; i++)
3378 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3379 {
3380 ix86_schedule = processor_alias_table[i].schedule;
3381 ix86_arch = processor_alias_table[i].processor;
3382 /* Default cpu tuning to the architecture. */
3383 ix86_tune = ix86_arch;
3384
3385 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3386 && !(processor_alias_table[i].flags & PTA_64BIT))
3387 error ("CPU you selected does not support x86-64 "
3388 "instruction set");
3389
3390 if (processor_alias_table[i].flags & PTA_MMX
3391 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3392 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3393 if (processor_alias_table[i].flags & PTA_3DNOW
3394 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3395 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3396 if (processor_alias_table[i].flags & PTA_3DNOW_A
3397 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3398 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3399 if (processor_alias_table[i].flags & PTA_SSE
3400 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3401 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3402 if (processor_alias_table[i].flags & PTA_SSE2
3403 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3404 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3405 if (processor_alias_table[i].flags & PTA_SSE3
3406 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3407 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3408 if (processor_alias_table[i].flags & PTA_SSSE3
3409 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3410 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3411 if (processor_alias_table[i].flags & PTA_SSE4_1
3412 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3413 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3414 if (processor_alias_table[i].flags & PTA_SSE4_2
3415 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3416 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3417 if (processor_alias_table[i].flags & PTA_AVX
3418 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3419 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3420 if (processor_alias_table[i].flags & PTA_AVX2
3421 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3422 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3423 if (processor_alias_table[i].flags & PTA_FMA
3424 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3425 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3426 if (processor_alias_table[i].flags & PTA_SSE4A
3427 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3428 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3429 if (processor_alias_table[i].flags & PTA_FMA4
3430 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3431 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3432 if (processor_alias_table[i].flags & PTA_XOP
3433 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3434 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3435 if (processor_alias_table[i].flags & PTA_LWP
3436 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3437 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3438 if (processor_alias_table[i].flags & PTA_ABM
3439 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3440 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3441 if (processor_alias_table[i].flags & PTA_BMI
3442 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3443 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3444 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3445 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3446 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3447 if (processor_alias_table[i].flags & PTA_TBM
3448 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3449 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3450 if (processor_alias_table[i].flags & PTA_BMI2
3451 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3452 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3453 if (processor_alias_table[i].flags & PTA_CX16
3454 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3455 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3456 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3457 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3458 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3459 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3460 && (processor_alias_table[i].flags & PTA_NO_SAHF))
3461 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3462 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3463 if (processor_alias_table[i].flags & PTA_MOVBE
3464 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3465 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3466 if (processor_alias_table[i].flags & PTA_AES
3467 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3468 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES;
3469 if (processor_alias_table[i].flags & PTA_PCLMUL
3470 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3471 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3472 if (processor_alias_table[i].flags & PTA_FSGSBASE
3473 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3474 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3475 if (processor_alias_table[i].flags & PTA_RDRND
3476 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3477 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3478 if (processor_alias_table[i].flags & PTA_F16C
3479 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3480 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3481 if (processor_alias_table[i].flags & PTA_RTM
3482 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3483 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3484 if (processor_alias_table[i].flags & PTA_HLE
3485 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3486 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3487 if (processor_alias_table[i].flags & PTA_PRFCHW
3488 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3489 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3490 if (processor_alias_table[i].flags & PTA_RDSEED
3491 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3492 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3493 if (processor_alias_table[i].flags & PTA_ADX
3494 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3495 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3496 if (processor_alias_table[i].flags & PTA_FXSR
3497 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3498 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3499 if (processor_alias_table[i].flags & PTA_XSAVE
3500 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3501 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3502 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3503 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3504 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3505 if (processor_alias_table[i].flags & PTA_AVX512F
3506 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
3507 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
3508 if (processor_alias_table[i].flags & PTA_AVX512ER
3509 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
3510 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
3511 if (processor_alias_table[i].flags & PTA_AVX512PF
3512 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
3513 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
3514 if (processor_alias_table[i].flags & PTA_AVX512CD
3515 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
3516 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
3517 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3518 x86_prefetch_sse = true;
3519
3520 break;
3521 }
3522
3523 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3524 error ("generic CPU can be used only for %stune=%s %s",
3525 prefix, suffix, sw);
3526 else if (!strncmp (opts->x_ix86_arch_string, "generic", 7) || i == pta_size)
3527 error ("bad value (%s) for %sarch=%s %s",
3528 opts->x_ix86_arch_string, prefix, suffix, sw);
3529
3530 ix86_arch_mask = 1u << ix86_arch;
3531 for (i = 0; i < X86_ARCH_LAST; ++i)
3532 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3533
3534 for (i = 0; i < pta_size; i++)
3535 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
3536 {
3537 ix86_schedule = processor_alias_table[i].schedule;
3538 ix86_tune = processor_alias_table[i].processor;
3539 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3540 {
3541 if (!(processor_alias_table[i].flags & PTA_64BIT))
3542 {
3543 if (ix86_tune_defaulted)
3544 {
3545 opts->x_ix86_tune_string = "x86-64";
3546 for (i = 0; i < pta_size; i++)
3547 if (! strcmp (opts->x_ix86_tune_string,
3548 processor_alias_table[i].name))
3549 break;
3550 ix86_schedule = processor_alias_table[i].schedule;
3551 ix86_tune = processor_alias_table[i].processor;
3552 }
3553 else
3554 error ("CPU you selected does not support x86-64 "
3555 "instruction set");
3556 }
3557 }
3558 /* Intel CPUs have always interpreted SSE prefetch instructions as
3559 NOPs; so, we can enable SSE prefetch instructions even when
3560 -mtune (rather than -march) points us to a processor that has them.
3561 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3562 higher processors. */
3563 if (TARGET_CMOV
3564 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3565 x86_prefetch_sse = true;
3566 break;
3567 }
3568
3569 if (ix86_tune_specified && i == pta_size)
3570 error ("bad value (%s) for %stune=%s %s",
3571 opts->x_ix86_tune_string, prefix, suffix, sw);
3572
3573 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
3574
3575 #ifndef USE_IX86_FRAME_POINTER
3576 #define USE_IX86_FRAME_POINTER 0
3577 #endif
3578
3579 #ifndef USE_X86_64_FRAME_POINTER
3580 #define USE_X86_64_FRAME_POINTER 0
3581 #endif
3582
3583 /* Set the default values for switches whose default depends on TARGET_64BIT
3584 in case they weren't overwritten by command line options. */
3585 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3586 {
3587 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3588 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3589 if (opts->x_flag_asynchronous_unwind_tables == 2)
3590 opts->x_flag_unwind_tables
3591 = opts->x_flag_asynchronous_unwind_tables = 1;
3592 if (opts->x_flag_pcc_struct_return == 2)
3593 opts->x_flag_pcc_struct_return = 0;
3594 }
3595 else
3596 {
3597 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
3598 opts->x_flag_omit_frame_pointer
3599 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
3600 if (opts->x_flag_asynchronous_unwind_tables == 2)
3601 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3602 if (opts->x_flag_pcc_struct_return == 2)
3603 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3604 }
3605
3606 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3607 if (opts->x_optimize_size)
3608 ix86_cost = &ix86_size_cost;
3609 else
3610 ix86_cost = ix86_tune_cost;
3611
3612 /* Arrange to set up i386_stack_locals for all functions. */
3613 init_machine_status = ix86_init_machine_status;
3614
3615 /* Validate -mregparm= value. */
3616 if (opts_set->x_ix86_regparm)
3617 {
3618 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3619 warning (0, "-mregparm is ignored in 64-bit mode");
3620 if (opts->x_ix86_regparm > REGPARM_MAX)
3621 {
3622 error ("-mregparm=%d is not between 0 and %d",
3623 opts->x_ix86_regparm, REGPARM_MAX);
3624 opts->x_ix86_regparm = 0;
3625 }
3626 }
3627 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3628 opts->x_ix86_regparm = REGPARM_MAX;
3629
3630 /* Default align_* from the processor table. */
3631 if (opts->x_align_loops == 0)
3632 {
3633 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3634 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3635 }
3636 if (opts->x_align_jumps == 0)
3637 {
3638 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3639 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3640 }
3641 if (opts->x_align_functions == 0)
3642 {
3643 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3644 }
3645
3646 /* Provide default for -mbranch-cost= value. */
3647 if (!opts_set->x_ix86_branch_cost)
3648 opts->x_ix86_branch_cost = ix86_cost->branch_cost;
3649
3650 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3651 {
3652 opts->x_target_flags
3653 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
3654
3655 /* Enable by default the SSE and MMX builtins. Do allow the user to
3656 explicitly disable any of these. In particular, disabling SSE and
3657 MMX for kernel code is extremely useful. */
3658 if (!ix86_arch_specified)
3659 opts->x_ix86_isa_flags
3660 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3661 | TARGET_SUBTARGET64_ISA_DEFAULT)
3662 & ~opts->x_ix86_isa_flags_explicit);
3663
3664 if (TARGET_RTD_P (opts->x_target_flags))
3665 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3666 }
3667 else
3668 {
3669 opts->x_target_flags
3670 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
3671
3672 if (!ix86_arch_specified)
3673 opts->x_ix86_isa_flags
3674 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
3675
3676 /* i386 ABI does not specify red zone. It still makes sense to use it
3677 when programmer takes care to stack from being destroyed. */
3678 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
3679 opts->x_target_flags |= MASK_NO_RED_ZONE;
3680 }
3681
3682 /* Keep nonleaf frame pointers. */
3683 if (opts->x_flag_omit_frame_pointer)
3684 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3685 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
3686 opts->x_flag_omit_frame_pointer = 1;
3687
3688 /* If we're doing fast math, we don't care about comparison order
3689 wrt NaNs. This lets us use a shorter comparison sequence. */
3690 if (opts->x_flag_finite_math_only)
3691 opts->x_target_flags &= ~MASK_IEEE_FP;
3692
3693 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3694 since the insns won't need emulation. */
3695 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
3696 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
3697
3698 /* Likewise, if the target doesn't have a 387, or we've specified
3699 software floating point, don't use 387 inline intrinsics. */
3700 if (!TARGET_80387_P (opts->x_target_flags))
3701 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
3702
3703 /* Turn on MMX builtins for -msse. */
3704 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
3705 opts->x_ix86_isa_flags
3706 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
3707
3708 /* Enable SSE prefetch. */
3709 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
3710 || (TARGET_PRFCHW && !TARGET_3DNOW_P (opts->x_ix86_isa_flags)))
3711 x86_prefetch_sse = true;
3712
3713 /* Enable prefetch{,w} instructions for -m3dnow. */
3714 if (TARGET_3DNOW_P (opts->x_ix86_isa_flags))
3715 opts->x_ix86_isa_flags
3716 |= OPTION_MASK_ISA_PRFCHW & ~opts->x_ix86_isa_flags_explicit;
3717
3718 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3719 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
3720 || TARGET_ABM_P (opts->x_ix86_isa_flags))
3721 opts->x_ix86_isa_flags
3722 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
3723
3724 /* Enable lzcnt instruction for -mabm. */
3725 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
3726 opts->x_ix86_isa_flags
3727 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
3728
3729 /* Validate -mpreferred-stack-boundary= value or default it to
3730 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3731 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3732 if (opts_set->x_ix86_preferred_stack_boundary_arg)
3733 {
3734 int min = (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3735 ? (TARGET_SSE_P (opts->x_ix86_isa_flags) ? 4 : 3) : 2);
3736 int max = (TARGET_SEH ? 4 : 12);
3737
3738 if (opts->x_ix86_preferred_stack_boundary_arg < min
3739 || opts->x_ix86_preferred_stack_boundary_arg > max)
3740 {
3741 if (min == max)
3742 error ("-mpreferred-stack-boundary is not supported "
3743 "for this target");
3744 else
3745 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3746 opts->x_ix86_preferred_stack_boundary_arg, min, max);
3747 }
3748 else
3749 ix86_preferred_stack_boundary
3750 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3751 }
3752
3753 /* Set the default value for -mstackrealign. */
3754 if (opts->x_ix86_force_align_arg_pointer == -1)
3755 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3756
3757 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3758
3759 /* Validate -mincoming-stack-boundary= value or default it to
3760 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3761 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3762 if (opts_set->x_ix86_incoming_stack_boundary_arg)
3763 {
3764 if (ix86_incoming_stack_boundary_arg
3765 < (TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2)
3766 || ix86_incoming_stack_boundary_arg > 12)
3767 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3768 ix86_incoming_stack_boundary_arg,
3769 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 4 : 2);
3770 else
3771 {
3772 ix86_user_incoming_stack_boundary
3773 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3774 ix86_incoming_stack_boundary
3775 = ix86_user_incoming_stack_boundary;
3776 }
3777 }
3778
3779 /* Accept -msseregparm only if at least SSE support is enabled. */
3780 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
3781 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
3782 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3783
3784 if (opts_set->x_ix86_fpmath)
3785 {
3786 if (opts->x_ix86_fpmath & FPMATH_SSE)
3787 {
3788 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
3789 {
3790 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3791 opts->x_ix86_fpmath = FPMATH_387;
3792 }
3793 else if ((opts->x_ix86_fpmath & FPMATH_387)
3794 && !TARGET_80387_P (opts->x_target_flags))
3795 {
3796 warning (0, "387 instruction set disabled, using SSE arithmetics");
3797 opts->x_ix86_fpmath = FPMATH_SSE;
3798 }
3799 }
3800 }
3801 /* For all chips supporting SSE2, -mfpmath=sse performs better than
3802 fpmath=387. The second is however default at many targets since the
3803 extra 80bit precision of temporaries is considered to be part of ABI.
3804 Overwrite the default at least for -ffast-math.
3805 TODO: -mfpmath=both seems to produce same performing code with bit
3806 smaller binaries. It is however not clear if register allocation is
3807 ready for this setting.
3808 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
3809 codegen. We may switch to 387 with -ffast-math for size optimized
3810 functions. */
3811 else if (fast_math_flags_set_p (&global_options)
3812 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
3813 ix86_fpmath = FPMATH_SSE;
3814 else
3815 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
3816
3817 /* If the i387 is disabled, then do not return values in it. */
3818 if (!TARGET_80387_P (opts->x_target_flags))
3819 opts->x_target_flags &= ~MASK_FLOAT_RETURNS;
3820
3821 /* Use external vectorized library in vectorizing intrinsics. */
3822 if (opts_set->x_ix86_veclibabi_type)
3823 switch (opts->x_ix86_veclibabi_type)
3824 {
3825 case ix86_veclibabi_type_svml:
3826 ix86_veclib_handler = ix86_veclibabi_svml;
3827 break;
3828
3829 case ix86_veclibabi_type_acml:
3830 ix86_veclib_handler = ix86_veclibabi_acml;
3831 break;
3832
3833 default:
3834 gcc_unreachable ();
3835 }
3836
3837 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
3838 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3839 && !opts->x_optimize_size)
3840 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3841
3842 /* If stack probes are required, the space used for large function
3843 arguments on the stack must also be probed, so enable
3844 -maccumulate-outgoing-args so this happens in the prologue. */
3845 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
3846 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3847 {
3848 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
3849 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3850 "for correctness", prefix, suffix);
3851 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3852 }
3853
3854 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3855 {
3856 char *p;
3857 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3858 p = strchr (internal_label_prefix, 'X');
3859 internal_label_prefix_len = p - internal_label_prefix;
3860 *p = '\0';
3861 }
3862
3863 /* When scheduling description is not available, disable scheduler pass
3864 so it won't slow down the compilation and make x87 code slower. */
3865 if (!TARGET_SCHEDULE)
3866 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
3867
3868 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3869 ix86_tune_cost->simultaneous_prefetches,
3870 opts->x_param_values,
3871 opts_set->x_param_values);
3872 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3873 ix86_tune_cost->prefetch_block,
3874 opts->x_param_values,
3875 opts_set->x_param_values);
3876 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3877 ix86_tune_cost->l1_cache_size,
3878 opts->x_param_values,
3879 opts_set->x_param_values);
3880 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3881 ix86_tune_cost->l2_cache_size,
3882 opts->x_param_values,
3883 opts_set->x_param_values);
3884
3885 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3886 if (opts->x_flag_prefetch_loop_arrays < 0
3887 && HAVE_prefetch
3888 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
3889 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3890 opts->x_flag_prefetch_loop_arrays = 1;
3891
3892 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3893 can be opts->x_optimized to ap = __builtin_next_arg (0). */
3894 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
3895 targetm.expand_builtin_va_start = NULL;
3896
3897 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3898 {
3899 ix86_gen_leave = gen_leave_rex64;
3900 if (Pmode == DImode)
3901 {
3902 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3903 ix86_gen_tls_local_dynamic_base_64
3904 = gen_tls_local_dynamic_base_64_di;
3905 }
3906 else
3907 {
3908 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3909 ix86_gen_tls_local_dynamic_base_64
3910 = gen_tls_local_dynamic_base_64_si;
3911 }
3912 }
3913 else
3914 ix86_gen_leave = gen_leave;
3915
3916 if (Pmode == DImode)
3917 {
3918 ix86_gen_add3 = gen_adddi3;
3919 ix86_gen_sub3 = gen_subdi3;
3920 ix86_gen_sub3_carry = gen_subdi3_carry;
3921 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3922 ix86_gen_andsp = gen_anddi3;
3923 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3924 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3925 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3926 ix86_gen_monitor = gen_sse3_monitor_di;
3927 }
3928 else
3929 {
3930 ix86_gen_add3 = gen_addsi3;
3931 ix86_gen_sub3 = gen_subsi3;
3932 ix86_gen_sub3_carry = gen_subsi3_carry;
3933 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3934 ix86_gen_andsp = gen_andsi3;
3935 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3936 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3937 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3938 ix86_gen_monitor = gen_sse3_monitor_si;
3939 }
3940
3941 #ifdef USE_IX86_CLD
3942 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3943 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3944 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
3945 #endif
3946
3947 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic)
3948 {
3949 if (opts->x_flag_fentry > 0)
3950 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3951 "with -fpic");
3952 opts->x_flag_fentry = 0;
3953 }
3954 else if (TARGET_SEH)
3955 {
3956 if (opts->x_flag_fentry == 0)
3957 sorry ("-mno-fentry isn%'t compatible with SEH");
3958 opts->x_flag_fentry = 1;
3959 }
3960 else if (opts->x_flag_fentry < 0)
3961 {
3962 #if defined(PROFILE_BEFORE_PROLOGUE)
3963 opts->x_flag_fentry = 1;
3964 #else
3965 opts->x_flag_fentry = 0;
3966 #endif
3967 }
3968
3969 /* When not opts->x_optimize for size, enable vzeroupper optimization for
3970 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3971 AVX unaligned load/store. */
3972 if (!opts->x_optimize_size)
3973 {
3974 if (flag_expensive_optimizations
3975 && !(opts_set->x_target_flags & MASK_VZEROUPPER))
3976 opts->x_target_flags |= MASK_VZEROUPPER;
3977 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
3978 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3979 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3980 if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
3981 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3982 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3983 /* Enable 128-bit AVX instruction generation
3984 for the auto-vectorizer. */
3985 if (TARGET_AVX128_OPTIMAL
3986 && !(opts_set->x_target_flags & MASK_PREFER_AVX128))
3987 opts->x_target_flags |= MASK_PREFER_AVX128;
3988 }
3989
3990 if (opts->x_ix86_recip_name)
3991 {
3992 char *p = ASTRDUP (opts->x_ix86_recip_name);
3993 char *q;
3994 unsigned int mask, i;
3995 bool invert;
3996
3997 while ((q = strtok (p, ",")) != NULL)
3998 {
3999 p = NULL;
4000 if (*q == '!')
4001 {
4002 invert = true;
4003 q++;
4004 }
4005 else
4006 invert = false;
4007
4008 if (!strcmp (q, "default"))
4009 mask = RECIP_MASK_ALL;
4010 else
4011 {
4012 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4013 if (!strcmp (q, recip_options[i].string))
4014 {
4015 mask = recip_options[i].mask;
4016 break;
4017 }
4018
4019 if (i == ARRAY_SIZE (recip_options))
4020 {
4021 error ("unknown option for -mrecip=%s", q);
4022 invert = false;
4023 mask = RECIP_MASK_NONE;
4024 }
4025 }
4026
4027 opts->x_recip_mask_explicit |= mask;
4028 if (invert)
4029 opts->x_recip_mask &= ~mask;
4030 else
4031 opts->x_recip_mask |= mask;
4032 }
4033 }
4034
4035 if (TARGET_RECIP_P (opts->x_target_flags))
4036 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4037 else if (opts_set->x_target_flags & MASK_RECIP)
4038 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4039
4040 /* Default long double to 64-bit for Bionic. */
4041 if (TARGET_HAS_BIONIC
4042 && !(opts_set->x_target_flags & MASK_LONG_DOUBLE_64))
4043 opts->x_target_flags |= MASK_LONG_DOUBLE_64;
4044
4045 /* Save the initial options in case the user does function specific
4046 options. */
4047 if (main_args_p)
4048 target_option_default_node = target_option_current_node
4049 = build_target_option_node (opts);
4050
4051 /* Handle stack protector */
4052 if (!opts_set->x_ix86_stack_protector_guard)
4053 opts->x_ix86_stack_protector_guard
4054 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4055
4056 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4057 if (opts->x_ix86_tune_memcpy_strategy)
4058 {
4059 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4060 ix86_parse_stringop_strategy_string (str, false);
4061 free (str);
4062 }
4063
4064 if (opts->x_ix86_tune_memset_strategy)
4065 {
4066 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4067 ix86_parse_stringop_strategy_string (str, true);
4068 free (str);
4069 }
4070 }
4071
4072 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4073
4074 static void
4075 ix86_option_override (void)
4076 {
4077 opt_pass *pass_insert_vzeroupper = make_pass_insert_vzeroupper (g);
4078 static struct register_pass_info insert_vzeroupper_info
4079 = { pass_insert_vzeroupper, "reload",
4080 1, PASS_POS_INSERT_AFTER
4081 };
4082
4083 ix86_option_override_internal (true, &global_options, &global_options_set);
4084
4085
4086 /* This needs to be done at start up. It's convenient to do it here. */
4087 register_pass (&insert_vzeroupper_info);
4088 }
4089
4090 /* Update register usage after having seen the compiler flags. */
4091
4092 static void
4093 ix86_conditional_register_usage (void)
4094 {
4095 int i, c_mask;
4096 unsigned int j;
4097
4098 /* The PIC register, if it exists, is fixed. */
4099 j = PIC_OFFSET_TABLE_REGNUM;
4100 if (j != INVALID_REGNUM)
4101 fixed_regs[j] = call_used_regs[j] = 1;
4102
4103 /* For 32-bit targets, squash the REX registers. */
4104 if (! TARGET_64BIT)
4105 {
4106 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4107 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4108 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4109 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4110 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4111 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4112 }
4113
4114 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4115 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
4116 : TARGET_64BIT ? (1 << 2)
4117 : (1 << 1));
4118
4119 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4120
4121 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4122 {
4123 /* Set/reset conditionally defined registers from
4124 CALL_USED_REGISTERS initializer. */
4125 if (call_used_regs[i] > 1)
4126 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4127
4128 /* Calculate registers of CLOBBERED_REGS register set
4129 as call used registers from GENERAL_REGS register set. */
4130 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4131 && call_used_regs[i])
4132 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4133 }
4134
4135 /* If MMX is disabled, squash the registers. */
4136 if (! TARGET_MMX)
4137 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4138 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4139 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4140
4141 /* If SSE is disabled, squash the registers. */
4142 if (! TARGET_SSE)
4143 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4144 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4145 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4146
4147 /* If the FPU is disabled, squash the registers. */
4148 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4149 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4150 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4151 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4152
4153 /* If AVX512F is disabled, squash the registers. */
4154 if (! TARGET_AVX512F)
4155 {
4156 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4157 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4158
4159 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4160 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4161 }
4162
4163 /* If MPX is disabled, squash the registers. */
4164 if (! TARGET_MPX)
4165 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4166 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4167 }
4168
4169 \f
4170 /* Save the current options */
4171
4172 static void
4173 ix86_function_specific_save (struct cl_target_option *ptr,
4174 struct gcc_options *opts)
4175 {
4176 ptr->arch = ix86_arch;
4177 ptr->schedule = ix86_schedule;
4178 ptr->tune = ix86_tune;
4179 ptr->branch_cost = ix86_branch_cost;
4180 ptr->tune_defaulted = ix86_tune_defaulted;
4181 ptr->arch_specified = ix86_arch_specified;
4182 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
4183 ptr->x_ix86_target_flags_explicit = opts->x_ix86_target_flags_explicit;
4184 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
4185
4186 /* The fields are char but the variables are not; make sure the
4187 values fit in the fields. */
4188 gcc_assert (ptr->arch == ix86_arch);
4189 gcc_assert (ptr->schedule == ix86_schedule);
4190 gcc_assert (ptr->tune == ix86_tune);
4191 gcc_assert (ptr->branch_cost == ix86_branch_cost);
4192 }
4193
4194 /* Restore the current options */
4195
4196 static void
4197 ix86_function_specific_restore (struct gcc_options *opts,
4198 struct cl_target_option *ptr)
4199 {
4200 enum processor_type old_tune = ix86_tune;
4201 enum processor_type old_arch = ix86_arch;
4202 unsigned int ix86_arch_mask;
4203 int i;
4204
4205 ix86_arch = (enum processor_type) ptr->arch;
4206 ix86_schedule = (enum attr_cpu) ptr->schedule;
4207 ix86_tune = (enum processor_type) ptr->tune;
4208 opts->x_ix86_branch_cost = ptr->branch_cost;
4209 ix86_tune_defaulted = ptr->tune_defaulted;
4210 ix86_arch_specified = ptr->arch_specified;
4211 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4212 opts->x_ix86_target_flags_explicit = ptr->x_ix86_target_flags_explicit;
4213 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
4214
4215 /* Recreate the arch feature tests if the arch changed */
4216 if (old_arch != ix86_arch)
4217 {
4218 ix86_arch_mask = 1u << ix86_arch;
4219 for (i = 0; i < X86_ARCH_LAST; ++i)
4220 ix86_arch_features[i]
4221 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4222 }
4223
4224 /* Recreate the tune optimization tests */
4225 if (old_tune != ix86_tune)
4226 set_ix86_tune_features (ix86_tune, false);
4227 }
4228
4229 /* Print the current options */
4230
4231 static void
4232 ix86_function_specific_print (FILE *file, int indent,
4233 struct cl_target_option *ptr)
4234 {
4235 char *target_string
4236 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4237 NULL, NULL, ptr->x_ix86_fpmath, false);
4238
4239 fprintf (file, "%*sarch = %d (%s)\n",
4240 indent, "",
4241 ptr->arch,
4242 ((ptr->arch < TARGET_CPU_DEFAULT_max)
4243 ? cpu_names[ptr->arch]
4244 : "<unknown>"));
4245
4246 fprintf (file, "%*stune = %d (%s)\n",
4247 indent, "",
4248 ptr->tune,
4249 ((ptr->tune < TARGET_CPU_DEFAULT_max)
4250 ? cpu_names[ptr->tune]
4251 : "<unknown>"));
4252
4253 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4254
4255 if (target_string)
4256 {
4257 fprintf (file, "%*s%s\n", indent, "", target_string);
4258 free (target_string);
4259 }
4260 }
4261
4262 \f
4263 /* Inner function to process the attribute((target(...))), take an argument and
4264 set the current options from the argument. If we have a list, recursively go
4265 over the list. */
4266
4267 static bool
4268 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4269 struct gcc_options *opts,
4270 struct gcc_options *opts_set,
4271 struct gcc_options *enum_opts_set)
4272 {
4273 char *next_optstr;
4274 bool ret = true;
4275
4276 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4277 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4278 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4279 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4280 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4281
4282 enum ix86_opt_type
4283 {
4284 ix86_opt_unknown,
4285 ix86_opt_yes,
4286 ix86_opt_no,
4287 ix86_opt_str,
4288 ix86_opt_enum,
4289 ix86_opt_isa
4290 };
4291
4292 static const struct
4293 {
4294 const char *string;
4295 size_t len;
4296 enum ix86_opt_type type;
4297 int opt;
4298 int mask;
4299 } attrs[] = {
4300 /* isa options */
4301 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4302 IX86_ATTR_ISA ("abm", OPT_mabm),
4303 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4304 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4305 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4306 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4307 IX86_ATTR_ISA ("aes", OPT_maes),
4308 IX86_ATTR_ISA ("avx", OPT_mavx),
4309 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4310 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
4311 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
4312 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
4313 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
4314 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4315 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4316 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4317 IX86_ATTR_ISA ("sse", OPT_msse),
4318 IX86_ATTR_ISA ("sse2", OPT_msse2),
4319 IX86_ATTR_ISA ("sse3", OPT_msse3),
4320 IX86_ATTR_ISA ("sse4", OPT_msse4),
4321 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4322 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4323 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4324 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4325 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4326 IX86_ATTR_ISA ("fma", OPT_mfma),
4327 IX86_ATTR_ISA ("xop", OPT_mxop),
4328 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4329 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4330 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4331 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4332 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4333 IX86_ATTR_ISA ("hle", OPT_mhle),
4334 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4335 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4336 IX86_ATTR_ISA ("adx", OPT_madx),
4337 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4338 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4339 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4340
4341 /* enum options */
4342 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4343
4344 /* string options */
4345 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4346 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4347
4348 /* flag options */
4349 IX86_ATTR_YES ("cld",
4350 OPT_mcld,
4351 MASK_CLD),
4352
4353 IX86_ATTR_NO ("fancy-math-387",
4354 OPT_mfancy_math_387,
4355 MASK_NO_FANCY_MATH_387),
4356
4357 IX86_ATTR_YES ("ieee-fp",
4358 OPT_mieee_fp,
4359 MASK_IEEE_FP),
4360
4361 IX86_ATTR_YES ("inline-all-stringops",
4362 OPT_minline_all_stringops,
4363 MASK_INLINE_ALL_STRINGOPS),
4364
4365 IX86_ATTR_YES ("inline-stringops-dynamically",
4366 OPT_minline_stringops_dynamically,
4367 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4368
4369 IX86_ATTR_NO ("align-stringops",
4370 OPT_mno_align_stringops,
4371 MASK_NO_ALIGN_STRINGOPS),
4372
4373 IX86_ATTR_YES ("recip",
4374 OPT_mrecip,
4375 MASK_RECIP),
4376
4377 };
4378
4379 /* If this is a list, recurse to get the options. */
4380 if (TREE_CODE (args) == TREE_LIST)
4381 {
4382 bool ret = true;
4383
4384 for (; args; args = TREE_CHAIN (args))
4385 if (TREE_VALUE (args)
4386 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4387 p_strings, opts, opts_set,
4388 enum_opts_set))
4389 ret = false;
4390
4391 return ret;
4392 }
4393
4394 else if (TREE_CODE (args) != STRING_CST)
4395 {
4396 error ("attribute %<target%> argument not a string");
4397 return false;
4398 }
4399
4400 /* Handle multiple arguments separated by commas. */
4401 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4402
4403 while (next_optstr && *next_optstr != '\0')
4404 {
4405 char *p = next_optstr;
4406 char *orig_p = p;
4407 char *comma = strchr (next_optstr, ',');
4408 const char *opt_string;
4409 size_t len, opt_len;
4410 int opt;
4411 bool opt_set_p;
4412 char ch;
4413 unsigned i;
4414 enum ix86_opt_type type = ix86_opt_unknown;
4415 int mask = 0;
4416
4417 if (comma)
4418 {
4419 *comma = '\0';
4420 len = comma - next_optstr;
4421 next_optstr = comma + 1;
4422 }
4423 else
4424 {
4425 len = strlen (p);
4426 next_optstr = NULL;
4427 }
4428
4429 /* Recognize no-xxx. */
4430 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4431 {
4432 opt_set_p = false;
4433 p += 3;
4434 len -= 3;
4435 }
4436 else
4437 opt_set_p = true;
4438
4439 /* Find the option. */
4440 ch = *p;
4441 opt = N_OPTS;
4442 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4443 {
4444 type = attrs[i].type;
4445 opt_len = attrs[i].len;
4446 if (ch == attrs[i].string[0]
4447 && ((type != ix86_opt_str && type != ix86_opt_enum)
4448 ? len == opt_len
4449 : len > opt_len)
4450 && memcmp (p, attrs[i].string, opt_len) == 0)
4451 {
4452 opt = attrs[i].opt;
4453 mask = attrs[i].mask;
4454 opt_string = attrs[i].string;
4455 break;
4456 }
4457 }
4458
4459 /* Process the option. */
4460 if (opt == N_OPTS)
4461 {
4462 error ("attribute(target(\"%s\")) is unknown", orig_p);
4463 ret = false;
4464 }
4465
4466 else if (type == ix86_opt_isa)
4467 {
4468 struct cl_decoded_option decoded;
4469
4470 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4471 ix86_handle_option (opts, opts_set,
4472 &decoded, input_location);
4473 }
4474
4475 else if (type == ix86_opt_yes || type == ix86_opt_no)
4476 {
4477 if (type == ix86_opt_no)
4478 opt_set_p = !opt_set_p;
4479
4480 if (opt_set_p)
4481 opts->x_target_flags |= mask;
4482 else
4483 opts->x_target_flags &= ~mask;
4484 }
4485
4486 else if (type == ix86_opt_str)
4487 {
4488 if (p_strings[opt])
4489 {
4490 error ("option(\"%s\") was already specified", opt_string);
4491 ret = false;
4492 }
4493 else
4494 p_strings[opt] = xstrdup (p + opt_len);
4495 }
4496
4497 else if (type == ix86_opt_enum)
4498 {
4499 bool arg_ok;
4500 int value;
4501
4502 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4503 if (arg_ok)
4504 set_option (opts, enum_opts_set, opt, value,
4505 p + opt_len, DK_UNSPECIFIED, input_location,
4506 global_dc);
4507 else
4508 {
4509 error ("attribute(target(\"%s\")) is unknown", orig_p);
4510 ret = false;
4511 }
4512 }
4513
4514 else
4515 gcc_unreachable ();
4516 }
4517
4518 return ret;
4519 }
4520
4521 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4522
4523 tree
4524 ix86_valid_target_attribute_tree (tree args,
4525 struct gcc_options *opts,
4526 struct gcc_options *opts_set)
4527 {
4528 const char *orig_arch_string = ix86_arch_string;
4529 const char *orig_tune_string = ix86_tune_string;
4530 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
4531 int orig_tune_defaulted = ix86_tune_defaulted;
4532 int orig_arch_specified = ix86_arch_specified;
4533 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4534 tree t = NULL_TREE;
4535 int i;
4536 struct cl_target_option *def
4537 = TREE_TARGET_OPTION (target_option_default_node);
4538 struct gcc_options enum_opts_set;
4539
4540 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4541
4542 /* Process each of the options on the chain. */
4543 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
4544 opts_set, &enum_opts_set))
4545 return error_mark_node;
4546
4547 /* If the changed options are different from the default, rerun
4548 ix86_option_override_internal, and then save the options away.
4549 The string options are are attribute options, and will be undone
4550 when we copy the save structure. */
4551 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
4552 || opts->x_target_flags != def->x_target_flags
4553 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4554 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4555 || enum_opts_set.x_ix86_fpmath)
4556 {
4557 /* If we are using the default tune= or arch=, undo the string assigned,
4558 and use the default. */
4559 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4560 opts->x_ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4561 else if (!orig_arch_specified)
4562 opts->x_ix86_arch_string = NULL;
4563
4564 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4565 opts->x_ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4566 else if (orig_tune_defaulted)
4567 opts->x_ix86_tune_string = NULL;
4568
4569 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4570 if (enum_opts_set.x_ix86_fpmath)
4571 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4572 else if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4573 && TARGET_SSE_P (opts->x_ix86_isa_flags))
4574 {
4575 opts->x_ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4576 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
4577 }
4578
4579 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4580 ix86_option_override_internal (false, opts, opts_set);
4581
4582 /* Add any builtin functions with the new isa if any. */
4583 ix86_add_new_builtins (opts->x_ix86_isa_flags);
4584
4585 /* Save the current options unless we are validating options for
4586 #pragma. */
4587 t = build_target_option_node (opts);
4588
4589 opts->x_ix86_arch_string = orig_arch_string;
4590 opts->x_ix86_tune_string = orig_tune_string;
4591 opts_set->x_ix86_fpmath = orig_fpmath_set;
4592
4593 /* Free up memory allocated to hold the strings */
4594 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4595 free (option_strings[i]);
4596 }
4597
4598 return t;
4599 }
4600
4601 /* Hook to validate attribute((target("string"))). */
4602
4603 static bool
4604 ix86_valid_target_attribute_p (tree fndecl,
4605 tree ARG_UNUSED (name),
4606 tree args,
4607 int ARG_UNUSED (flags))
4608 {
4609 struct gcc_options func_options;
4610 tree new_target, new_optimize;
4611 bool ret = true;
4612
4613 /* attribute((target("default"))) does nothing, beyond
4614 affecting multi-versioning. */
4615 if (TREE_VALUE (args)
4616 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4617 && TREE_CHAIN (args) == NULL_TREE
4618 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4619 return true;
4620
4621 tree old_optimize = build_optimization_node (&global_options);
4622
4623 /* Get the optimization options of the current function. */
4624 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4625
4626 if (!func_optimize)
4627 func_optimize = old_optimize;
4628
4629 /* Init func_options. */
4630 memset (&func_options, 0, sizeof (func_options));
4631 init_options_struct (&func_options, NULL);
4632 lang_hooks.init_options_struct (&func_options);
4633
4634 cl_optimization_restore (&func_options,
4635 TREE_OPTIMIZATION (func_optimize));
4636
4637 /* Initialize func_options to the default before its target options can
4638 be set. */
4639 cl_target_option_restore (&func_options,
4640 TREE_TARGET_OPTION (target_option_default_node));
4641
4642 new_target = ix86_valid_target_attribute_tree (args, &func_options,
4643 &global_options_set);
4644
4645 new_optimize = build_optimization_node (&func_options);
4646
4647 if (new_target == error_mark_node)
4648 ret = false;
4649
4650 else if (fndecl && new_target)
4651 {
4652 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4653
4654 if (old_optimize != new_optimize)
4655 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4656 }
4657
4658 return ret;
4659 }
4660
4661 \f
4662 /* Hook to determine if one function can safely inline another. */
4663
4664 static bool
4665 ix86_can_inline_p (tree caller, tree callee)
4666 {
4667 bool ret = false;
4668 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4669 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4670
4671 /* If callee has no option attributes, then it is ok to inline. */
4672 if (!callee_tree)
4673 ret = true;
4674
4675 /* If caller has no option attributes, but callee does then it is not ok to
4676 inline. */
4677 else if (!caller_tree)
4678 ret = false;
4679
4680 else
4681 {
4682 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4683 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4684
4685 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4686 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4687 function. */
4688 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4689 != callee_opts->x_ix86_isa_flags)
4690 ret = false;
4691
4692 /* See if we have the same non-isa options. */
4693 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4694 ret = false;
4695
4696 /* See if arch, tune, etc. are the same. */
4697 else if (caller_opts->arch != callee_opts->arch)
4698 ret = false;
4699
4700 else if (caller_opts->tune != callee_opts->tune)
4701 ret = false;
4702
4703 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4704 ret = false;
4705
4706 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4707 ret = false;
4708
4709 else
4710 ret = true;
4711 }
4712
4713 return ret;
4714 }
4715
4716 \f
4717 /* Remember the last target of ix86_set_current_function. */
4718 static GTY(()) tree ix86_previous_fndecl;
4719
4720 /* Invalidate ix86_previous_fndecl cache. */
4721 void
4722 ix86_reset_previous_fndecl (void)
4723 {
4724 ix86_previous_fndecl = NULL_TREE;
4725 }
4726
4727 /* Establish appropriate back-end context for processing the function
4728 FNDECL. The argument might be NULL to indicate processing at top
4729 level, outside of any function scope. */
4730 static void
4731 ix86_set_current_function (tree fndecl)
4732 {
4733 /* Only change the context if the function changes. This hook is called
4734 several times in the course of compiling a function, and we don't want to
4735 slow things down too much or call target_reinit when it isn't safe. */
4736 if (fndecl && fndecl != ix86_previous_fndecl)
4737 {
4738 tree old_tree = (ix86_previous_fndecl
4739 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4740 : NULL_TREE);
4741
4742 tree new_tree = (fndecl
4743 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4744 : NULL_TREE);
4745
4746 ix86_previous_fndecl = fndecl;
4747 if (old_tree == new_tree)
4748 ;
4749
4750 else if (new_tree)
4751 {
4752 cl_target_option_restore (&global_options,
4753 TREE_TARGET_OPTION (new_tree));
4754 target_reinit ();
4755 }
4756
4757 else if (old_tree)
4758 {
4759 struct cl_target_option *def
4760 = TREE_TARGET_OPTION (target_option_current_node);
4761
4762 cl_target_option_restore (&global_options, def);
4763 target_reinit ();
4764 }
4765 }
4766 }
4767
4768 \f
4769 /* Return true if this goes in large data/bss. */
4770
4771 static bool
4772 ix86_in_large_data_p (tree exp)
4773 {
4774 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4775 return false;
4776
4777 /* Functions are never large data. */
4778 if (TREE_CODE (exp) == FUNCTION_DECL)
4779 return false;
4780
4781 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4782 {
4783 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4784 if (strcmp (section, ".ldata") == 0
4785 || strcmp (section, ".lbss") == 0)
4786 return true;
4787 return false;
4788 }
4789 else
4790 {
4791 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4792
4793 /* If this is an incomplete type with size 0, then we can't put it
4794 in data because it might be too big when completed. */
4795 if (!size || size > ix86_section_threshold)
4796 return true;
4797 }
4798
4799 return false;
4800 }
4801
4802 /* Switch to the appropriate section for output of DECL.
4803 DECL is either a `VAR_DECL' node or a constant of some sort.
4804 RELOC indicates whether forming the initial value of DECL requires
4805 link-time relocations. */
4806
4807 ATTRIBUTE_UNUSED static section *
4808 x86_64_elf_select_section (tree decl, int reloc,
4809 unsigned HOST_WIDE_INT align)
4810 {
4811 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4812 && ix86_in_large_data_p (decl))
4813 {
4814 const char *sname = NULL;
4815 unsigned int flags = SECTION_WRITE;
4816 switch (categorize_decl_for_section (decl, reloc))
4817 {
4818 case SECCAT_DATA:
4819 sname = ".ldata";
4820 break;
4821 case SECCAT_DATA_REL:
4822 sname = ".ldata.rel";
4823 break;
4824 case SECCAT_DATA_REL_LOCAL:
4825 sname = ".ldata.rel.local";
4826 break;
4827 case SECCAT_DATA_REL_RO:
4828 sname = ".ldata.rel.ro";
4829 break;
4830 case SECCAT_DATA_REL_RO_LOCAL:
4831 sname = ".ldata.rel.ro.local";
4832 break;
4833 case SECCAT_BSS:
4834 sname = ".lbss";
4835 flags |= SECTION_BSS;
4836 break;
4837 case SECCAT_RODATA:
4838 case SECCAT_RODATA_MERGE_STR:
4839 case SECCAT_RODATA_MERGE_STR_INIT:
4840 case SECCAT_RODATA_MERGE_CONST:
4841 sname = ".lrodata";
4842 flags = 0;
4843 break;
4844 case SECCAT_SRODATA:
4845 case SECCAT_SDATA:
4846 case SECCAT_SBSS:
4847 gcc_unreachable ();
4848 case SECCAT_TEXT:
4849 case SECCAT_TDATA:
4850 case SECCAT_TBSS:
4851 /* We don't split these for medium model. Place them into
4852 default sections and hope for best. */
4853 break;
4854 }
4855 if (sname)
4856 {
4857 /* We might get called with string constants, but get_named_section
4858 doesn't like them as they are not DECLs. Also, we need to set
4859 flags in that case. */
4860 if (!DECL_P (decl))
4861 return get_section (sname, flags, NULL);
4862 return get_named_section (decl, sname, reloc);
4863 }
4864 }
4865 return default_elf_select_section (decl, reloc, align);
4866 }
4867
4868 /* Select a set of attributes for section NAME based on the properties
4869 of DECL and whether or not RELOC indicates that DECL's initializer
4870 might contain runtime relocations. */
4871
4872 static unsigned int ATTRIBUTE_UNUSED
4873 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4874 {
4875 unsigned int flags = default_section_type_flags (decl, name, reloc);
4876
4877 if (decl == NULL_TREE
4878 && (strcmp (name, ".ldata.rel.ro") == 0
4879 || strcmp (name, ".ldata.rel.ro.local") == 0))
4880 flags |= SECTION_RELRO;
4881
4882 if (strcmp (name, ".lbss") == 0
4883 || strncmp (name, ".lbss.", 5) == 0
4884 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4885 flags |= SECTION_BSS;
4886
4887 return flags;
4888 }
4889
4890 /* Build up a unique section name, expressed as a
4891 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4892 RELOC indicates whether the initial value of EXP requires
4893 link-time relocations. */
4894
4895 static void ATTRIBUTE_UNUSED
4896 x86_64_elf_unique_section (tree decl, int reloc)
4897 {
4898 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4899 && ix86_in_large_data_p (decl))
4900 {
4901 const char *prefix = NULL;
4902 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4903 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4904
4905 switch (categorize_decl_for_section (decl, reloc))
4906 {
4907 case SECCAT_DATA:
4908 case SECCAT_DATA_REL:
4909 case SECCAT_DATA_REL_LOCAL:
4910 case SECCAT_DATA_REL_RO:
4911 case SECCAT_DATA_REL_RO_LOCAL:
4912 prefix = one_only ? ".ld" : ".ldata";
4913 break;
4914 case SECCAT_BSS:
4915 prefix = one_only ? ".lb" : ".lbss";
4916 break;
4917 case SECCAT_RODATA:
4918 case SECCAT_RODATA_MERGE_STR:
4919 case SECCAT_RODATA_MERGE_STR_INIT:
4920 case SECCAT_RODATA_MERGE_CONST:
4921 prefix = one_only ? ".lr" : ".lrodata";
4922 break;
4923 case SECCAT_SRODATA:
4924 case SECCAT_SDATA:
4925 case SECCAT_SBSS:
4926 gcc_unreachable ();
4927 case SECCAT_TEXT:
4928 case SECCAT_TDATA:
4929 case SECCAT_TBSS:
4930 /* We don't split these for medium model. Place them into
4931 default sections and hope for best. */
4932 break;
4933 }
4934 if (prefix)
4935 {
4936 const char *name, *linkonce;
4937 char *string;
4938
4939 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4940 name = targetm.strip_name_encoding (name);
4941
4942 /* If we're using one_only, then there needs to be a .gnu.linkonce
4943 prefix to the section name. */
4944 linkonce = one_only ? ".gnu.linkonce" : "";
4945
4946 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4947
4948 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4949 return;
4950 }
4951 }
4952 default_unique_section (decl, reloc);
4953 }
4954
4955 #ifdef COMMON_ASM_OP
4956 /* This says how to output assembler code to declare an
4957 uninitialized external linkage data object.
4958
4959 For medium model x86-64 we need to use .largecomm opcode for
4960 large objects. */
4961 void
4962 x86_elf_aligned_common (FILE *file,
4963 const char *name, unsigned HOST_WIDE_INT size,
4964 int align)
4965 {
4966 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4967 && size > (unsigned int)ix86_section_threshold)
4968 fputs (".largecomm\t", file);
4969 else
4970 fputs (COMMON_ASM_OP, file);
4971 assemble_name (file, name);
4972 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4973 size, align / BITS_PER_UNIT);
4974 }
4975 #endif
4976
4977 /* Utility function for targets to use in implementing
4978 ASM_OUTPUT_ALIGNED_BSS. */
4979
4980 void
4981 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4982 const char *name, unsigned HOST_WIDE_INT size,
4983 int align)
4984 {
4985 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4986 && size > (unsigned int)ix86_section_threshold)
4987 switch_to_section (get_named_section (decl, ".lbss", 0));
4988 else
4989 switch_to_section (bss_section);
4990 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4991 #ifdef ASM_DECLARE_OBJECT_NAME
4992 last_assemble_variable_decl = decl;
4993 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4994 #else
4995 /* Standard thing is just output label for the object. */
4996 ASM_OUTPUT_LABEL (file, name);
4997 #endif /* ASM_DECLARE_OBJECT_NAME */
4998 ASM_OUTPUT_SKIP (file, size ? size : 1);
4999 }
5000 \f
5001 /* Decide whether we must probe the stack before any space allocation
5002 on this target. It's essentially TARGET_STACK_PROBE except when
5003 -fstack-check causes the stack to be already probed differently. */
5004
5005 bool
5006 ix86_target_stack_probe (void)
5007 {
5008 /* Do not probe the stack twice if static stack checking is enabled. */
5009 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5010 return false;
5011
5012 return TARGET_STACK_PROBE;
5013 }
5014 \f
5015 /* Decide whether we can make a sibling call to a function. DECL is the
5016 declaration of the function being targeted by the call and EXP is the
5017 CALL_EXPR representing the call. */
5018
5019 static bool
5020 ix86_function_ok_for_sibcall (tree decl, tree exp)
5021 {
5022 tree type, decl_or_type;
5023 rtx a, b;
5024
5025 /* If we are generating position-independent code, we cannot sibcall
5026 optimize any indirect call, or a direct call to a global function,
5027 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
5028 if (!TARGET_MACHO
5029 && !TARGET_64BIT
5030 && flag_pic
5031 && (!decl || !targetm.binds_local_p (decl)))
5032 return false;
5033
5034 /* If we need to align the outgoing stack, then sibcalling would
5035 unalign the stack, which may break the called function. */
5036 if (ix86_minimum_incoming_stack_boundary (true)
5037 < PREFERRED_STACK_BOUNDARY)
5038 return false;
5039
5040 if (decl)
5041 {
5042 decl_or_type = decl;
5043 type = TREE_TYPE (decl);
5044 }
5045 else
5046 {
5047 /* We're looking at the CALL_EXPR, we need the type of the function. */
5048 type = CALL_EXPR_FN (exp); /* pointer expression */
5049 type = TREE_TYPE (type); /* pointer type */
5050 type = TREE_TYPE (type); /* function type */
5051 decl_or_type = type;
5052 }
5053
5054 /* Check that the return value locations are the same. Like
5055 if we are returning floats on the 80387 register stack, we cannot
5056 make a sibcall from a function that doesn't return a float to a
5057 function that does or, conversely, from a function that does return
5058 a float to a function that doesn't; the necessary stack adjustment
5059 would not be executed. This is also the place we notice
5060 differences in the return value ABI. Note that it is ok for one
5061 of the functions to have void return type as long as the return
5062 value of the other is passed in a register. */
5063 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
5064 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
5065 cfun->decl, false);
5066 if (STACK_REG_P (a) || STACK_REG_P (b))
5067 {
5068 if (!rtx_equal_p (a, b))
5069 return false;
5070 }
5071 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
5072 ;
5073 else if (!rtx_equal_p (a, b))
5074 return false;
5075
5076 if (TARGET_64BIT)
5077 {
5078 /* The SYSV ABI has more call-clobbered registers;
5079 disallow sibcalls from MS to SYSV. */
5080 if (cfun->machine->call_abi == MS_ABI
5081 && ix86_function_type_abi (type) == SYSV_ABI)
5082 return false;
5083 }
5084 else
5085 {
5086 /* If this call is indirect, we'll need to be able to use a
5087 call-clobbered register for the address of the target function.
5088 Make sure that all such registers are not used for passing
5089 parameters. Note that DLLIMPORT functions are indirect. */
5090 if (!decl
5091 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
5092 {
5093 if (ix86_function_regparm (type, NULL) >= 3)
5094 {
5095 /* ??? Need to count the actual number of registers to be used,
5096 not the possible number of registers. Fix later. */
5097 return false;
5098 }
5099 }
5100 }
5101
5102 /* Otherwise okay. That also includes certain types of indirect calls. */
5103 return true;
5104 }
5105
5106 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
5107 and "sseregparm" calling convention attributes;
5108 arguments as in struct attribute_spec.handler. */
5109
5110 static tree
5111 ix86_handle_cconv_attribute (tree *node, tree name,
5112 tree args,
5113 int flags ATTRIBUTE_UNUSED,
5114 bool *no_add_attrs)
5115 {
5116 if (TREE_CODE (*node) != FUNCTION_TYPE
5117 && TREE_CODE (*node) != METHOD_TYPE
5118 && TREE_CODE (*node) != FIELD_DECL
5119 && TREE_CODE (*node) != TYPE_DECL)
5120 {
5121 warning (OPT_Wattributes, "%qE attribute only applies to functions",
5122 name);
5123 *no_add_attrs = true;
5124 return NULL_TREE;
5125 }
5126
5127 /* Can combine regparm with all attributes but fastcall, and thiscall. */
5128 if (is_attribute_p ("regparm", name))
5129 {
5130 tree cst;
5131
5132 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5133 {
5134 error ("fastcall and regparm attributes are not compatible");
5135 }
5136
5137 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5138 {
5139 error ("regparam and thiscall attributes are not compatible");
5140 }
5141
5142 cst = TREE_VALUE (args);
5143 if (TREE_CODE (cst) != INTEGER_CST)
5144 {
5145 warning (OPT_Wattributes,
5146 "%qE attribute requires an integer constant argument",
5147 name);
5148 *no_add_attrs = true;
5149 }
5150 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
5151 {
5152 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
5153 name, REGPARM_MAX);
5154 *no_add_attrs = true;
5155 }
5156
5157 return NULL_TREE;
5158 }
5159
5160 if (TARGET_64BIT)
5161 {
5162 /* Do not warn when emulating the MS ABI. */
5163 if ((TREE_CODE (*node) != FUNCTION_TYPE
5164 && TREE_CODE (*node) != METHOD_TYPE)
5165 || ix86_function_type_abi (*node) != MS_ABI)
5166 warning (OPT_Wattributes, "%qE attribute ignored",
5167 name);
5168 *no_add_attrs = true;
5169 return NULL_TREE;
5170 }
5171
5172 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
5173 if (is_attribute_p ("fastcall", name))
5174 {
5175 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5176 {
5177 error ("fastcall and cdecl attributes are not compatible");
5178 }
5179 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5180 {
5181 error ("fastcall and stdcall attributes are not compatible");
5182 }
5183 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
5184 {
5185 error ("fastcall and regparm attributes are not compatible");
5186 }
5187 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5188 {
5189 error ("fastcall and thiscall attributes are not compatible");
5190 }
5191 }
5192
5193 /* Can combine stdcall with fastcall (redundant), regparm and
5194 sseregparm. */
5195 else if (is_attribute_p ("stdcall", name))
5196 {
5197 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5198 {
5199 error ("stdcall and cdecl attributes are not compatible");
5200 }
5201 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5202 {
5203 error ("stdcall and fastcall attributes are not compatible");
5204 }
5205 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5206 {
5207 error ("stdcall and thiscall attributes are not compatible");
5208 }
5209 }
5210
5211 /* Can combine cdecl with regparm and sseregparm. */
5212 else if (is_attribute_p ("cdecl", name))
5213 {
5214 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5215 {
5216 error ("stdcall and cdecl attributes are not compatible");
5217 }
5218 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5219 {
5220 error ("fastcall and cdecl attributes are not compatible");
5221 }
5222 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5223 {
5224 error ("cdecl and thiscall attributes are not compatible");
5225 }
5226 }
5227 else if (is_attribute_p ("thiscall", name))
5228 {
5229 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5230 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5231 name);
5232 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5233 {
5234 error ("stdcall and thiscall attributes are not compatible");
5235 }
5236 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5237 {
5238 error ("fastcall and thiscall attributes are not compatible");
5239 }
5240 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5241 {
5242 error ("cdecl and thiscall attributes are not compatible");
5243 }
5244 }
5245
5246 /* Can combine sseregparm with all attributes. */
5247
5248 return NULL_TREE;
5249 }
5250
5251 /* The transactional memory builtins are implicitly regparm or fastcall
5252 depending on the ABI. Override the generic do-nothing attribute that
5253 these builtins were declared with, and replace it with one of the two
5254 attributes that we expect elsewhere. */
5255
5256 static tree
5257 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5258 tree args ATTRIBUTE_UNUSED,
5259 int flags, bool *no_add_attrs)
5260 {
5261 tree alt;
5262
5263 /* In no case do we want to add the placeholder attribute. */
5264 *no_add_attrs = true;
5265
5266 /* The 64-bit ABI is unchanged for transactional memory. */
5267 if (TARGET_64BIT)
5268 return NULL_TREE;
5269
5270 /* ??? Is there a better way to validate 32-bit windows? We have
5271 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5272 if (CHECK_STACK_LIMIT > 0)
5273 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5274 else
5275 {
5276 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5277 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5278 }
5279 decl_attributes (node, alt, flags);
5280
5281 return NULL_TREE;
5282 }
5283
5284 /* This function determines from TYPE the calling-convention. */
5285
5286 unsigned int
5287 ix86_get_callcvt (const_tree type)
5288 {
5289 unsigned int ret = 0;
5290 bool is_stdarg;
5291 tree attrs;
5292
5293 if (TARGET_64BIT)
5294 return IX86_CALLCVT_CDECL;
5295
5296 attrs = TYPE_ATTRIBUTES (type);
5297 if (attrs != NULL_TREE)
5298 {
5299 if (lookup_attribute ("cdecl", attrs))
5300 ret |= IX86_CALLCVT_CDECL;
5301 else if (lookup_attribute ("stdcall", attrs))
5302 ret |= IX86_CALLCVT_STDCALL;
5303 else if (lookup_attribute ("fastcall", attrs))
5304 ret |= IX86_CALLCVT_FASTCALL;
5305 else if (lookup_attribute ("thiscall", attrs))
5306 ret |= IX86_CALLCVT_THISCALL;
5307
5308 /* Regparam isn't allowed for thiscall and fastcall. */
5309 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5310 {
5311 if (lookup_attribute ("regparm", attrs))
5312 ret |= IX86_CALLCVT_REGPARM;
5313 if (lookup_attribute ("sseregparm", attrs))
5314 ret |= IX86_CALLCVT_SSEREGPARM;
5315 }
5316
5317 if (IX86_BASE_CALLCVT(ret) != 0)
5318 return ret;
5319 }
5320
5321 is_stdarg = stdarg_p (type);
5322 if (TARGET_RTD && !is_stdarg)
5323 return IX86_CALLCVT_STDCALL | ret;
5324
5325 if (ret != 0
5326 || is_stdarg
5327 || TREE_CODE (type) != METHOD_TYPE
5328 || ix86_function_type_abi (type) != MS_ABI)
5329 return IX86_CALLCVT_CDECL | ret;
5330
5331 return IX86_CALLCVT_THISCALL;
5332 }
5333
5334 /* Return 0 if the attributes for two types are incompatible, 1 if they
5335 are compatible, and 2 if they are nearly compatible (which causes a
5336 warning to be generated). */
5337
5338 static int
5339 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5340 {
5341 unsigned int ccvt1, ccvt2;
5342
5343 if (TREE_CODE (type1) != FUNCTION_TYPE
5344 && TREE_CODE (type1) != METHOD_TYPE)
5345 return 1;
5346
5347 ccvt1 = ix86_get_callcvt (type1);
5348 ccvt2 = ix86_get_callcvt (type2);
5349 if (ccvt1 != ccvt2)
5350 return 0;
5351 if (ix86_function_regparm (type1, NULL)
5352 != ix86_function_regparm (type2, NULL))
5353 return 0;
5354
5355 return 1;
5356 }
5357 \f
5358 /* Return the regparm value for a function with the indicated TYPE and DECL.
5359 DECL may be NULL when calling function indirectly
5360 or considering a libcall. */
5361
5362 static int
5363 ix86_function_regparm (const_tree type, const_tree decl)
5364 {
5365 tree attr;
5366 int regparm;
5367 unsigned int ccvt;
5368
5369 if (TARGET_64BIT)
5370 return (ix86_function_type_abi (type) == SYSV_ABI
5371 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5372 ccvt = ix86_get_callcvt (type);
5373 regparm = ix86_regparm;
5374
5375 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5376 {
5377 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5378 if (attr)
5379 {
5380 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5381 return regparm;
5382 }
5383 }
5384 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5385 return 2;
5386 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5387 return 1;
5388
5389 /* Use register calling convention for local functions when possible. */
5390 if (decl
5391 && TREE_CODE (decl) == FUNCTION_DECL
5392 && optimize
5393 && !(profile_flag && !flag_fentry))
5394 {
5395 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5396 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5397 if (i && i->local && i->can_change_signature)
5398 {
5399 int local_regparm, globals = 0, regno;
5400
5401 /* Make sure no regparm register is taken by a
5402 fixed register variable. */
5403 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5404 if (fixed_regs[local_regparm])
5405 break;
5406
5407 /* We don't want to use regparm(3) for nested functions as
5408 these use a static chain pointer in the third argument. */
5409 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5410 local_regparm = 2;
5411
5412 /* In 32-bit mode save a register for the split stack. */
5413 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5414 local_regparm = 2;
5415
5416 /* Each fixed register usage increases register pressure,
5417 so less registers should be used for argument passing.
5418 This functionality can be overriden by an explicit
5419 regparm value. */
5420 for (regno = AX_REG; regno <= DI_REG; regno++)
5421 if (fixed_regs[regno])
5422 globals++;
5423
5424 local_regparm
5425 = globals < local_regparm ? local_regparm - globals : 0;
5426
5427 if (local_regparm > regparm)
5428 regparm = local_regparm;
5429 }
5430 }
5431
5432 return regparm;
5433 }
5434
5435 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5436 DFmode (2) arguments in SSE registers for a function with the
5437 indicated TYPE and DECL. DECL may be NULL when calling function
5438 indirectly or considering a libcall. Otherwise return 0. */
5439
5440 static int
5441 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5442 {
5443 gcc_assert (!TARGET_64BIT);
5444
5445 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5446 by the sseregparm attribute. */
5447 if (TARGET_SSEREGPARM
5448 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5449 {
5450 if (!TARGET_SSE)
5451 {
5452 if (warn)
5453 {
5454 if (decl)
5455 error ("calling %qD with attribute sseregparm without "
5456 "SSE/SSE2 enabled", decl);
5457 else
5458 error ("calling %qT with attribute sseregparm without "
5459 "SSE/SSE2 enabled", type);
5460 }
5461 return 0;
5462 }
5463
5464 return 2;
5465 }
5466
5467 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5468 (and DFmode for SSE2) arguments in SSE registers. */
5469 if (decl && TARGET_SSE_MATH && optimize
5470 && !(profile_flag && !flag_fentry))
5471 {
5472 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5473 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5474 if (i && i->local && i->can_change_signature)
5475 return TARGET_SSE2 ? 2 : 1;
5476 }
5477
5478 return 0;
5479 }
5480
5481 /* Return true if EAX is live at the start of the function. Used by
5482 ix86_expand_prologue to determine if we need special help before
5483 calling allocate_stack_worker. */
5484
5485 static bool
5486 ix86_eax_live_at_start_p (void)
5487 {
5488 /* Cheat. Don't bother working forward from ix86_function_regparm
5489 to the function type to whether an actual argument is located in
5490 eax. Instead just look at cfg info, which is still close enough
5491 to correct at this point. This gives false positives for broken
5492 functions that might use uninitialized data that happens to be
5493 allocated in eax, but who cares? */
5494 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5495 }
5496
5497 static bool
5498 ix86_keep_aggregate_return_pointer (tree fntype)
5499 {
5500 tree attr;
5501
5502 if (!TARGET_64BIT)
5503 {
5504 attr = lookup_attribute ("callee_pop_aggregate_return",
5505 TYPE_ATTRIBUTES (fntype));
5506 if (attr)
5507 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5508
5509 /* For 32-bit MS-ABI the default is to keep aggregate
5510 return pointer. */
5511 if (ix86_function_type_abi (fntype) == MS_ABI)
5512 return true;
5513 }
5514 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5515 }
5516
5517 /* Value is the number of bytes of arguments automatically
5518 popped when returning from a subroutine call.
5519 FUNDECL is the declaration node of the function (as a tree),
5520 FUNTYPE is the data type of the function (as a tree),
5521 or for a library call it is an identifier node for the subroutine name.
5522 SIZE is the number of bytes of arguments passed on the stack.
5523
5524 On the 80386, the RTD insn may be used to pop them if the number
5525 of args is fixed, but if the number is variable then the caller
5526 must pop them all. RTD can't be used for library calls now
5527 because the library is compiled with the Unix compiler.
5528 Use of RTD is a selectable option, since it is incompatible with
5529 standard Unix calling sequences. If the option is not selected,
5530 the caller must always pop the args.
5531
5532 The attribute stdcall is equivalent to RTD on a per module basis. */
5533
5534 static int
5535 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5536 {
5537 unsigned int ccvt;
5538
5539 /* None of the 64-bit ABIs pop arguments. */
5540 if (TARGET_64BIT)
5541 return 0;
5542
5543 ccvt = ix86_get_callcvt (funtype);
5544
5545 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5546 | IX86_CALLCVT_THISCALL)) != 0
5547 && ! stdarg_p (funtype))
5548 return size;
5549
5550 /* Lose any fake structure return argument if it is passed on the stack. */
5551 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5552 && !ix86_keep_aggregate_return_pointer (funtype))
5553 {
5554 int nregs = ix86_function_regparm (funtype, fundecl);
5555 if (nregs == 0)
5556 return GET_MODE_SIZE (Pmode);
5557 }
5558
5559 return 0;
5560 }
5561
5562 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5563
5564 static bool
5565 ix86_legitimate_combined_insn (rtx insn)
5566 {
5567 /* Check operand constraints in case hard registers were propagated
5568 into insn pattern. This check prevents combine pass from
5569 generating insn patterns with invalid hard register operands.
5570 These invalid insns can eventually confuse reload to error out
5571 with a spill failure. See also PRs 46829 and 46843. */
5572 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5573 {
5574 int i;
5575
5576 extract_insn (insn);
5577 preprocess_constraints ();
5578
5579 for (i = 0; i < recog_data.n_operands; i++)
5580 {
5581 rtx op = recog_data.operand[i];
5582 enum machine_mode mode = GET_MODE (op);
5583 struct operand_alternative *op_alt;
5584 int offset = 0;
5585 bool win;
5586 int j;
5587
5588 /* A unary operator may be accepted by the predicate, but it
5589 is irrelevant for matching constraints. */
5590 if (UNARY_P (op))
5591 op = XEXP (op, 0);
5592
5593 if (GET_CODE (op) == SUBREG)
5594 {
5595 if (REG_P (SUBREG_REG (op))
5596 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5597 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5598 GET_MODE (SUBREG_REG (op)),
5599 SUBREG_BYTE (op),
5600 GET_MODE (op));
5601 op = SUBREG_REG (op);
5602 }
5603
5604 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5605 continue;
5606
5607 op_alt = recog_op_alt[i];
5608
5609 /* Operand has no constraints, anything is OK. */
5610 win = !recog_data.n_alternatives;
5611
5612 for (j = 0; j < recog_data.n_alternatives; j++)
5613 {
5614 if (op_alt[j].anything_ok
5615 || (op_alt[j].matches != -1
5616 && operands_match_p
5617 (recog_data.operand[i],
5618 recog_data.operand[op_alt[j].matches]))
5619 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5620 {
5621 win = true;
5622 break;
5623 }
5624 }
5625
5626 if (!win)
5627 return false;
5628 }
5629 }
5630
5631 return true;
5632 }
5633 \f
5634 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5635
5636 static unsigned HOST_WIDE_INT
5637 ix86_asan_shadow_offset (void)
5638 {
5639 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5640 : HOST_WIDE_INT_C (0x7fff8000))
5641 : (HOST_WIDE_INT_1 << 29);
5642 }
5643 \f
5644 /* Argument support functions. */
5645
5646 /* Return true when register may be used to pass function parameters. */
5647 bool
5648 ix86_function_arg_regno_p (int regno)
5649 {
5650 int i;
5651 const int *parm_regs;
5652
5653 if (!TARGET_64BIT)
5654 {
5655 if (TARGET_MACHO)
5656 return (regno < REGPARM_MAX
5657 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5658 else
5659 return (regno < REGPARM_MAX
5660 || (TARGET_MMX && MMX_REGNO_P (regno)
5661 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5662 || (TARGET_SSE && SSE_REGNO_P (regno)
5663 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5664 }
5665
5666 if (TARGET_SSE && SSE_REGNO_P (regno)
5667 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5668 return true;
5669
5670 /* TODO: The function should depend on current function ABI but
5671 builtins.c would need updating then. Therefore we use the
5672 default ABI. */
5673
5674 /* RAX is used as hidden argument to va_arg functions. */
5675 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5676 return true;
5677
5678 if (ix86_abi == MS_ABI)
5679 parm_regs = x86_64_ms_abi_int_parameter_registers;
5680 else
5681 parm_regs = x86_64_int_parameter_registers;
5682 for (i = 0; i < (ix86_abi == MS_ABI
5683 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5684 if (regno == parm_regs[i])
5685 return true;
5686 return false;
5687 }
5688
5689 /* Return if we do not know how to pass TYPE solely in registers. */
5690
5691 static bool
5692 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5693 {
5694 if (must_pass_in_stack_var_size_or_pad (mode, type))
5695 return true;
5696
5697 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5698 The layout_type routine is crafty and tries to trick us into passing
5699 currently unsupported vector types on the stack by using TImode. */
5700 return (!TARGET_64BIT && mode == TImode
5701 && type && TREE_CODE (type) != VECTOR_TYPE);
5702 }
5703
5704 /* It returns the size, in bytes, of the area reserved for arguments passed
5705 in registers for the function represented by fndecl dependent to the used
5706 abi format. */
5707 int
5708 ix86_reg_parm_stack_space (const_tree fndecl)
5709 {
5710 enum calling_abi call_abi = SYSV_ABI;
5711 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5712 call_abi = ix86_function_abi (fndecl);
5713 else
5714 call_abi = ix86_function_type_abi (fndecl);
5715 if (TARGET_64BIT && call_abi == MS_ABI)
5716 return 32;
5717 return 0;
5718 }
5719
5720 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5721 call abi used. */
5722 enum calling_abi
5723 ix86_function_type_abi (const_tree fntype)
5724 {
5725 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5726 {
5727 enum calling_abi abi = ix86_abi;
5728 if (abi == SYSV_ABI)
5729 {
5730 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5731 abi = MS_ABI;
5732 }
5733 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5734 abi = SYSV_ABI;
5735 return abi;
5736 }
5737 return ix86_abi;
5738 }
5739
5740 /* We add this as a workaround in order to use libc_has_function
5741 hook in i386.md. */
5742 bool
5743 ix86_libc_has_function (enum function_class fn_class)
5744 {
5745 return targetm.libc_has_function (fn_class);
5746 }
5747
5748 static bool
5749 ix86_function_ms_hook_prologue (const_tree fn)
5750 {
5751 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5752 {
5753 if (decl_function_context (fn) != NULL_TREE)
5754 error_at (DECL_SOURCE_LOCATION (fn),
5755 "ms_hook_prologue is not compatible with nested function");
5756 else
5757 return true;
5758 }
5759 return false;
5760 }
5761
5762 static enum calling_abi
5763 ix86_function_abi (const_tree fndecl)
5764 {
5765 if (! fndecl)
5766 return ix86_abi;
5767 return ix86_function_type_abi (TREE_TYPE (fndecl));
5768 }
5769
5770 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5771 call abi used. */
5772 enum calling_abi
5773 ix86_cfun_abi (void)
5774 {
5775 if (! cfun)
5776 return ix86_abi;
5777 return cfun->machine->call_abi;
5778 }
5779
5780 /* Write the extra assembler code needed to declare a function properly. */
5781
5782 void
5783 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5784 tree decl)
5785 {
5786 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5787
5788 if (is_ms_hook)
5789 {
5790 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5791 unsigned int filler_cc = 0xcccccccc;
5792
5793 for (i = 0; i < filler_count; i += 4)
5794 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5795 }
5796
5797 #ifdef SUBTARGET_ASM_UNWIND_INIT
5798 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5799 #endif
5800
5801 ASM_OUTPUT_LABEL (asm_out_file, fname);
5802
5803 /* Output magic byte marker, if hot-patch attribute is set. */
5804 if (is_ms_hook)
5805 {
5806 if (TARGET_64BIT)
5807 {
5808 /* leaq [%rsp + 0], %rsp */
5809 asm_fprintf (asm_out_file, ASM_BYTE
5810 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5811 }
5812 else
5813 {
5814 /* movl.s %edi, %edi
5815 push %ebp
5816 movl.s %esp, %ebp */
5817 asm_fprintf (asm_out_file, ASM_BYTE
5818 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5819 }
5820 }
5821 }
5822
5823 /* regclass.c */
5824 extern void init_regs (void);
5825
5826 /* Implementation of call abi switching target hook. Specific to FNDECL
5827 the specific call register sets are set. See also
5828 ix86_conditional_register_usage for more details. */
5829 void
5830 ix86_call_abi_override (const_tree fndecl)
5831 {
5832 if (fndecl == NULL_TREE)
5833 cfun->machine->call_abi = ix86_abi;
5834 else
5835 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5836 }
5837
5838 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5839 expensive re-initialization of init_regs each time we switch function context
5840 since this is needed only during RTL expansion. */
5841 static void
5842 ix86_maybe_switch_abi (void)
5843 {
5844 if (TARGET_64BIT &&
5845 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5846 reinit_regs ();
5847 }
5848
5849 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5850 for a call to a function whose data type is FNTYPE.
5851 For a library call, FNTYPE is 0. */
5852
5853 void
5854 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5855 tree fntype, /* tree ptr for function decl */
5856 rtx libname, /* SYMBOL_REF of library name or 0 */
5857 tree fndecl,
5858 int caller)
5859 {
5860 struct cgraph_local_info *i;
5861
5862 memset (cum, 0, sizeof (*cum));
5863
5864 if (fndecl)
5865 {
5866 i = cgraph_local_info (fndecl);
5867 cum->call_abi = ix86_function_abi (fndecl);
5868 }
5869 else
5870 {
5871 i = NULL;
5872 cum->call_abi = ix86_function_type_abi (fntype);
5873 }
5874
5875 cum->caller = caller;
5876
5877 /* Set up the number of registers to use for passing arguments. */
5878
5879 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5880 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5881 "or subtarget optimization implying it");
5882 cum->nregs = ix86_regparm;
5883 if (TARGET_64BIT)
5884 {
5885 cum->nregs = (cum->call_abi == SYSV_ABI
5886 ? X86_64_REGPARM_MAX
5887 : X86_64_MS_REGPARM_MAX);
5888 }
5889 if (TARGET_SSE)
5890 {
5891 cum->sse_nregs = SSE_REGPARM_MAX;
5892 if (TARGET_64BIT)
5893 {
5894 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5895 ? X86_64_SSE_REGPARM_MAX
5896 : X86_64_MS_SSE_REGPARM_MAX);
5897 }
5898 }
5899 if (TARGET_MMX)
5900 cum->mmx_nregs = MMX_REGPARM_MAX;
5901 cum->warn_avx = true;
5902 cum->warn_sse = true;
5903 cum->warn_mmx = true;
5904
5905 /* Because type might mismatch in between caller and callee, we need to
5906 use actual type of function for local calls.
5907 FIXME: cgraph_analyze can be told to actually record if function uses
5908 va_start so for local functions maybe_vaarg can be made aggressive
5909 helping K&R code.
5910 FIXME: once typesytem is fixed, we won't need this code anymore. */
5911 if (i && i->local && i->can_change_signature)
5912 fntype = TREE_TYPE (fndecl);
5913 cum->maybe_vaarg = (fntype
5914 ? (!prototype_p (fntype) || stdarg_p (fntype))
5915 : !libname);
5916
5917 if (!TARGET_64BIT)
5918 {
5919 /* If there are variable arguments, then we won't pass anything
5920 in registers in 32-bit mode. */
5921 if (stdarg_p (fntype))
5922 {
5923 cum->nregs = 0;
5924 cum->sse_nregs = 0;
5925 cum->mmx_nregs = 0;
5926 cum->warn_avx = 0;
5927 cum->warn_sse = 0;
5928 cum->warn_mmx = 0;
5929 return;
5930 }
5931
5932 /* Use ecx and edx registers if function has fastcall attribute,
5933 else look for regparm information. */
5934 if (fntype)
5935 {
5936 unsigned int ccvt = ix86_get_callcvt (fntype);
5937 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5938 {
5939 cum->nregs = 1;
5940 cum->fastcall = 1; /* Same first register as in fastcall. */
5941 }
5942 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5943 {
5944 cum->nregs = 2;
5945 cum->fastcall = 1;
5946 }
5947 else
5948 cum->nregs = ix86_function_regparm (fntype, fndecl);
5949 }
5950
5951 /* Set up the number of SSE registers used for passing SFmode
5952 and DFmode arguments. Warn for mismatching ABI. */
5953 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5954 }
5955 }
5956
5957 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5958 But in the case of vector types, it is some vector mode.
5959
5960 When we have only some of our vector isa extensions enabled, then there
5961 are some modes for which vector_mode_supported_p is false. For these
5962 modes, the generic vector support in gcc will choose some non-vector mode
5963 in order to implement the type. By computing the natural mode, we'll
5964 select the proper ABI location for the operand and not depend on whatever
5965 the middle-end decides to do with these vector types.
5966
5967 The midde-end can't deal with the vector types > 16 bytes. In this
5968 case, we return the original mode and warn ABI change if CUM isn't
5969 NULL. */
5970
5971 static enum machine_mode
5972 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5973 {
5974 enum machine_mode mode = TYPE_MODE (type);
5975
5976 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5977 {
5978 HOST_WIDE_INT size = int_size_in_bytes (type);
5979 if ((size == 8 || size == 16 || size == 32)
5980 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5981 && TYPE_VECTOR_SUBPARTS (type) > 1)
5982 {
5983 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5984
5985 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5986 mode = MIN_MODE_VECTOR_FLOAT;
5987 else
5988 mode = MIN_MODE_VECTOR_INT;
5989
5990 /* Get the mode which has this inner mode and number of units. */
5991 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5992 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5993 && GET_MODE_INNER (mode) == innermode)
5994 {
5995 if (size == 32 && !TARGET_AVX)
5996 {
5997 static bool warnedavx;
5998
5999 if (cum
6000 && !warnedavx
6001 && cum->warn_avx)
6002 {
6003 warnedavx = true;
6004 warning (0, "AVX vector argument without AVX "
6005 "enabled changes the ABI");
6006 }
6007 return TYPE_MODE (type);
6008 }
6009 else if ((size == 8 || size == 16) && !TARGET_SSE)
6010 {
6011 static bool warnedsse;
6012
6013 if (cum
6014 && !warnedsse
6015 && cum->warn_sse)
6016 {
6017 warnedsse = true;
6018 warning (0, "SSE vector argument without SSE "
6019 "enabled changes the ABI");
6020 }
6021 return mode;
6022 }
6023 else
6024 return mode;
6025 }
6026
6027 gcc_unreachable ();
6028 }
6029 }
6030
6031 return mode;
6032 }
6033
6034 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
6035 this may not agree with the mode that the type system has chosen for the
6036 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
6037 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
6038
6039 static rtx
6040 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
6041 unsigned int regno)
6042 {
6043 rtx tmp;
6044
6045 if (orig_mode != BLKmode)
6046 tmp = gen_rtx_REG (orig_mode, regno);
6047 else
6048 {
6049 tmp = gen_rtx_REG (mode, regno);
6050 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
6051 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
6052 }
6053
6054 return tmp;
6055 }
6056
6057 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
6058 of this code is to classify each 8bytes of incoming argument by the register
6059 class and assign registers accordingly. */
6060
6061 /* Return the union class of CLASS1 and CLASS2.
6062 See the x86-64 PS ABI for details. */
6063
6064 static enum x86_64_reg_class
6065 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
6066 {
6067 /* Rule #1: If both classes are equal, this is the resulting class. */
6068 if (class1 == class2)
6069 return class1;
6070
6071 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
6072 the other class. */
6073 if (class1 == X86_64_NO_CLASS)
6074 return class2;
6075 if (class2 == X86_64_NO_CLASS)
6076 return class1;
6077
6078 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
6079 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
6080 return X86_64_MEMORY_CLASS;
6081
6082 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
6083 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
6084 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
6085 return X86_64_INTEGERSI_CLASS;
6086 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
6087 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
6088 return X86_64_INTEGER_CLASS;
6089
6090 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
6091 MEMORY is used. */
6092 if (class1 == X86_64_X87_CLASS
6093 || class1 == X86_64_X87UP_CLASS
6094 || class1 == X86_64_COMPLEX_X87_CLASS
6095 || class2 == X86_64_X87_CLASS
6096 || class2 == X86_64_X87UP_CLASS
6097 || class2 == X86_64_COMPLEX_X87_CLASS)
6098 return X86_64_MEMORY_CLASS;
6099
6100 /* Rule #6: Otherwise class SSE is used. */
6101 return X86_64_SSE_CLASS;
6102 }
6103
6104 /* Classify the argument of type TYPE and mode MODE.
6105 CLASSES will be filled by the register class used to pass each word
6106 of the operand. The number of words is returned. In case the parameter
6107 should be passed in memory, 0 is returned. As a special case for zero
6108 sized containers, classes[0] will be NO_CLASS and 1 is returned.
6109
6110 BIT_OFFSET is used internally for handling records and specifies offset
6111 of the offset in bits modulo 256 to avoid overflow cases.
6112
6113 See the x86-64 PS ABI for details.
6114 */
6115
6116 static int
6117 classify_argument (enum machine_mode mode, const_tree type,
6118 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
6119 {
6120 HOST_WIDE_INT bytes =
6121 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6122 int words
6123 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6124
6125 /* Variable sized entities are always passed/returned in memory. */
6126 if (bytes < 0)
6127 return 0;
6128
6129 if (mode != VOIDmode
6130 && targetm.calls.must_pass_in_stack (mode, type))
6131 return 0;
6132
6133 if (type && AGGREGATE_TYPE_P (type))
6134 {
6135 int i;
6136 tree field;
6137 enum x86_64_reg_class subclasses[MAX_CLASSES];
6138
6139 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
6140 if (bytes > 32)
6141 return 0;
6142
6143 for (i = 0; i < words; i++)
6144 classes[i] = X86_64_NO_CLASS;
6145
6146 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
6147 signalize memory class, so handle it as special case. */
6148 if (!words)
6149 {
6150 classes[0] = X86_64_NO_CLASS;
6151 return 1;
6152 }
6153
6154 /* Classify each field of record and merge classes. */
6155 switch (TREE_CODE (type))
6156 {
6157 case RECORD_TYPE:
6158 /* And now merge the fields of structure. */
6159 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6160 {
6161 if (TREE_CODE (field) == FIELD_DECL)
6162 {
6163 int num;
6164
6165 if (TREE_TYPE (field) == error_mark_node)
6166 continue;
6167
6168 /* Bitfields are always classified as integer. Handle them
6169 early, since later code would consider them to be
6170 misaligned integers. */
6171 if (DECL_BIT_FIELD (field))
6172 {
6173 for (i = (int_bit_position (field)
6174 + (bit_offset % 64)) / 8 / 8;
6175 i < ((int_bit_position (field) + (bit_offset % 64))
6176 + tree_low_cst (DECL_SIZE (field), 0)
6177 + 63) / 8 / 8; i++)
6178 classes[i] =
6179 merge_classes (X86_64_INTEGER_CLASS,
6180 classes[i]);
6181 }
6182 else
6183 {
6184 int pos;
6185
6186 type = TREE_TYPE (field);
6187
6188 /* Flexible array member is ignored. */
6189 if (TYPE_MODE (type) == BLKmode
6190 && TREE_CODE (type) == ARRAY_TYPE
6191 && TYPE_SIZE (type) == NULL_TREE
6192 && TYPE_DOMAIN (type) != NULL_TREE
6193 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6194 == NULL_TREE))
6195 {
6196 static bool warned;
6197
6198 if (!warned && warn_psabi)
6199 {
6200 warned = true;
6201 inform (input_location,
6202 "the ABI of passing struct with"
6203 " a flexible array member has"
6204 " changed in GCC 4.4");
6205 }
6206 continue;
6207 }
6208 num = classify_argument (TYPE_MODE (type), type,
6209 subclasses,
6210 (int_bit_position (field)
6211 + bit_offset) % 256);
6212 if (!num)
6213 return 0;
6214 pos = (int_bit_position (field)
6215 + (bit_offset % 64)) / 8 / 8;
6216 for (i = 0; i < num && (i + pos) < words; i++)
6217 classes[i + pos] =
6218 merge_classes (subclasses[i], classes[i + pos]);
6219 }
6220 }
6221 }
6222 break;
6223
6224 case ARRAY_TYPE:
6225 /* Arrays are handled as small records. */
6226 {
6227 int num;
6228 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6229 TREE_TYPE (type), subclasses, bit_offset);
6230 if (!num)
6231 return 0;
6232
6233 /* The partial classes are now full classes. */
6234 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6235 subclasses[0] = X86_64_SSE_CLASS;
6236 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6237 && !((bit_offset % 64) == 0 && bytes == 4))
6238 subclasses[0] = X86_64_INTEGER_CLASS;
6239
6240 for (i = 0; i < words; i++)
6241 classes[i] = subclasses[i % num];
6242
6243 break;
6244 }
6245 case UNION_TYPE:
6246 case QUAL_UNION_TYPE:
6247 /* Unions are similar to RECORD_TYPE but offset is always 0.
6248 */
6249 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6250 {
6251 if (TREE_CODE (field) == FIELD_DECL)
6252 {
6253 int num;
6254
6255 if (TREE_TYPE (field) == error_mark_node)
6256 continue;
6257
6258 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6259 TREE_TYPE (field), subclasses,
6260 bit_offset);
6261 if (!num)
6262 return 0;
6263 for (i = 0; i < num; i++)
6264 classes[i] = merge_classes (subclasses[i], classes[i]);
6265 }
6266 }
6267 break;
6268
6269 default:
6270 gcc_unreachable ();
6271 }
6272
6273 if (words > 2)
6274 {
6275 /* When size > 16 bytes, if the first one isn't
6276 X86_64_SSE_CLASS or any other ones aren't
6277 X86_64_SSEUP_CLASS, everything should be passed in
6278 memory. */
6279 if (classes[0] != X86_64_SSE_CLASS)
6280 return 0;
6281
6282 for (i = 1; i < words; i++)
6283 if (classes[i] != X86_64_SSEUP_CLASS)
6284 return 0;
6285 }
6286
6287 /* Final merger cleanup. */
6288 for (i = 0; i < words; i++)
6289 {
6290 /* If one class is MEMORY, everything should be passed in
6291 memory. */
6292 if (classes[i] == X86_64_MEMORY_CLASS)
6293 return 0;
6294
6295 /* The X86_64_SSEUP_CLASS should be always preceded by
6296 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6297 if (classes[i] == X86_64_SSEUP_CLASS
6298 && classes[i - 1] != X86_64_SSE_CLASS
6299 && classes[i - 1] != X86_64_SSEUP_CLASS)
6300 {
6301 /* The first one should never be X86_64_SSEUP_CLASS. */
6302 gcc_assert (i != 0);
6303 classes[i] = X86_64_SSE_CLASS;
6304 }
6305
6306 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6307 everything should be passed in memory. */
6308 if (classes[i] == X86_64_X87UP_CLASS
6309 && (classes[i - 1] != X86_64_X87_CLASS))
6310 {
6311 static bool warned;
6312
6313 /* The first one should never be X86_64_X87UP_CLASS. */
6314 gcc_assert (i != 0);
6315 if (!warned && warn_psabi)
6316 {
6317 warned = true;
6318 inform (input_location,
6319 "the ABI of passing union with long double"
6320 " has changed in GCC 4.4");
6321 }
6322 return 0;
6323 }
6324 }
6325 return words;
6326 }
6327
6328 /* Compute alignment needed. We align all types to natural boundaries with
6329 exception of XFmode that is aligned to 64bits. */
6330 if (mode != VOIDmode && mode != BLKmode)
6331 {
6332 int mode_alignment = GET_MODE_BITSIZE (mode);
6333
6334 if (mode == XFmode)
6335 mode_alignment = 128;
6336 else if (mode == XCmode)
6337 mode_alignment = 256;
6338 if (COMPLEX_MODE_P (mode))
6339 mode_alignment /= 2;
6340 /* Misaligned fields are always returned in memory. */
6341 if (bit_offset % mode_alignment)
6342 return 0;
6343 }
6344
6345 /* for V1xx modes, just use the base mode */
6346 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6347 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6348 mode = GET_MODE_INNER (mode);
6349
6350 /* Classification of atomic types. */
6351 switch (mode)
6352 {
6353 case SDmode:
6354 case DDmode:
6355 classes[0] = X86_64_SSE_CLASS;
6356 return 1;
6357 case TDmode:
6358 classes[0] = X86_64_SSE_CLASS;
6359 classes[1] = X86_64_SSEUP_CLASS;
6360 return 2;
6361 case DImode:
6362 case SImode:
6363 case HImode:
6364 case QImode:
6365 case CSImode:
6366 case CHImode:
6367 case CQImode:
6368 {
6369 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
6370
6371 if (size <= 32)
6372 {
6373 classes[0] = X86_64_INTEGERSI_CLASS;
6374 return 1;
6375 }
6376 else if (size <= 64)
6377 {
6378 classes[0] = X86_64_INTEGER_CLASS;
6379 return 1;
6380 }
6381 else if (size <= 64+32)
6382 {
6383 classes[0] = X86_64_INTEGER_CLASS;
6384 classes[1] = X86_64_INTEGERSI_CLASS;
6385 return 2;
6386 }
6387 else if (size <= 64+64)
6388 {
6389 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6390 return 2;
6391 }
6392 else
6393 gcc_unreachable ();
6394 }
6395 case CDImode:
6396 case TImode:
6397 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6398 return 2;
6399 case COImode:
6400 case OImode:
6401 /* OImode shouldn't be used directly. */
6402 gcc_unreachable ();
6403 case CTImode:
6404 return 0;
6405 case SFmode:
6406 if (!(bit_offset % 64))
6407 classes[0] = X86_64_SSESF_CLASS;
6408 else
6409 classes[0] = X86_64_SSE_CLASS;
6410 return 1;
6411 case DFmode:
6412 classes[0] = X86_64_SSEDF_CLASS;
6413 return 1;
6414 case XFmode:
6415 classes[0] = X86_64_X87_CLASS;
6416 classes[1] = X86_64_X87UP_CLASS;
6417 return 2;
6418 case TFmode:
6419 classes[0] = X86_64_SSE_CLASS;
6420 classes[1] = X86_64_SSEUP_CLASS;
6421 return 2;
6422 case SCmode:
6423 classes[0] = X86_64_SSE_CLASS;
6424 if (!(bit_offset % 64))
6425 return 1;
6426 else
6427 {
6428 static bool warned;
6429
6430 if (!warned && warn_psabi)
6431 {
6432 warned = true;
6433 inform (input_location,
6434 "the ABI of passing structure with complex float"
6435 " member has changed in GCC 4.4");
6436 }
6437 classes[1] = X86_64_SSESF_CLASS;
6438 return 2;
6439 }
6440 case DCmode:
6441 classes[0] = X86_64_SSEDF_CLASS;
6442 classes[1] = X86_64_SSEDF_CLASS;
6443 return 2;
6444 case XCmode:
6445 classes[0] = X86_64_COMPLEX_X87_CLASS;
6446 return 1;
6447 case TCmode:
6448 /* This modes is larger than 16 bytes. */
6449 return 0;
6450 case V8SFmode:
6451 case V8SImode:
6452 case V32QImode:
6453 case V16HImode:
6454 case V4DFmode:
6455 case V4DImode:
6456 classes[0] = X86_64_SSE_CLASS;
6457 classes[1] = X86_64_SSEUP_CLASS;
6458 classes[2] = X86_64_SSEUP_CLASS;
6459 classes[3] = X86_64_SSEUP_CLASS;
6460 return 4;
6461 case V4SFmode:
6462 case V4SImode:
6463 case V16QImode:
6464 case V8HImode:
6465 case V2DFmode:
6466 case V2DImode:
6467 classes[0] = X86_64_SSE_CLASS;
6468 classes[1] = X86_64_SSEUP_CLASS;
6469 return 2;
6470 case V1TImode:
6471 case V1DImode:
6472 case V2SFmode:
6473 case V2SImode:
6474 case V4HImode:
6475 case V8QImode:
6476 classes[0] = X86_64_SSE_CLASS;
6477 return 1;
6478 case BLKmode:
6479 case VOIDmode:
6480 return 0;
6481 default:
6482 gcc_assert (VECTOR_MODE_P (mode));
6483
6484 if (bytes > 16)
6485 return 0;
6486
6487 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6488
6489 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6490 classes[0] = X86_64_INTEGERSI_CLASS;
6491 else
6492 classes[0] = X86_64_INTEGER_CLASS;
6493 classes[1] = X86_64_INTEGER_CLASS;
6494 return 1 + (bytes > 8);
6495 }
6496 }
6497
6498 /* Examine the argument and return set number of register required in each
6499 class. Return 0 iff parameter should be passed in memory. */
6500 static int
6501 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6502 int *int_nregs, int *sse_nregs)
6503 {
6504 enum x86_64_reg_class regclass[MAX_CLASSES];
6505 int n = classify_argument (mode, type, regclass, 0);
6506
6507 *int_nregs = 0;
6508 *sse_nregs = 0;
6509 if (!n)
6510 return 0;
6511 for (n--; n >= 0; n--)
6512 switch (regclass[n])
6513 {
6514 case X86_64_INTEGER_CLASS:
6515 case X86_64_INTEGERSI_CLASS:
6516 (*int_nregs)++;
6517 break;
6518 case X86_64_SSE_CLASS:
6519 case X86_64_SSESF_CLASS:
6520 case X86_64_SSEDF_CLASS:
6521 (*sse_nregs)++;
6522 break;
6523 case X86_64_NO_CLASS:
6524 case X86_64_SSEUP_CLASS:
6525 break;
6526 case X86_64_X87_CLASS:
6527 case X86_64_X87UP_CLASS:
6528 if (!in_return)
6529 return 0;
6530 break;
6531 case X86_64_COMPLEX_X87_CLASS:
6532 return in_return ? 2 : 0;
6533 case X86_64_MEMORY_CLASS:
6534 gcc_unreachable ();
6535 }
6536 return 1;
6537 }
6538
6539 /* Construct container for the argument used by GCC interface. See
6540 FUNCTION_ARG for the detailed description. */
6541
6542 static rtx
6543 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6544 const_tree type, int in_return, int nintregs, int nsseregs,
6545 const int *intreg, int sse_regno)
6546 {
6547 /* The following variables hold the static issued_error state. */
6548 static bool issued_sse_arg_error;
6549 static bool issued_sse_ret_error;
6550 static bool issued_x87_ret_error;
6551
6552 enum machine_mode tmpmode;
6553 int bytes =
6554 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6555 enum x86_64_reg_class regclass[MAX_CLASSES];
6556 int n;
6557 int i;
6558 int nexps = 0;
6559 int needed_sseregs, needed_intregs;
6560 rtx exp[MAX_CLASSES];
6561 rtx ret;
6562
6563 n = classify_argument (mode, type, regclass, 0);
6564 if (!n)
6565 return NULL;
6566 if (!examine_argument (mode, type, in_return, &needed_intregs,
6567 &needed_sseregs))
6568 return NULL;
6569 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6570 return NULL;
6571
6572 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6573 some less clueful developer tries to use floating-point anyway. */
6574 if (needed_sseregs && !TARGET_SSE)
6575 {
6576 if (in_return)
6577 {
6578 if (!issued_sse_ret_error)
6579 {
6580 error ("SSE register return with SSE disabled");
6581 issued_sse_ret_error = true;
6582 }
6583 }
6584 else if (!issued_sse_arg_error)
6585 {
6586 error ("SSE register argument with SSE disabled");
6587 issued_sse_arg_error = true;
6588 }
6589 return NULL;
6590 }
6591
6592 /* Likewise, error if the ABI requires us to return values in the
6593 x87 registers and the user specified -mno-80387. */
6594 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
6595 for (i = 0; i < n; i++)
6596 if (regclass[i] == X86_64_X87_CLASS
6597 || regclass[i] == X86_64_X87UP_CLASS
6598 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6599 {
6600 if (!issued_x87_ret_error)
6601 {
6602 error ("x87 register return with x87 disabled");
6603 issued_x87_ret_error = true;
6604 }
6605 return NULL;
6606 }
6607
6608 /* First construct simple cases. Avoid SCmode, since we want to use
6609 single register to pass this type. */
6610 if (n == 1 && mode != SCmode)
6611 switch (regclass[0])
6612 {
6613 case X86_64_INTEGER_CLASS:
6614 case X86_64_INTEGERSI_CLASS:
6615 return gen_rtx_REG (mode, intreg[0]);
6616 case X86_64_SSE_CLASS:
6617 case X86_64_SSESF_CLASS:
6618 case X86_64_SSEDF_CLASS:
6619 if (mode != BLKmode)
6620 return gen_reg_or_parallel (mode, orig_mode,
6621 SSE_REGNO (sse_regno));
6622 break;
6623 case X86_64_X87_CLASS:
6624 case X86_64_COMPLEX_X87_CLASS:
6625 return gen_rtx_REG (mode, FIRST_STACK_REG);
6626 case X86_64_NO_CLASS:
6627 /* Zero sized array, struct or class. */
6628 return NULL;
6629 default:
6630 gcc_unreachable ();
6631 }
6632 if (n == 2
6633 && regclass[0] == X86_64_SSE_CLASS
6634 && regclass[1] == X86_64_SSEUP_CLASS
6635 && mode != BLKmode)
6636 return gen_reg_or_parallel (mode, orig_mode,
6637 SSE_REGNO (sse_regno));
6638 if (n == 4
6639 && regclass[0] == X86_64_SSE_CLASS
6640 && regclass[1] == X86_64_SSEUP_CLASS
6641 && regclass[2] == X86_64_SSEUP_CLASS
6642 && regclass[3] == X86_64_SSEUP_CLASS
6643 && mode != BLKmode)
6644 return gen_reg_or_parallel (mode, orig_mode,
6645 SSE_REGNO (sse_regno));
6646 if (n == 2
6647 && regclass[0] == X86_64_X87_CLASS
6648 && regclass[1] == X86_64_X87UP_CLASS)
6649 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6650
6651 if (n == 2
6652 && regclass[0] == X86_64_INTEGER_CLASS
6653 && regclass[1] == X86_64_INTEGER_CLASS
6654 && (mode == CDImode || mode == TImode || mode == TFmode)
6655 && intreg[0] + 1 == intreg[1])
6656 return gen_rtx_REG (mode, intreg[0]);
6657
6658 /* Otherwise figure out the entries of the PARALLEL. */
6659 for (i = 0; i < n; i++)
6660 {
6661 int pos;
6662
6663 switch (regclass[i])
6664 {
6665 case X86_64_NO_CLASS:
6666 break;
6667 case X86_64_INTEGER_CLASS:
6668 case X86_64_INTEGERSI_CLASS:
6669 /* Merge TImodes on aligned occasions here too. */
6670 if (i * 8 + 8 > bytes)
6671 tmpmode
6672 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6673 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6674 tmpmode = SImode;
6675 else
6676 tmpmode = DImode;
6677 /* We've requested 24 bytes we
6678 don't have mode for. Use DImode. */
6679 if (tmpmode == BLKmode)
6680 tmpmode = DImode;
6681 exp [nexps++]
6682 = gen_rtx_EXPR_LIST (VOIDmode,
6683 gen_rtx_REG (tmpmode, *intreg),
6684 GEN_INT (i*8));
6685 intreg++;
6686 break;
6687 case X86_64_SSESF_CLASS:
6688 exp [nexps++]
6689 = gen_rtx_EXPR_LIST (VOIDmode,
6690 gen_rtx_REG (SFmode,
6691 SSE_REGNO (sse_regno)),
6692 GEN_INT (i*8));
6693 sse_regno++;
6694 break;
6695 case X86_64_SSEDF_CLASS:
6696 exp [nexps++]
6697 = gen_rtx_EXPR_LIST (VOIDmode,
6698 gen_rtx_REG (DFmode,
6699 SSE_REGNO (sse_regno)),
6700 GEN_INT (i*8));
6701 sse_regno++;
6702 break;
6703 case X86_64_SSE_CLASS:
6704 pos = i;
6705 switch (n)
6706 {
6707 case 1:
6708 tmpmode = DImode;
6709 break;
6710 case 2:
6711 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6712 {
6713 tmpmode = TImode;
6714 i++;
6715 }
6716 else
6717 tmpmode = DImode;
6718 break;
6719 case 4:
6720 gcc_assert (i == 0
6721 && regclass[1] == X86_64_SSEUP_CLASS
6722 && regclass[2] == X86_64_SSEUP_CLASS
6723 && regclass[3] == X86_64_SSEUP_CLASS);
6724 tmpmode = OImode;
6725 i += 3;
6726 break;
6727 default:
6728 gcc_unreachable ();
6729 }
6730 exp [nexps++]
6731 = gen_rtx_EXPR_LIST (VOIDmode,
6732 gen_rtx_REG (tmpmode,
6733 SSE_REGNO (sse_regno)),
6734 GEN_INT (pos*8));
6735 sse_regno++;
6736 break;
6737 default:
6738 gcc_unreachable ();
6739 }
6740 }
6741
6742 /* Empty aligned struct, union or class. */
6743 if (nexps == 0)
6744 return NULL;
6745
6746 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6747 for (i = 0; i < nexps; i++)
6748 XVECEXP (ret, 0, i) = exp [i];
6749 return ret;
6750 }
6751
6752 /* Update the data in CUM to advance over an argument of mode MODE
6753 and data type TYPE. (TYPE is null for libcalls where that information
6754 may not be available.) */
6755
6756 static void
6757 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6758 const_tree type, HOST_WIDE_INT bytes,
6759 HOST_WIDE_INT words)
6760 {
6761 switch (mode)
6762 {
6763 default:
6764 break;
6765
6766 case BLKmode:
6767 if (bytes < 0)
6768 break;
6769 /* FALLTHRU */
6770
6771 case DImode:
6772 case SImode:
6773 case HImode:
6774 case QImode:
6775 cum->words += words;
6776 cum->nregs -= words;
6777 cum->regno += words;
6778
6779 if (cum->nregs <= 0)
6780 {
6781 cum->nregs = 0;
6782 cum->regno = 0;
6783 }
6784 break;
6785
6786 case OImode:
6787 /* OImode shouldn't be used directly. */
6788 gcc_unreachable ();
6789
6790 case DFmode:
6791 if (cum->float_in_sse < 2)
6792 break;
6793 case SFmode:
6794 if (cum->float_in_sse < 1)
6795 break;
6796 /* FALLTHRU */
6797
6798 case V8SFmode:
6799 case V8SImode:
6800 case V32QImode:
6801 case V16HImode:
6802 case V4DFmode:
6803 case V4DImode:
6804 case TImode:
6805 case V16QImode:
6806 case V8HImode:
6807 case V4SImode:
6808 case V2DImode:
6809 case V4SFmode:
6810 case V2DFmode:
6811 if (!type || !AGGREGATE_TYPE_P (type))
6812 {
6813 cum->sse_words += words;
6814 cum->sse_nregs -= 1;
6815 cum->sse_regno += 1;
6816 if (cum->sse_nregs <= 0)
6817 {
6818 cum->sse_nregs = 0;
6819 cum->sse_regno = 0;
6820 }
6821 }
6822 break;
6823
6824 case V8QImode:
6825 case V4HImode:
6826 case V2SImode:
6827 case V2SFmode:
6828 case V1TImode:
6829 case V1DImode:
6830 if (!type || !AGGREGATE_TYPE_P (type))
6831 {
6832 cum->mmx_words += words;
6833 cum->mmx_nregs -= 1;
6834 cum->mmx_regno += 1;
6835 if (cum->mmx_nregs <= 0)
6836 {
6837 cum->mmx_nregs = 0;
6838 cum->mmx_regno = 0;
6839 }
6840 }
6841 break;
6842 }
6843 }
6844
6845 static void
6846 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6847 const_tree type, HOST_WIDE_INT words, bool named)
6848 {
6849 int int_nregs, sse_nregs;
6850
6851 /* Unnamed 256bit vector mode parameters are passed on stack. */
6852 if (!named && VALID_AVX256_REG_MODE (mode))
6853 return;
6854
6855 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6856 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6857 {
6858 cum->nregs -= int_nregs;
6859 cum->sse_nregs -= sse_nregs;
6860 cum->regno += int_nregs;
6861 cum->sse_regno += sse_nregs;
6862 }
6863 else
6864 {
6865 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6866 cum->words = (cum->words + align - 1) & ~(align - 1);
6867 cum->words += words;
6868 }
6869 }
6870
6871 static void
6872 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6873 HOST_WIDE_INT words)
6874 {
6875 /* Otherwise, this should be passed indirect. */
6876 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6877
6878 cum->words += words;
6879 if (cum->nregs > 0)
6880 {
6881 cum->nregs -= 1;
6882 cum->regno += 1;
6883 }
6884 }
6885
6886 /* Update the data in CUM to advance over an argument of mode MODE and
6887 data type TYPE. (TYPE is null for libcalls where that information
6888 may not be available.) */
6889
6890 static void
6891 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6892 const_tree type, bool named)
6893 {
6894 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6895 HOST_WIDE_INT bytes, words;
6896
6897 if (mode == BLKmode)
6898 bytes = int_size_in_bytes (type);
6899 else
6900 bytes = GET_MODE_SIZE (mode);
6901 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6902
6903 if (type)
6904 mode = type_natural_mode (type, NULL);
6905
6906 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6907 function_arg_advance_ms_64 (cum, bytes, words);
6908 else if (TARGET_64BIT)
6909 function_arg_advance_64 (cum, mode, type, words, named);
6910 else
6911 function_arg_advance_32 (cum, mode, type, bytes, words);
6912 }
6913
6914 /* Define where to put the arguments to a function.
6915 Value is zero to push the argument on the stack,
6916 or a hard register in which to store the argument.
6917
6918 MODE is the argument's machine mode.
6919 TYPE is the data type of the argument (as a tree).
6920 This is null for libcalls where that information may
6921 not be available.
6922 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6923 the preceding args and about the function being called.
6924 NAMED is nonzero if this argument is a named parameter
6925 (otherwise it is an extra parameter matching an ellipsis). */
6926
6927 static rtx
6928 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6929 enum machine_mode orig_mode, const_tree type,
6930 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6931 {
6932 static bool warnedsse, warnedmmx;
6933
6934 /* Avoid the AL settings for the Unix64 ABI. */
6935 if (mode == VOIDmode)
6936 return constm1_rtx;
6937
6938 switch (mode)
6939 {
6940 default:
6941 break;
6942
6943 case BLKmode:
6944 if (bytes < 0)
6945 break;
6946 /* FALLTHRU */
6947 case DImode:
6948 case SImode:
6949 case HImode:
6950 case QImode:
6951 if (words <= cum->nregs)
6952 {
6953 int regno = cum->regno;
6954
6955 /* Fastcall allocates the first two DWORD (SImode) or
6956 smaller arguments to ECX and EDX if it isn't an
6957 aggregate type . */
6958 if (cum->fastcall)
6959 {
6960 if (mode == BLKmode
6961 || mode == DImode
6962 || (type && AGGREGATE_TYPE_P (type)))
6963 break;
6964
6965 /* ECX not EAX is the first allocated register. */
6966 if (regno == AX_REG)
6967 regno = CX_REG;
6968 }
6969 return gen_rtx_REG (mode, regno);
6970 }
6971 break;
6972
6973 case DFmode:
6974 if (cum->float_in_sse < 2)
6975 break;
6976 case SFmode:
6977 if (cum->float_in_sse < 1)
6978 break;
6979 /* FALLTHRU */
6980 case TImode:
6981 /* In 32bit, we pass TImode in xmm registers. */
6982 case V16QImode:
6983 case V8HImode:
6984 case V4SImode:
6985 case V2DImode:
6986 case V4SFmode:
6987 case V2DFmode:
6988 if (!type || !AGGREGATE_TYPE_P (type))
6989 {
6990 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6991 {
6992 warnedsse = true;
6993 warning (0, "SSE vector argument without SSE enabled "
6994 "changes the ABI");
6995 }
6996 if (cum->sse_nregs)
6997 return gen_reg_or_parallel (mode, orig_mode,
6998 cum->sse_regno + FIRST_SSE_REG);
6999 }
7000 break;
7001
7002 case OImode:
7003 /* OImode shouldn't be used directly. */
7004 gcc_unreachable ();
7005
7006 case V8SFmode:
7007 case V8SImode:
7008 case V32QImode:
7009 case V16HImode:
7010 case V4DFmode:
7011 case V4DImode:
7012 if (!type || !AGGREGATE_TYPE_P (type))
7013 {
7014 if (cum->sse_nregs)
7015 return gen_reg_or_parallel (mode, orig_mode,
7016 cum->sse_regno + FIRST_SSE_REG);
7017 }
7018 break;
7019
7020 case V8QImode:
7021 case V4HImode:
7022 case V2SImode:
7023 case V2SFmode:
7024 case V1TImode:
7025 case V1DImode:
7026 if (!type || !AGGREGATE_TYPE_P (type))
7027 {
7028 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
7029 {
7030 warnedmmx = true;
7031 warning (0, "MMX vector argument without MMX enabled "
7032 "changes the ABI");
7033 }
7034 if (cum->mmx_nregs)
7035 return gen_reg_or_parallel (mode, orig_mode,
7036 cum->mmx_regno + FIRST_MMX_REG);
7037 }
7038 break;
7039 }
7040
7041 return NULL_RTX;
7042 }
7043
7044 static rtx
7045 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7046 enum machine_mode orig_mode, const_tree type, bool named)
7047 {
7048 /* Handle a hidden AL argument containing number of registers
7049 for varargs x86-64 functions. */
7050 if (mode == VOIDmode)
7051 return GEN_INT (cum->maybe_vaarg
7052 ? (cum->sse_nregs < 0
7053 ? X86_64_SSE_REGPARM_MAX
7054 : cum->sse_regno)
7055 : -1);
7056
7057 switch (mode)
7058 {
7059 default:
7060 break;
7061
7062 case V8SFmode:
7063 case V8SImode:
7064 case V32QImode:
7065 case V16HImode:
7066 case V4DFmode:
7067 case V4DImode:
7068 /* Unnamed 256bit vector mode parameters are passed on stack. */
7069 if (!named)
7070 return NULL;
7071 break;
7072 }
7073
7074 return construct_container (mode, orig_mode, type, 0, cum->nregs,
7075 cum->sse_nregs,
7076 &x86_64_int_parameter_registers [cum->regno],
7077 cum->sse_regno);
7078 }
7079
7080 static rtx
7081 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
7082 enum machine_mode orig_mode, bool named,
7083 HOST_WIDE_INT bytes)
7084 {
7085 unsigned int regno;
7086
7087 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
7088 We use value of -2 to specify that current function call is MSABI. */
7089 if (mode == VOIDmode)
7090 return GEN_INT (-2);
7091
7092 /* If we've run out of registers, it goes on the stack. */
7093 if (cum->nregs == 0)
7094 return NULL_RTX;
7095
7096 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
7097
7098 /* Only floating point modes are passed in anything but integer regs. */
7099 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
7100 {
7101 if (named)
7102 regno = cum->regno + FIRST_SSE_REG;
7103 else
7104 {
7105 rtx t1, t2;
7106
7107 /* Unnamed floating parameters are passed in both the
7108 SSE and integer registers. */
7109 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
7110 t2 = gen_rtx_REG (mode, regno);
7111 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
7112 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
7113 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
7114 }
7115 }
7116 /* Handle aggregated types passed in register. */
7117 if (orig_mode == BLKmode)
7118 {
7119 if (bytes > 0 && bytes <= 8)
7120 mode = (bytes > 4 ? DImode : SImode);
7121 if (mode == BLKmode)
7122 mode = DImode;
7123 }
7124
7125 return gen_reg_or_parallel (mode, orig_mode, regno);
7126 }
7127
7128 /* Return where to put the arguments to a function.
7129 Return zero to push the argument on the stack, or a hard register in which to store the argument.
7130
7131 MODE is the argument's machine mode. TYPE is the data type of the
7132 argument. It is null for libcalls where that information may not be
7133 available. CUM gives information about the preceding args and about
7134 the function being called. NAMED is nonzero if this argument is a
7135 named parameter (otherwise it is an extra parameter matching an
7136 ellipsis). */
7137
7138 static rtx
7139 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
7140 const_tree type, bool named)
7141 {
7142 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7143 enum machine_mode mode = omode;
7144 HOST_WIDE_INT bytes, words;
7145 rtx arg;
7146
7147 if (mode == BLKmode)
7148 bytes = int_size_in_bytes (type);
7149 else
7150 bytes = GET_MODE_SIZE (mode);
7151 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7152
7153 /* To simplify the code below, represent vector types with a vector mode
7154 even if MMX/SSE are not active. */
7155 if (type && TREE_CODE (type) == VECTOR_TYPE)
7156 mode = type_natural_mode (type, cum);
7157
7158 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7159 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
7160 else if (TARGET_64BIT)
7161 arg = function_arg_64 (cum, mode, omode, type, named);
7162 else
7163 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
7164
7165 return arg;
7166 }
7167
7168 /* A C expression that indicates when an argument must be passed by
7169 reference. If nonzero for an argument, a copy of that argument is
7170 made in memory and a pointer to the argument is passed instead of
7171 the argument itself. The pointer is passed in whatever way is
7172 appropriate for passing a pointer to that type. */
7173
7174 static bool
7175 ix86_pass_by_reference (cumulative_args_t cum_v, enum machine_mode mode,
7176 const_tree type, bool named ATTRIBUTE_UNUSED)
7177 {
7178 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7179
7180 /* See Windows x64 Software Convention. */
7181 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7182 {
7183 int msize = (int) GET_MODE_SIZE (mode);
7184 if (type)
7185 {
7186 /* Arrays are passed by reference. */
7187 if (TREE_CODE (type) == ARRAY_TYPE)
7188 return true;
7189
7190 if (AGGREGATE_TYPE_P (type))
7191 {
7192 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7193 are passed by reference. */
7194 msize = int_size_in_bytes (type);
7195 }
7196 }
7197
7198 /* __m128 is passed by reference. */
7199 switch (msize) {
7200 case 1: case 2: case 4: case 8:
7201 break;
7202 default:
7203 return true;
7204 }
7205 }
7206 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7207 return 1;
7208
7209 return 0;
7210 }
7211
7212 /* Return true when TYPE should be 128bit aligned for 32bit argument
7213 passing ABI. XXX: This function is obsolete and is only used for
7214 checking psABI compatibility with previous versions of GCC. */
7215
7216 static bool
7217 ix86_compat_aligned_value_p (const_tree type)
7218 {
7219 enum machine_mode mode = TYPE_MODE (type);
7220 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7221 || mode == TDmode
7222 || mode == TFmode
7223 || mode == TCmode)
7224 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7225 return true;
7226 if (TYPE_ALIGN (type) < 128)
7227 return false;
7228
7229 if (AGGREGATE_TYPE_P (type))
7230 {
7231 /* Walk the aggregates recursively. */
7232 switch (TREE_CODE (type))
7233 {
7234 case RECORD_TYPE:
7235 case UNION_TYPE:
7236 case QUAL_UNION_TYPE:
7237 {
7238 tree field;
7239
7240 /* Walk all the structure fields. */
7241 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7242 {
7243 if (TREE_CODE (field) == FIELD_DECL
7244 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7245 return true;
7246 }
7247 break;
7248 }
7249
7250 case ARRAY_TYPE:
7251 /* Just for use if some languages passes arrays by value. */
7252 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7253 return true;
7254 break;
7255
7256 default:
7257 gcc_unreachable ();
7258 }
7259 }
7260 return false;
7261 }
7262
7263 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7264 XXX: This function is obsolete and is only used for checking psABI
7265 compatibility with previous versions of GCC. */
7266
7267 static unsigned int
7268 ix86_compat_function_arg_boundary (enum machine_mode mode,
7269 const_tree type, unsigned int align)
7270 {
7271 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7272 natural boundaries. */
7273 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7274 {
7275 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7276 make an exception for SSE modes since these require 128bit
7277 alignment.
7278
7279 The handling here differs from field_alignment. ICC aligns MMX
7280 arguments to 4 byte boundaries, while structure fields are aligned
7281 to 8 byte boundaries. */
7282 if (!type)
7283 {
7284 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7285 align = PARM_BOUNDARY;
7286 }
7287 else
7288 {
7289 if (!ix86_compat_aligned_value_p (type))
7290 align = PARM_BOUNDARY;
7291 }
7292 }
7293 if (align > BIGGEST_ALIGNMENT)
7294 align = BIGGEST_ALIGNMENT;
7295 return align;
7296 }
7297
7298 /* Return true when TYPE should be 128bit aligned for 32bit argument
7299 passing ABI. */
7300
7301 static bool
7302 ix86_contains_aligned_value_p (const_tree type)
7303 {
7304 enum machine_mode mode = TYPE_MODE (type);
7305
7306 if (mode == XFmode || mode == XCmode)
7307 return false;
7308
7309 if (TYPE_ALIGN (type) < 128)
7310 return false;
7311
7312 if (AGGREGATE_TYPE_P (type))
7313 {
7314 /* Walk the aggregates recursively. */
7315 switch (TREE_CODE (type))
7316 {
7317 case RECORD_TYPE:
7318 case UNION_TYPE:
7319 case QUAL_UNION_TYPE:
7320 {
7321 tree field;
7322
7323 /* Walk all the structure fields. */
7324 for (field = TYPE_FIELDS (type);
7325 field;
7326 field = DECL_CHAIN (field))
7327 {
7328 if (TREE_CODE (field) == FIELD_DECL
7329 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7330 return true;
7331 }
7332 break;
7333 }
7334
7335 case ARRAY_TYPE:
7336 /* Just for use if some languages passes arrays by value. */
7337 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7338 return true;
7339 break;
7340
7341 default:
7342 gcc_unreachable ();
7343 }
7344 }
7345 else
7346 return TYPE_ALIGN (type) >= 128;
7347
7348 return false;
7349 }
7350
7351 /* Gives the alignment boundary, in bits, of an argument with the
7352 specified mode and type. */
7353
7354 static unsigned int
7355 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7356 {
7357 unsigned int align;
7358 if (type)
7359 {
7360 /* Since the main variant type is used for call, we convert it to
7361 the main variant type. */
7362 type = TYPE_MAIN_VARIANT (type);
7363 align = TYPE_ALIGN (type);
7364 }
7365 else
7366 align = GET_MODE_ALIGNMENT (mode);
7367 if (align < PARM_BOUNDARY)
7368 align = PARM_BOUNDARY;
7369 else
7370 {
7371 static bool warned;
7372 unsigned int saved_align = align;
7373
7374 if (!TARGET_64BIT)
7375 {
7376 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7377 if (!type)
7378 {
7379 if (mode == XFmode || mode == XCmode)
7380 align = PARM_BOUNDARY;
7381 }
7382 else if (!ix86_contains_aligned_value_p (type))
7383 align = PARM_BOUNDARY;
7384
7385 if (align < 128)
7386 align = PARM_BOUNDARY;
7387 }
7388
7389 if (warn_psabi
7390 && !warned
7391 && align != ix86_compat_function_arg_boundary (mode, type,
7392 saved_align))
7393 {
7394 warned = true;
7395 inform (input_location,
7396 "The ABI for passing parameters with %d-byte"
7397 " alignment has changed in GCC 4.6",
7398 align / BITS_PER_UNIT);
7399 }
7400 }
7401
7402 return align;
7403 }
7404
7405 /* Return true if N is a possible register number of function value. */
7406
7407 static bool
7408 ix86_function_value_regno_p (const unsigned int regno)
7409 {
7410 switch (regno)
7411 {
7412 case AX_REG:
7413 case DX_REG:
7414 return true;
7415 case DI_REG:
7416 case SI_REG:
7417 return TARGET_64BIT && ix86_abi != MS_ABI;
7418
7419 /* Complex values are returned in %st(0)/%st(1) pair. */
7420 case ST0_REG:
7421 case ST1_REG:
7422 /* TODO: The function should depend on current function ABI but
7423 builtins.c would need updating then. Therefore we use the
7424 default ABI. */
7425 if (TARGET_64BIT && ix86_abi == MS_ABI)
7426 return false;
7427 return TARGET_FLOAT_RETURNS_IN_80387;
7428
7429 /* Complex values are returned in %xmm0/%xmm1 pair. */
7430 case XMM0_REG:
7431 case XMM1_REG:
7432 return TARGET_SSE;
7433
7434 case MM0_REG:
7435 if (TARGET_MACHO || TARGET_64BIT)
7436 return false;
7437 return TARGET_MMX;
7438 }
7439
7440 return false;
7441 }
7442
7443 /* Define how to find the value returned by a function.
7444 VALTYPE is the data type of the value (as a tree).
7445 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7446 otherwise, FUNC is 0. */
7447
7448 static rtx
7449 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7450 const_tree fntype, const_tree fn)
7451 {
7452 unsigned int regno;
7453
7454 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7455 we normally prevent this case when mmx is not available. However
7456 some ABIs may require the result to be returned like DImode. */
7457 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7458 regno = FIRST_MMX_REG;
7459
7460 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7461 we prevent this case when sse is not available. However some ABIs
7462 may require the result to be returned like integer TImode. */
7463 else if (mode == TImode
7464 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7465 regno = FIRST_SSE_REG;
7466
7467 /* 32-byte vector modes in %ymm0. */
7468 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7469 regno = FIRST_SSE_REG;
7470
7471 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7472 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7473 regno = FIRST_FLOAT_REG;
7474 else
7475 /* Most things go in %eax. */
7476 regno = AX_REG;
7477
7478 /* Override FP return register with %xmm0 for local functions when
7479 SSE math is enabled or for functions with sseregparm attribute. */
7480 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7481 {
7482 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7483 if ((sse_level >= 1 && mode == SFmode)
7484 || (sse_level == 2 && mode == DFmode))
7485 regno = FIRST_SSE_REG;
7486 }
7487
7488 /* OImode shouldn't be used directly. */
7489 gcc_assert (mode != OImode);
7490
7491 return gen_rtx_REG (orig_mode, regno);
7492 }
7493
7494 static rtx
7495 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7496 const_tree valtype)
7497 {
7498 rtx ret;
7499
7500 /* Handle libcalls, which don't provide a type node. */
7501 if (valtype == NULL)
7502 {
7503 unsigned int regno;
7504
7505 switch (mode)
7506 {
7507 case SFmode:
7508 case SCmode:
7509 case DFmode:
7510 case DCmode:
7511 case TFmode:
7512 case SDmode:
7513 case DDmode:
7514 case TDmode:
7515 regno = FIRST_SSE_REG;
7516 break;
7517 case XFmode:
7518 case XCmode:
7519 regno = FIRST_FLOAT_REG;
7520 break;
7521 case TCmode:
7522 return NULL;
7523 default:
7524 regno = AX_REG;
7525 }
7526
7527 return gen_rtx_REG (mode, regno);
7528 }
7529 else if (POINTER_TYPE_P (valtype))
7530 {
7531 /* Pointers are always returned in word_mode. */
7532 mode = word_mode;
7533 }
7534
7535 ret = construct_container (mode, orig_mode, valtype, 1,
7536 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7537 x86_64_int_return_registers, 0);
7538
7539 /* For zero sized structures, construct_container returns NULL, but we
7540 need to keep rest of compiler happy by returning meaningful value. */
7541 if (!ret)
7542 ret = gen_rtx_REG (orig_mode, AX_REG);
7543
7544 return ret;
7545 }
7546
7547 static rtx
7548 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7549 const_tree valtype)
7550 {
7551 unsigned int regno = AX_REG;
7552
7553 if (TARGET_SSE)
7554 {
7555 switch (GET_MODE_SIZE (mode))
7556 {
7557 case 16:
7558 if (valtype != NULL_TREE
7559 && !VECTOR_INTEGER_TYPE_P (valtype)
7560 && !VECTOR_INTEGER_TYPE_P (valtype)
7561 && !INTEGRAL_TYPE_P (valtype)
7562 && !VECTOR_FLOAT_TYPE_P (valtype))
7563 break;
7564 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7565 && !COMPLEX_MODE_P (mode))
7566 regno = FIRST_SSE_REG;
7567 break;
7568 case 8:
7569 case 4:
7570 if (mode == SFmode || mode == DFmode)
7571 regno = FIRST_SSE_REG;
7572 break;
7573 default:
7574 break;
7575 }
7576 }
7577 return gen_rtx_REG (orig_mode, regno);
7578 }
7579
7580 static rtx
7581 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7582 enum machine_mode orig_mode, enum machine_mode mode)
7583 {
7584 const_tree fn, fntype;
7585
7586 fn = NULL_TREE;
7587 if (fntype_or_decl && DECL_P (fntype_or_decl))
7588 fn = fntype_or_decl;
7589 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7590
7591 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7592 return function_value_ms_64 (orig_mode, mode, valtype);
7593 else if (TARGET_64BIT)
7594 return function_value_64 (orig_mode, mode, valtype);
7595 else
7596 return function_value_32 (orig_mode, mode, fntype, fn);
7597 }
7598
7599 static rtx
7600 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7601 bool outgoing ATTRIBUTE_UNUSED)
7602 {
7603 enum machine_mode mode, orig_mode;
7604
7605 orig_mode = TYPE_MODE (valtype);
7606 mode = type_natural_mode (valtype, NULL);
7607 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7608 }
7609
7610 /* Pointer function arguments and return values are promoted to
7611 word_mode. */
7612
7613 static enum machine_mode
7614 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7615 int *punsignedp, const_tree fntype,
7616 int for_return)
7617 {
7618 if (type != NULL_TREE && POINTER_TYPE_P (type))
7619 {
7620 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7621 return word_mode;
7622 }
7623 return default_promote_function_mode (type, mode, punsignedp, fntype,
7624 for_return);
7625 }
7626
7627 /* Return true if a structure, union or array with MODE containing FIELD
7628 should be accessed using BLKmode. */
7629
7630 static bool
7631 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7632 {
7633 /* Union with XFmode must be in BLKmode. */
7634 return (mode == XFmode
7635 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7636 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7637 }
7638
7639 rtx
7640 ix86_libcall_value (enum machine_mode mode)
7641 {
7642 return ix86_function_value_1 (NULL, NULL, mode, mode);
7643 }
7644
7645 /* Return true iff type is returned in memory. */
7646
7647 static bool ATTRIBUTE_UNUSED
7648 return_in_memory_32 (const_tree type, enum machine_mode mode)
7649 {
7650 HOST_WIDE_INT size;
7651
7652 if (mode == BLKmode)
7653 return true;
7654
7655 size = int_size_in_bytes (type);
7656
7657 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7658 return false;
7659
7660 if (VECTOR_MODE_P (mode) || mode == TImode)
7661 {
7662 /* User-created vectors small enough to fit in EAX. */
7663 if (size < 8)
7664 return false;
7665
7666 /* MMX/3dNow values are returned in MM0,
7667 except when it doesn't exits or the ABI prescribes otherwise. */
7668 if (size == 8)
7669 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7670
7671 /* SSE values are returned in XMM0, except when it doesn't exist. */
7672 if (size == 16)
7673 return !TARGET_SSE;
7674
7675 /* AVX values are returned in YMM0, except when it doesn't exist. */
7676 if (size == 32)
7677 return !TARGET_AVX;
7678 }
7679
7680 if (mode == XFmode)
7681 return false;
7682
7683 if (size > 12)
7684 return true;
7685
7686 /* OImode shouldn't be used directly. */
7687 gcc_assert (mode != OImode);
7688
7689 return false;
7690 }
7691
7692 static bool ATTRIBUTE_UNUSED
7693 return_in_memory_64 (const_tree type, enum machine_mode mode)
7694 {
7695 int needed_intregs, needed_sseregs;
7696 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7697 }
7698
7699 static bool ATTRIBUTE_UNUSED
7700 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7701 {
7702 HOST_WIDE_INT size = int_size_in_bytes (type);
7703
7704 /* __m128 is returned in xmm0. */
7705 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7706 || VECTOR_FLOAT_TYPE_P (type))
7707 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7708 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7709 return false;
7710
7711 /* Otherwise, the size must be exactly in [1248]. */
7712 return size != 1 && size != 2 && size != 4 && size != 8;
7713 }
7714
7715 static bool
7716 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7717 {
7718 #ifdef SUBTARGET_RETURN_IN_MEMORY
7719 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7720 #else
7721 const enum machine_mode mode = type_natural_mode (type, NULL);
7722
7723 if (TARGET_64BIT)
7724 {
7725 if (ix86_function_type_abi (fntype) == MS_ABI)
7726 return return_in_memory_ms_64 (type, mode);
7727 else
7728 return return_in_memory_64 (type, mode);
7729 }
7730 else
7731 return return_in_memory_32 (type, mode);
7732 #endif
7733 }
7734
7735 /* When returning SSE vector types, we have a choice of either
7736 (1) being abi incompatible with a -march switch, or
7737 (2) generating an error.
7738 Given no good solution, I think the safest thing is one warning.
7739 The user won't be able to use -Werror, but....
7740
7741 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7742 called in response to actually generating a caller or callee that
7743 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7744 via aggregate_value_p for general type probing from tree-ssa. */
7745
7746 static rtx
7747 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7748 {
7749 static bool warnedsse, warnedmmx;
7750
7751 if (!TARGET_64BIT && type)
7752 {
7753 /* Look at the return type of the function, not the function type. */
7754 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7755
7756 if (!TARGET_SSE && !warnedsse)
7757 {
7758 if (mode == TImode
7759 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7760 {
7761 warnedsse = true;
7762 warning (0, "SSE vector return without SSE enabled "
7763 "changes the ABI");
7764 }
7765 }
7766
7767 if (!TARGET_MMX && !warnedmmx)
7768 {
7769 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7770 {
7771 warnedmmx = true;
7772 warning (0, "MMX vector return without MMX enabled "
7773 "changes the ABI");
7774 }
7775 }
7776 }
7777
7778 return NULL;
7779 }
7780
7781 \f
7782 /* Create the va_list data type. */
7783
7784 /* Returns the calling convention specific va_list date type.
7785 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7786
7787 static tree
7788 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7789 {
7790 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7791
7792 /* For i386 we use plain pointer to argument area. */
7793 if (!TARGET_64BIT || abi == MS_ABI)
7794 return build_pointer_type (char_type_node);
7795
7796 record = lang_hooks.types.make_type (RECORD_TYPE);
7797 type_decl = build_decl (BUILTINS_LOCATION,
7798 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7799
7800 f_gpr = build_decl (BUILTINS_LOCATION,
7801 FIELD_DECL, get_identifier ("gp_offset"),
7802 unsigned_type_node);
7803 f_fpr = build_decl (BUILTINS_LOCATION,
7804 FIELD_DECL, get_identifier ("fp_offset"),
7805 unsigned_type_node);
7806 f_ovf = build_decl (BUILTINS_LOCATION,
7807 FIELD_DECL, get_identifier ("overflow_arg_area"),
7808 ptr_type_node);
7809 f_sav = build_decl (BUILTINS_LOCATION,
7810 FIELD_DECL, get_identifier ("reg_save_area"),
7811 ptr_type_node);
7812
7813 va_list_gpr_counter_field = f_gpr;
7814 va_list_fpr_counter_field = f_fpr;
7815
7816 DECL_FIELD_CONTEXT (f_gpr) = record;
7817 DECL_FIELD_CONTEXT (f_fpr) = record;
7818 DECL_FIELD_CONTEXT (f_ovf) = record;
7819 DECL_FIELD_CONTEXT (f_sav) = record;
7820
7821 TYPE_STUB_DECL (record) = type_decl;
7822 TYPE_NAME (record) = type_decl;
7823 TYPE_FIELDS (record) = f_gpr;
7824 DECL_CHAIN (f_gpr) = f_fpr;
7825 DECL_CHAIN (f_fpr) = f_ovf;
7826 DECL_CHAIN (f_ovf) = f_sav;
7827
7828 layout_type (record);
7829
7830 /* The correct type is an array type of one element. */
7831 return build_array_type (record, build_index_type (size_zero_node));
7832 }
7833
7834 /* Setup the builtin va_list data type and for 64-bit the additional
7835 calling convention specific va_list data types. */
7836
7837 static tree
7838 ix86_build_builtin_va_list (void)
7839 {
7840 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7841
7842 /* Initialize abi specific va_list builtin types. */
7843 if (TARGET_64BIT)
7844 {
7845 tree t;
7846 if (ix86_abi == MS_ABI)
7847 {
7848 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7849 if (TREE_CODE (t) != RECORD_TYPE)
7850 t = build_variant_type_copy (t);
7851 sysv_va_list_type_node = t;
7852 }
7853 else
7854 {
7855 t = ret;
7856 if (TREE_CODE (t) != RECORD_TYPE)
7857 t = build_variant_type_copy (t);
7858 sysv_va_list_type_node = t;
7859 }
7860 if (ix86_abi != MS_ABI)
7861 {
7862 t = ix86_build_builtin_va_list_abi (MS_ABI);
7863 if (TREE_CODE (t) != RECORD_TYPE)
7864 t = build_variant_type_copy (t);
7865 ms_va_list_type_node = t;
7866 }
7867 else
7868 {
7869 t = ret;
7870 if (TREE_CODE (t) != RECORD_TYPE)
7871 t = build_variant_type_copy (t);
7872 ms_va_list_type_node = t;
7873 }
7874 }
7875
7876 return ret;
7877 }
7878
7879 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7880
7881 static void
7882 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7883 {
7884 rtx save_area, mem;
7885 alias_set_type set;
7886 int i, max;
7887
7888 /* GPR size of varargs save area. */
7889 if (cfun->va_list_gpr_size)
7890 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7891 else
7892 ix86_varargs_gpr_size = 0;
7893
7894 /* FPR size of varargs save area. We don't need it if we don't pass
7895 anything in SSE registers. */
7896 if (TARGET_SSE && cfun->va_list_fpr_size)
7897 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7898 else
7899 ix86_varargs_fpr_size = 0;
7900
7901 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7902 return;
7903
7904 save_area = frame_pointer_rtx;
7905 set = get_varargs_alias_set ();
7906
7907 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7908 if (max > X86_64_REGPARM_MAX)
7909 max = X86_64_REGPARM_MAX;
7910
7911 for (i = cum->regno; i < max; i++)
7912 {
7913 mem = gen_rtx_MEM (word_mode,
7914 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7915 MEM_NOTRAP_P (mem) = 1;
7916 set_mem_alias_set (mem, set);
7917 emit_move_insn (mem,
7918 gen_rtx_REG (word_mode,
7919 x86_64_int_parameter_registers[i]));
7920 }
7921
7922 if (ix86_varargs_fpr_size)
7923 {
7924 enum machine_mode smode;
7925 rtx label, test;
7926
7927 /* Now emit code to save SSE registers. The AX parameter contains number
7928 of SSE parameter registers used to call this function, though all we
7929 actually check here is the zero/non-zero status. */
7930
7931 label = gen_label_rtx ();
7932 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7933 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7934 label));
7935
7936 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7937 we used movdqa (i.e. TImode) instead? Perhaps even better would
7938 be if we could determine the real mode of the data, via a hook
7939 into pass_stdarg. Ignore all that for now. */
7940 smode = V4SFmode;
7941 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7942 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7943
7944 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7945 if (max > X86_64_SSE_REGPARM_MAX)
7946 max = X86_64_SSE_REGPARM_MAX;
7947
7948 for (i = cum->sse_regno; i < max; ++i)
7949 {
7950 mem = plus_constant (Pmode, save_area,
7951 i * 16 + ix86_varargs_gpr_size);
7952 mem = gen_rtx_MEM (smode, mem);
7953 MEM_NOTRAP_P (mem) = 1;
7954 set_mem_alias_set (mem, set);
7955 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7956
7957 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7958 }
7959
7960 emit_label (label);
7961 }
7962 }
7963
7964 static void
7965 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7966 {
7967 alias_set_type set = get_varargs_alias_set ();
7968 int i;
7969
7970 /* Reset to zero, as there might be a sysv vaarg used
7971 before. */
7972 ix86_varargs_gpr_size = 0;
7973 ix86_varargs_fpr_size = 0;
7974
7975 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7976 {
7977 rtx reg, mem;
7978
7979 mem = gen_rtx_MEM (Pmode,
7980 plus_constant (Pmode, virtual_incoming_args_rtx,
7981 i * UNITS_PER_WORD));
7982 MEM_NOTRAP_P (mem) = 1;
7983 set_mem_alias_set (mem, set);
7984
7985 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7986 emit_move_insn (mem, reg);
7987 }
7988 }
7989
7990 static void
7991 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7992 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7993 int no_rtl)
7994 {
7995 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7996 CUMULATIVE_ARGS next_cum;
7997 tree fntype;
7998
7999 /* This argument doesn't appear to be used anymore. Which is good,
8000 because the old code here didn't suppress rtl generation. */
8001 gcc_assert (!no_rtl);
8002
8003 if (!TARGET_64BIT)
8004 return;
8005
8006 fntype = TREE_TYPE (current_function_decl);
8007
8008 /* For varargs, we do not want to skip the dummy va_dcl argument.
8009 For stdargs, we do want to skip the last named argument. */
8010 next_cum = *cum;
8011 if (stdarg_p (fntype))
8012 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
8013 true);
8014
8015 if (cum->call_abi == MS_ABI)
8016 setup_incoming_varargs_ms_64 (&next_cum);
8017 else
8018 setup_incoming_varargs_64 (&next_cum);
8019 }
8020
8021 /* Checks if TYPE is of kind va_list char *. */
8022
8023 static bool
8024 is_va_list_char_pointer (tree type)
8025 {
8026 tree canonic;
8027
8028 /* For 32-bit it is always true. */
8029 if (!TARGET_64BIT)
8030 return true;
8031 canonic = ix86_canonical_va_list_type (type);
8032 return (canonic == ms_va_list_type_node
8033 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
8034 }
8035
8036 /* Implement va_start. */
8037
8038 static void
8039 ix86_va_start (tree valist, rtx nextarg)
8040 {
8041 HOST_WIDE_INT words, n_gpr, n_fpr;
8042 tree f_gpr, f_fpr, f_ovf, f_sav;
8043 tree gpr, fpr, ovf, sav, t;
8044 tree type;
8045 rtx ovf_rtx;
8046
8047 if (flag_split_stack
8048 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8049 {
8050 unsigned int scratch_regno;
8051
8052 /* When we are splitting the stack, we can't refer to the stack
8053 arguments using internal_arg_pointer, because they may be on
8054 the old stack. The split stack prologue will arrange to
8055 leave a pointer to the old stack arguments in a scratch
8056 register, which we here copy to a pseudo-register. The split
8057 stack prologue can't set the pseudo-register directly because
8058 it (the prologue) runs before any registers have been saved. */
8059
8060 scratch_regno = split_stack_prologue_scratch_regno ();
8061 if (scratch_regno != INVALID_REGNUM)
8062 {
8063 rtx reg, seq;
8064
8065 reg = gen_reg_rtx (Pmode);
8066 cfun->machine->split_stack_varargs_pointer = reg;
8067
8068 start_sequence ();
8069 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
8070 seq = get_insns ();
8071 end_sequence ();
8072
8073 push_topmost_sequence ();
8074 emit_insn_after (seq, entry_of_function ());
8075 pop_topmost_sequence ();
8076 }
8077 }
8078
8079 /* Only 64bit target needs something special. */
8080 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8081 {
8082 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8083 std_expand_builtin_va_start (valist, nextarg);
8084 else
8085 {
8086 rtx va_r, next;
8087
8088 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
8089 next = expand_binop (ptr_mode, add_optab,
8090 cfun->machine->split_stack_varargs_pointer,
8091 crtl->args.arg_offset_rtx,
8092 NULL_RTX, 0, OPTAB_LIB_WIDEN);
8093 convert_move (va_r, next, 0);
8094 }
8095 return;
8096 }
8097
8098 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8099 f_fpr = DECL_CHAIN (f_gpr);
8100 f_ovf = DECL_CHAIN (f_fpr);
8101 f_sav = DECL_CHAIN (f_ovf);
8102
8103 valist = build_simple_mem_ref (valist);
8104 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
8105 /* The following should be folded into the MEM_REF offset. */
8106 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
8107 f_gpr, NULL_TREE);
8108 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
8109 f_fpr, NULL_TREE);
8110 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
8111 f_ovf, NULL_TREE);
8112 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
8113 f_sav, NULL_TREE);
8114
8115 /* Count number of gp and fp argument registers used. */
8116 words = crtl->args.info.words;
8117 n_gpr = crtl->args.info.regno;
8118 n_fpr = crtl->args.info.sse_regno;
8119
8120 if (cfun->va_list_gpr_size)
8121 {
8122 type = TREE_TYPE (gpr);
8123 t = build2 (MODIFY_EXPR, type,
8124 gpr, build_int_cst (type, n_gpr * 8));
8125 TREE_SIDE_EFFECTS (t) = 1;
8126 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8127 }
8128
8129 if (TARGET_SSE && cfun->va_list_fpr_size)
8130 {
8131 type = TREE_TYPE (fpr);
8132 t = build2 (MODIFY_EXPR, type, fpr,
8133 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
8134 TREE_SIDE_EFFECTS (t) = 1;
8135 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8136 }
8137
8138 /* Find the overflow area. */
8139 type = TREE_TYPE (ovf);
8140 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
8141 ovf_rtx = crtl->args.internal_arg_pointer;
8142 else
8143 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
8144 t = make_tree (type, ovf_rtx);
8145 if (words != 0)
8146 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
8147 t = build2 (MODIFY_EXPR, type, ovf, t);
8148 TREE_SIDE_EFFECTS (t) = 1;
8149 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8150
8151 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
8152 {
8153 /* Find the register save area.
8154 Prologue of the function save it right above stack frame. */
8155 type = TREE_TYPE (sav);
8156 t = make_tree (type, frame_pointer_rtx);
8157 if (!ix86_varargs_gpr_size)
8158 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
8159 t = build2 (MODIFY_EXPR, type, sav, t);
8160 TREE_SIDE_EFFECTS (t) = 1;
8161 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8162 }
8163 }
8164
8165 /* Implement va_arg. */
8166
8167 static tree
8168 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
8169 gimple_seq *post_p)
8170 {
8171 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
8172 tree f_gpr, f_fpr, f_ovf, f_sav;
8173 tree gpr, fpr, ovf, sav, t;
8174 int size, rsize;
8175 tree lab_false, lab_over = NULL_TREE;
8176 tree addr, t2;
8177 rtx container;
8178 int indirect_p = 0;
8179 tree ptrtype;
8180 enum machine_mode nat_mode;
8181 unsigned int arg_boundary;
8182
8183 /* Only 64bit target needs something special. */
8184 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
8185 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
8186
8187 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
8188 f_fpr = DECL_CHAIN (f_gpr);
8189 f_ovf = DECL_CHAIN (f_fpr);
8190 f_sav = DECL_CHAIN (f_ovf);
8191
8192 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
8193 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
8194 valist = build_va_arg_indirect_ref (valist);
8195 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
8196 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
8197 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
8198
8199 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8200 if (indirect_p)
8201 type = build_pointer_type (type);
8202 size = int_size_in_bytes (type);
8203 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
8204
8205 nat_mode = type_natural_mode (type, NULL);
8206 switch (nat_mode)
8207 {
8208 case V8SFmode:
8209 case V8SImode:
8210 case V32QImode:
8211 case V16HImode:
8212 case V4DFmode:
8213 case V4DImode:
8214 /* Unnamed 256bit vector mode parameters are passed on stack. */
8215 if (!TARGET_64BIT_MS_ABI)
8216 {
8217 container = NULL;
8218 break;
8219 }
8220
8221 default:
8222 container = construct_container (nat_mode, TYPE_MODE (type),
8223 type, 0, X86_64_REGPARM_MAX,
8224 X86_64_SSE_REGPARM_MAX, intreg,
8225 0);
8226 break;
8227 }
8228
8229 /* Pull the value out of the saved registers. */
8230
8231 addr = create_tmp_var (ptr_type_node, "addr");
8232
8233 if (container)
8234 {
8235 int needed_intregs, needed_sseregs;
8236 bool need_temp;
8237 tree int_addr, sse_addr;
8238
8239 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8240 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8241
8242 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8243
8244 need_temp = (!REG_P (container)
8245 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8246 || TYPE_ALIGN (type) > 128));
8247
8248 /* In case we are passing structure, verify that it is consecutive block
8249 on the register save area. If not we need to do moves. */
8250 if (!need_temp && !REG_P (container))
8251 {
8252 /* Verify that all registers are strictly consecutive */
8253 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8254 {
8255 int i;
8256
8257 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8258 {
8259 rtx slot = XVECEXP (container, 0, i);
8260 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8261 || INTVAL (XEXP (slot, 1)) != i * 16)
8262 need_temp = 1;
8263 }
8264 }
8265 else
8266 {
8267 int i;
8268
8269 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8270 {
8271 rtx slot = XVECEXP (container, 0, i);
8272 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8273 || INTVAL (XEXP (slot, 1)) != i * 8)
8274 need_temp = 1;
8275 }
8276 }
8277 }
8278 if (!need_temp)
8279 {
8280 int_addr = addr;
8281 sse_addr = addr;
8282 }
8283 else
8284 {
8285 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8286 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8287 }
8288
8289 /* First ensure that we fit completely in registers. */
8290 if (needed_intregs)
8291 {
8292 t = build_int_cst (TREE_TYPE (gpr),
8293 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8294 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8295 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8296 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8297 gimplify_and_add (t, pre_p);
8298 }
8299 if (needed_sseregs)
8300 {
8301 t = build_int_cst (TREE_TYPE (fpr),
8302 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8303 + X86_64_REGPARM_MAX * 8);
8304 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8305 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8306 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8307 gimplify_and_add (t, pre_p);
8308 }
8309
8310 /* Compute index to start of area used for integer regs. */
8311 if (needed_intregs)
8312 {
8313 /* int_addr = gpr + sav; */
8314 t = fold_build_pointer_plus (sav, gpr);
8315 gimplify_assign (int_addr, t, pre_p);
8316 }
8317 if (needed_sseregs)
8318 {
8319 /* sse_addr = fpr + sav; */
8320 t = fold_build_pointer_plus (sav, fpr);
8321 gimplify_assign (sse_addr, t, pre_p);
8322 }
8323 if (need_temp)
8324 {
8325 int i, prev_size = 0;
8326 tree temp = create_tmp_var (type, "va_arg_tmp");
8327
8328 /* addr = &temp; */
8329 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8330 gimplify_assign (addr, t, pre_p);
8331
8332 for (i = 0; i < XVECLEN (container, 0); i++)
8333 {
8334 rtx slot = XVECEXP (container, 0, i);
8335 rtx reg = XEXP (slot, 0);
8336 enum machine_mode mode = GET_MODE (reg);
8337 tree piece_type;
8338 tree addr_type;
8339 tree daddr_type;
8340 tree src_addr, src;
8341 int src_offset;
8342 tree dest_addr, dest;
8343 int cur_size = GET_MODE_SIZE (mode);
8344
8345 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8346 prev_size = INTVAL (XEXP (slot, 1));
8347 if (prev_size + cur_size > size)
8348 {
8349 cur_size = size - prev_size;
8350 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8351 if (mode == BLKmode)
8352 mode = QImode;
8353 }
8354 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8355 if (mode == GET_MODE (reg))
8356 addr_type = build_pointer_type (piece_type);
8357 else
8358 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8359 true);
8360 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8361 true);
8362
8363 if (SSE_REGNO_P (REGNO (reg)))
8364 {
8365 src_addr = sse_addr;
8366 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8367 }
8368 else
8369 {
8370 src_addr = int_addr;
8371 src_offset = REGNO (reg) * 8;
8372 }
8373 src_addr = fold_convert (addr_type, src_addr);
8374 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8375
8376 dest_addr = fold_convert (daddr_type, addr);
8377 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8378 if (cur_size == GET_MODE_SIZE (mode))
8379 {
8380 src = build_va_arg_indirect_ref (src_addr);
8381 dest = build_va_arg_indirect_ref (dest_addr);
8382
8383 gimplify_assign (dest, src, pre_p);
8384 }
8385 else
8386 {
8387 tree copy
8388 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8389 3, dest_addr, src_addr,
8390 size_int (cur_size));
8391 gimplify_and_add (copy, pre_p);
8392 }
8393 prev_size += cur_size;
8394 }
8395 }
8396
8397 if (needed_intregs)
8398 {
8399 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8400 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8401 gimplify_assign (gpr, t, pre_p);
8402 }
8403
8404 if (needed_sseregs)
8405 {
8406 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8407 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8408 gimplify_assign (fpr, t, pre_p);
8409 }
8410
8411 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8412
8413 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8414 }
8415
8416 /* ... otherwise out of the overflow area. */
8417
8418 /* When we align parameter on stack for caller, if the parameter
8419 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8420 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8421 here with caller. */
8422 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8423 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8424 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8425
8426 /* Care for on-stack alignment if needed. */
8427 if (arg_boundary <= 64 || size == 0)
8428 t = ovf;
8429 else
8430 {
8431 HOST_WIDE_INT align = arg_boundary / 8;
8432 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8433 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8434 build_int_cst (TREE_TYPE (t), -align));
8435 }
8436
8437 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8438 gimplify_assign (addr, t, pre_p);
8439
8440 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8441 gimplify_assign (unshare_expr (ovf), t, pre_p);
8442
8443 if (container)
8444 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8445
8446 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8447 addr = fold_convert (ptrtype, addr);
8448
8449 if (indirect_p)
8450 addr = build_va_arg_indirect_ref (addr);
8451 return build_va_arg_indirect_ref (addr);
8452 }
8453 \f
8454 /* Return true if OPNUM's MEM should be matched
8455 in movabs* patterns. */
8456
8457 bool
8458 ix86_check_movabs (rtx insn, int opnum)
8459 {
8460 rtx set, mem;
8461
8462 set = PATTERN (insn);
8463 if (GET_CODE (set) == PARALLEL)
8464 set = XVECEXP (set, 0, 0);
8465 gcc_assert (GET_CODE (set) == SET);
8466 mem = XEXP (set, opnum);
8467 while (GET_CODE (mem) == SUBREG)
8468 mem = SUBREG_REG (mem);
8469 gcc_assert (MEM_P (mem));
8470 return volatile_ok || !MEM_VOLATILE_P (mem);
8471 }
8472 \f
8473 /* Initialize the table of extra 80387 mathematical constants. */
8474
8475 static void
8476 init_ext_80387_constants (void)
8477 {
8478 static const char * cst[5] =
8479 {
8480 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8481 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8482 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8483 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8484 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8485 };
8486 int i;
8487
8488 for (i = 0; i < 5; i++)
8489 {
8490 real_from_string (&ext_80387_constants_table[i], cst[i]);
8491 /* Ensure each constant is rounded to XFmode precision. */
8492 real_convert (&ext_80387_constants_table[i],
8493 XFmode, &ext_80387_constants_table[i]);
8494 }
8495
8496 ext_80387_constants_init = 1;
8497 }
8498
8499 /* Return non-zero if the constant is something that
8500 can be loaded with a special instruction. */
8501
8502 int
8503 standard_80387_constant_p (rtx x)
8504 {
8505 enum machine_mode mode = GET_MODE (x);
8506
8507 REAL_VALUE_TYPE r;
8508
8509 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8510 return -1;
8511
8512 if (x == CONST0_RTX (mode))
8513 return 1;
8514 if (x == CONST1_RTX (mode))
8515 return 2;
8516
8517 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8518
8519 /* For XFmode constants, try to find a special 80387 instruction when
8520 optimizing for size or on those CPUs that benefit from them. */
8521 if (mode == XFmode
8522 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8523 {
8524 int i;
8525
8526 if (! ext_80387_constants_init)
8527 init_ext_80387_constants ();
8528
8529 for (i = 0; i < 5; i++)
8530 if (real_identical (&r, &ext_80387_constants_table[i]))
8531 return i + 3;
8532 }
8533
8534 /* Load of the constant -0.0 or -1.0 will be split as
8535 fldz;fchs or fld1;fchs sequence. */
8536 if (real_isnegzero (&r))
8537 return 8;
8538 if (real_identical (&r, &dconstm1))
8539 return 9;
8540
8541 return 0;
8542 }
8543
8544 /* Return the opcode of the special instruction to be used to load
8545 the constant X. */
8546
8547 const char *
8548 standard_80387_constant_opcode (rtx x)
8549 {
8550 switch (standard_80387_constant_p (x))
8551 {
8552 case 1:
8553 return "fldz";
8554 case 2:
8555 return "fld1";
8556 case 3:
8557 return "fldlg2";
8558 case 4:
8559 return "fldln2";
8560 case 5:
8561 return "fldl2e";
8562 case 6:
8563 return "fldl2t";
8564 case 7:
8565 return "fldpi";
8566 case 8:
8567 case 9:
8568 return "#";
8569 default:
8570 gcc_unreachable ();
8571 }
8572 }
8573
8574 /* Return the CONST_DOUBLE representing the 80387 constant that is
8575 loaded by the specified special instruction. The argument IDX
8576 matches the return value from standard_80387_constant_p. */
8577
8578 rtx
8579 standard_80387_constant_rtx (int idx)
8580 {
8581 int i;
8582
8583 if (! ext_80387_constants_init)
8584 init_ext_80387_constants ();
8585
8586 switch (idx)
8587 {
8588 case 3:
8589 case 4:
8590 case 5:
8591 case 6:
8592 case 7:
8593 i = idx - 3;
8594 break;
8595
8596 default:
8597 gcc_unreachable ();
8598 }
8599
8600 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8601 XFmode);
8602 }
8603
8604 /* Return 1 if X is all 0s and 2 if x is all 1s
8605 in supported SSE/AVX vector mode. */
8606
8607 int
8608 standard_sse_constant_p (rtx x)
8609 {
8610 enum machine_mode mode = GET_MODE (x);
8611
8612 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8613 return 1;
8614 if (vector_all_ones_operand (x, mode))
8615 switch (mode)
8616 {
8617 case V16QImode:
8618 case V8HImode:
8619 case V4SImode:
8620 case V2DImode:
8621 if (TARGET_SSE2)
8622 return 2;
8623 case V32QImode:
8624 case V16HImode:
8625 case V8SImode:
8626 case V4DImode:
8627 if (TARGET_AVX2)
8628 return 2;
8629 default:
8630 break;
8631 }
8632
8633 return 0;
8634 }
8635
8636 /* Return the opcode of the special instruction to be used to load
8637 the constant X. */
8638
8639 const char *
8640 standard_sse_constant_opcode (rtx insn, rtx x)
8641 {
8642 switch (standard_sse_constant_p (x))
8643 {
8644 case 1:
8645 switch (get_attr_mode (insn))
8646 {
8647 case MODE_TI:
8648 return "%vpxor\t%0, %d0";
8649 case MODE_V2DF:
8650 return "%vxorpd\t%0, %d0";
8651 case MODE_V4SF:
8652 return "%vxorps\t%0, %d0";
8653
8654 case MODE_OI:
8655 return "vpxor\t%x0, %x0, %x0";
8656 case MODE_V4DF:
8657 return "vxorpd\t%x0, %x0, %x0";
8658 case MODE_V8SF:
8659 return "vxorps\t%x0, %x0, %x0";
8660
8661 default:
8662 break;
8663 }
8664
8665 case 2:
8666 if (get_attr_mode (insn) == MODE_XI
8667 || get_attr_mode (insn) == MODE_V8DF
8668 || get_attr_mode (insn) == MODE_V16SF)
8669 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
8670 if (TARGET_AVX)
8671 return "vpcmpeqd\t%0, %0, %0";
8672 else
8673 return "pcmpeqd\t%0, %0";
8674
8675 default:
8676 break;
8677 }
8678 gcc_unreachable ();
8679 }
8680
8681 /* Returns true if OP contains a symbol reference */
8682
8683 bool
8684 symbolic_reference_mentioned_p (rtx op)
8685 {
8686 const char *fmt;
8687 int i;
8688
8689 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8690 return true;
8691
8692 fmt = GET_RTX_FORMAT (GET_CODE (op));
8693 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8694 {
8695 if (fmt[i] == 'E')
8696 {
8697 int j;
8698
8699 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8700 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8701 return true;
8702 }
8703
8704 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8705 return true;
8706 }
8707
8708 return false;
8709 }
8710
8711 /* Return true if it is appropriate to emit `ret' instructions in the
8712 body of a function. Do this only if the epilogue is simple, needing a
8713 couple of insns. Prior to reloading, we can't tell how many registers
8714 must be saved, so return false then. Return false if there is no frame
8715 marker to de-allocate. */
8716
8717 bool
8718 ix86_can_use_return_insn_p (void)
8719 {
8720 struct ix86_frame frame;
8721
8722 if (! reload_completed || frame_pointer_needed)
8723 return 0;
8724
8725 /* Don't allow more than 32k pop, since that's all we can do
8726 with one instruction. */
8727 if (crtl->args.pops_args && crtl->args.size >= 32768)
8728 return 0;
8729
8730 ix86_compute_frame_layout (&frame);
8731 return (frame.stack_pointer_offset == UNITS_PER_WORD
8732 && (frame.nregs + frame.nsseregs) == 0);
8733 }
8734 \f
8735 /* Value should be nonzero if functions must have frame pointers.
8736 Zero means the frame pointer need not be set up (and parms may
8737 be accessed via the stack pointer) in functions that seem suitable. */
8738
8739 static bool
8740 ix86_frame_pointer_required (void)
8741 {
8742 /* If we accessed previous frames, then the generated code expects
8743 to be able to access the saved ebp value in our frame. */
8744 if (cfun->machine->accesses_prev_frame)
8745 return true;
8746
8747 /* Several x86 os'es need a frame pointer for other reasons,
8748 usually pertaining to setjmp. */
8749 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8750 return true;
8751
8752 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8753 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8754 return true;
8755
8756 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8757 allocation is 4GB. */
8758 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8759 return true;
8760
8761 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8762 turns off the frame pointer by default. Turn it back on now if
8763 we've not got a leaf function. */
8764 if (TARGET_OMIT_LEAF_FRAME_POINTER
8765 && (!crtl->is_leaf
8766 || ix86_current_function_calls_tls_descriptor))
8767 return true;
8768
8769 if (crtl->profile && !flag_fentry)
8770 return true;
8771
8772 return false;
8773 }
8774
8775 /* Record that the current function accesses previous call frames. */
8776
8777 void
8778 ix86_setup_frame_addresses (void)
8779 {
8780 cfun->machine->accesses_prev_frame = 1;
8781 }
8782 \f
8783 #ifndef USE_HIDDEN_LINKONCE
8784 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8785 # define USE_HIDDEN_LINKONCE 1
8786 # else
8787 # define USE_HIDDEN_LINKONCE 0
8788 # endif
8789 #endif
8790
8791 static int pic_labels_used;
8792
8793 /* Fills in the label name that should be used for a pc thunk for
8794 the given register. */
8795
8796 static void
8797 get_pc_thunk_name (char name[32], unsigned int regno)
8798 {
8799 gcc_assert (!TARGET_64BIT);
8800
8801 if (USE_HIDDEN_LINKONCE)
8802 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8803 else
8804 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8805 }
8806
8807
8808 /* This function generates code for -fpic that loads %ebx with
8809 the return address of the caller and then returns. */
8810
8811 static void
8812 ix86_code_end (void)
8813 {
8814 rtx xops[2];
8815 int regno;
8816
8817 for (regno = AX_REG; regno <= SP_REG; regno++)
8818 {
8819 char name[32];
8820 tree decl;
8821
8822 if (!(pic_labels_used & (1 << regno)))
8823 continue;
8824
8825 get_pc_thunk_name (name, regno);
8826
8827 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8828 get_identifier (name),
8829 build_function_type_list (void_type_node, NULL_TREE));
8830 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8831 NULL_TREE, void_type_node);
8832 TREE_PUBLIC (decl) = 1;
8833 TREE_STATIC (decl) = 1;
8834 DECL_IGNORED_P (decl) = 1;
8835
8836 #if TARGET_MACHO
8837 if (TARGET_MACHO)
8838 {
8839 switch_to_section (darwin_sections[text_coal_section]);
8840 fputs ("\t.weak_definition\t", asm_out_file);
8841 assemble_name (asm_out_file, name);
8842 fputs ("\n\t.private_extern\t", asm_out_file);
8843 assemble_name (asm_out_file, name);
8844 putc ('\n', asm_out_file);
8845 ASM_OUTPUT_LABEL (asm_out_file, name);
8846 DECL_WEAK (decl) = 1;
8847 }
8848 else
8849 #endif
8850 if (USE_HIDDEN_LINKONCE)
8851 {
8852 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8853
8854 targetm.asm_out.unique_section (decl, 0);
8855 switch_to_section (get_named_section (decl, NULL, 0));
8856
8857 targetm.asm_out.globalize_label (asm_out_file, name);
8858 fputs ("\t.hidden\t", asm_out_file);
8859 assemble_name (asm_out_file, name);
8860 putc ('\n', asm_out_file);
8861 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8862 }
8863 else
8864 {
8865 switch_to_section (text_section);
8866 ASM_OUTPUT_LABEL (asm_out_file, name);
8867 }
8868
8869 DECL_INITIAL (decl) = make_node (BLOCK);
8870 current_function_decl = decl;
8871 init_function_start (decl);
8872 first_function_block_is_cold = false;
8873 /* Make sure unwind info is emitted for the thunk if needed. */
8874 final_start_function (emit_barrier (), asm_out_file, 1);
8875
8876 /* Pad stack IP move with 4 instructions (two NOPs count
8877 as one instruction). */
8878 if (TARGET_PAD_SHORT_FUNCTION)
8879 {
8880 int i = 8;
8881
8882 while (i--)
8883 fputs ("\tnop\n", asm_out_file);
8884 }
8885
8886 xops[0] = gen_rtx_REG (Pmode, regno);
8887 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8888 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8889 output_asm_insn ("%!ret", NULL);
8890 final_end_function ();
8891 init_insn_lengths ();
8892 free_after_compilation (cfun);
8893 set_cfun (NULL);
8894 current_function_decl = NULL;
8895 }
8896
8897 if (flag_split_stack)
8898 file_end_indicate_split_stack ();
8899 }
8900
8901 /* Emit code for the SET_GOT patterns. */
8902
8903 const char *
8904 output_set_got (rtx dest, rtx label)
8905 {
8906 rtx xops[3];
8907
8908 xops[0] = dest;
8909
8910 if (TARGET_VXWORKS_RTP && flag_pic)
8911 {
8912 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8913 xops[2] = gen_rtx_MEM (Pmode,
8914 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8915 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8916
8917 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8918 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8919 an unadorned address. */
8920 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8921 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8922 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8923 return "";
8924 }
8925
8926 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8927
8928 if (!flag_pic)
8929 {
8930 if (TARGET_MACHO)
8931 /* We don't need a pic base, we're not producing pic. */
8932 gcc_unreachable ();
8933
8934 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8935 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8936 targetm.asm_out.internal_label (asm_out_file, "L",
8937 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8938 }
8939 else
8940 {
8941 char name[32];
8942 get_pc_thunk_name (name, REGNO (dest));
8943 pic_labels_used |= 1 << REGNO (dest);
8944
8945 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8946 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8947 output_asm_insn ("%!call\t%X2", xops);
8948
8949 #if TARGET_MACHO
8950 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8951 This is what will be referenced by the Mach-O PIC subsystem. */
8952 if (machopic_should_output_picbase_label () || !label)
8953 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8954
8955 /* When we are restoring the pic base at the site of a nonlocal label,
8956 and we decided to emit the pic base above, we will still output a
8957 local label used for calculating the correction offset (even though
8958 the offset will be 0 in that case). */
8959 if (label)
8960 targetm.asm_out.internal_label (asm_out_file, "L",
8961 CODE_LABEL_NUMBER (label));
8962 #endif
8963 }
8964
8965 if (!TARGET_MACHO)
8966 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8967
8968 return "";
8969 }
8970
8971 /* Generate an "push" pattern for input ARG. */
8972
8973 static rtx
8974 gen_push (rtx arg)
8975 {
8976 struct machine_function *m = cfun->machine;
8977
8978 if (m->fs.cfa_reg == stack_pointer_rtx)
8979 m->fs.cfa_offset += UNITS_PER_WORD;
8980 m->fs.sp_offset += UNITS_PER_WORD;
8981
8982 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8983 arg = gen_rtx_REG (word_mode, REGNO (arg));
8984
8985 return gen_rtx_SET (VOIDmode,
8986 gen_rtx_MEM (word_mode,
8987 gen_rtx_PRE_DEC (Pmode,
8988 stack_pointer_rtx)),
8989 arg);
8990 }
8991
8992 /* Generate an "pop" pattern for input ARG. */
8993
8994 static rtx
8995 gen_pop (rtx arg)
8996 {
8997 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8998 arg = gen_rtx_REG (word_mode, REGNO (arg));
8999
9000 return gen_rtx_SET (VOIDmode,
9001 arg,
9002 gen_rtx_MEM (word_mode,
9003 gen_rtx_POST_INC (Pmode,
9004 stack_pointer_rtx)));
9005 }
9006
9007 /* Return >= 0 if there is an unused call-clobbered register available
9008 for the entire function. */
9009
9010 static unsigned int
9011 ix86_select_alt_pic_regnum (void)
9012 {
9013 if (crtl->is_leaf
9014 && !crtl->profile
9015 && !ix86_current_function_calls_tls_descriptor)
9016 {
9017 int i, drap;
9018 /* Can't use the same register for both PIC and DRAP. */
9019 if (crtl->drap_reg)
9020 drap = REGNO (crtl->drap_reg);
9021 else
9022 drap = -1;
9023 for (i = 2; i >= 0; --i)
9024 if (i != drap && !df_regs_ever_live_p (i))
9025 return i;
9026 }
9027
9028 return INVALID_REGNUM;
9029 }
9030
9031 /* Return TRUE if we need to save REGNO. */
9032
9033 static bool
9034 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
9035 {
9036 if (pic_offset_table_rtx
9037 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
9038 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
9039 || crtl->profile
9040 || crtl->calls_eh_return
9041 || crtl->uses_const_pool
9042 || cfun->has_nonlocal_label))
9043 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
9044
9045 if (crtl->calls_eh_return && maybe_eh_return)
9046 {
9047 unsigned i;
9048 for (i = 0; ; i++)
9049 {
9050 unsigned test = EH_RETURN_DATA_REGNO (i);
9051 if (test == INVALID_REGNUM)
9052 break;
9053 if (test == regno)
9054 return true;
9055 }
9056 }
9057
9058 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
9059 return true;
9060
9061 return (df_regs_ever_live_p (regno)
9062 && !call_used_regs[regno]
9063 && !fixed_regs[regno]
9064 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
9065 }
9066
9067 /* Return number of saved general prupose registers. */
9068
9069 static int
9070 ix86_nsaved_regs (void)
9071 {
9072 int nregs = 0;
9073 int regno;
9074
9075 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9076 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9077 nregs ++;
9078 return nregs;
9079 }
9080
9081 /* Return number of saved SSE registrers. */
9082
9083 static int
9084 ix86_nsaved_sseregs (void)
9085 {
9086 int nregs = 0;
9087 int regno;
9088
9089 if (!TARGET_64BIT_MS_ABI)
9090 return 0;
9091 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9092 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9093 nregs ++;
9094 return nregs;
9095 }
9096
9097 /* Given FROM and TO register numbers, say whether this elimination is
9098 allowed. If stack alignment is needed, we can only replace argument
9099 pointer with hard frame pointer, or replace frame pointer with stack
9100 pointer. Otherwise, frame pointer elimination is automatically
9101 handled and all other eliminations are valid. */
9102
9103 static bool
9104 ix86_can_eliminate (const int from, const int to)
9105 {
9106 if (stack_realign_fp)
9107 return ((from == ARG_POINTER_REGNUM
9108 && to == HARD_FRAME_POINTER_REGNUM)
9109 || (from == FRAME_POINTER_REGNUM
9110 && to == STACK_POINTER_REGNUM));
9111 else
9112 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
9113 }
9114
9115 /* Return the offset between two registers, one to be eliminated, and the other
9116 its replacement, at the start of a routine. */
9117
9118 HOST_WIDE_INT
9119 ix86_initial_elimination_offset (int from, int to)
9120 {
9121 struct ix86_frame frame;
9122 ix86_compute_frame_layout (&frame);
9123
9124 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
9125 return frame.hard_frame_pointer_offset;
9126 else if (from == FRAME_POINTER_REGNUM
9127 && to == HARD_FRAME_POINTER_REGNUM)
9128 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
9129 else
9130 {
9131 gcc_assert (to == STACK_POINTER_REGNUM);
9132
9133 if (from == ARG_POINTER_REGNUM)
9134 return frame.stack_pointer_offset;
9135
9136 gcc_assert (from == FRAME_POINTER_REGNUM);
9137 return frame.stack_pointer_offset - frame.frame_pointer_offset;
9138 }
9139 }
9140
9141 /* In a dynamically-aligned function, we can't know the offset from
9142 stack pointer to frame pointer, so we must ensure that setjmp
9143 eliminates fp against the hard fp (%ebp) rather than trying to
9144 index from %esp up to the top of the frame across a gap that is
9145 of unknown (at compile-time) size. */
9146 static rtx
9147 ix86_builtin_setjmp_frame_value (void)
9148 {
9149 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
9150 }
9151
9152 /* When using -fsplit-stack, the allocation routines set a field in
9153 the TCB to the bottom of the stack plus this much space, measured
9154 in bytes. */
9155
9156 #define SPLIT_STACK_AVAILABLE 256
9157
9158 /* Fill structure ix86_frame about frame of currently computed function. */
9159
9160 static void
9161 ix86_compute_frame_layout (struct ix86_frame *frame)
9162 {
9163 unsigned HOST_WIDE_INT stack_alignment_needed;
9164 HOST_WIDE_INT offset;
9165 unsigned HOST_WIDE_INT preferred_alignment;
9166 HOST_WIDE_INT size = get_frame_size ();
9167 HOST_WIDE_INT to_allocate;
9168
9169 frame->nregs = ix86_nsaved_regs ();
9170 frame->nsseregs = ix86_nsaved_sseregs ();
9171
9172 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
9173 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
9174
9175 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
9176 function prologues and leaf. */
9177 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
9178 && (!crtl->is_leaf || cfun->calls_alloca != 0
9179 || ix86_current_function_calls_tls_descriptor))
9180 {
9181 preferred_alignment = 16;
9182 stack_alignment_needed = 16;
9183 crtl->preferred_stack_boundary = 128;
9184 crtl->stack_alignment_needed = 128;
9185 }
9186
9187 gcc_assert (!size || stack_alignment_needed);
9188 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
9189 gcc_assert (preferred_alignment <= stack_alignment_needed);
9190
9191 /* For SEH we have to limit the amount of code movement into the prologue.
9192 At present we do this via a BLOCKAGE, at which point there's very little
9193 scheduling that can be done, which means that there's very little point
9194 in doing anything except PUSHs. */
9195 if (TARGET_SEH)
9196 cfun->machine->use_fast_prologue_epilogue = false;
9197
9198 /* During reload iteration the amount of registers saved can change.
9199 Recompute the value as needed. Do not recompute when amount of registers
9200 didn't change as reload does multiple calls to the function and does not
9201 expect the decision to change within single iteration. */
9202 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR)
9203 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
9204 {
9205 int count = frame->nregs;
9206 struct cgraph_node *node = cgraph_get_node (current_function_decl);
9207
9208 cfun->machine->use_fast_prologue_epilogue_nregs = count;
9209
9210 /* The fast prologue uses move instead of push to save registers. This
9211 is significantly longer, but also executes faster as modern hardware
9212 can execute the moves in parallel, but can't do that for push/pop.
9213
9214 Be careful about choosing what prologue to emit: When function takes
9215 many instructions to execute we may use slow version as well as in
9216 case function is known to be outside hot spot (this is known with
9217 feedback only). Weight the size of function by number of registers
9218 to save as it is cheap to use one or two push instructions but very
9219 slow to use many of them. */
9220 if (count)
9221 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
9222 if (node->frequency < NODE_FREQUENCY_NORMAL
9223 || (flag_branch_probabilities
9224 && node->frequency < NODE_FREQUENCY_HOT))
9225 cfun->machine->use_fast_prologue_epilogue = false;
9226 else
9227 cfun->machine->use_fast_prologue_epilogue
9228 = !expensive_function_p (count);
9229 }
9230
9231 frame->save_regs_using_mov
9232 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9233 /* If static stack checking is enabled and done with probes,
9234 the registers need to be saved before allocating the frame. */
9235 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9236
9237 /* Skip return address. */
9238 offset = UNITS_PER_WORD;
9239
9240 /* Skip pushed static chain. */
9241 if (ix86_static_chain_on_stack)
9242 offset += UNITS_PER_WORD;
9243
9244 /* Skip saved base pointer. */
9245 if (frame_pointer_needed)
9246 offset += UNITS_PER_WORD;
9247 frame->hfp_save_offset = offset;
9248
9249 /* The traditional frame pointer location is at the top of the frame. */
9250 frame->hard_frame_pointer_offset = offset;
9251
9252 /* Register save area */
9253 offset += frame->nregs * UNITS_PER_WORD;
9254 frame->reg_save_offset = offset;
9255
9256 /* On SEH target, registers are pushed just before the frame pointer
9257 location. */
9258 if (TARGET_SEH)
9259 frame->hard_frame_pointer_offset = offset;
9260
9261 /* Align and set SSE register save area. */
9262 if (frame->nsseregs)
9263 {
9264 /* The only ABI that has saved SSE registers (Win64) also has a
9265 16-byte aligned default stack, and thus we don't need to be
9266 within the re-aligned local stack frame to save them. */
9267 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9268 offset = (offset + 16 - 1) & -16;
9269 offset += frame->nsseregs * 16;
9270 }
9271 frame->sse_reg_save_offset = offset;
9272
9273 /* The re-aligned stack starts here. Values before this point are not
9274 directly comparable with values below this point. In order to make
9275 sure that no value happens to be the same before and after, force
9276 the alignment computation below to add a non-zero value. */
9277 if (stack_realign_fp)
9278 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9279
9280 /* Va-arg area */
9281 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9282 offset += frame->va_arg_size;
9283
9284 /* Align start of frame for local function. */
9285 if (stack_realign_fp
9286 || offset != frame->sse_reg_save_offset
9287 || size != 0
9288 || !crtl->is_leaf
9289 || cfun->calls_alloca
9290 || ix86_current_function_calls_tls_descriptor)
9291 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9292
9293 /* Frame pointer points here. */
9294 frame->frame_pointer_offset = offset;
9295
9296 offset += size;
9297
9298 /* Add outgoing arguments area. Can be skipped if we eliminated
9299 all the function calls as dead code.
9300 Skipping is however impossible when function calls alloca. Alloca
9301 expander assumes that last crtl->outgoing_args_size
9302 of stack frame are unused. */
9303 if (ACCUMULATE_OUTGOING_ARGS
9304 && (!crtl->is_leaf || cfun->calls_alloca
9305 || ix86_current_function_calls_tls_descriptor))
9306 {
9307 offset += crtl->outgoing_args_size;
9308 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9309 }
9310 else
9311 frame->outgoing_arguments_size = 0;
9312
9313 /* Align stack boundary. Only needed if we're calling another function
9314 or using alloca. */
9315 if (!crtl->is_leaf || cfun->calls_alloca
9316 || ix86_current_function_calls_tls_descriptor)
9317 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9318
9319 /* We've reached end of stack frame. */
9320 frame->stack_pointer_offset = offset;
9321
9322 /* Size prologue needs to allocate. */
9323 to_allocate = offset - frame->sse_reg_save_offset;
9324
9325 if ((!to_allocate && frame->nregs <= 1)
9326 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9327 frame->save_regs_using_mov = false;
9328
9329 if (ix86_using_red_zone ()
9330 && crtl->sp_is_unchanging
9331 && crtl->is_leaf
9332 && !ix86_current_function_calls_tls_descriptor)
9333 {
9334 frame->red_zone_size = to_allocate;
9335 if (frame->save_regs_using_mov)
9336 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9337 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9338 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9339 }
9340 else
9341 frame->red_zone_size = 0;
9342 frame->stack_pointer_offset -= frame->red_zone_size;
9343
9344 /* The SEH frame pointer location is near the bottom of the frame.
9345 This is enforced by the fact that the difference between the
9346 stack pointer and the frame pointer is limited to 240 bytes in
9347 the unwind data structure. */
9348 if (TARGET_SEH)
9349 {
9350 HOST_WIDE_INT diff;
9351
9352 /* If we can leave the frame pointer where it is, do so. Also, returns
9353 the establisher frame for __builtin_frame_address (0). */
9354 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9355 if (diff <= SEH_MAX_FRAME_SIZE
9356 && (diff > 240 || (diff & 15) != 0)
9357 && !crtl->accesses_prior_frames)
9358 {
9359 /* Ideally we'd determine what portion of the local stack frame
9360 (within the constraint of the lowest 240) is most heavily used.
9361 But without that complication, simply bias the frame pointer
9362 by 128 bytes so as to maximize the amount of the local stack
9363 frame that is addressable with 8-bit offsets. */
9364 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9365 }
9366 }
9367 }
9368
9369 /* This is semi-inlined memory_address_length, but simplified
9370 since we know that we're always dealing with reg+offset, and
9371 to avoid having to create and discard all that rtl. */
9372
9373 static inline int
9374 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9375 {
9376 int len = 4;
9377
9378 if (offset == 0)
9379 {
9380 /* EBP and R13 cannot be encoded without an offset. */
9381 len = (regno == BP_REG || regno == R13_REG);
9382 }
9383 else if (IN_RANGE (offset, -128, 127))
9384 len = 1;
9385
9386 /* ESP and R12 must be encoded with a SIB byte. */
9387 if (regno == SP_REG || regno == R12_REG)
9388 len++;
9389
9390 return len;
9391 }
9392
9393 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9394 The valid base registers are taken from CFUN->MACHINE->FS. */
9395
9396 static rtx
9397 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9398 {
9399 const struct machine_function *m = cfun->machine;
9400 rtx base_reg = NULL;
9401 HOST_WIDE_INT base_offset = 0;
9402
9403 if (m->use_fast_prologue_epilogue)
9404 {
9405 /* Choose the base register most likely to allow the most scheduling
9406 opportunities. Generally FP is valid throughout the function,
9407 while DRAP must be reloaded within the epilogue. But choose either
9408 over the SP due to increased encoding size. */
9409
9410 if (m->fs.fp_valid)
9411 {
9412 base_reg = hard_frame_pointer_rtx;
9413 base_offset = m->fs.fp_offset - cfa_offset;
9414 }
9415 else if (m->fs.drap_valid)
9416 {
9417 base_reg = crtl->drap_reg;
9418 base_offset = 0 - cfa_offset;
9419 }
9420 else if (m->fs.sp_valid)
9421 {
9422 base_reg = stack_pointer_rtx;
9423 base_offset = m->fs.sp_offset - cfa_offset;
9424 }
9425 }
9426 else
9427 {
9428 HOST_WIDE_INT toffset;
9429 int len = 16, tlen;
9430
9431 /* Choose the base register with the smallest address encoding.
9432 With a tie, choose FP > DRAP > SP. */
9433 if (m->fs.sp_valid)
9434 {
9435 base_reg = stack_pointer_rtx;
9436 base_offset = m->fs.sp_offset - cfa_offset;
9437 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9438 }
9439 if (m->fs.drap_valid)
9440 {
9441 toffset = 0 - cfa_offset;
9442 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9443 if (tlen <= len)
9444 {
9445 base_reg = crtl->drap_reg;
9446 base_offset = toffset;
9447 len = tlen;
9448 }
9449 }
9450 if (m->fs.fp_valid)
9451 {
9452 toffset = m->fs.fp_offset - cfa_offset;
9453 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9454 if (tlen <= len)
9455 {
9456 base_reg = hard_frame_pointer_rtx;
9457 base_offset = toffset;
9458 len = tlen;
9459 }
9460 }
9461 }
9462 gcc_assert (base_reg != NULL);
9463
9464 return plus_constant (Pmode, base_reg, base_offset);
9465 }
9466
9467 /* Emit code to save registers in the prologue. */
9468
9469 static void
9470 ix86_emit_save_regs (void)
9471 {
9472 unsigned int regno;
9473 rtx insn;
9474
9475 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9476 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9477 {
9478 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9479 RTX_FRAME_RELATED_P (insn) = 1;
9480 }
9481 }
9482
9483 /* Emit a single register save at CFA - CFA_OFFSET. */
9484
9485 static void
9486 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9487 HOST_WIDE_INT cfa_offset)
9488 {
9489 struct machine_function *m = cfun->machine;
9490 rtx reg = gen_rtx_REG (mode, regno);
9491 rtx mem, addr, base, insn;
9492
9493 addr = choose_baseaddr (cfa_offset);
9494 mem = gen_frame_mem (mode, addr);
9495
9496 /* For SSE saves, we need to indicate the 128-bit alignment. */
9497 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9498
9499 insn = emit_move_insn (mem, reg);
9500 RTX_FRAME_RELATED_P (insn) = 1;
9501
9502 base = addr;
9503 if (GET_CODE (base) == PLUS)
9504 base = XEXP (base, 0);
9505 gcc_checking_assert (REG_P (base));
9506
9507 /* When saving registers into a re-aligned local stack frame, avoid
9508 any tricky guessing by dwarf2out. */
9509 if (m->fs.realigned)
9510 {
9511 gcc_checking_assert (stack_realign_drap);
9512
9513 if (regno == REGNO (crtl->drap_reg))
9514 {
9515 /* A bit of a hack. We force the DRAP register to be saved in
9516 the re-aligned stack frame, which provides us with a copy
9517 of the CFA that will last past the prologue. Install it. */
9518 gcc_checking_assert (cfun->machine->fs.fp_valid);
9519 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9520 cfun->machine->fs.fp_offset - cfa_offset);
9521 mem = gen_rtx_MEM (mode, addr);
9522 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9523 }
9524 else
9525 {
9526 /* The frame pointer is a stable reference within the
9527 aligned frame. Use it. */
9528 gcc_checking_assert (cfun->machine->fs.fp_valid);
9529 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9530 cfun->machine->fs.fp_offset - cfa_offset);
9531 mem = gen_rtx_MEM (mode, addr);
9532 add_reg_note (insn, REG_CFA_EXPRESSION,
9533 gen_rtx_SET (VOIDmode, mem, reg));
9534 }
9535 }
9536
9537 /* The memory may not be relative to the current CFA register,
9538 which means that we may need to generate a new pattern for
9539 use by the unwind info. */
9540 else if (base != m->fs.cfa_reg)
9541 {
9542 addr = plus_constant (Pmode, m->fs.cfa_reg,
9543 m->fs.cfa_offset - cfa_offset);
9544 mem = gen_rtx_MEM (mode, addr);
9545 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9546 }
9547 }
9548
9549 /* Emit code to save registers using MOV insns.
9550 First register is stored at CFA - CFA_OFFSET. */
9551 static void
9552 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9553 {
9554 unsigned int regno;
9555
9556 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9557 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9558 {
9559 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9560 cfa_offset -= UNITS_PER_WORD;
9561 }
9562 }
9563
9564 /* Emit code to save SSE registers using MOV insns.
9565 First register is stored at CFA - CFA_OFFSET. */
9566 static void
9567 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9568 {
9569 unsigned int regno;
9570
9571 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9572 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9573 {
9574 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9575 cfa_offset -= 16;
9576 }
9577 }
9578
9579 static GTY(()) rtx queued_cfa_restores;
9580
9581 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9582 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9583 Don't add the note if the previously saved value will be left untouched
9584 within stack red-zone till return, as unwinders can find the same value
9585 in the register and on the stack. */
9586
9587 static void
9588 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9589 {
9590 if (!crtl->shrink_wrapped
9591 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9592 return;
9593
9594 if (insn)
9595 {
9596 add_reg_note (insn, REG_CFA_RESTORE, reg);
9597 RTX_FRAME_RELATED_P (insn) = 1;
9598 }
9599 else
9600 queued_cfa_restores
9601 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9602 }
9603
9604 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9605
9606 static void
9607 ix86_add_queued_cfa_restore_notes (rtx insn)
9608 {
9609 rtx last;
9610 if (!queued_cfa_restores)
9611 return;
9612 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9613 ;
9614 XEXP (last, 1) = REG_NOTES (insn);
9615 REG_NOTES (insn) = queued_cfa_restores;
9616 queued_cfa_restores = NULL_RTX;
9617 RTX_FRAME_RELATED_P (insn) = 1;
9618 }
9619
9620 /* Expand prologue or epilogue stack adjustment.
9621 The pattern exist to put a dependency on all ebp-based memory accesses.
9622 STYLE should be negative if instructions should be marked as frame related,
9623 zero if %r11 register is live and cannot be freely used and positive
9624 otherwise. */
9625
9626 static void
9627 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9628 int style, bool set_cfa)
9629 {
9630 struct machine_function *m = cfun->machine;
9631 rtx insn;
9632 bool add_frame_related_expr = false;
9633
9634 if (Pmode == SImode)
9635 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9636 else if (x86_64_immediate_operand (offset, DImode))
9637 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9638 else
9639 {
9640 rtx tmp;
9641 /* r11 is used by indirect sibcall return as well, set before the
9642 epilogue and used after the epilogue. */
9643 if (style)
9644 tmp = gen_rtx_REG (DImode, R11_REG);
9645 else
9646 {
9647 gcc_assert (src != hard_frame_pointer_rtx
9648 && dest != hard_frame_pointer_rtx);
9649 tmp = hard_frame_pointer_rtx;
9650 }
9651 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9652 if (style < 0)
9653 add_frame_related_expr = true;
9654
9655 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9656 }
9657
9658 insn = emit_insn (insn);
9659 if (style >= 0)
9660 ix86_add_queued_cfa_restore_notes (insn);
9661
9662 if (set_cfa)
9663 {
9664 rtx r;
9665
9666 gcc_assert (m->fs.cfa_reg == src);
9667 m->fs.cfa_offset += INTVAL (offset);
9668 m->fs.cfa_reg = dest;
9669
9670 r = gen_rtx_PLUS (Pmode, src, offset);
9671 r = gen_rtx_SET (VOIDmode, dest, r);
9672 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9673 RTX_FRAME_RELATED_P (insn) = 1;
9674 }
9675 else if (style < 0)
9676 {
9677 RTX_FRAME_RELATED_P (insn) = 1;
9678 if (add_frame_related_expr)
9679 {
9680 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9681 r = gen_rtx_SET (VOIDmode, dest, r);
9682 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9683 }
9684 }
9685
9686 if (dest == stack_pointer_rtx)
9687 {
9688 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9689 bool valid = m->fs.sp_valid;
9690
9691 if (src == hard_frame_pointer_rtx)
9692 {
9693 valid = m->fs.fp_valid;
9694 ooffset = m->fs.fp_offset;
9695 }
9696 else if (src == crtl->drap_reg)
9697 {
9698 valid = m->fs.drap_valid;
9699 ooffset = 0;
9700 }
9701 else
9702 {
9703 /* Else there are two possibilities: SP itself, which we set
9704 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9705 taken care of this by hand along the eh_return path. */
9706 gcc_checking_assert (src == stack_pointer_rtx
9707 || offset == const0_rtx);
9708 }
9709
9710 m->fs.sp_offset = ooffset - INTVAL (offset);
9711 m->fs.sp_valid = valid;
9712 }
9713 }
9714
9715 /* Find an available register to be used as dynamic realign argument
9716 pointer regsiter. Such a register will be written in prologue and
9717 used in begin of body, so it must not be
9718 1. parameter passing register.
9719 2. GOT pointer.
9720 We reuse static-chain register if it is available. Otherwise, we
9721 use DI for i386 and R13 for x86-64. We chose R13 since it has
9722 shorter encoding.
9723
9724 Return: the regno of chosen register. */
9725
9726 static unsigned int
9727 find_drap_reg (void)
9728 {
9729 tree decl = cfun->decl;
9730
9731 if (TARGET_64BIT)
9732 {
9733 /* Use R13 for nested function or function need static chain.
9734 Since function with tail call may use any caller-saved
9735 registers in epilogue, DRAP must not use caller-saved
9736 register in such case. */
9737 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9738 return R13_REG;
9739
9740 return R10_REG;
9741 }
9742 else
9743 {
9744 /* Use DI for nested function or function need static chain.
9745 Since function with tail call may use any caller-saved
9746 registers in epilogue, DRAP must not use caller-saved
9747 register in such case. */
9748 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9749 return DI_REG;
9750
9751 /* Reuse static chain register if it isn't used for parameter
9752 passing. */
9753 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9754 {
9755 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9756 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9757 return CX_REG;
9758 }
9759 return DI_REG;
9760 }
9761 }
9762
9763 /* Return minimum incoming stack alignment. */
9764
9765 static unsigned int
9766 ix86_minimum_incoming_stack_boundary (bool sibcall)
9767 {
9768 unsigned int incoming_stack_boundary;
9769
9770 /* Prefer the one specified at command line. */
9771 if (ix86_user_incoming_stack_boundary)
9772 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9773 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9774 if -mstackrealign is used, it isn't used for sibcall check and
9775 estimated stack alignment is 128bit. */
9776 else if (!sibcall
9777 && !TARGET_64BIT
9778 && ix86_force_align_arg_pointer
9779 && crtl->stack_alignment_estimated == 128)
9780 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9781 else
9782 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9783
9784 /* Incoming stack alignment can be changed on individual functions
9785 via force_align_arg_pointer attribute. We use the smallest
9786 incoming stack boundary. */
9787 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9788 && lookup_attribute (ix86_force_align_arg_pointer_string,
9789 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9790 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9791
9792 /* The incoming stack frame has to be aligned at least at
9793 parm_stack_boundary. */
9794 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9795 incoming_stack_boundary = crtl->parm_stack_boundary;
9796
9797 /* Stack at entrance of main is aligned by runtime. We use the
9798 smallest incoming stack boundary. */
9799 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9800 && DECL_NAME (current_function_decl)
9801 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9802 && DECL_FILE_SCOPE_P (current_function_decl))
9803 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9804
9805 return incoming_stack_boundary;
9806 }
9807
9808 /* Update incoming stack boundary and estimated stack alignment. */
9809
9810 static void
9811 ix86_update_stack_boundary (void)
9812 {
9813 ix86_incoming_stack_boundary
9814 = ix86_minimum_incoming_stack_boundary (false);
9815
9816 /* x86_64 vararg needs 16byte stack alignment for register save
9817 area. */
9818 if (TARGET_64BIT
9819 && cfun->stdarg
9820 && crtl->stack_alignment_estimated < 128)
9821 crtl->stack_alignment_estimated = 128;
9822 }
9823
9824 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9825 needed or an rtx for DRAP otherwise. */
9826
9827 static rtx
9828 ix86_get_drap_rtx (void)
9829 {
9830 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9831 crtl->need_drap = true;
9832
9833 if (stack_realign_drap)
9834 {
9835 /* Assign DRAP to vDRAP and returns vDRAP */
9836 unsigned int regno = find_drap_reg ();
9837 rtx drap_vreg;
9838 rtx arg_ptr;
9839 rtx seq, insn;
9840
9841 arg_ptr = gen_rtx_REG (Pmode, regno);
9842 crtl->drap_reg = arg_ptr;
9843
9844 start_sequence ();
9845 drap_vreg = copy_to_reg (arg_ptr);
9846 seq = get_insns ();
9847 end_sequence ();
9848
9849 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9850 if (!optimize)
9851 {
9852 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9853 RTX_FRAME_RELATED_P (insn) = 1;
9854 }
9855 return drap_vreg;
9856 }
9857 else
9858 return NULL;
9859 }
9860
9861 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9862
9863 static rtx
9864 ix86_internal_arg_pointer (void)
9865 {
9866 return virtual_incoming_args_rtx;
9867 }
9868
9869 struct scratch_reg {
9870 rtx reg;
9871 bool saved;
9872 };
9873
9874 /* Return a short-lived scratch register for use on function entry.
9875 In 32-bit mode, it is valid only after the registers are saved
9876 in the prologue. This register must be released by means of
9877 release_scratch_register_on_entry once it is dead. */
9878
9879 static void
9880 get_scratch_register_on_entry (struct scratch_reg *sr)
9881 {
9882 int regno;
9883
9884 sr->saved = false;
9885
9886 if (TARGET_64BIT)
9887 {
9888 /* We always use R11 in 64-bit mode. */
9889 regno = R11_REG;
9890 }
9891 else
9892 {
9893 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9894 bool fastcall_p
9895 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9896 bool thiscall_p
9897 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9898 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9899 int regparm = ix86_function_regparm (fntype, decl);
9900 int drap_regno
9901 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9902
9903 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9904 for the static chain register. */
9905 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9906 && drap_regno != AX_REG)
9907 regno = AX_REG;
9908 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9909 for the static chain register. */
9910 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9911 regno = AX_REG;
9912 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9913 regno = DX_REG;
9914 /* ecx is the static chain register. */
9915 else if (regparm < 3 && !fastcall_p && !thiscall_p
9916 && !static_chain_p
9917 && drap_regno != CX_REG)
9918 regno = CX_REG;
9919 else if (ix86_save_reg (BX_REG, true))
9920 regno = BX_REG;
9921 /* esi is the static chain register. */
9922 else if (!(regparm == 3 && static_chain_p)
9923 && ix86_save_reg (SI_REG, true))
9924 regno = SI_REG;
9925 else if (ix86_save_reg (DI_REG, true))
9926 regno = DI_REG;
9927 else
9928 {
9929 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9930 sr->saved = true;
9931 }
9932 }
9933
9934 sr->reg = gen_rtx_REG (Pmode, regno);
9935 if (sr->saved)
9936 {
9937 rtx insn = emit_insn (gen_push (sr->reg));
9938 RTX_FRAME_RELATED_P (insn) = 1;
9939 }
9940 }
9941
9942 /* Release a scratch register obtained from the preceding function. */
9943
9944 static void
9945 release_scratch_register_on_entry (struct scratch_reg *sr)
9946 {
9947 if (sr->saved)
9948 {
9949 struct machine_function *m = cfun->machine;
9950 rtx x, insn = emit_insn (gen_pop (sr->reg));
9951
9952 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9953 RTX_FRAME_RELATED_P (insn) = 1;
9954 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9955 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9956 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9957 m->fs.sp_offset -= UNITS_PER_WORD;
9958 }
9959 }
9960
9961 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9962
9963 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9964
9965 static void
9966 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9967 {
9968 /* We skip the probe for the first interval + a small dope of 4 words and
9969 probe that many bytes past the specified size to maintain a protection
9970 area at the botton of the stack. */
9971 const int dope = 4 * UNITS_PER_WORD;
9972 rtx size_rtx = GEN_INT (size), last;
9973
9974 /* See if we have a constant small number of probes to generate. If so,
9975 that's the easy case. The run-time loop is made up of 11 insns in the
9976 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9977 for n # of intervals. */
9978 if (size <= 5 * PROBE_INTERVAL)
9979 {
9980 HOST_WIDE_INT i, adjust;
9981 bool first_probe = true;
9982
9983 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9984 values of N from 1 until it exceeds SIZE. If only one probe is
9985 needed, this will not generate any code. Then adjust and probe
9986 to PROBE_INTERVAL + SIZE. */
9987 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9988 {
9989 if (first_probe)
9990 {
9991 adjust = 2 * PROBE_INTERVAL + dope;
9992 first_probe = false;
9993 }
9994 else
9995 adjust = PROBE_INTERVAL;
9996
9997 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9998 plus_constant (Pmode, stack_pointer_rtx,
9999 -adjust)));
10000 emit_stack_probe (stack_pointer_rtx);
10001 }
10002
10003 if (first_probe)
10004 adjust = size + PROBE_INTERVAL + dope;
10005 else
10006 adjust = size + PROBE_INTERVAL - i;
10007
10008 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10009 plus_constant (Pmode, stack_pointer_rtx,
10010 -adjust)));
10011 emit_stack_probe (stack_pointer_rtx);
10012
10013 /* Adjust back to account for the additional first interval. */
10014 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10015 plus_constant (Pmode, stack_pointer_rtx,
10016 PROBE_INTERVAL + dope)));
10017 }
10018
10019 /* Otherwise, do the same as above, but in a loop. Note that we must be
10020 extra careful with variables wrapping around because we might be at
10021 the very top (or the very bottom) of the address space and we have
10022 to be able to handle this case properly; in particular, we use an
10023 equality test for the loop condition. */
10024 else
10025 {
10026 HOST_WIDE_INT rounded_size;
10027 struct scratch_reg sr;
10028
10029 get_scratch_register_on_entry (&sr);
10030
10031
10032 /* Step 1: round SIZE to the previous multiple of the interval. */
10033
10034 rounded_size = size & -PROBE_INTERVAL;
10035
10036
10037 /* Step 2: compute initial and final value of the loop counter. */
10038
10039 /* SP = SP_0 + PROBE_INTERVAL. */
10040 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10041 plus_constant (Pmode, stack_pointer_rtx,
10042 - (PROBE_INTERVAL + dope))));
10043
10044 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
10045 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
10046 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
10047 gen_rtx_PLUS (Pmode, sr.reg,
10048 stack_pointer_rtx)));
10049
10050
10051 /* Step 3: the loop
10052
10053 while (SP != LAST_ADDR)
10054 {
10055 SP = SP + PROBE_INTERVAL
10056 probe at SP
10057 }
10058
10059 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
10060 values of N from 1 until it is equal to ROUNDED_SIZE. */
10061
10062 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
10063
10064
10065 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
10066 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
10067
10068 if (size != rounded_size)
10069 {
10070 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10071 plus_constant (Pmode, stack_pointer_rtx,
10072 rounded_size - size)));
10073 emit_stack_probe (stack_pointer_rtx);
10074 }
10075
10076 /* Adjust back to account for the additional first interval. */
10077 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10078 plus_constant (Pmode, stack_pointer_rtx,
10079 PROBE_INTERVAL + dope)));
10080
10081 release_scratch_register_on_entry (&sr);
10082 }
10083
10084 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
10085
10086 /* Even if the stack pointer isn't the CFA register, we need to correctly
10087 describe the adjustments made to it, in particular differentiate the
10088 frame-related ones from the frame-unrelated ones. */
10089 if (size > 0)
10090 {
10091 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
10092 XVECEXP (expr, 0, 0)
10093 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10094 plus_constant (Pmode, stack_pointer_rtx, -size));
10095 XVECEXP (expr, 0, 1)
10096 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10097 plus_constant (Pmode, stack_pointer_rtx,
10098 PROBE_INTERVAL + dope + size));
10099 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
10100 RTX_FRAME_RELATED_P (last) = 1;
10101
10102 cfun->machine->fs.sp_offset += size;
10103 }
10104
10105 /* Make sure nothing is scheduled before we are done. */
10106 emit_insn (gen_blockage ());
10107 }
10108
10109 /* Adjust the stack pointer up to REG while probing it. */
10110
10111 const char *
10112 output_adjust_stack_and_probe (rtx reg)
10113 {
10114 static int labelno = 0;
10115 char loop_lab[32], end_lab[32];
10116 rtx xops[2];
10117
10118 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10119 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10120
10121 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10122
10123 /* Jump to END_LAB if SP == LAST_ADDR. */
10124 xops[0] = stack_pointer_rtx;
10125 xops[1] = reg;
10126 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10127 fputs ("\tje\t", asm_out_file);
10128 assemble_name_raw (asm_out_file, end_lab);
10129 fputc ('\n', asm_out_file);
10130
10131 /* SP = SP + PROBE_INTERVAL. */
10132 xops[1] = GEN_INT (PROBE_INTERVAL);
10133 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10134
10135 /* Probe at SP. */
10136 xops[1] = const0_rtx;
10137 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
10138
10139 fprintf (asm_out_file, "\tjmp\t");
10140 assemble_name_raw (asm_out_file, loop_lab);
10141 fputc ('\n', asm_out_file);
10142
10143 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10144
10145 return "";
10146 }
10147
10148 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
10149 inclusive. These are offsets from the current stack pointer. */
10150
10151 static void
10152 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
10153 {
10154 /* See if we have a constant small number of probes to generate. If so,
10155 that's the easy case. The run-time loop is made up of 7 insns in the
10156 generic case while the compile-time loop is made up of n insns for n #
10157 of intervals. */
10158 if (size <= 7 * PROBE_INTERVAL)
10159 {
10160 HOST_WIDE_INT i;
10161
10162 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
10163 it exceeds SIZE. If only one probe is needed, this will not
10164 generate any code. Then probe at FIRST + SIZE. */
10165 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
10166 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10167 -(first + i)));
10168
10169 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
10170 -(first + size)));
10171 }
10172
10173 /* Otherwise, do the same as above, but in a loop. Note that we must be
10174 extra careful with variables wrapping around because we might be at
10175 the very top (or the very bottom) of the address space and we have
10176 to be able to handle this case properly; in particular, we use an
10177 equality test for the loop condition. */
10178 else
10179 {
10180 HOST_WIDE_INT rounded_size, last;
10181 struct scratch_reg sr;
10182
10183 get_scratch_register_on_entry (&sr);
10184
10185
10186 /* Step 1: round SIZE to the previous multiple of the interval. */
10187
10188 rounded_size = size & -PROBE_INTERVAL;
10189
10190
10191 /* Step 2: compute initial and final value of the loop counter. */
10192
10193 /* TEST_OFFSET = FIRST. */
10194 emit_move_insn (sr.reg, GEN_INT (-first));
10195
10196 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
10197 last = first + rounded_size;
10198
10199
10200 /* Step 3: the loop
10201
10202 while (TEST_ADDR != LAST_ADDR)
10203 {
10204 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
10205 probe at TEST_ADDR
10206 }
10207
10208 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
10209 until it is equal to ROUNDED_SIZE. */
10210
10211 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
10212
10213
10214 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
10215 that SIZE is equal to ROUNDED_SIZE. */
10216
10217 if (size != rounded_size)
10218 emit_stack_probe (plus_constant (Pmode,
10219 gen_rtx_PLUS (Pmode,
10220 stack_pointer_rtx,
10221 sr.reg),
10222 rounded_size - size));
10223
10224 release_scratch_register_on_entry (&sr);
10225 }
10226
10227 /* Make sure nothing is scheduled before we are done. */
10228 emit_insn (gen_blockage ());
10229 }
10230
10231 /* Probe a range of stack addresses from REG to END, inclusive. These are
10232 offsets from the current stack pointer. */
10233
10234 const char *
10235 output_probe_stack_range (rtx reg, rtx end)
10236 {
10237 static int labelno = 0;
10238 char loop_lab[32], end_lab[32];
10239 rtx xops[3];
10240
10241 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10242 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10243
10244 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10245
10246 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10247 xops[0] = reg;
10248 xops[1] = end;
10249 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10250 fputs ("\tje\t", asm_out_file);
10251 assemble_name_raw (asm_out_file, end_lab);
10252 fputc ('\n', asm_out_file);
10253
10254 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10255 xops[1] = GEN_INT (PROBE_INTERVAL);
10256 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10257
10258 /* Probe at TEST_ADDR. */
10259 xops[0] = stack_pointer_rtx;
10260 xops[1] = reg;
10261 xops[2] = const0_rtx;
10262 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10263
10264 fprintf (asm_out_file, "\tjmp\t");
10265 assemble_name_raw (asm_out_file, loop_lab);
10266 fputc ('\n', asm_out_file);
10267
10268 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10269
10270 return "";
10271 }
10272
10273 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10274 to be generated in correct form. */
10275 static void
10276 ix86_finalize_stack_realign_flags (void)
10277 {
10278 /* Check if stack realign is really needed after reload, and
10279 stores result in cfun */
10280 unsigned int incoming_stack_boundary
10281 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10282 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10283 unsigned int stack_realign = (incoming_stack_boundary
10284 < (crtl->is_leaf
10285 ? crtl->max_used_stack_slot_alignment
10286 : crtl->stack_alignment_needed));
10287
10288 if (crtl->stack_realign_finalized)
10289 {
10290 /* After stack_realign_needed is finalized, we can't no longer
10291 change it. */
10292 gcc_assert (crtl->stack_realign_needed == stack_realign);
10293 return;
10294 }
10295
10296 /* If the only reason for frame_pointer_needed is that we conservatively
10297 assumed stack realignment might be needed, but in the end nothing that
10298 needed the stack alignment had been spilled, clear frame_pointer_needed
10299 and say we don't need stack realignment. */
10300 if (stack_realign
10301 && !crtl->need_drap
10302 && frame_pointer_needed
10303 && crtl->is_leaf
10304 && flag_omit_frame_pointer
10305 && crtl->sp_is_unchanging
10306 && !ix86_current_function_calls_tls_descriptor
10307 && !crtl->accesses_prior_frames
10308 && !cfun->calls_alloca
10309 && !crtl->calls_eh_return
10310 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10311 && !ix86_frame_pointer_required ()
10312 && get_frame_size () == 0
10313 && ix86_nsaved_sseregs () == 0
10314 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10315 {
10316 HARD_REG_SET set_up_by_prologue, prologue_used;
10317 basic_block bb;
10318
10319 CLEAR_HARD_REG_SET (prologue_used);
10320 CLEAR_HARD_REG_SET (set_up_by_prologue);
10321 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10322 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10323 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10324 HARD_FRAME_POINTER_REGNUM);
10325 FOR_EACH_BB (bb)
10326 {
10327 rtx insn;
10328 FOR_BB_INSNS (bb, insn)
10329 if (NONDEBUG_INSN_P (insn)
10330 && requires_stack_frame_p (insn, prologue_used,
10331 set_up_by_prologue))
10332 {
10333 crtl->stack_realign_needed = stack_realign;
10334 crtl->stack_realign_finalized = true;
10335 return;
10336 }
10337 }
10338
10339 frame_pointer_needed = false;
10340 stack_realign = false;
10341 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10342 crtl->stack_alignment_needed = incoming_stack_boundary;
10343 crtl->stack_alignment_estimated = incoming_stack_boundary;
10344 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10345 crtl->preferred_stack_boundary = incoming_stack_boundary;
10346 df_finish_pass (true);
10347 df_scan_alloc (NULL);
10348 df_scan_blocks ();
10349 df_compute_regs_ever_live (true);
10350 df_analyze ();
10351 }
10352
10353 crtl->stack_realign_needed = stack_realign;
10354 crtl->stack_realign_finalized = true;
10355 }
10356
10357 /* Expand the prologue into a bunch of separate insns. */
10358
10359 void
10360 ix86_expand_prologue (void)
10361 {
10362 struct machine_function *m = cfun->machine;
10363 rtx insn, t;
10364 bool pic_reg_used;
10365 struct ix86_frame frame;
10366 HOST_WIDE_INT allocate;
10367 bool int_registers_saved;
10368 bool sse_registers_saved;
10369
10370 ix86_finalize_stack_realign_flags ();
10371
10372 /* DRAP should not coexist with stack_realign_fp */
10373 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10374
10375 memset (&m->fs, 0, sizeof (m->fs));
10376
10377 /* Initialize CFA state for before the prologue. */
10378 m->fs.cfa_reg = stack_pointer_rtx;
10379 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10380
10381 /* Track SP offset to the CFA. We continue tracking this after we've
10382 swapped the CFA register away from SP. In the case of re-alignment
10383 this is fudged; we're interested to offsets within the local frame. */
10384 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10385 m->fs.sp_valid = true;
10386
10387 ix86_compute_frame_layout (&frame);
10388
10389 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10390 {
10391 /* We should have already generated an error for any use of
10392 ms_hook on a nested function. */
10393 gcc_checking_assert (!ix86_static_chain_on_stack);
10394
10395 /* Check if profiling is active and we shall use profiling before
10396 prologue variant. If so sorry. */
10397 if (crtl->profile && flag_fentry != 0)
10398 sorry ("ms_hook_prologue attribute isn%'t compatible "
10399 "with -mfentry for 32-bit");
10400
10401 /* In ix86_asm_output_function_label we emitted:
10402 8b ff movl.s %edi,%edi
10403 55 push %ebp
10404 8b ec movl.s %esp,%ebp
10405
10406 This matches the hookable function prologue in Win32 API
10407 functions in Microsoft Windows XP Service Pack 2 and newer.
10408 Wine uses this to enable Windows apps to hook the Win32 API
10409 functions provided by Wine.
10410
10411 What that means is that we've already set up the frame pointer. */
10412
10413 if (frame_pointer_needed
10414 && !(crtl->drap_reg && crtl->stack_realign_needed))
10415 {
10416 rtx push, mov;
10417
10418 /* We've decided to use the frame pointer already set up.
10419 Describe this to the unwinder by pretending that both
10420 push and mov insns happen right here.
10421
10422 Putting the unwind info here at the end of the ms_hook
10423 is done so that we can make absolutely certain we get
10424 the required byte sequence at the start of the function,
10425 rather than relying on an assembler that can produce
10426 the exact encoding required.
10427
10428 However it does mean (in the unpatched case) that we have
10429 a 1 insn window where the asynchronous unwind info is
10430 incorrect. However, if we placed the unwind info at
10431 its correct location we would have incorrect unwind info
10432 in the patched case. Which is probably all moot since
10433 I don't expect Wine generates dwarf2 unwind info for the
10434 system libraries that use this feature. */
10435
10436 insn = emit_insn (gen_blockage ());
10437
10438 push = gen_push (hard_frame_pointer_rtx);
10439 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10440 stack_pointer_rtx);
10441 RTX_FRAME_RELATED_P (push) = 1;
10442 RTX_FRAME_RELATED_P (mov) = 1;
10443
10444 RTX_FRAME_RELATED_P (insn) = 1;
10445 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10446 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10447
10448 /* Note that gen_push incremented m->fs.cfa_offset, even
10449 though we didn't emit the push insn here. */
10450 m->fs.cfa_reg = hard_frame_pointer_rtx;
10451 m->fs.fp_offset = m->fs.cfa_offset;
10452 m->fs.fp_valid = true;
10453 }
10454 else
10455 {
10456 /* The frame pointer is not needed so pop %ebp again.
10457 This leaves us with a pristine state. */
10458 emit_insn (gen_pop (hard_frame_pointer_rtx));
10459 }
10460 }
10461
10462 /* The first insn of a function that accepts its static chain on the
10463 stack is to push the register that would be filled in by a direct
10464 call. This insn will be skipped by the trampoline. */
10465 else if (ix86_static_chain_on_stack)
10466 {
10467 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10468 emit_insn (gen_blockage ());
10469
10470 /* We don't want to interpret this push insn as a register save,
10471 only as a stack adjustment. The real copy of the register as
10472 a save will be done later, if needed. */
10473 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10474 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10475 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10476 RTX_FRAME_RELATED_P (insn) = 1;
10477 }
10478
10479 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10480 of DRAP is needed and stack realignment is really needed after reload */
10481 if (stack_realign_drap)
10482 {
10483 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10484
10485 /* Only need to push parameter pointer reg if it is caller saved. */
10486 if (!call_used_regs[REGNO (crtl->drap_reg)])
10487 {
10488 /* Push arg pointer reg */
10489 insn = emit_insn (gen_push (crtl->drap_reg));
10490 RTX_FRAME_RELATED_P (insn) = 1;
10491 }
10492
10493 /* Grab the argument pointer. */
10494 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10495 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10496 RTX_FRAME_RELATED_P (insn) = 1;
10497 m->fs.cfa_reg = crtl->drap_reg;
10498 m->fs.cfa_offset = 0;
10499
10500 /* Align the stack. */
10501 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10502 stack_pointer_rtx,
10503 GEN_INT (-align_bytes)));
10504 RTX_FRAME_RELATED_P (insn) = 1;
10505
10506 /* Replicate the return address on the stack so that return
10507 address can be reached via (argp - 1) slot. This is needed
10508 to implement macro RETURN_ADDR_RTX and intrinsic function
10509 expand_builtin_return_addr etc. */
10510 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10511 t = gen_frame_mem (word_mode, t);
10512 insn = emit_insn (gen_push (t));
10513 RTX_FRAME_RELATED_P (insn) = 1;
10514
10515 /* For the purposes of frame and register save area addressing,
10516 we've started over with a new frame. */
10517 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10518 m->fs.realigned = true;
10519 }
10520
10521 int_registers_saved = (frame.nregs == 0);
10522 sse_registers_saved = (frame.nsseregs == 0);
10523
10524 if (frame_pointer_needed && !m->fs.fp_valid)
10525 {
10526 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10527 slower on all targets. Also sdb doesn't like it. */
10528 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10529 RTX_FRAME_RELATED_P (insn) = 1;
10530
10531 /* Push registers now, before setting the frame pointer
10532 on SEH target. */
10533 if (!int_registers_saved
10534 && TARGET_SEH
10535 && !frame.save_regs_using_mov)
10536 {
10537 ix86_emit_save_regs ();
10538 int_registers_saved = true;
10539 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10540 }
10541
10542 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10543 {
10544 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10545 RTX_FRAME_RELATED_P (insn) = 1;
10546
10547 if (m->fs.cfa_reg == stack_pointer_rtx)
10548 m->fs.cfa_reg = hard_frame_pointer_rtx;
10549 m->fs.fp_offset = m->fs.sp_offset;
10550 m->fs.fp_valid = true;
10551 }
10552 }
10553
10554 if (!int_registers_saved)
10555 {
10556 /* If saving registers via PUSH, do so now. */
10557 if (!frame.save_regs_using_mov)
10558 {
10559 ix86_emit_save_regs ();
10560 int_registers_saved = true;
10561 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10562 }
10563
10564 /* When using red zone we may start register saving before allocating
10565 the stack frame saving one cycle of the prologue. However, avoid
10566 doing this if we have to probe the stack; at least on x86_64 the
10567 stack probe can turn into a call that clobbers a red zone location. */
10568 else if (ix86_using_red_zone ()
10569 && (! TARGET_STACK_PROBE
10570 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10571 {
10572 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10573 int_registers_saved = true;
10574 }
10575 }
10576
10577 if (stack_realign_fp)
10578 {
10579 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10580 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10581
10582 /* The computation of the size of the re-aligned stack frame means
10583 that we must allocate the size of the register save area before
10584 performing the actual alignment. Otherwise we cannot guarantee
10585 that there's enough storage above the realignment point. */
10586 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10587 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10588 GEN_INT (m->fs.sp_offset
10589 - frame.sse_reg_save_offset),
10590 -1, false);
10591
10592 /* Align the stack. */
10593 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10594 stack_pointer_rtx,
10595 GEN_INT (-align_bytes)));
10596
10597 /* For the purposes of register save area addressing, the stack
10598 pointer is no longer valid. As for the value of sp_offset,
10599 see ix86_compute_frame_layout, which we need to match in order
10600 to pass verification of stack_pointer_offset at the end. */
10601 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10602 m->fs.sp_valid = false;
10603 }
10604
10605 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10606
10607 if (flag_stack_usage_info)
10608 {
10609 /* We start to count from ARG_POINTER. */
10610 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10611
10612 /* If it was realigned, take into account the fake frame. */
10613 if (stack_realign_drap)
10614 {
10615 if (ix86_static_chain_on_stack)
10616 stack_size += UNITS_PER_WORD;
10617
10618 if (!call_used_regs[REGNO (crtl->drap_reg)])
10619 stack_size += UNITS_PER_WORD;
10620
10621 /* This over-estimates by 1 minimal-stack-alignment-unit but
10622 mitigates that by counting in the new return address slot. */
10623 current_function_dynamic_stack_size
10624 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10625 }
10626
10627 current_function_static_stack_size = stack_size;
10628 }
10629
10630 /* On SEH target with very large frame size, allocate an area to save
10631 SSE registers (as the very large allocation won't be described). */
10632 if (TARGET_SEH
10633 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10634 && !sse_registers_saved)
10635 {
10636 HOST_WIDE_INT sse_size =
10637 frame.sse_reg_save_offset - frame.reg_save_offset;
10638
10639 gcc_assert (int_registers_saved);
10640
10641 /* No need to do stack checking as the area will be immediately
10642 written. */
10643 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10644 GEN_INT (-sse_size), -1,
10645 m->fs.cfa_reg == stack_pointer_rtx);
10646 allocate -= sse_size;
10647 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10648 sse_registers_saved = true;
10649 }
10650
10651 /* The stack has already been decremented by the instruction calling us
10652 so probe if the size is non-negative to preserve the protection area. */
10653 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10654 {
10655 /* We expect the registers to be saved when probes are used. */
10656 gcc_assert (int_registers_saved);
10657
10658 if (STACK_CHECK_MOVING_SP)
10659 {
10660 ix86_adjust_stack_and_probe (allocate);
10661 allocate = 0;
10662 }
10663 else
10664 {
10665 HOST_WIDE_INT size = allocate;
10666
10667 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10668 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10669
10670 if (TARGET_STACK_PROBE)
10671 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10672 else
10673 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10674 }
10675 }
10676
10677 if (allocate == 0)
10678 ;
10679 else if (!ix86_target_stack_probe ()
10680 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10681 {
10682 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10683 GEN_INT (-allocate), -1,
10684 m->fs.cfa_reg == stack_pointer_rtx);
10685 }
10686 else
10687 {
10688 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10689 rtx r10 = NULL;
10690 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10691 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10692 bool eax_live = false;
10693 bool r10_live = false;
10694
10695 if (TARGET_64BIT)
10696 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10697 if (!TARGET_64BIT_MS_ABI)
10698 eax_live = ix86_eax_live_at_start_p ();
10699
10700 /* Note that SEH directives need to continue tracking the stack
10701 pointer even after the frame pointer has been set up. */
10702 if (eax_live)
10703 {
10704 insn = emit_insn (gen_push (eax));
10705 allocate -= UNITS_PER_WORD;
10706 if (sp_is_cfa_reg || TARGET_SEH)
10707 {
10708 if (sp_is_cfa_reg)
10709 m->fs.cfa_offset += UNITS_PER_WORD;
10710 RTX_FRAME_RELATED_P (insn) = 1;
10711 }
10712 }
10713
10714 if (r10_live)
10715 {
10716 r10 = gen_rtx_REG (Pmode, R10_REG);
10717 insn = emit_insn (gen_push (r10));
10718 allocate -= UNITS_PER_WORD;
10719 if (sp_is_cfa_reg || TARGET_SEH)
10720 {
10721 if (sp_is_cfa_reg)
10722 m->fs.cfa_offset += UNITS_PER_WORD;
10723 RTX_FRAME_RELATED_P (insn) = 1;
10724 }
10725 }
10726
10727 emit_move_insn (eax, GEN_INT (allocate));
10728 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10729
10730 /* Use the fact that AX still contains ALLOCATE. */
10731 adjust_stack_insn = (Pmode == DImode
10732 ? gen_pro_epilogue_adjust_stack_di_sub
10733 : gen_pro_epilogue_adjust_stack_si_sub);
10734
10735 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10736 stack_pointer_rtx, eax));
10737
10738 if (sp_is_cfa_reg || TARGET_SEH)
10739 {
10740 if (sp_is_cfa_reg)
10741 m->fs.cfa_offset += allocate;
10742 RTX_FRAME_RELATED_P (insn) = 1;
10743 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10744 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10745 plus_constant (Pmode, stack_pointer_rtx,
10746 -allocate)));
10747 }
10748 m->fs.sp_offset += allocate;
10749
10750 if (r10_live && eax_live)
10751 {
10752 t = choose_baseaddr (m->fs.sp_offset - allocate);
10753 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10754 gen_frame_mem (word_mode, t));
10755 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10756 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10757 gen_frame_mem (word_mode, t));
10758 }
10759 else if (eax_live || r10_live)
10760 {
10761 t = choose_baseaddr (m->fs.sp_offset - allocate);
10762 emit_move_insn (gen_rtx_REG (word_mode,
10763 (eax_live ? AX_REG : R10_REG)),
10764 gen_frame_mem (word_mode, t));
10765 }
10766 }
10767 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10768
10769 /* If we havn't already set up the frame pointer, do so now. */
10770 if (frame_pointer_needed && !m->fs.fp_valid)
10771 {
10772 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10773 GEN_INT (frame.stack_pointer_offset
10774 - frame.hard_frame_pointer_offset));
10775 insn = emit_insn (insn);
10776 RTX_FRAME_RELATED_P (insn) = 1;
10777 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10778
10779 if (m->fs.cfa_reg == stack_pointer_rtx)
10780 m->fs.cfa_reg = hard_frame_pointer_rtx;
10781 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10782 m->fs.fp_valid = true;
10783 }
10784
10785 if (!int_registers_saved)
10786 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10787 if (!sse_registers_saved)
10788 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10789
10790 pic_reg_used = false;
10791 /* We don't use pic-register for pe-coff target. */
10792 if (pic_offset_table_rtx
10793 && !TARGET_PECOFF
10794 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10795 || crtl->profile))
10796 {
10797 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10798
10799 if (alt_pic_reg_used != INVALID_REGNUM)
10800 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10801
10802 pic_reg_used = true;
10803 }
10804
10805 if (pic_reg_used)
10806 {
10807 if (TARGET_64BIT)
10808 {
10809 if (ix86_cmodel == CM_LARGE_PIC)
10810 {
10811 rtx label, tmp_reg;
10812
10813 gcc_assert (Pmode == DImode);
10814 label = gen_label_rtx ();
10815 emit_label (label);
10816 LABEL_PRESERVE_P (label) = 1;
10817 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10818 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10819 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10820 label));
10821 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10822 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10823 pic_offset_table_rtx, tmp_reg));
10824 }
10825 else
10826 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10827 }
10828 else
10829 {
10830 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10831 RTX_FRAME_RELATED_P (insn) = 1;
10832 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10833 }
10834 }
10835
10836 /* In the pic_reg_used case, make sure that the got load isn't deleted
10837 when mcount needs it. Blockage to avoid call movement across mcount
10838 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10839 note. */
10840 if (crtl->profile && !flag_fentry && pic_reg_used)
10841 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10842
10843 if (crtl->drap_reg && !crtl->stack_realign_needed)
10844 {
10845 /* vDRAP is setup but after reload it turns out stack realign
10846 isn't necessary, here we will emit prologue to setup DRAP
10847 without stack realign adjustment */
10848 t = choose_baseaddr (0);
10849 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10850 }
10851
10852 /* Prevent instructions from being scheduled into register save push
10853 sequence when access to the redzone area is done through frame pointer.
10854 The offset between the frame pointer and the stack pointer is calculated
10855 relative to the value of the stack pointer at the end of the function
10856 prologue, and moving instructions that access redzone area via frame
10857 pointer inside push sequence violates this assumption. */
10858 if (frame_pointer_needed && frame.red_zone_size)
10859 emit_insn (gen_memory_blockage ());
10860
10861 /* Emit cld instruction if stringops are used in the function. */
10862 if (TARGET_CLD && ix86_current_function_needs_cld)
10863 emit_insn (gen_cld ());
10864
10865 /* SEH requires that the prologue end within 256 bytes of the start of
10866 the function. Prevent instruction schedules that would extend that.
10867 Further, prevent alloca modifications to the stack pointer from being
10868 combined with prologue modifications. */
10869 if (TARGET_SEH)
10870 emit_insn (gen_prologue_use (stack_pointer_rtx));
10871 }
10872
10873 /* Emit code to restore REG using a POP insn. */
10874
10875 static void
10876 ix86_emit_restore_reg_using_pop (rtx reg)
10877 {
10878 struct machine_function *m = cfun->machine;
10879 rtx insn = emit_insn (gen_pop (reg));
10880
10881 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10882 m->fs.sp_offset -= UNITS_PER_WORD;
10883
10884 if (m->fs.cfa_reg == crtl->drap_reg
10885 && REGNO (reg) == REGNO (crtl->drap_reg))
10886 {
10887 /* Previously we'd represented the CFA as an expression
10888 like *(%ebp - 8). We've just popped that value from
10889 the stack, which means we need to reset the CFA to
10890 the drap register. This will remain until we restore
10891 the stack pointer. */
10892 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10893 RTX_FRAME_RELATED_P (insn) = 1;
10894
10895 /* This means that the DRAP register is valid for addressing too. */
10896 m->fs.drap_valid = true;
10897 return;
10898 }
10899
10900 if (m->fs.cfa_reg == stack_pointer_rtx)
10901 {
10902 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10903 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10904 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10905 RTX_FRAME_RELATED_P (insn) = 1;
10906
10907 m->fs.cfa_offset -= UNITS_PER_WORD;
10908 }
10909
10910 /* When the frame pointer is the CFA, and we pop it, we are
10911 swapping back to the stack pointer as the CFA. This happens
10912 for stack frames that don't allocate other data, so we assume
10913 the stack pointer is now pointing at the return address, i.e.
10914 the function entry state, which makes the offset be 1 word. */
10915 if (reg == hard_frame_pointer_rtx)
10916 {
10917 m->fs.fp_valid = false;
10918 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10919 {
10920 m->fs.cfa_reg = stack_pointer_rtx;
10921 m->fs.cfa_offset -= UNITS_PER_WORD;
10922
10923 add_reg_note (insn, REG_CFA_DEF_CFA,
10924 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10925 GEN_INT (m->fs.cfa_offset)));
10926 RTX_FRAME_RELATED_P (insn) = 1;
10927 }
10928 }
10929 }
10930
10931 /* Emit code to restore saved registers using POP insns. */
10932
10933 static void
10934 ix86_emit_restore_regs_using_pop (void)
10935 {
10936 unsigned int regno;
10937
10938 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10939 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10940 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10941 }
10942
10943 /* Emit code and notes for the LEAVE instruction. */
10944
10945 static void
10946 ix86_emit_leave (void)
10947 {
10948 struct machine_function *m = cfun->machine;
10949 rtx insn = emit_insn (ix86_gen_leave ());
10950
10951 ix86_add_queued_cfa_restore_notes (insn);
10952
10953 gcc_assert (m->fs.fp_valid);
10954 m->fs.sp_valid = true;
10955 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10956 m->fs.fp_valid = false;
10957
10958 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10959 {
10960 m->fs.cfa_reg = stack_pointer_rtx;
10961 m->fs.cfa_offset = m->fs.sp_offset;
10962
10963 add_reg_note (insn, REG_CFA_DEF_CFA,
10964 plus_constant (Pmode, stack_pointer_rtx,
10965 m->fs.sp_offset));
10966 RTX_FRAME_RELATED_P (insn) = 1;
10967 }
10968 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10969 m->fs.fp_offset);
10970 }
10971
10972 /* Emit code to restore saved registers using MOV insns.
10973 First register is restored from CFA - CFA_OFFSET. */
10974 static void
10975 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10976 bool maybe_eh_return)
10977 {
10978 struct machine_function *m = cfun->machine;
10979 unsigned int regno;
10980
10981 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10982 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10983 {
10984 rtx reg = gen_rtx_REG (word_mode, regno);
10985 rtx insn, mem;
10986
10987 mem = choose_baseaddr (cfa_offset);
10988 mem = gen_frame_mem (word_mode, mem);
10989 insn = emit_move_insn (reg, mem);
10990
10991 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10992 {
10993 /* Previously we'd represented the CFA as an expression
10994 like *(%ebp - 8). We've just popped that value from
10995 the stack, which means we need to reset the CFA to
10996 the drap register. This will remain until we restore
10997 the stack pointer. */
10998 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10999 RTX_FRAME_RELATED_P (insn) = 1;
11000
11001 /* This means that the DRAP register is valid for addressing. */
11002 m->fs.drap_valid = true;
11003 }
11004 else
11005 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11006
11007 cfa_offset -= UNITS_PER_WORD;
11008 }
11009 }
11010
11011 /* Emit code to restore saved registers using MOV insns.
11012 First register is restored from CFA - CFA_OFFSET. */
11013 static void
11014 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
11015 bool maybe_eh_return)
11016 {
11017 unsigned int regno;
11018
11019 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11020 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
11021 {
11022 rtx reg = gen_rtx_REG (V4SFmode, regno);
11023 rtx mem;
11024
11025 mem = choose_baseaddr (cfa_offset);
11026 mem = gen_rtx_MEM (V4SFmode, mem);
11027 set_mem_align (mem, 128);
11028 emit_move_insn (reg, mem);
11029
11030 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
11031
11032 cfa_offset -= 16;
11033 }
11034 }
11035
11036 /* Restore function stack, frame, and registers. */
11037
11038 void
11039 ix86_expand_epilogue (int style)
11040 {
11041 struct machine_function *m = cfun->machine;
11042 struct machine_frame_state frame_state_save = m->fs;
11043 struct ix86_frame frame;
11044 bool restore_regs_via_mov;
11045 bool using_drap;
11046
11047 ix86_finalize_stack_realign_flags ();
11048 ix86_compute_frame_layout (&frame);
11049
11050 m->fs.sp_valid = (!frame_pointer_needed
11051 || (crtl->sp_is_unchanging
11052 && !stack_realign_fp));
11053 gcc_assert (!m->fs.sp_valid
11054 || m->fs.sp_offset == frame.stack_pointer_offset);
11055
11056 /* The FP must be valid if the frame pointer is present. */
11057 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
11058 gcc_assert (!m->fs.fp_valid
11059 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
11060
11061 /* We must have *some* valid pointer to the stack frame. */
11062 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
11063
11064 /* The DRAP is never valid at this point. */
11065 gcc_assert (!m->fs.drap_valid);
11066
11067 /* See the comment about red zone and frame
11068 pointer usage in ix86_expand_prologue. */
11069 if (frame_pointer_needed && frame.red_zone_size)
11070 emit_insn (gen_memory_blockage ());
11071
11072 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
11073 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
11074
11075 /* Determine the CFA offset of the end of the red-zone. */
11076 m->fs.red_zone_offset = 0;
11077 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
11078 {
11079 /* The red-zone begins below the return address. */
11080 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
11081
11082 /* When the register save area is in the aligned portion of
11083 the stack, determine the maximum runtime displacement that
11084 matches up with the aligned frame. */
11085 if (stack_realign_drap)
11086 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
11087 + UNITS_PER_WORD);
11088 }
11089
11090 /* Special care must be taken for the normal return case of a function
11091 using eh_return: the eax and edx registers are marked as saved, but
11092 not restored along this path. Adjust the save location to match. */
11093 if (crtl->calls_eh_return && style != 2)
11094 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
11095
11096 /* EH_RETURN requires the use of moves to function properly. */
11097 if (crtl->calls_eh_return)
11098 restore_regs_via_mov = true;
11099 /* SEH requires the use of pops to identify the epilogue. */
11100 else if (TARGET_SEH)
11101 restore_regs_via_mov = false;
11102 /* If we're only restoring one register and sp is not valid then
11103 using a move instruction to restore the register since it's
11104 less work than reloading sp and popping the register. */
11105 else if (!m->fs.sp_valid && frame.nregs <= 1)
11106 restore_regs_via_mov = true;
11107 else if (TARGET_EPILOGUE_USING_MOVE
11108 && cfun->machine->use_fast_prologue_epilogue
11109 && (frame.nregs > 1
11110 || m->fs.sp_offset != frame.reg_save_offset))
11111 restore_regs_via_mov = true;
11112 else if (frame_pointer_needed
11113 && !frame.nregs
11114 && m->fs.sp_offset != frame.reg_save_offset)
11115 restore_regs_via_mov = true;
11116 else if (frame_pointer_needed
11117 && TARGET_USE_LEAVE
11118 && cfun->machine->use_fast_prologue_epilogue
11119 && frame.nregs == 1)
11120 restore_regs_via_mov = true;
11121 else
11122 restore_regs_via_mov = false;
11123
11124 if (restore_regs_via_mov || frame.nsseregs)
11125 {
11126 /* Ensure that the entire register save area is addressable via
11127 the stack pointer, if we will restore via sp. */
11128 if (TARGET_64BIT
11129 && m->fs.sp_offset > 0x7fffffff
11130 && !(m->fs.fp_valid || m->fs.drap_valid)
11131 && (frame.nsseregs + frame.nregs) != 0)
11132 {
11133 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11134 GEN_INT (m->fs.sp_offset
11135 - frame.sse_reg_save_offset),
11136 style,
11137 m->fs.cfa_reg == stack_pointer_rtx);
11138 }
11139 }
11140
11141 /* If there are any SSE registers to restore, then we have to do it
11142 via moves, since there's obviously no pop for SSE regs. */
11143 if (frame.nsseregs)
11144 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
11145 style == 2);
11146
11147 if (restore_regs_via_mov)
11148 {
11149 rtx t;
11150
11151 if (frame.nregs)
11152 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
11153
11154 /* eh_return epilogues need %ecx added to the stack pointer. */
11155 if (style == 2)
11156 {
11157 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
11158
11159 /* Stack align doesn't work with eh_return. */
11160 gcc_assert (!stack_realign_drap);
11161 /* Neither does regparm nested functions. */
11162 gcc_assert (!ix86_static_chain_on_stack);
11163
11164 if (frame_pointer_needed)
11165 {
11166 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
11167 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
11168 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
11169
11170 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
11171 insn = emit_move_insn (hard_frame_pointer_rtx, t);
11172
11173 /* Note that we use SA as a temporary CFA, as the return
11174 address is at the proper place relative to it. We
11175 pretend this happens at the FP restore insn because
11176 prior to this insn the FP would be stored at the wrong
11177 offset relative to SA, and after this insn we have no
11178 other reasonable register to use for the CFA. We don't
11179 bother resetting the CFA to the SP for the duration of
11180 the return insn. */
11181 add_reg_note (insn, REG_CFA_DEF_CFA,
11182 plus_constant (Pmode, sa, UNITS_PER_WORD));
11183 ix86_add_queued_cfa_restore_notes (insn);
11184 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
11185 RTX_FRAME_RELATED_P (insn) = 1;
11186
11187 m->fs.cfa_reg = sa;
11188 m->fs.cfa_offset = UNITS_PER_WORD;
11189 m->fs.fp_valid = false;
11190
11191 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
11192 const0_rtx, style, false);
11193 }
11194 else
11195 {
11196 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
11197 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
11198 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
11199 ix86_add_queued_cfa_restore_notes (insn);
11200
11201 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
11202 if (m->fs.cfa_offset != UNITS_PER_WORD)
11203 {
11204 m->fs.cfa_offset = UNITS_PER_WORD;
11205 add_reg_note (insn, REG_CFA_DEF_CFA,
11206 plus_constant (Pmode, stack_pointer_rtx,
11207 UNITS_PER_WORD));
11208 RTX_FRAME_RELATED_P (insn) = 1;
11209 }
11210 }
11211 m->fs.sp_offset = UNITS_PER_WORD;
11212 m->fs.sp_valid = true;
11213 }
11214 }
11215 else
11216 {
11217 /* SEH requires that the function end with (1) a stack adjustment
11218 if necessary, (2) a sequence of pops, and (3) a return or
11219 jump instruction. Prevent insns from the function body from
11220 being scheduled into this sequence. */
11221 if (TARGET_SEH)
11222 {
11223 /* Prevent a catch region from being adjacent to the standard
11224 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11225 several other flags that would be interesting to test are
11226 not yet set up. */
11227 if (flag_non_call_exceptions)
11228 emit_insn (gen_nops (const1_rtx));
11229 else
11230 emit_insn (gen_blockage ());
11231 }
11232
11233 /* First step is to deallocate the stack frame so that we can
11234 pop the registers. Also do it on SEH target for very large
11235 frame as the emitted instructions aren't allowed by the ABI in
11236 epilogues. */
11237 if (!m->fs.sp_valid
11238 || (TARGET_SEH
11239 && (m->fs.sp_offset - frame.reg_save_offset
11240 >= SEH_MAX_FRAME_SIZE)))
11241 {
11242 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11243 GEN_INT (m->fs.fp_offset
11244 - frame.reg_save_offset),
11245 style, false);
11246 }
11247 else if (m->fs.sp_offset != frame.reg_save_offset)
11248 {
11249 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11250 GEN_INT (m->fs.sp_offset
11251 - frame.reg_save_offset),
11252 style,
11253 m->fs.cfa_reg == stack_pointer_rtx);
11254 }
11255
11256 ix86_emit_restore_regs_using_pop ();
11257 }
11258
11259 /* If we used a stack pointer and haven't already got rid of it,
11260 then do so now. */
11261 if (m->fs.fp_valid)
11262 {
11263 /* If the stack pointer is valid and pointing at the frame
11264 pointer store address, then we only need a pop. */
11265 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11266 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11267 /* Leave results in shorter dependency chains on CPUs that are
11268 able to grok it fast. */
11269 else if (TARGET_USE_LEAVE
11270 || optimize_bb_for_size_p (EXIT_BLOCK_PTR)
11271 || !cfun->machine->use_fast_prologue_epilogue)
11272 ix86_emit_leave ();
11273 else
11274 {
11275 pro_epilogue_adjust_stack (stack_pointer_rtx,
11276 hard_frame_pointer_rtx,
11277 const0_rtx, style, !using_drap);
11278 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11279 }
11280 }
11281
11282 if (using_drap)
11283 {
11284 int param_ptr_offset = UNITS_PER_WORD;
11285 rtx insn;
11286
11287 gcc_assert (stack_realign_drap);
11288
11289 if (ix86_static_chain_on_stack)
11290 param_ptr_offset += UNITS_PER_WORD;
11291 if (!call_used_regs[REGNO (crtl->drap_reg)])
11292 param_ptr_offset += UNITS_PER_WORD;
11293
11294 insn = emit_insn (gen_rtx_SET
11295 (VOIDmode, stack_pointer_rtx,
11296 gen_rtx_PLUS (Pmode,
11297 crtl->drap_reg,
11298 GEN_INT (-param_ptr_offset))));
11299 m->fs.cfa_reg = stack_pointer_rtx;
11300 m->fs.cfa_offset = param_ptr_offset;
11301 m->fs.sp_offset = param_ptr_offset;
11302 m->fs.realigned = false;
11303
11304 add_reg_note (insn, REG_CFA_DEF_CFA,
11305 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11306 GEN_INT (param_ptr_offset)));
11307 RTX_FRAME_RELATED_P (insn) = 1;
11308
11309 if (!call_used_regs[REGNO (crtl->drap_reg)])
11310 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11311 }
11312
11313 /* At this point the stack pointer must be valid, and we must have
11314 restored all of the registers. We may not have deallocated the
11315 entire stack frame. We've delayed this until now because it may
11316 be possible to merge the local stack deallocation with the
11317 deallocation forced by ix86_static_chain_on_stack. */
11318 gcc_assert (m->fs.sp_valid);
11319 gcc_assert (!m->fs.fp_valid);
11320 gcc_assert (!m->fs.realigned);
11321 if (m->fs.sp_offset != UNITS_PER_WORD)
11322 {
11323 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11324 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11325 style, true);
11326 }
11327 else
11328 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11329
11330 /* Sibcall epilogues don't want a return instruction. */
11331 if (style == 0)
11332 {
11333 m->fs = frame_state_save;
11334 return;
11335 }
11336
11337 if (crtl->args.pops_args && crtl->args.size)
11338 {
11339 rtx popc = GEN_INT (crtl->args.pops_args);
11340
11341 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11342 address, do explicit add, and jump indirectly to the caller. */
11343
11344 if (crtl->args.pops_args >= 65536)
11345 {
11346 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11347 rtx insn;
11348
11349 /* There is no "pascal" calling convention in any 64bit ABI. */
11350 gcc_assert (!TARGET_64BIT);
11351
11352 insn = emit_insn (gen_pop (ecx));
11353 m->fs.cfa_offset -= UNITS_PER_WORD;
11354 m->fs.sp_offset -= UNITS_PER_WORD;
11355
11356 add_reg_note (insn, REG_CFA_ADJUST_CFA,
11357 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
11358 add_reg_note (insn, REG_CFA_REGISTER,
11359 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11360 RTX_FRAME_RELATED_P (insn) = 1;
11361
11362 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11363 popc, -1, true);
11364 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11365 }
11366 else
11367 emit_jump_insn (gen_simple_return_pop_internal (popc));
11368 }
11369 else
11370 emit_jump_insn (gen_simple_return_internal ());
11371
11372 /* Restore the state back to the state from the prologue,
11373 so that it's correct for the next epilogue. */
11374 m->fs = frame_state_save;
11375 }
11376
11377 /* Reset from the function's potential modifications. */
11378
11379 static void
11380 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11381 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11382 {
11383 if (pic_offset_table_rtx)
11384 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11385 #if TARGET_MACHO
11386 /* Mach-O doesn't support labels at the end of objects, so if
11387 it looks like we might want one, insert a NOP. */
11388 {
11389 rtx insn = get_last_insn ();
11390 rtx deleted_debug_label = NULL_RTX;
11391 while (insn
11392 && NOTE_P (insn)
11393 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11394 {
11395 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11396 notes only, instead set their CODE_LABEL_NUMBER to -1,
11397 otherwise there would be code generation differences
11398 in between -g and -g0. */
11399 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11400 deleted_debug_label = insn;
11401 insn = PREV_INSN (insn);
11402 }
11403 if (insn
11404 && (LABEL_P (insn)
11405 || (NOTE_P (insn)
11406 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11407 fputs ("\tnop\n", file);
11408 else if (deleted_debug_label)
11409 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11410 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11411 CODE_LABEL_NUMBER (insn) = -1;
11412 }
11413 #endif
11414
11415 }
11416
11417 /* Return a scratch register to use in the split stack prologue. The
11418 split stack prologue is used for -fsplit-stack. It is the first
11419 instructions in the function, even before the regular prologue.
11420 The scratch register can be any caller-saved register which is not
11421 used for parameters or for the static chain. */
11422
11423 static unsigned int
11424 split_stack_prologue_scratch_regno (void)
11425 {
11426 if (TARGET_64BIT)
11427 return R11_REG;
11428 else
11429 {
11430 bool is_fastcall, is_thiscall;
11431 int regparm;
11432
11433 is_fastcall = (lookup_attribute ("fastcall",
11434 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11435 != NULL);
11436 is_thiscall = (lookup_attribute ("thiscall",
11437 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11438 != NULL);
11439 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11440
11441 if (is_fastcall)
11442 {
11443 if (DECL_STATIC_CHAIN (cfun->decl))
11444 {
11445 sorry ("-fsplit-stack does not support fastcall with "
11446 "nested function");
11447 return INVALID_REGNUM;
11448 }
11449 return AX_REG;
11450 }
11451 else if (is_thiscall)
11452 {
11453 if (!DECL_STATIC_CHAIN (cfun->decl))
11454 return DX_REG;
11455 return AX_REG;
11456 }
11457 else if (regparm < 3)
11458 {
11459 if (!DECL_STATIC_CHAIN (cfun->decl))
11460 return CX_REG;
11461 else
11462 {
11463 if (regparm >= 2)
11464 {
11465 sorry ("-fsplit-stack does not support 2 register "
11466 " parameters for a nested function");
11467 return INVALID_REGNUM;
11468 }
11469 return DX_REG;
11470 }
11471 }
11472 else
11473 {
11474 /* FIXME: We could make this work by pushing a register
11475 around the addition and comparison. */
11476 sorry ("-fsplit-stack does not support 3 register parameters");
11477 return INVALID_REGNUM;
11478 }
11479 }
11480 }
11481
11482 /* A SYMBOL_REF for the function which allocates new stackspace for
11483 -fsplit-stack. */
11484
11485 static GTY(()) rtx split_stack_fn;
11486
11487 /* A SYMBOL_REF for the more stack function when using the large
11488 model. */
11489
11490 static GTY(()) rtx split_stack_fn_large;
11491
11492 /* Handle -fsplit-stack. These are the first instructions in the
11493 function, even before the regular prologue. */
11494
11495 void
11496 ix86_expand_split_stack_prologue (void)
11497 {
11498 struct ix86_frame frame;
11499 HOST_WIDE_INT allocate;
11500 unsigned HOST_WIDE_INT args_size;
11501 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11502 rtx scratch_reg = NULL_RTX;
11503 rtx varargs_label = NULL_RTX;
11504 rtx fn;
11505
11506 gcc_assert (flag_split_stack && reload_completed);
11507
11508 ix86_finalize_stack_realign_flags ();
11509 ix86_compute_frame_layout (&frame);
11510 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11511
11512 /* This is the label we will branch to if we have enough stack
11513 space. We expect the basic block reordering pass to reverse this
11514 branch if optimizing, so that we branch in the unlikely case. */
11515 label = gen_label_rtx ();
11516
11517 /* We need to compare the stack pointer minus the frame size with
11518 the stack boundary in the TCB. The stack boundary always gives
11519 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11520 can compare directly. Otherwise we need to do an addition. */
11521
11522 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11523 UNSPEC_STACK_CHECK);
11524 limit = gen_rtx_CONST (Pmode, limit);
11525 limit = gen_rtx_MEM (Pmode, limit);
11526 if (allocate < SPLIT_STACK_AVAILABLE)
11527 current = stack_pointer_rtx;
11528 else
11529 {
11530 unsigned int scratch_regno;
11531 rtx offset;
11532
11533 /* We need a scratch register to hold the stack pointer minus
11534 the required frame size. Since this is the very start of the
11535 function, the scratch register can be any caller-saved
11536 register which is not used for parameters. */
11537 offset = GEN_INT (- allocate);
11538 scratch_regno = split_stack_prologue_scratch_regno ();
11539 if (scratch_regno == INVALID_REGNUM)
11540 return;
11541 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11542 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11543 {
11544 /* We don't use ix86_gen_add3 in this case because it will
11545 want to split to lea, but when not optimizing the insn
11546 will not be split after this point. */
11547 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11548 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11549 offset)));
11550 }
11551 else
11552 {
11553 emit_move_insn (scratch_reg, offset);
11554 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11555 stack_pointer_rtx));
11556 }
11557 current = scratch_reg;
11558 }
11559
11560 ix86_expand_branch (GEU, current, limit, label);
11561 jump_insn = get_last_insn ();
11562 JUMP_LABEL (jump_insn) = label;
11563
11564 /* Mark the jump as very likely to be taken. */
11565 add_int_reg_note (jump_insn, REG_BR_PROB,
11566 REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100);
11567
11568 if (split_stack_fn == NULL_RTX)
11569 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11570 fn = split_stack_fn;
11571
11572 /* Get more stack space. We pass in the desired stack space and the
11573 size of the arguments to copy to the new stack. In 32-bit mode
11574 we push the parameters; __morestack will return on a new stack
11575 anyhow. In 64-bit mode we pass the parameters in r10 and
11576 r11. */
11577 allocate_rtx = GEN_INT (allocate);
11578 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11579 call_fusage = NULL_RTX;
11580 if (TARGET_64BIT)
11581 {
11582 rtx reg10, reg11;
11583
11584 reg10 = gen_rtx_REG (Pmode, R10_REG);
11585 reg11 = gen_rtx_REG (Pmode, R11_REG);
11586
11587 /* If this function uses a static chain, it will be in %r10.
11588 Preserve it across the call to __morestack. */
11589 if (DECL_STATIC_CHAIN (cfun->decl))
11590 {
11591 rtx rax;
11592
11593 rax = gen_rtx_REG (word_mode, AX_REG);
11594 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11595 use_reg (&call_fusage, rax);
11596 }
11597
11598 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11599 && !TARGET_PECOFF)
11600 {
11601 HOST_WIDE_INT argval;
11602
11603 gcc_assert (Pmode == DImode);
11604 /* When using the large model we need to load the address
11605 into a register, and we've run out of registers. So we
11606 switch to a different calling convention, and we call a
11607 different function: __morestack_large. We pass the
11608 argument size in the upper 32 bits of r10 and pass the
11609 frame size in the lower 32 bits. */
11610 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11611 gcc_assert ((args_size & 0xffffffff) == args_size);
11612
11613 if (split_stack_fn_large == NULL_RTX)
11614 split_stack_fn_large =
11615 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11616
11617 if (ix86_cmodel == CM_LARGE_PIC)
11618 {
11619 rtx label, x;
11620
11621 label = gen_label_rtx ();
11622 emit_label (label);
11623 LABEL_PRESERVE_P (label) = 1;
11624 emit_insn (gen_set_rip_rex64 (reg10, label));
11625 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11626 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11627 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11628 UNSPEC_GOT);
11629 x = gen_rtx_CONST (Pmode, x);
11630 emit_move_insn (reg11, x);
11631 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11632 x = gen_const_mem (Pmode, x);
11633 emit_move_insn (reg11, x);
11634 }
11635 else
11636 emit_move_insn (reg11, split_stack_fn_large);
11637
11638 fn = reg11;
11639
11640 argval = ((args_size << 16) << 16) + allocate;
11641 emit_move_insn (reg10, GEN_INT (argval));
11642 }
11643 else
11644 {
11645 emit_move_insn (reg10, allocate_rtx);
11646 emit_move_insn (reg11, GEN_INT (args_size));
11647 use_reg (&call_fusage, reg11);
11648 }
11649
11650 use_reg (&call_fusage, reg10);
11651 }
11652 else
11653 {
11654 emit_insn (gen_push (GEN_INT (args_size)));
11655 emit_insn (gen_push (allocate_rtx));
11656 }
11657 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11658 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11659 NULL_RTX, false);
11660 add_function_usage_to (call_insn, call_fusage);
11661
11662 /* In order to make call/return prediction work right, we now need
11663 to execute a return instruction. See
11664 libgcc/config/i386/morestack.S for the details on how this works.
11665
11666 For flow purposes gcc must not see this as a return
11667 instruction--we need control flow to continue at the subsequent
11668 label. Therefore, we use an unspec. */
11669 gcc_assert (crtl->args.pops_args < 65536);
11670 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11671
11672 /* If we are in 64-bit mode and this function uses a static chain,
11673 we saved %r10 in %rax before calling _morestack. */
11674 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11675 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11676 gen_rtx_REG (word_mode, AX_REG));
11677
11678 /* If this function calls va_start, we need to store a pointer to
11679 the arguments on the old stack, because they may not have been
11680 all copied to the new stack. At this point the old stack can be
11681 found at the frame pointer value used by __morestack, because
11682 __morestack has set that up before calling back to us. Here we
11683 store that pointer in a scratch register, and in
11684 ix86_expand_prologue we store the scratch register in a stack
11685 slot. */
11686 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11687 {
11688 unsigned int scratch_regno;
11689 rtx frame_reg;
11690 int words;
11691
11692 scratch_regno = split_stack_prologue_scratch_regno ();
11693 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11694 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11695
11696 /* 64-bit:
11697 fp -> old fp value
11698 return address within this function
11699 return address of caller of this function
11700 stack arguments
11701 So we add three words to get to the stack arguments.
11702
11703 32-bit:
11704 fp -> old fp value
11705 return address within this function
11706 first argument to __morestack
11707 second argument to __morestack
11708 return address of caller of this function
11709 stack arguments
11710 So we add five words to get to the stack arguments.
11711 */
11712 words = TARGET_64BIT ? 3 : 5;
11713 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11714 gen_rtx_PLUS (Pmode, frame_reg,
11715 GEN_INT (words * UNITS_PER_WORD))));
11716
11717 varargs_label = gen_label_rtx ();
11718 emit_jump_insn (gen_jump (varargs_label));
11719 JUMP_LABEL (get_last_insn ()) = varargs_label;
11720
11721 emit_barrier ();
11722 }
11723
11724 emit_label (label);
11725 LABEL_NUSES (label) = 1;
11726
11727 /* If this function calls va_start, we now have to set the scratch
11728 register for the case where we do not call __morestack. In this
11729 case we need to set it based on the stack pointer. */
11730 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11731 {
11732 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11733 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11734 GEN_INT (UNITS_PER_WORD))));
11735
11736 emit_label (varargs_label);
11737 LABEL_NUSES (varargs_label) = 1;
11738 }
11739 }
11740
11741 /* We may have to tell the dataflow pass that the split stack prologue
11742 is initializing a scratch register. */
11743
11744 static void
11745 ix86_live_on_entry (bitmap regs)
11746 {
11747 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11748 {
11749 gcc_assert (flag_split_stack);
11750 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11751 }
11752 }
11753 \f
11754 /* Determine if op is suitable SUBREG RTX for address. */
11755
11756 static bool
11757 ix86_address_subreg_operand (rtx op)
11758 {
11759 enum machine_mode mode;
11760
11761 if (!REG_P (op))
11762 return false;
11763
11764 mode = GET_MODE (op);
11765
11766 if (GET_MODE_CLASS (mode) != MODE_INT)
11767 return false;
11768
11769 /* Don't allow SUBREGs that span more than a word. It can lead to spill
11770 failures when the register is one word out of a two word structure. */
11771 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
11772 return false;
11773
11774 /* Allow only SUBREGs of non-eliminable hard registers. */
11775 return register_no_elim_operand (op, mode);
11776 }
11777
11778 /* Extract the parts of an RTL expression that is a valid memory address
11779 for an instruction. Return 0 if the structure of the address is
11780 grossly off. Return -1 if the address contains ASHIFT, so it is not
11781 strictly valid, but still used for computing length of lea instruction. */
11782
11783 int
11784 ix86_decompose_address (rtx addr, struct ix86_address *out)
11785 {
11786 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11787 rtx base_reg, index_reg;
11788 HOST_WIDE_INT scale = 1;
11789 rtx scale_rtx = NULL_RTX;
11790 rtx tmp;
11791 int retval = 1;
11792 enum ix86_address_seg seg = SEG_DEFAULT;
11793
11794 /* Allow zero-extended SImode addresses,
11795 they will be emitted with addr32 prefix. */
11796 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11797 {
11798 if (GET_CODE (addr) == ZERO_EXTEND
11799 && GET_MODE (XEXP (addr, 0)) == SImode)
11800 {
11801 addr = XEXP (addr, 0);
11802 if (CONST_INT_P (addr))
11803 return 0;
11804 }
11805 else if (GET_CODE (addr) == AND
11806 && const_32bit_mask (XEXP (addr, 1), DImode))
11807 {
11808 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11809 if (addr == NULL_RTX)
11810 return 0;
11811
11812 if (CONST_INT_P (addr))
11813 return 0;
11814 }
11815 }
11816
11817 /* Allow SImode subregs of DImode addresses,
11818 they will be emitted with addr32 prefix. */
11819 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11820 {
11821 if (GET_CODE (addr) == SUBREG
11822 && GET_MODE (SUBREG_REG (addr)) == DImode)
11823 {
11824 addr = SUBREG_REG (addr);
11825 if (CONST_INT_P (addr))
11826 return 0;
11827 }
11828 }
11829
11830 if (REG_P (addr))
11831 base = addr;
11832 else if (GET_CODE (addr) == SUBREG)
11833 {
11834 if (ix86_address_subreg_operand (SUBREG_REG (addr)))
11835 base = addr;
11836 else
11837 return 0;
11838 }
11839 else if (GET_CODE (addr) == PLUS)
11840 {
11841 rtx addends[4], op;
11842 int n = 0, i;
11843
11844 op = addr;
11845 do
11846 {
11847 if (n >= 4)
11848 return 0;
11849 addends[n++] = XEXP (op, 1);
11850 op = XEXP (op, 0);
11851 }
11852 while (GET_CODE (op) == PLUS);
11853 if (n >= 4)
11854 return 0;
11855 addends[n] = op;
11856
11857 for (i = n; i >= 0; --i)
11858 {
11859 op = addends[i];
11860 switch (GET_CODE (op))
11861 {
11862 case MULT:
11863 if (index)
11864 return 0;
11865 index = XEXP (op, 0);
11866 scale_rtx = XEXP (op, 1);
11867 break;
11868
11869 case ASHIFT:
11870 if (index)
11871 return 0;
11872 index = XEXP (op, 0);
11873 tmp = XEXP (op, 1);
11874 if (!CONST_INT_P (tmp))
11875 return 0;
11876 scale = INTVAL (tmp);
11877 if ((unsigned HOST_WIDE_INT) scale > 3)
11878 return 0;
11879 scale = 1 << scale;
11880 break;
11881
11882 case ZERO_EXTEND:
11883 op = XEXP (op, 0);
11884 if (GET_CODE (op) != UNSPEC)
11885 return 0;
11886 /* FALLTHRU */
11887
11888 case UNSPEC:
11889 if (XINT (op, 1) == UNSPEC_TP
11890 && TARGET_TLS_DIRECT_SEG_REFS
11891 && seg == SEG_DEFAULT)
11892 seg = DEFAULT_TLS_SEG_REG;
11893 else
11894 return 0;
11895 break;
11896
11897 case SUBREG:
11898 if (!ix86_address_subreg_operand (SUBREG_REG (op)))
11899 return 0;
11900 /* FALLTHRU */
11901
11902 case REG:
11903 if (!base)
11904 base = op;
11905 else if (!index)
11906 index = op;
11907 else
11908 return 0;
11909 break;
11910
11911 case CONST:
11912 case CONST_INT:
11913 case SYMBOL_REF:
11914 case LABEL_REF:
11915 if (disp)
11916 return 0;
11917 disp = op;
11918 break;
11919
11920 default:
11921 return 0;
11922 }
11923 }
11924 }
11925 else if (GET_CODE (addr) == MULT)
11926 {
11927 index = XEXP (addr, 0); /* index*scale */
11928 scale_rtx = XEXP (addr, 1);
11929 }
11930 else if (GET_CODE (addr) == ASHIFT)
11931 {
11932 /* We're called for lea too, which implements ashift on occasion. */
11933 index = XEXP (addr, 0);
11934 tmp = XEXP (addr, 1);
11935 if (!CONST_INT_P (tmp))
11936 return 0;
11937 scale = INTVAL (tmp);
11938 if ((unsigned HOST_WIDE_INT) scale > 3)
11939 return 0;
11940 scale = 1 << scale;
11941 retval = -1;
11942 }
11943 else if (CONST_INT_P (addr))
11944 {
11945 if (!x86_64_immediate_operand (addr, VOIDmode))
11946 return 0;
11947
11948 /* Constant addresses are sign extended to 64bit, we have to
11949 prevent addresses from 0x80000000 to 0xffffffff in x32 mode. */
11950 if (TARGET_X32
11951 && val_signbit_known_set_p (SImode, INTVAL (addr)))
11952 return 0;
11953
11954 disp = addr;
11955 }
11956 else
11957 disp = addr; /* displacement */
11958
11959 if (index)
11960 {
11961 if (REG_P (index))
11962 ;
11963 else if (GET_CODE (index) == SUBREG
11964 && ix86_address_subreg_operand (SUBREG_REG (index)))
11965 ;
11966 else
11967 return 0;
11968 }
11969
11970 /* Address override works only on the (%reg) part of %fs:(%reg). */
11971 if (seg != SEG_DEFAULT
11972 && ((base && GET_MODE (base) != word_mode)
11973 || (index && GET_MODE (index) != word_mode)))
11974 return 0;
11975
11976 /* Extract the integral value of scale. */
11977 if (scale_rtx)
11978 {
11979 if (!CONST_INT_P (scale_rtx))
11980 return 0;
11981 scale = INTVAL (scale_rtx);
11982 }
11983
11984 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11985 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11986
11987 /* Avoid useless 0 displacement. */
11988 if (disp == const0_rtx && (base || index))
11989 disp = NULL_RTX;
11990
11991 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11992 if (base_reg && index_reg && scale == 1
11993 && (index_reg == arg_pointer_rtx
11994 || index_reg == frame_pointer_rtx
11995 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11996 {
11997 rtx tmp;
11998 tmp = base, base = index, index = tmp;
11999 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
12000 }
12001
12002 /* Special case: %ebp cannot be encoded as a base without a displacement.
12003 Similarly %r13. */
12004 if (!disp
12005 && base_reg
12006 && (base_reg == hard_frame_pointer_rtx
12007 || base_reg == frame_pointer_rtx
12008 || base_reg == arg_pointer_rtx
12009 || (REG_P (base_reg)
12010 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
12011 || REGNO (base_reg) == R13_REG))))
12012 disp = const0_rtx;
12013
12014 /* Special case: on K6, [%esi] makes the instruction vector decoded.
12015 Avoid this by transforming to [%esi+0].
12016 Reload calls address legitimization without cfun defined, so we need
12017 to test cfun for being non-NULL. */
12018 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
12019 && base_reg && !index_reg && !disp
12020 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
12021 disp = const0_rtx;
12022
12023 /* Special case: encode reg+reg instead of reg*2. */
12024 if (!base && index && scale == 2)
12025 base = index, base_reg = index_reg, scale = 1;
12026
12027 /* Special case: scaling cannot be encoded without base or displacement. */
12028 if (!base && !disp && index && scale != 1)
12029 disp = const0_rtx;
12030
12031 out->base = base;
12032 out->index = index;
12033 out->disp = disp;
12034 out->scale = scale;
12035 out->seg = seg;
12036
12037 return retval;
12038 }
12039 \f
12040 /* Return cost of the memory address x.
12041 For i386, it is better to use a complex address than let gcc copy
12042 the address into a reg and make a new pseudo. But not if the address
12043 requires to two regs - that would mean more pseudos with longer
12044 lifetimes. */
12045 static int
12046 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
12047 addr_space_t as ATTRIBUTE_UNUSED,
12048 bool speed ATTRIBUTE_UNUSED)
12049 {
12050 struct ix86_address parts;
12051 int cost = 1;
12052 int ok = ix86_decompose_address (x, &parts);
12053
12054 gcc_assert (ok);
12055
12056 if (parts.base && GET_CODE (parts.base) == SUBREG)
12057 parts.base = SUBREG_REG (parts.base);
12058 if (parts.index && GET_CODE (parts.index) == SUBREG)
12059 parts.index = SUBREG_REG (parts.index);
12060
12061 /* Attempt to minimize number of registers in the address. */
12062 if ((parts.base
12063 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
12064 || (parts.index
12065 && (!REG_P (parts.index)
12066 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
12067 cost++;
12068
12069 if (parts.base
12070 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
12071 && parts.index
12072 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
12073 && parts.base != parts.index)
12074 cost++;
12075
12076 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
12077 since it's predecode logic can't detect the length of instructions
12078 and it degenerates to vector decoded. Increase cost of such
12079 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
12080 to split such addresses or even refuse such addresses at all.
12081
12082 Following addressing modes are affected:
12083 [base+scale*index]
12084 [scale*index+disp]
12085 [base+index]
12086
12087 The first and last case may be avoidable by explicitly coding the zero in
12088 memory address, but I don't have AMD-K6 machine handy to check this
12089 theory. */
12090
12091 if (TARGET_K6
12092 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
12093 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
12094 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
12095 cost += 10;
12096
12097 return cost;
12098 }
12099 \f
12100 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
12101 this is used for to form addresses to local data when -fPIC is in
12102 use. */
12103
12104 static bool
12105 darwin_local_data_pic (rtx disp)
12106 {
12107 return (GET_CODE (disp) == UNSPEC
12108 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
12109 }
12110
12111 /* Determine if a given RTX is a valid constant. We already know this
12112 satisfies CONSTANT_P. */
12113
12114 static bool
12115 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
12116 {
12117 switch (GET_CODE (x))
12118 {
12119 case CONST:
12120 x = XEXP (x, 0);
12121
12122 if (GET_CODE (x) == PLUS)
12123 {
12124 if (!CONST_INT_P (XEXP (x, 1)))
12125 return false;
12126 x = XEXP (x, 0);
12127 }
12128
12129 if (TARGET_MACHO && darwin_local_data_pic (x))
12130 return true;
12131
12132 /* Only some unspecs are valid as "constants". */
12133 if (GET_CODE (x) == UNSPEC)
12134 switch (XINT (x, 1))
12135 {
12136 case UNSPEC_GOT:
12137 case UNSPEC_GOTOFF:
12138 case UNSPEC_PLTOFF:
12139 return TARGET_64BIT;
12140 case UNSPEC_TPOFF:
12141 case UNSPEC_NTPOFF:
12142 x = XVECEXP (x, 0, 0);
12143 return (GET_CODE (x) == SYMBOL_REF
12144 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12145 case UNSPEC_DTPOFF:
12146 x = XVECEXP (x, 0, 0);
12147 return (GET_CODE (x) == SYMBOL_REF
12148 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
12149 default:
12150 return false;
12151 }
12152
12153 /* We must have drilled down to a symbol. */
12154 if (GET_CODE (x) == LABEL_REF)
12155 return true;
12156 if (GET_CODE (x) != SYMBOL_REF)
12157 return false;
12158 /* FALLTHRU */
12159
12160 case SYMBOL_REF:
12161 /* TLS symbols are never valid. */
12162 if (SYMBOL_REF_TLS_MODEL (x))
12163 return false;
12164
12165 /* DLLIMPORT symbols are never valid. */
12166 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
12167 && SYMBOL_REF_DLLIMPORT_P (x))
12168 return false;
12169
12170 #if TARGET_MACHO
12171 /* mdynamic-no-pic */
12172 if (MACHO_DYNAMIC_NO_PIC_P)
12173 return machopic_symbol_defined_p (x);
12174 #endif
12175 break;
12176
12177 case CONST_DOUBLE:
12178 if (GET_MODE (x) == TImode
12179 && x != CONST0_RTX (TImode)
12180 && !TARGET_64BIT)
12181 return false;
12182 break;
12183
12184 case CONST_VECTOR:
12185 if (!standard_sse_constant_p (x))
12186 return false;
12187
12188 default:
12189 break;
12190 }
12191
12192 /* Otherwise we handle everything else in the move patterns. */
12193 return true;
12194 }
12195
12196 /* Determine if it's legal to put X into the constant pool. This
12197 is not possible for the address of thread-local symbols, which
12198 is checked above. */
12199
12200 static bool
12201 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
12202 {
12203 /* We can always put integral constants and vectors in memory. */
12204 switch (GET_CODE (x))
12205 {
12206 case CONST_INT:
12207 case CONST_DOUBLE:
12208 case CONST_VECTOR:
12209 return false;
12210
12211 default:
12212 break;
12213 }
12214 return !ix86_legitimate_constant_p (mode, x);
12215 }
12216
12217 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
12218 otherwise zero. */
12219
12220 static bool
12221 is_imported_p (rtx x)
12222 {
12223 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
12224 || GET_CODE (x) != SYMBOL_REF)
12225 return false;
12226
12227 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
12228 }
12229
12230
12231 /* Nonzero if the constant value X is a legitimate general operand
12232 when generating PIC code. It is given that flag_pic is on and
12233 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
12234
12235 bool
12236 legitimate_pic_operand_p (rtx x)
12237 {
12238 rtx inner;
12239
12240 switch (GET_CODE (x))
12241 {
12242 case CONST:
12243 inner = XEXP (x, 0);
12244 if (GET_CODE (inner) == PLUS
12245 && CONST_INT_P (XEXP (inner, 1)))
12246 inner = XEXP (inner, 0);
12247
12248 /* Only some unspecs are valid as "constants". */
12249 if (GET_CODE (inner) == UNSPEC)
12250 switch (XINT (inner, 1))
12251 {
12252 case UNSPEC_GOT:
12253 case UNSPEC_GOTOFF:
12254 case UNSPEC_PLTOFF:
12255 return TARGET_64BIT;
12256 case UNSPEC_TPOFF:
12257 x = XVECEXP (inner, 0, 0);
12258 return (GET_CODE (x) == SYMBOL_REF
12259 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
12260 case UNSPEC_MACHOPIC_OFFSET:
12261 return legitimate_pic_address_disp_p (x);
12262 default:
12263 return false;
12264 }
12265 /* FALLTHRU */
12266
12267 case SYMBOL_REF:
12268 case LABEL_REF:
12269 return legitimate_pic_address_disp_p (x);
12270
12271 default:
12272 return true;
12273 }
12274 }
12275
12276 /* Determine if a given CONST RTX is a valid memory displacement
12277 in PIC mode. */
12278
12279 bool
12280 legitimate_pic_address_disp_p (rtx disp)
12281 {
12282 bool saw_plus;
12283
12284 /* In 64bit mode we can allow direct addresses of symbols and labels
12285 when they are not dynamic symbols. */
12286 if (TARGET_64BIT)
12287 {
12288 rtx op0 = disp, op1;
12289
12290 switch (GET_CODE (disp))
12291 {
12292 case LABEL_REF:
12293 return true;
12294
12295 case CONST:
12296 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12297 break;
12298 op0 = XEXP (XEXP (disp, 0), 0);
12299 op1 = XEXP (XEXP (disp, 0), 1);
12300 if (!CONST_INT_P (op1)
12301 || INTVAL (op1) >= 16*1024*1024
12302 || INTVAL (op1) < -16*1024*1024)
12303 break;
12304 if (GET_CODE (op0) == LABEL_REF)
12305 return true;
12306 if (GET_CODE (op0) == CONST
12307 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12308 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12309 return true;
12310 if (GET_CODE (op0) == UNSPEC
12311 && XINT (op0, 1) == UNSPEC_PCREL)
12312 return true;
12313 if (GET_CODE (op0) != SYMBOL_REF)
12314 break;
12315 /* FALLTHRU */
12316
12317 case SYMBOL_REF:
12318 /* TLS references should always be enclosed in UNSPEC.
12319 The dllimported symbol needs always to be resolved. */
12320 if (SYMBOL_REF_TLS_MODEL (op0)
12321 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
12322 return false;
12323
12324 if (TARGET_PECOFF)
12325 {
12326 if (is_imported_p (op0))
12327 return true;
12328
12329 if (SYMBOL_REF_FAR_ADDR_P (op0)
12330 || !SYMBOL_REF_LOCAL_P (op0))
12331 break;
12332
12333 /* Function-symbols need to be resolved only for
12334 large-model.
12335 For the small-model we don't need to resolve anything
12336 here. */
12337 if ((ix86_cmodel != CM_LARGE_PIC
12338 && SYMBOL_REF_FUNCTION_P (op0))
12339 || ix86_cmodel == CM_SMALL_PIC)
12340 return true;
12341 /* Non-external symbols don't need to be resolved for
12342 large, and medium-model. */
12343 if ((ix86_cmodel == CM_LARGE_PIC
12344 || ix86_cmodel == CM_MEDIUM_PIC)
12345 && !SYMBOL_REF_EXTERNAL_P (op0))
12346 return true;
12347 }
12348 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
12349 && SYMBOL_REF_LOCAL_P (op0)
12350 && ix86_cmodel != CM_LARGE_PIC)
12351 return true;
12352 break;
12353
12354 default:
12355 break;
12356 }
12357 }
12358 if (GET_CODE (disp) != CONST)
12359 return false;
12360 disp = XEXP (disp, 0);
12361
12362 if (TARGET_64BIT)
12363 {
12364 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12365 of GOT tables. We should not need these anyway. */
12366 if (GET_CODE (disp) != UNSPEC
12367 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12368 && XINT (disp, 1) != UNSPEC_GOTOFF
12369 && XINT (disp, 1) != UNSPEC_PCREL
12370 && XINT (disp, 1) != UNSPEC_PLTOFF))
12371 return false;
12372
12373 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12374 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12375 return false;
12376 return true;
12377 }
12378
12379 saw_plus = false;
12380 if (GET_CODE (disp) == PLUS)
12381 {
12382 if (!CONST_INT_P (XEXP (disp, 1)))
12383 return false;
12384 disp = XEXP (disp, 0);
12385 saw_plus = true;
12386 }
12387
12388 if (TARGET_MACHO && darwin_local_data_pic (disp))
12389 return true;
12390
12391 if (GET_CODE (disp) != UNSPEC)
12392 return false;
12393
12394 switch (XINT (disp, 1))
12395 {
12396 case UNSPEC_GOT:
12397 if (saw_plus)
12398 return false;
12399 /* We need to check for both symbols and labels because VxWorks loads
12400 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12401 details. */
12402 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12403 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12404 case UNSPEC_GOTOFF:
12405 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12406 While ABI specify also 32bit relocation but we don't produce it in
12407 small PIC model at all. */
12408 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12409 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12410 && !TARGET_64BIT)
12411 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12412 return false;
12413 case UNSPEC_GOTTPOFF:
12414 case UNSPEC_GOTNTPOFF:
12415 case UNSPEC_INDNTPOFF:
12416 if (saw_plus)
12417 return false;
12418 disp = XVECEXP (disp, 0, 0);
12419 return (GET_CODE (disp) == SYMBOL_REF
12420 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12421 case UNSPEC_NTPOFF:
12422 disp = XVECEXP (disp, 0, 0);
12423 return (GET_CODE (disp) == SYMBOL_REF
12424 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12425 case UNSPEC_DTPOFF:
12426 disp = XVECEXP (disp, 0, 0);
12427 return (GET_CODE (disp) == SYMBOL_REF
12428 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12429 }
12430
12431 return false;
12432 }
12433
12434 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12435 replace the input X, or the original X if no replacement is called for.
12436 The output parameter *WIN is 1 if the calling macro should goto WIN,
12437 0 if it should not. */
12438
12439 bool
12440 ix86_legitimize_reload_address (rtx x,
12441 enum machine_mode mode ATTRIBUTE_UNUSED,
12442 int opnum, int type,
12443 int ind_levels ATTRIBUTE_UNUSED)
12444 {
12445 /* Reload can generate:
12446
12447 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12448 (reg:DI 97))
12449 (reg:DI 2 cx))
12450
12451 This RTX is rejected from ix86_legitimate_address_p due to
12452 non-strictness of base register 97. Following this rejection,
12453 reload pushes all three components into separate registers,
12454 creating invalid memory address RTX.
12455
12456 Following code reloads only the invalid part of the
12457 memory address RTX. */
12458
12459 if (GET_CODE (x) == PLUS
12460 && REG_P (XEXP (x, 1))
12461 && GET_CODE (XEXP (x, 0)) == PLUS
12462 && REG_P (XEXP (XEXP (x, 0), 1)))
12463 {
12464 rtx base, index;
12465 bool something_reloaded = false;
12466
12467 base = XEXP (XEXP (x, 0), 1);
12468 if (!REG_OK_FOR_BASE_STRICT_P (base))
12469 {
12470 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12471 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12472 opnum, (enum reload_type) type);
12473 something_reloaded = true;
12474 }
12475
12476 index = XEXP (x, 1);
12477 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12478 {
12479 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12480 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12481 opnum, (enum reload_type) type);
12482 something_reloaded = true;
12483 }
12484
12485 gcc_assert (something_reloaded);
12486 return true;
12487 }
12488
12489 return false;
12490 }
12491
12492 /* Recognizes RTL expressions that are valid memory addresses for an
12493 instruction. The MODE argument is the machine mode for the MEM
12494 expression that wants to use this address.
12495
12496 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12497 convert common non-canonical forms to canonical form so that they will
12498 be recognized. */
12499
12500 static bool
12501 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12502 rtx addr, bool strict)
12503 {
12504 struct ix86_address parts;
12505 rtx base, index, disp;
12506 HOST_WIDE_INT scale;
12507
12508 if (ix86_decompose_address (addr, &parts) <= 0)
12509 /* Decomposition failed. */
12510 return false;
12511
12512 base = parts.base;
12513 index = parts.index;
12514 disp = parts.disp;
12515 scale = parts.scale;
12516
12517 /* Validate base register. */
12518 if (base)
12519 {
12520 rtx reg;
12521
12522 if (REG_P (base))
12523 reg = base;
12524 else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base)))
12525 reg = SUBREG_REG (base);
12526 else
12527 /* Base is not a register. */
12528 return false;
12529
12530 if (GET_MODE (base) != SImode && GET_MODE (base) != DImode)
12531 return false;
12532
12533 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12534 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12535 /* Base is not valid. */
12536 return false;
12537 }
12538
12539 /* Validate index register. */
12540 if (index)
12541 {
12542 rtx reg;
12543
12544 if (REG_P (index))
12545 reg = index;
12546 else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index)))
12547 reg = SUBREG_REG (index);
12548 else
12549 /* Index is not a register. */
12550 return false;
12551
12552 if (GET_MODE (index) != SImode && GET_MODE (index) != DImode)
12553 return false;
12554
12555 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12556 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12557 /* Index is not valid. */
12558 return false;
12559 }
12560
12561 /* Index and base should have the same mode. */
12562 if (base && index
12563 && GET_MODE (base) != GET_MODE (index))
12564 return false;
12565
12566 /* Validate scale factor. */
12567 if (scale != 1)
12568 {
12569 if (!index)
12570 /* Scale without index. */
12571 return false;
12572
12573 if (scale != 2 && scale != 4 && scale != 8)
12574 /* Scale is not a valid multiplier. */
12575 return false;
12576 }
12577
12578 /* Validate displacement. */
12579 if (disp)
12580 {
12581 if (GET_CODE (disp) == CONST
12582 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12583 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12584 switch (XINT (XEXP (disp, 0), 1))
12585 {
12586 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12587 used. While ABI specify also 32bit relocations, we don't produce
12588 them at all and use IP relative instead. */
12589 case UNSPEC_GOT:
12590 case UNSPEC_GOTOFF:
12591 gcc_assert (flag_pic);
12592 if (!TARGET_64BIT)
12593 goto is_legitimate_pic;
12594
12595 /* 64bit address unspec. */
12596 return false;
12597
12598 case UNSPEC_GOTPCREL:
12599 case UNSPEC_PCREL:
12600 gcc_assert (flag_pic);
12601 goto is_legitimate_pic;
12602
12603 case UNSPEC_GOTTPOFF:
12604 case UNSPEC_GOTNTPOFF:
12605 case UNSPEC_INDNTPOFF:
12606 case UNSPEC_NTPOFF:
12607 case UNSPEC_DTPOFF:
12608 break;
12609
12610 case UNSPEC_STACK_CHECK:
12611 gcc_assert (flag_split_stack);
12612 break;
12613
12614 default:
12615 /* Invalid address unspec. */
12616 return false;
12617 }
12618
12619 else if (SYMBOLIC_CONST (disp)
12620 && (flag_pic
12621 || (TARGET_MACHO
12622 #if TARGET_MACHO
12623 && MACHOPIC_INDIRECT
12624 && !machopic_operand_p (disp)
12625 #endif
12626 )))
12627 {
12628
12629 is_legitimate_pic:
12630 if (TARGET_64BIT && (index || base))
12631 {
12632 /* foo@dtpoff(%rX) is ok. */
12633 if (GET_CODE (disp) != CONST
12634 || GET_CODE (XEXP (disp, 0)) != PLUS
12635 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12636 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12637 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12638 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12639 /* Non-constant pic memory reference. */
12640 return false;
12641 }
12642 else if ((!TARGET_MACHO || flag_pic)
12643 && ! legitimate_pic_address_disp_p (disp))
12644 /* Displacement is an invalid pic construct. */
12645 return false;
12646 #if TARGET_MACHO
12647 else if (MACHO_DYNAMIC_NO_PIC_P
12648 && !ix86_legitimate_constant_p (Pmode, disp))
12649 /* displacment must be referenced via non_lazy_pointer */
12650 return false;
12651 #endif
12652
12653 /* This code used to verify that a symbolic pic displacement
12654 includes the pic_offset_table_rtx register.
12655
12656 While this is good idea, unfortunately these constructs may
12657 be created by "adds using lea" optimization for incorrect
12658 code like:
12659
12660 int a;
12661 int foo(int i)
12662 {
12663 return *(&a+i);
12664 }
12665
12666 This code is nonsensical, but results in addressing
12667 GOT table with pic_offset_table_rtx base. We can't
12668 just refuse it easily, since it gets matched by
12669 "addsi3" pattern, that later gets split to lea in the
12670 case output register differs from input. While this
12671 can be handled by separate addsi pattern for this case
12672 that never results in lea, this seems to be easier and
12673 correct fix for crash to disable this test. */
12674 }
12675 else if (GET_CODE (disp) != LABEL_REF
12676 && !CONST_INT_P (disp)
12677 && (GET_CODE (disp) != CONST
12678 || !ix86_legitimate_constant_p (Pmode, disp))
12679 && (GET_CODE (disp) != SYMBOL_REF
12680 || !ix86_legitimate_constant_p (Pmode, disp)))
12681 /* Displacement is not constant. */
12682 return false;
12683 else if (TARGET_64BIT
12684 && !x86_64_immediate_operand (disp, VOIDmode))
12685 /* Displacement is out of range. */
12686 return false;
12687 }
12688
12689 /* Everything looks valid. */
12690 return true;
12691 }
12692
12693 /* Determine if a given RTX is a valid constant address. */
12694
12695 bool
12696 constant_address_p (rtx x)
12697 {
12698 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12699 }
12700 \f
12701 /* Return a unique alias set for the GOT. */
12702
12703 static alias_set_type
12704 ix86_GOT_alias_set (void)
12705 {
12706 static alias_set_type set = -1;
12707 if (set == -1)
12708 set = new_alias_set ();
12709 return set;
12710 }
12711
12712 /* Return a legitimate reference for ORIG (an address) using the
12713 register REG. If REG is 0, a new pseudo is generated.
12714
12715 There are two types of references that must be handled:
12716
12717 1. Global data references must load the address from the GOT, via
12718 the PIC reg. An insn is emitted to do this load, and the reg is
12719 returned.
12720
12721 2. Static data references, constant pool addresses, and code labels
12722 compute the address as an offset from the GOT, whose base is in
12723 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12724 differentiate them from global data objects. The returned
12725 address is the PIC reg + an unspec constant.
12726
12727 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12728 reg also appears in the address. */
12729
12730 static rtx
12731 legitimize_pic_address (rtx orig, rtx reg)
12732 {
12733 rtx addr = orig;
12734 rtx new_rtx = orig;
12735
12736 #if TARGET_MACHO
12737 if (TARGET_MACHO && !TARGET_64BIT)
12738 {
12739 if (reg == 0)
12740 reg = gen_reg_rtx (Pmode);
12741 /* Use the generic Mach-O PIC machinery. */
12742 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12743 }
12744 #endif
12745
12746 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12747 {
12748 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12749 if (tmp)
12750 return tmp;
12751 }
12752
12753 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12754 new_rtx = addr;
12755 else if (TARGET_64BIT && !TARGET_PECOFF
12756 && ix86_cmodel != CM_SMALL_PIC && gotoff_operand (addr, Pmode))
12757 {
12758 rtx tmpreg;
12759 /* This symbol may be referenced via a displacement from the PIC
12760 base address (@GOTOFF). */
12761
12762 if (reload_in_progress)
12763 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12764 if (GET_CODE (addr) == CONST)
12765 addr = XEXP (addr, 0);
12766 if (GET_CODE (addr) == PLUS)
12767 {
12768 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12769 UNSPEC_GOTOFF);
12770 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12771 }
12772 else
12773 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12774 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12775 if (!reg)
12776 tmpreg = gen_reg_rtx (Pmode);
12777 else
12778 tmpreg = reg;
12779 emit_move_insn (tmpreg, new_rtx);
12780
12781 if (reg != 0)
12782 {
12783 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12784 tmpreg, 1, OPTAB_DIRECT);
12785 new_rtx = reg;
12786 }
12787 else
12788 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12789 }
12790 else if (!TARGET_64BIT && !TARGET_PECOFF && gotoff_operand (addr, Pmode))
12791 {
12792 /* This symbol may be referenced via a displacement from the PIC
12793 base address (@GOTOFF). */
12794
12795 if (reload_in_progress)
12796 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12797 if (GET_CODE (addr) == CONST)
12798 addr = XEXP (addr, 0);
12799 if (GET_CODE (addr) == PLUS)
12800 {
12801 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12802 UNSPEC_GOTOFF);
12803 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12804 }
12805 else
12806 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12807 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12808 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12809
12810 if (reg != 0)
12811 {
12812 emit_move_insn (reg, new_rtx);
12813 new_rtx = reg;
12814 }
12815 }
12816 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12817 /* We can't use @GOTOFF for text labels on VxWorks;
12818 see gotoff_operand. */
12819 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12820 {
12821 rtx tmp = legitimize_pe_coff_symbol (addr, true);
12822 if (tmp)
12823 return tmp;
12824
12825 /* For x64 PE-COFF there is no GOT table. So we use address
12826 directly. */
12827 if (TARGET_64BIT && TARGET_PECOFF)
12828 {
12829 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12830 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12831
12832 if (reg == 0)
12833 reg = gen_reg_rtx (Pmode);
12834 emit_move_insn (reg, new_rtx);
12835 new_rtx = reg;
12836 }
12837 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12838 {
12839 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12840 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12841 new_rtx = gen_const_mem (Pmode, new_rtx);
12842 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12843
12844 if (reg == 0)
12845 reg = gen_reg_rtx (Pmode);
12846 /* Use directly gen_movsi, otherwise the address is loaded
12847 into register for CSE. We don't want to CSE this addresses,
12848 instead we CSE addresses from the GOT table, so skip this. */
12849 emit_insn (gen_movsi (reg, new_rtx));
12850 new_rtx = reg;
12851 }
12852 else
12853 {
12854 /* This symbol must be referenced via a load from the
12855 Global Offset Table (@GOT). */
12856
12857 if (reload_in_progress)
12858 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12859 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12860 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12861 if (TARGET_64BIT)
12862 new_rtx = force_reg (Pmode, new_rtx);
12863 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12864 new_rtx = gen_const_mem (Pmode, new_rtx);
12865 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12866
12867 if (reg == 0)
12868 reg = gen_reg_rtx (Pmode);
12869 emit_move_insn (reg, new_rtx);
12870 new_rtx = reg;
12871 }
12872 }
12873 else
12874 {
12875 if (CONST_INT_P (addr)
12876 && !x86_64_immediate_operand (addr, VOIDmode))
12877 {
12878 if (reg)
12879 {
12880 emit_move_insn (reg, addr);
12881 new_rtx = reg;
12882 }
12883 else
12884 new_rtx = force_reg (Pmode, addr);
12885 }
12886 else if (GET_CODE (addr) == CONST)
12887 {
12888 addr = XEXP (addr, 0);
12889
12890 /* We must match stuff we generate before. Assume the only
12891 unspecs that can get here are ours. Not that we could do
12892 anything with them anyway.... */
12893 if (GET_CODE (addr) == UNSPEC
12894 || (GET_CODE (addr) == PLUS
12895 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12896 return orig;
12897 gcc_assert (GET_CODE (addr) == PLUS);
12898 }
12899 if (GET_CODE (addr) == PLUS)
12900 {
12901 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12902
12903 /* Check first to see if this is a constant offset from a @GOTOFF
12904 symbol reference. */
12905 if (!TARGET_PECOFF && gotoff_operand (op0, Pmode)
12906 && CONST_INT_P (op1))
12907 {
12908 if (!TARGET_64BIT)
12909 {
12910 if (reload_in_progress)
12911 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12912 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12913 UNSPEC_GOTOFF);
12914 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12915 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12916 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12917
12918 if (reg != 0)
12919 {
12920 emit_move_insn (reg, new_rtx);
12921 new_rtx = reg;
12922 }
12923 }
12924 else
12925 {
12926 if (INTVAL (op1) < -16*1024*1024
12927 || INTVAL (op1) >= 16*1024*1024)
12928 {
12929 if (!x86_64_immediate_operand (op1, Pmode))
12930 op1 = force_reg (Pmode, op1);
12931 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12932 }
12933 }
12934 }
12935 else
12936 {
12937 rtx base = legitimize_pic_address (op0, reg);
12938 enum machine_mode mode = GET_MODE (base);
12939 new_rtx
12940 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12941
12942 if (CONST_INT_P (new_rtx))
12943 {
12944 if (INTVAL (new_rtx) < -16*1024*1024
12945 || INTVAL (new_rtx) >= 16*1024*1024)
12946 {
12947 if (!x86_64_immediate_operand (new_rtx, mode))
12948 new_rtx = force_reg (mode, new_rtx);
12949 new_rtx
12950 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12951 }
12952 else
12953 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12954 }
12955 else
12956 {
12957 if (GET_CODE (new_rtx) == PLUS
12958 && CONSTANT_P (XEXP (new_rtx, 1)))
12959 {
12960 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12961 new_rtx = XEXP (new_rtx, 1);
12962 }
12963 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12964 }
12965 }
12966 }
12967 }
12968 return new_rtx;
12969 }
12970 \f
12971 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12972
12973 static rtx
12974 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12975 {
12976 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12977
12978 if (GET_MODE (tp) != tp_mode)
12979 {
12980 gcc_assert (GET_MODE (tp) == SImode);
12981 gcc_assert (tp_mode == DImode);
12982
12983 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12984 }
12985
12986 if (to_reg)
12987 tp = copy_to_mode_reg (tp_mode, tp);
12988
12989 return tp;
12990 }
12991
12992 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12993
12994 static GTY(()) rtx ix86_tls_symbol;
12995
12996 static rtx
12997 ix86_tls_get_addr (void)
12998 {
12999 if (!ix86_tls_symbol)
13000 {
13001 const char *sym
13002 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
13003 ? "___tls_get_addr" : "__tls_get_addr");
13004
13005 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
13006 }
13007
13008 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
13009 {
13010 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
13011 UNSPEC_PLTOFF);
13012 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
13013 gen_rtx_CONST (Pmode, unspec));
13014 }
13015
13016 return ix86_tls_symbol;
13017 }
13018
13019 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
13020
13021 static GTY(()) rtx ix86_tls_module_base_symbol;
13022
13023 rtx
13024 ix86_tls_module_base (void)
13025 {
13026 if (!ix86_tls_module_base_symbol)
13027 {
13028 ix86_tls_module_base_symbol
13029 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
13030
13031 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
13032 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
13033 }
13034
13035 return ix86_tls_module_base_symbol;
13036 }
13037
13038 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
13039 false if we expect this to be used for a memory address and true if
13040 we expect to load the address into a register. */
13041
13042 static rtx
13043 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
13044 {
13045 rtx dest, base, off;
13046 rtx pic = NULL_RTX, tp = NULL_RTX;
13047 enum machine_mode tp_mode = Pmode;
13048 int type;
13049
13050 switch (model)
13051 {
13052 case TLS_MODEL_GLOBAL_DYNAMIC:
13053 dest = gen_reg_rtx (Pmode);
13054
13055 if (!TARGET_64BIT)
13056 {
13057 if (flag_pic && !TARGET_PECOFF)
13058 pic = pic_offset_table_rtx;
13059 else
13060 {
13061 pic = gen_reg_rtx (Pmode);
13062 emit_insn (gen_set_got (pic));
13063 }
13064 }
13065
13066 if (TARGET_GNU2_TLS)
13067 {
13068 if (TARGET_64BIT)
13069 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
13070 else
13071 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
13072
13073 tp = get_thread_pointer (Pmode, true);
13074 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
13075
13076 if (GET_MODE (x) != Pmode)
13077 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13078
13079 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13080 }
13081 else
13082 {
13083 rtx caddr = ix86_tls_get_addr ();
13084
13085 if (TARGET_64BIT)
13086 {
13087 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13088 rtx insns;
13089
13090 start_sequence ();
13091 emit_call_insn
13092 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
13093 insns = get_insns ();
13094 end_sequence ();
13095
13096 if (GET_MODE (x) != Pmode)
13097 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13098
13099 RTL_CONST_CALL_P (insns) = 1;
13100 emit_libcall_block (insns, dest, rax, x);
13101 }
13102 else
13103 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
13104 }
13105 break;
13106
13107 case TLS_MODEL_LOCAL_DYNAMIC:
13108 base = gen_reg_rtx (Pmode);
13109
13110 if (!TARGET_64BIT)
13111 {
13112 if (flag_pic)
13113 pic = pic_offset_table_rtx;
13114 else
13115 {
13116 pic = gen_reg_rtx (Pmode);
13117 emit_insn (gen_set_got (pic));
13118 }
13119 }
13120
13121 if (TARGET_GNU2_TLS)
13122 {
13123 rtx tmp = ix86_tls_module_base ();
13124
13125 if (TARGET_64BIT)
13126 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
13127 else
13128 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
13129
13130 tp = get_thread_pointer (Pmode, true);
13131 set_unique_reg_note (get_last_insn (), REG_EQUAL,
13132 gen_rtx_MINUS (Pmode, tmp, tp));
13133 }
13134 else
13135 {
13136 rtx caddr = ix86_tls_get_addr ();
13137
13138 if (TARGET_64BIT)
13139 {
13140 rtx rax = gen_rtx_REG (Pmode, AX_REG);
13141 rtx insns, eqv;
13142
13143 start_sequence ();
13144 emit_call_insn
13145 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
13146 insns = get_insns ();
13147 end_sequence ();
13148
13149 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
13150 share the LD_BASE result with other LD model accesses. */
13151 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
13152 UNSPEC_TLS_LD_BASE);
13153
13154 RTL_CONST_CALL_P (insns) = 1;
13155 emit_libcall_block (insns, base, rax, eqv);
13156 }
13157 else
13158 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
13159 }
13160
13161 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
13162 off = gen_rtx_CONST (Pmode, off);
13163
13164 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
13165
13166 if (TARGET_GNU2_TLS)
13167 {
13168 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
13169
13170 if (GET_MODE (x) != Pmode)
13171 x = gen_rtx_ZERO_EXTEND (Pmode, x);
13172
13173 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
13174 }
13175 break;
13176
13177 case TLS_MODEL_INITIAL_EXEC:
13178 if (TARGET_64BIT)
13179 {
13180 if (TARGET_SUN_TLS && !TARGET_X32)
13181 {
13182 /* The Sun linker took the AMD64 TLS spec literally
13183 and can only handle %rax as destination of the
13184 initial executable code sequence. */
13185
13186 dest = gen_reg_rtx (DImode);
13187 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
13188 return dest;
13189 }
13190
13191 /* Generate DImode references to avoid %fs:(%reg32)
13192 problems and linker IE->LE relaxation bug. */
13193 tp_mode = DImode;
13194 pic = NULL;
13195 type = UNSPEC_GOTNTPOFF;
13196 }
13197 else if (flag_pic)
13198 {
13199 if (reload_in_progress)
13200 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
13201 pic = pic_offset_table_rtx;
13202 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
13203 }
13204 else if (!TARGET_ANY_GNU_TLS)
13205 {
13206 pic = gen_reg_rtx (Pmode);
13207 emit_insn (gen_set_got (pic));
13208 type = UNSPEC_GOTTPOFF;
13209 }
13210 else
13211 {
13212 pic = NULL;
13213 type = UNSPEC_INDNTPOFF;
13214 }
13215
13216 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
13217 off = gen_rtx_CONST (tp_mode, off);
13218 if (pic)
13219 off = gen_rtx_PLUS (tp_mode, pic, off);
13220 off = gen_const_mem (tp_mode, off);
13221 set_mem_alias_set (off, ix86_GOT_alias_set ());
13222
13223 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13224 {
13225 base = get_thread_pointer (tp_mode,
13226 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13227 off = force_reg (tp_mode, off);
13228 return gen_rtx_PLUS (tp_mode, base, off);
13229 }
13230 else
13231 {
13232 base = get_thread_pointer (Pmode, true);
13233 dest = gen_reg_rtx (Pmode);
13234 emit_insn (ix86_gen_sub3 (dest, base, off));
13235 }
13236 break;
13237
13238 case TLS_MODEL_LOCAL_EXEC:
13239 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
13240 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13241 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
13242 off = gen_rtx_CONST (Pmode, off);
13243
13244 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
13245 {
13246 base = get_thread_pointer (Pmode,
13247 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
13248 return gen_rtx_PLUS (Pmode, base, off);
13249 }
13250 else
13251 {
13252 base = get_thread_pointer (Pmode, true);
13253 dest = gen_reg_rtx (Pmode);
13254 emit_insn (ix86_gen_sub3 (dest, base, off));
13255 }
13256 break;
13257
13258 default:
13259 gcc_unreachable ();
13260 }
13261
13262 return dest;
13263 }
13264
13265 /* Create or return the unique __imp_DECL dllimport symbol corresponding
13266 to symbol DECL if BEIMPORT is true. Otherwise create or return the
13267 unique refptr-DECL symbol corresponding to symbol DECL. */
13268
13269 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
13270 htab_t dllimport_map;
13271
13272 static tree
13273 get_dllimport_decl (tree decl, bool beimport)
13274 {
13275 struct tree_map *h, in;
13276 void **loc;
13277 const char *name;
13278 const char *prefix;
13279 size_t namelen, prefixlen;
13280 char *imp_name;
13281 tree to;
13282 rtx rtl;
13283
13284 if (!dllimport_map)
13285 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13286
13287 in.hash = htab_hash_pointer (decl);
13288 in.base.from = decl;
13289 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13290 h = (struct tree_map *) *loc;
13291 if (h)
13292 return h->to;
13293
13294 *loc = h = ggc_alloc_tree_map ();
13295 h->hash = in.hash;
13296 h->base.from = decl;
13297 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13298 VAR_DECL, NULL, ptr_type_node);
13299 DECL_ARTIFICIAL (to) = 1;
13300 DECL_IGNORED_P (to) = 1;
13301 DECL_EXTERNAL (to) = 1;
13302 TREE_READONLY (to) = 1;
13303
13304 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13305 name = targetm.strip_name_encoding (name);
13306 if (beimport)
13307 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13308 ? "*__imp_" : "*__imp__";
13309 else
13310 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
13311 namelen = strlen (name);
13312 prefixlen = strlen (prefix);
13313 imp_name = (char *) alloca (namelen + prefixlen + 1);
13314 memcpy (imp_name, prefix, prefixlen);
13315 memcpy (imp_name + prefixlen, name, namelen + 1);
13316
13317 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13318 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13319 SET_SYMBOL_REF_DECL (rtl, to);
13320 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
13321 if (!beimport)
13322 {
13323 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
13324 #ifdef SUB_TARGET_RECORD_STUB
13325 SUB_TARGET_RECORD_STUB (name);
13326 #endif
13327 }
13328
13329 rtl = gen_const_mem (Pmode, rtl);
13330 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13331
13332 SET_DECL_RTL (to, rtl);
13333 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13334
13335 return to;
13336 }
13337
13338 /* Expand SYMBOL into its corresponding far-addresse symbol.
13339 WANT_REG is true if we require the result be a register. */
13340
13341 static rtx
13342 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
13343 {
13344 tree imp_decl;
13345 rtx x;
13346
13347 gcc_assert (SYMBOL_REF_DECL (symbol));
13348 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
13349
13350 x = DECL_RTL (imp_decl);
13351 if (want_reg)
13352 x = force_reg (Pmode, x);
13353 return x;
13354 }
13355
13356 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13357 true if we require the result be a register. */
13358
13359 static rtx
13360 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13361 {
13362 tree imp_decl;
13363 rtx x;
13364
13365 gcc_assert (SYMBOL_REF_DECL (symbol));
13366 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
13367
13368 x = DECL_RTL (imp_decl);
13369 if (want_reg)
13370 x = force_reg (Pmode, x);
13371 return x;
13372 }
13373
13374 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
13375 is true if we require the result be a register. */
13376
13377 static rtx
13378 legitimize_pe_coff_symbol (rtx addr, bool inreg)
13379 {
13380 if (!TARGET_PECOFF)
13381 return NULL_RTX;
13382
13383 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13384 {
13385 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
13386 return legitimize_dllimport_symbol (addr, inreg);
13387 if (GET_CODE (addr) == CONST
13388 && GET_CODE (XEXP (addr, 0)) == PLUS
13389 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13390 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
13391 {
13392 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
13393 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13394 }
13395 }
13396
13397 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
13398 return NULL_RTX;
13399 if (GET_CODE (addr) == SYMBOL_REF
13400 && !is_imported_p (addr)
13401 && SYMBOL_REF_EXTERNAL_P (addr)
13402 && SYMBOL_REF_DECL (addr))
13403 return legitimize_pe_coff_extern_decl (addr, inreg);
13404
13405 if (GET_CODE (addr) == CONST
13406 && GET_CODE (XEXP (addr, 0)) == PLUS
13407 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
13408 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
13409 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
13410 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
13411 {
13412 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
13413 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
13414 }
13415 return NULL_RTX;
13416 }
13417
13418 /* Try machine-dependent ways of modifying an illegitimate address
13419 to be legitimate. If we find one, return the new, valid address.
13420 This macro is used in only one place: `memory_address' in explow.c.
13421
13422 OLDX is the address as it was before break_out_memory_refs was called.
13423 In some cases it is useful to look at this to decide what needs to be done.
13424
13425 It is always safe for this macro to do nothing. It exists to recognize
13426 opportunities to optimize the output.
13427
13428 For the 80386, we handle X+REG by loading X into a register R and
13429 using R+REG. R will go in a general reg and indexing will be used.
13430 However, if REG is a broken-out memory address or multiplication,
13431 nothing needs to be done because REG can certainly go in a general reg.
13432
13433 When -fpic is used, special handling is needed for symbolic references.
13434 See comments by legitimize_pic_address in i386.c for details. */
13435
13436 static rtx
13437 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13438 enum machine_mode mode)
13439 {
13440 int changed = 0;
13441 unsigned log;
13442
13443 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13444 if (log)
13445 return legitimize_tls_address (x, (enum tls_model) log, false);
13446 if (GET_CODE (x) == CONST
13447 && GET_CODE (XEXP (x, 0)) == PLUS
13448 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13449 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13450 {
13451 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13452 (enum tls_model) log, false);
13453 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13454 }
13455
13456 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13457 {
13458 rtx tmp = legitimize_pe_coff_symbol (x, true);
13459 if (tmp)
13460 return tmp;
13461 }
13462
13463 if (flag_pic && SYMBOLIC_CONST (x))
13464 return legitimize_pic_address (x, 0);
13465
13466 #if TARGET_MACHO
13467 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13468 return machopic_indirect_data_reference (x, 0);
13469 #endif
13470
13471 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13472 if (GET_CODE (x) == ASHIFT
13473 && CONST_INT_P (XEXP (x, 1))
13474 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13475 {
13476 changed = 1;
13477 log = INTVAL (XEXP (x, 1));
13478 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13479 GEN_INT (1 << log));
13480 }
13481
13482 if (GET_CODE (x) == PLUS)
13483 {
13484 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13485
13486 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13487 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13488 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13489 {
13490 changed = 1;
13491 log = INTVAL (XEXP (XEXP (x, 0), 1));
13492 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13493 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13494 GEN_INT (1 << log));
13495 }
13496
13497 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13498 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13499 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13500 {
13501 changed = 1;
13502 log = INTVAL (XEXP (XEXP (x, 1), 1));
13503 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13504 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13505 GEN_INT (1 << log));
13506 }
13507
13508 /* Put multiply first if it isn't already. */
13509 if (GET_CODE (XEXP (x, 1)) == MULT)
13510 {
13511 rtx tmp = XEXP (x, 0);
13512 XEXP (x, 0) = XEXP (x, 1);
13513 XEXP (x, 1) = tmp;
13514 changed = 1;
13515 }
13516
13517 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13518 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13519 created by virtual register instantiation, register elimination, and
13520 similar optimizations. */
13521 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13522 {
13523 changed = 1;
13524 x = gen_rtx_PLUS (Pmode,
13525 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13526 XEXP (XEXP (x, 1), 0)),
13527 XEXP (XEXP (x, 1), 1));
13528 }
13529
13530 /* Canonicalize
13531 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13532 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13533 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13534 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13535 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13536 && CONSTANT_P (XEXP (x, 1)))
13537 {
13538 rtx constant;
13539 rtx other = NULL_RTX;
13540
13541 if (CONST_INT_P (XEXP (x, 1)))
13542 {
13543 constant = XEXP (x, 1);
13544 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13545 }
13546 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13547 {
13548 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13549 other = XEXP (x, 1);
13550 }
13551 else
13552 constant = 0;
13553
13554 if (constant)
13555 {
13556 changed = 1;
13557 x = gen_rtx_PLUS (Pmode,
13558 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13559 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13560 plus_constant (Pmode, other,
13561 INTVAL (constant)));
13562 }
13563 }
13564
13565 if (changed && ix86_legitimate_address_p (mode, x, false))
13566 return x;
13567
13568 if (GET_CODE (XEXP (x, 0)) == MULT)
13569 {
13570 changed = 1;
13571 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13572 }
13573
13574 if (GET_CODE (XEXP (x, 1)) == MULT)
13575 {
13576 changed = 1;
13577 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13578 }
13579
13580 if (changed
13581 && REG_P (XEXP (x, 1))
13582 && REG_P (XEXP (x, 0)))
13583 return x;
13584
13585 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13586 {
13587 changed = 1;
13588 x = legitimize_pic_address (x, 0);
13589 }
13590
13591 if (changed && ix86_legitimate_address_p (mode, x, false))
13592 return x;
13593
13594 if (REG_P (XEXP (x, 0)))
13595 {
13596 rtx temp = gen_reg_rtx (Pmode);
13597 rtx val = force_operand (XEXP (x, 1), temp);
13598 if (val != temp)
13599 {
13600 val = convert_to_mode (Pmode, val, 1);
13601 emit_move_insn (temp, val);
13602 }
13603
13604 XEXP (x, 1) = temp;
13605 return x;
13606 }
13607
13608 else if (REG_P (XEXP (x, 1)))
13609 {
13610 rtx temp = gen_reg_rtx (Pmode);
13611 rtx val = force_operand (XEXP (x, 0), temp);
13612 if (val != temp)
13613 {
13614 val = convert_to_mode (Pmode, val, 1);
13615 emit_move_insn (temp, val);
13616 }
13617
13618 XEXP (x, 0) = temp;
13619 return x;
13620 }
13621 }
13622
13623 return x;
13624 }
13625 \f
13626 /* Print an integer constant expression in assembler syntax. Addition
13627 and subtraction are the only arithmetic that may appear in these
13628 expressions. FILE is the stdio stream to write to, X is the rtx, and
13629 CODE is the operand print code from the output string. */
13630
13631 static void
13632 output_pic_addr_const (FILE *file, rtx x, int code)
13633 {
13634 char buf[256];
13635
13636 switch (GET_CODE (x))
13637 {
13638 case PC:
13639 gcc_assert (flag_pic);
13640 putc ('.', file);
13641 break;
13642
13643 case SYMBOL_REF:
13644 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13645 output_addr_const (file, x);
13646 else
13647 {
13648 const char *name = XSTR (x, 0);
13649
13650 /* Mark the decl as referenced so that cgraph will
13651 output the function. */
13652 if (SYMBOL_REF_DECL (x))
13653 mark_decl_referenced (SYMBOL_REF_DECL (x));
13654
13655 #if TARGET_MACHO
13656 if (MACHOPIC_INDIRECT
13657 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13658 name = machopic_indirection_name (x, /*stub_p=*/true);
13659 #endif
13660 assemble_name (file, name);
13661 }
13662 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
13663 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13664 fputs ("@PLT", file);
13665 break;
13666
13667 case LABEL_REF:
13668 x = XEXP (x, 0);
13669 /* FALLTHRU */
13670 case CODE_LABEL:
13671 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13672 assemble_name (asm_out_file, buf);
13673 break;
13674
13675 case CONST_INT:
13676 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13677 break;
13678
13679 case CONST:
13680 /* This used to output parentheses around the expression,
13681 but that does not work on the 386 (either ATT or BSD assembler). */
13682 output_pic_addr_const (file, XEXP (x, 0), code);
13683 break;
13684
13685 case CONST_DOUBLE:
13686 if (GET_MODE (x) == VOIDmode)
13687 {
13688 /* We can use %d if the number is <32 bits and positive. */
13689 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13690 fprintf (file, "0x%lx%08lx",
13691 (unsigned long) CONST_DOUBLE_HIGH (x),
13692 (unsigned long) CONST_DOUBLE_LOW (x));
13693 else
13694 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13695 }
13696 else
13697 /* We can't handle floating point constants;
13698 TARGET_PRINT_OPERAND must handle them. */
13699 output_operand_lossage ("floating constant misused");
13700 break;
13701
13702 case PLUS:
13703 /* Some assemblers need integer constants to appear first. */
13704 if (CONST_INT_P (XEXP (x, 0)))
13705 {
13706 output_pic_addr_const (file, XEXP (x, 0), code);
13707 putc ('+', file);
13708 output_pic_addr_const (file, XEXP (x, 1), code);
13709 }
13710 else
13711 {
13712 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13713 output_pic_addr_const (file, XEXP (x, 1), code);
13714 putc ('+', file);
13715 output_pic_addr_const (file, XEXP (x, 0), code);
13716 }
13717 break;
13718
13719 case MINUS:
13720 if (!TARGET_MACHO)
13721 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13722 output_pic_addr_const (file, XEXP (x, 0), code);
13723 putc ('-', file);
13724 output_pic_addr_const (file, XEXP (x, 1), code);
13725 if (!TARGET_MACHO)
13726 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13727 break;
13728
13729 case UNSPEC:
13730 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13731 {
13732 bool f = i386_asm_output_addr_const_extra (file, x);
13733 gcc_assert (f);
13734 break;
13735 }
13736
13737 gcc_assert (XVECLEN (x, 0) == 1);
13738 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13739 switch (XINT (x, 1))
13740 {
13741 case UNSPEC_GOT:
13742 fputs ("@GOT", file);
13743 break;
13744 case UNSPEC_GOTOFF:
13745 fputs ("@GOTOFF", file);
13746 break;
13747 case UNSPEC_PLTOFF:
13748 fputs ("@PLTOFF", file);
13749 break;
13750 case UNSPEC_PCREL:
13751 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13752 "(%rip)" : "[rip]", file);
13753 break;
13754 case UNSPEC_GOTPCREL:
13755 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13756 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13757 break;
13758 case UNSPEC_GOTTPOFF:
13759 /* FIXME: This might be @TPOFF in Sun ld too. */
13760 fputs ("@gottpoff", file);
13761 break;
13762 case UNSPEC_TPOFF:
13763 fputs ("@tpoff", file);
13764 break;
13765 case UNSPEC_NTPOFF:
13766 if (TARGET_64BIT)
13767 fputs ("@tpoff", file);
13768 else
13769 fputs ("@ntpoff", file);
13770 break;
13771 case UNSPEC_DTPOFF:
13772 fputs ("@dtpoff", file);
13773 break;
13774 case UNSPEC_GOTNTPOFF:
13775 if (TARGET_64BIT)
13776 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13777 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13778 else
13779 fputs ("@gotntpoff", file);
13780 break;
13781 case UNSPEC_INDNTPOFF:
13782 fputs ("@indntpoff", file);
13783 break;
13784 #if TARGET_MACHO
13785 case UNSPEC_MACHOPIC_OFFSET:
13786 putc ('-', file);
13787 machopic_output_function_base_name (file);
13788 break;
13789 #endif
13790 default:
13791 output_operand_lossage ("invalid UNSPEC as operand");
13792 break;
13793 }
13794 break;
13795
13796 default:
13797 output_operand_lossage ("invalid expression as operand");
13798 }
13799 }
13800
13801 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13802 We need to emit DTP-relative relocations. */
13803
13804 static void ATTRIBUTE_UNUSED
13805 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13806 {
13807 fputs (ASM_LONG, file);
13808 output_addr_const (file, x);
13809 fputs ("@dtpoff", file);
13810 switch (size)
13811 {
13812 case 4:
13813 break;
13814 case 8:
13815 fputs (", 0", file);
13816 break;
13817 default:
13818 gcc_unreachable ();
13819 }
13820 }
13821
13822 /* Return true if X is a representation of the PIC register. This copes
13823 with calls from ix86_find_base_term, where the register might have
13824 been replaced by a cselib value. */
13825
13826 static bool
13827 ix86_pic_register_p (rtx x)
13828 {
13829 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13830 return (pic_offset_table_rtx
13831 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13832 else
13833 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13834 }
13835
13836 /* Helper function for ix86_delegitimize_address.
13837 Attempt to delegitimize TLS local-exec accesses. */
13838
13839 static rtx
13840 ix86_delegitimize_tls_address (rtx orig_x)
13841 {
13842 rtx x = orig_x, unspec;
13843 struct ix86_address addr;
13844
13845 if (!TARGET_TLS_DIRECT_SEG_REFS)
13846 return orig_x;
13847 if (MEM_P (x))
13848 x = XEXP (x, 0);
13849 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13850 return orig_x;
13851 if (ix86_decompose_address (x, &addr) == 0
13852 || addr.seg != DEFAULT_TLS_SEG_REG
13853 || addr.disp == NULL_RTX
13854 || GET_CODE (addr.disp) != CONST)
13855 return orig_x;
13856 unspec = XEXP (addr.disp, 0);
13857 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13858 unspec = XEXP (unspec, 0);
13859 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13860 return orig_x;
13861 x = XVECEXP (unspec, 0, 0);
13862 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13863 if (unspec != XEXP (addr.disp, 0))
13864 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13865 if (addr.index)
13866 {
13867 rtx idx = addr.index;
13868 if (addr.scale != 1)
13869 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13870 x = gen_rtx_PLUS (Pmode, idx, x);
13871 }
13872 if (addr.base)
13873 x = gen_rtx_PLUS (Pmode, addr.base, x);
13874 if (MEM_P (orig_x))
13875 x = replace_equiv_address_nv (orig_x, x);
13876 return x;
13877 }
13878
13879 /* In the name of slightly smaller debug output, and to cater to
13880 general assembler lossage, recognize PIC+GOTOFF and turn it back
13881 into a direct symbol reference.
13882
13883 On Darwin, this is necessary to avoid a crash, because Darwin
13884 has a different PIC label for each routine but the DWARF debugging
13885 information is not associated with any particular routine, so it's
13886 necessary to remove references to the PIC label from RTL stored by
13887 the DWARF output code. */
13888
13889 static rtx
13890 ix86_delegitimize_address (rtx x)
13891 {
13892 rtx orig_x = delegitimize_mem_from_attrs (x);
13893 /* addend is NULL or some rtx if x is something+GOTOFF where
13894 something doesn't include the PIC register. */
13895 rtx addend = NULL_RTX;
13896 /* reg_addend is NULL or a multiple of some register. */
13897 rtx reg_addend = NULL_RTX;
13898 /* const_addend is NULL or a const_int. */
13899 rtx const_addend = NULL_RTX;
13900 /* This is the result, or NULL. */
13901 rtx result = NULL_RTX;
13902
13903 x = orig_x;
13904
13905 if (MEM_P (x))
13906 x = XEXP (x, 0);
13907
13908 if (TARGET_64BIT)
13909 {
13910 if (GET_CODE (x) == CONST
13911 && GET_CODE (XEXP (x, 0)) == PLUS
13912 && GET_MODE (XEXP (x, 0)) == Pmode
13913 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13914 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13915 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13916 {
13917 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13918 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13919 if (MEM_P (orig_x))
13920 x = replace_equiv_address_nv (orig_x, x);
13921 return x;
13922 }
13923
13924 if (GET_CODE (x) == CONST
13925 && GET_CODE (XEXP (x, 0)) == UNSPEC
13926 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13927 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13928 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13929 {
13930 x = XVECEXP (XEXP (x, 0), 0, 0);
13931 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13932 {
13933 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13934 GET_MODE (x), 0);
13935 if (x == NULL_RTX)
13936 return orig_x;
13937 }
13938 return x;
13939 }
13940
13941 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13942 return ix86_delegitimize_tls_address (orig_x);
13943
13944 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13945 and -mcmodel=medium -fpic. */
13946 }
13947
13948 if (GET_CODE (x) != PLUS
13949 || GET_CODE (XEXP (x, 1)) != CONST)
13950 return ix86_delegitimize_tls_address (orig_x);
13951
13952 if (ix86_pic_register_p (XEXP (x, 0)))
13953 /* %ebx + GOT/GOTOFF */
13954 ;
13955 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13956 {
13957 /* %ebx + %reg * scale + GOT/GOTOFF */
13958 reg_addend = XEXP (x, 0);
13959 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13960 reg_addend = XEXP (reg_addend, 1);
13961 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13962 reg_addend = XEXP (reg_addend, 0);
13963 else
13964 {
13965 reg_addend = NULL_RTX;
13966 addend = XEXP (x, 0);
13967 }
13968 }
13969 else
13970 addend = XEXP (x, 0);
13971
13972 x = XEXP (XEXP (x, 1), 0);
13973 if (GET_CODE (x) == PLUS
13974 && CONST_INT_P (XEXP (x, 1)))
13975 {
13976 const_addend = XEXP (x, 1);
13977 x = XEXP (x, 0);
13978 }
13979
13980 if (GET_CODE (x) == UNSPEC
13981 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13982 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13983 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13984 && !MEM_P (orig_x) && !addend)))
13985 result = XVECEXP (x, 0, 0);
13986
13987 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13988 && !MEM_P (orig_x))
13989 result = XVECEXP (x, 0, 0);
13990
13991 if (! result)
13992 return ix86_delegitimize_tls_address (orig_x);
13993
13994 if (const_addend)
13995 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13996 if (reg_addend)
13997 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13998 if (addend)
13999 {
14000 /* If the rest of original X doesn't involve the PIC register, add
14001 addend and subtract pic_offset_table_rtx. This can happen e.g.
14002 for code like:
14003 leal (%ebx, %ecx, 4), %ecx
14004 ...
14005 movl foo@GOTOFF(%ecx), %edx
14006 in which case we return (%ecx - %ebx) + foo. */
14007 if (pic_offset_table_rtx)
14008 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
14009 pic_offset_table_rtx),
14010 result);
14011 else
14012 return orig_x;
14013 }
14014 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
14015 {
14016 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
14017 if (result == NULL_RTX)
14018 return orig_x;
14019 }
14020 return result;
14021 }
14022
14023 /* If X is a machine specific address (i.e. a symbol or label being
14024 referenced as a displacement from the GOT implemented using an
14025 UNSPEC), then return the base term. Otherwise return X. */
14026
14027 rtx
14028 ix86_find_base_term (rtx x)
14029 {
14030 rtx term;
14031
14032 if (TARGET_64BIT)
14033 {
14034 if (GET_CODE (x) != CONST)
14035 return x;
14036 term = XEXP (x, 0);
14037 if (GET_CODE (term) == PLUS
14038 && (CONST_INT_P (XEXP (term, 1))
14039 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
14040 term = XEXP (term, 0);
14041 if (GET_CODE (term) != UNSPEC
14042 || (XINT (term, 1) != UNSPEC_GOTPCREL
14043 && XINT (term, 1) != UNSPEC_PCREL))
14044 return x;
14045
14046 return XVECEXP (term, 0, 0);
14047 }
14048
14049 return ix86_delegitimize_address (x);
14050 }
14051 \f
14052 static void
14053 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
14054 bool fp, FILE *file)
14055 {
14056 const char *suffix;
14057
14058 if (mode == CCFPmode || mode == CCFPUmode)
14059 {
14060 code = ix86_fp_compare_code_to_integer (code);
14061 mode = CCmode;
14062 }
14063 if (reverse)
14064 code = reverse_condition (code);
14065
14066 switch (code)
14067 {
14068 case EQ:
14069 switch (mode)
14070 {
14071 case CCAmode:
14072 suffix = "a";
14073 break;
14074
14075 case CCCmode:
14076 suffix = "c";
14077 break;
14078
14079 case CCOmode:
14080 suffix = "o";
14081 break;
14082
14083 case CCSmode:
14084 suffix = "s";
14085 break;
14086
14087 default:
14088 suffix = "e";
14089 }
14090 break;
14091 case NE:
14092 switch (mode)
14093 {
14094 case CCAmode:
14095 suffix = "na";
14096 break;
14097
14098 case CCCmode:
14099 suffix = "nc";
14100 break;
14101
14102 case CCOmode:
14103 suffix = "no";
14104 break;
14105
14106 case CCSmode:
14107 suffix = "ns";
14108 break;
14109
14110 default:
14111 suffix = "ne";
14112 }
14113 break;
14114 case GT:
14115 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
14116 suffix = "g";
14117 break;
14118 case GTU:
14119 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
14120 Those same assemblers have the same but opposite lossage on cmov. */
14121 if (mode == CCmode)
14122 suffix = fp ? "nbe" : "a";
14123 else
14124 gcc_unreachable ();
14125 break;
14126 case LT:
14127 switch (mode)
14128 {
14129 case CCNOmode:
14130 case CCGOCmode:
14131 suffix = "s";
14132 break;
14133
14134 case CCmode:
14135 case CCGCmode:
14136 suffix = "l";
14137 break;
14138
14139 default:
14140 gcc_unreachable ();
14141 }
14142 break;
14143 case LTU:
14144 if (mode == CCmode)
14145 suffix = "b";
14146 else if (mode == CCCmode)
14147 suffix = "c";
14148 else
14149 gcc_unreachable ();
14150 break;
14151 case GE:
14152 switch (mode)
14153 {
14154 case CCNOmode:
14155 case CCGOCmode:
14156 suffix = "ns";
14157 break;
14158
14159 case CCmode:
14160 case CCGCmode:
14161 suffix = "ge";
14162 break;
14163
14164 default:
14165 gcc_unreachable ();
14166 }
14167 break;
14168 case GEU:
14169 if (mode == CCmode)
14170 suffix = fp ? "nb" : "ae";
14171 else if (mode == CCCmode)
14172 suffix = "nc";
14173 else
14174 gcc_unreachable ();
14175 break;
14176 case LE:
14177 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
14178 suffix = "le";
14179 break;
14180 case LEU:
14181 if (mode == CCmode)
14182 suffix = "be";
14183 else
14184 gcc_unreachable ();
14185 break;
14186 case UNORDERED:
14187 suffix = fp ? "u" : "p";
14188 break;
14189 case ORDERED:
14190 suffix = fp ? "nu" : "np";
14191 break;
14192 default:
14193 gcc_unreachable ();
14194 }
14195 fputs (suffix, file);
14196 }
14197
14198 /* Print the name of register X to FILE based on its machine mode and number.
14199 If CODE is 'w', pretend the mode is HImode.
14200 If CODE is 'b', pretend the mode is QImode.
14201 If CODE is 'k', pretend the mode is SImode.
14202 If CODE is 'q', pretend the mode is DImode.
14203 If CODE is 'x', pretend the mode is V4SFmode.
14204 If CODE is 't', pretend the mode is V8SFmode.
14205 If CODE is 'g', pretend the mode is V16SFmode.
14206 If CODE is 'h', pretend the reg is the 'high' byte register.
14207 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
14208 If CODE is 'd', duplicate the operand for AVX instruction.
14209 */
14210
14211 void
14212 print_reg (rtx x, int code, FILE *file)
14213 {
14214 const char *reg;
14215 unsigned int regno;
14216 bool duplicated = code == 'd' && TARGET_AVX;
14217
14218 if (ASSEMBLER_DIALECT == ASM_ATT)
14219 putc ('%', file);
14220
14221 if (x == pc_rtx)
14222 {
14223 gcc_assert (TARGET_64BIT);
14224 fputs ("rip", file);
14225 return;
14226 }
14227
14228 regno = true_regnum (x);
14229 gcc_assert (regno != ARG_POINTER_REGNUM
14230 && regno != FRAME_POINTER_REGNUM
14231 && regno != FLAGS_REG
14232 && regno != FPSR_REG
14233 && regno != FPCR_REG);
14234
14235 if (code == 'w' || MMX_REG_P (x))
14236 code = 2;
14237 else if (code == 'b')
14238 code = 1;
14239 else if (code == 'k')
14240 code = 4;
14241 else if (code == 'q')
14242 code = 8;
14243 else if (code == 'y')
14244 code = 3;
14245 else if (code == 'h')
14246 code = 0;
14247 else if (code == 'x')
14248 code = 16;
14249 else if (code == 't')
14250 code = 32;
14251 else if (code == 'g')
14252 code = 64;
14253 else
14254 code = GET_MODE_SIZE (GET_MODE (x));
14255
14256 /* Irritatingly, AMD extended registers use different naming convention
14257 from the normal registers: "r%d[bwd]" */
14258 if (REX_INT_REGNO_P (regno))
14259 {
14260 gcc_assert (TARGET_64BIT);
14261 putc ('r', file);
14262 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
14263 switch (code)
14264 {
14265 case 0:
14266 error ("extended registers have no high halves");
14267 break;
14268 case 1:
14269 putc ('b', file);
14270 break;
14271 case 2:
14272 putc ('w', file);
14273 break;
14274 case 4:
14275 putc ('d', file);
14276 break;
14277 case 8:
14278 /* no suffix */
14279 break;
14280 default:
14281 error ("unsupported operand size for extended register");
14282 break;
14283 }
14284 return;
14285 }
14286
14287 reg = NULL;
14288 switch (code)
14289 {
14290 case 3:
14291 if (STACK_TOP_P (x))
14292 {
14293 reg = "st(0)";
14294 break;
14295 }
14296 /* FALLTHRU */
14297 case 8:
14298 case 4:
14299 case 12:
14300 if (! ANY_FP_REG_P (x) && ! ANY_BND_REG_P (x))
14301 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
14302 /* FALLTHRU */
14303 case 16:
14304 case 2:
14305 normal:
14306 reg = hi_reg_name[regno];
14307 break;
14308 case 1:
14309 if (regno >= ARRAY_SIZE (qi_reg_name))
14310 goto normal;
14311 reg = qi_reg_name[regno];
14312 break;
14313 case 0:
14314 if (regno >= ARRAY_SIZE (qi_high_reg_name))
14315 goto normal;
14316 reg = qi_high_reg_name[regno];
14317 break;
14318 case 32:
14319 if (SSE_REG_P (x))
14320 {
14321 gcc_assert (!duplicated);
14322 putc ('y', file);
14323 fputs (hi_reg_name[regno] + 1, file);
14324 return;
14325 }
14326 case 64:
14327 if (SSE_REG_P (x))
14328 {
14329 gcc_assert (!duplicated);
14330 putc ('z', file);
14331 fputs (hi_reg_name[REGNO (x)] + 1, file);
14332 return;
14333 }
14334 break;
14335 default:
14336 gcc_unreachable ();
14337 }
14338
14339 fputs (reg, file);
14340 if (duplicated)
14341 {
14342 if (ASSEMBLER_DIALECT == ASM_ATT)
14343 fprintf (file, ", %%%s", reg);
14344 else
14345 fprintf (file, ", %s", reg);
14346 }
14347 }
14348
14349 /* Locate some local-dynamic symbol still in use by this function
14350 so that we can print its name in some tls_local_dynamic_base
14351 pattern. */
14352
14353 static int
14354 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14355 {
14356 rtx x = *px;
14357
14358 if (GET_CODE (x) == SYMBOL_REF
14359 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14360 {
14361 cfun->machine->some_ld_name = XSTR (x, 0);
14362 return 1;
14363 }
14364
14365 return 0;
14366 }
14367
14368 static const char *
14369 get_some_local_dynamic_name (void)
14370 {
14371 rtx insn;
14372
14373 if (cfun->machine->some_ld_name)
14374 return cfun->machine->some_ld_name;
14375
14376 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14377 if (NONDEBUG_INSN_P (insn)
14378 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14379 return cfun->machine->some_ld_name;
14380
14381 return NULL;
14382 }
14383
14384 /* Meaning of CODE:
14385 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14386 C -- print opcode suffix for set/cmov insn.
14387 c -- like C, but print reversed condition
14388 F,f -- likewise, but for floating-point.
14389 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14390 otherwise nothing
14391 R -- print the prefix for register names.
14392 z -- print the opcode suffix for the size of the current operand.
14393 Z -- likewise, with special suffixes for x87 instructions.
14394 * -- print a star (in certain assembler syntax)
14395 A -- print an absolute memory reference.
14396 E -- print address with DImode register names if TARGET_64BIT.
14397 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14398 s -- print a shift double count, followed by the assemblers argument
14399 delimiter.
14400 b -- print the QImode name of the register for the indicated operand.
14401 %b0 would print %al if operands[0] is reg 0.
14402 w -- likewise, print the HImode name of the register.
14403 k -- likewise, print the SImode name of the register.
14404 q -- likewise, print the DImode name of the register.
14405 x -- likewise, print the V4SFmode name of the register.
14406 t -- likewise, print the V8SFmode name of the register.
14407 g -- likewise, print the V16SFmode name of the register.
14408 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14409 y -- print "st(0)" instead of "st" as a register.
14410 d -- print duplicated register operand for AVX instruction.
14411 D -- print condition for SSE cmp instruction.
14412 P -- if PIC, print an @PLT suffix.
14413 p -- print raw symbol name.
14414 X -- don't print any sort of PIC '@' suffix for a symbol.
14415 & -- print some in-use local-dynamic symbol name.
14416 H -- print a memory address offset by 8; used for sse high-parts
14417 Y -- print condition for XOP pcom* instruction.
14418 + -- print a branch hint as 'cs' or 'ds' prefix
14419 ; -- print a semicolon (after prefixes due to bug in older gas).
14420 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14421 @ -- print a segment register of thread base pointer load
14422 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14423 ! -- print MPX prefix for jxx/call/ret instructions if required.
14424 */
14425
14426 void
14427 ix86_print_operand (FILE *file, rtx x, int code)
14428 {
14429 if (code)
14430 {
14431 switch (code)
14432 {
14433 case 'A':
14434 switch (ASSEMBLER_DIALECT)
14435 {
14436 case ASM_ATT:
14437 putc ('*', file);
14438 break;
14439
14440 case ASM_INTEL:
14441 /* Intel syntax. For absolute addresses, registers should not
14442 be surrounded by braces. */
14443 if (!REG_P (x))
14444 {
14445 putc ('[', file);
14446 ix86_print_operand (file, x, 0);
14447 putc (']', file);
14448 return;
14449 }
14450 break;
14451
14452 default:
14453 gcc_unreachable ();
14454 }
14455
14456 ix86_print_operand (file, x, 0);
14457 return;
14458
14459 case 'E':
14460 /* Wrap address in an UNSPEC to declare special handling. */
14461 if (TARGET_64BIT)
14462 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14463
14464 output_address (x);
14465 return;
14466
14467 case 'L':
14468 if (ASSEMBLER_DIALECT == ASM_ATT)
14469 putc ('l', file);
14470 return;
14471
14472 case 'W':
14473 if (ASSEMBLER_DIALECT == ASM_ATT)
14474 putc ('w', file);
14475 return;
14476
14477 case 'B':
14478 if (ASSEMBLER_DIALECT == ASM_ATT)
14479 putc ('b', file);
14480 return;
14481
14482 case 'Q':
14483 if (ASSEMBLER_DIALECT == ASM_ATT)
14484 putc ('l', file);
14485 return;
14486
14487 case 'S':
14488 if (ASSEMBLER_DIALECT == ASM_ATT)
14489 putc ('s', file);
14490 return;
14491
14492 case 'T':
14493 if (ASSEMBLER_DIALECT == ASM_ATT)
14494 putc ('t', file);
14495 return;
14496
14497 case 'O':
14498 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14499 if (ASSEMBLER_DIALECT != ASM_ATT)
14500 return;
14501
14502 switch (GET_MODE_SIZE (GET_MODE (x)))
14503 {
14504 case 2:
14505 putc ('w', file);
14506 break;
14507
14508 case 4:
14509 putc ('l', file);
14510 break;
14511
14512 case 8:
14513 putc ('q', file);
14514 break;
14515
14516 default:
14517 output_operand_lossage
14518 ("invalid operand size for operand code 'O'");
14519 return;
14520 }
14521
14522 putc ('.', file);
14523 #endif
14524 return;
14525
14526 case 'z':
14527 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14528 {
14529 /* Opcodes don't get size suffixes if using Intel opcodes. */
14530 if (ASSEMBLER_DIALECT == ASM_INTEL)
14531 return;
14532
14533 switch (GET_MODE_SIZE (GET_MODE (x)))
14534 {
14535 case 1:
14536 putc ('b', file);
14537 return;
14538
14539 case 2:
14540 putc ('w', file);
14541 return;
14542
14543 case 4:
14544 putc ('l', file);
14545 return;
14546
14547 case 8:
14548 putc ('q', file);
14549 return;
14550
14551 default:
14552 output_operand_lossage
14553 ("invalid operand size for operand code 'z'");
14554 return;
14555 }
14556 }
14557
14558 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14559 warning
14560 (0, "non-integer operand used with operand code 'z'");
14561 /* FALLTHRU */
14562
14563 case 'Z':
14564 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14565 if (ASSEMBLER_DIALECT == ASM_INTEL)
14566 return;
14567
14568 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14569 {
14570 switch (GET_MODE_SIZE (GET_MODE (x)))
14571 {
14572 case 2:
14573 #ifdef HAVE_AS_IX86_FILDS
14574 putc ('s', file);
14575 #endif
14576 return;
14577
14578 case 4:
14579 putc ('l', file);
14580 return;
14581
14582 case 8:
14583 #ifdef HAVE_AS_IX86_FILDQ
14584 putc ('q', file);
14585 #else
14586 fputs ("ll", file);
14587 #endif
14588 return;
14589
14590 default:
14591 break;
14592 }
14593 }
14594 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14595 {
14596 /* 387 opcodes don't get size suffixes
14597 if the operands are registers. */
14598 if (STACK_REG_P (x))
14599 return;
14600
14601 switch (GET_MODE_SIZE (GET_MODE (x)))
14602 {
14603 case 4:
14604 putc ('s', file);
14605 return;
14606
14607 case 8:
14608 putc ('l', file);
14609 return;
14610
14611 case 12:
14612 case 16:
14613 putc ('t', file);
14614 return;
14615
14616 default:
14617 break;
14618 }
14619 }
14620 else
14621 {
14622 output_operand_lossage
14623 ("invalid operand type used with operand code 'Z'");
14624 return;
14625 }
14626
14627 output_operand_lossage
14628 ("invalid operand size for operand code 'Z'");
14629 return;
14630
14631 case 'd':
14632 case 'b':
14633 case 'w':
14634 case 'k':
14635 case 'q':
14636 case 'h':
14637 case 't':
14638 case 'g':
14639 case 'y':
14640 case 'x':
14641 case 'X':
14642 case 'P':
14643 case 'p':
14644 break;
14645
14646 case 's':
14647 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14648 {
14649 ix86_print_operand (file, x, 0);
14650 fputs (", ", file);
14651 }
14652 return;
14653
14654 case 'Y':
14655 switch (GET_CODE (x))
14656 {
14657 case NE:
14658 fputs ("neq", file);
14659 break;
14660 case EQ:
14661 fputs ("eq", file);
14662 break;
14663 case GE:
14664 case GEU:
14665 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14666 break;
14667 case GT:
14668 case GTU:
14669 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14670 break;
14671 case LE:
14672 case LEU:
14673 fputs ("le", file);
14674 break;
14675 case LT:
14676 case LTU:
14677 fputs ("lt", file);
14678 break;
14679 case UNORDERED:
14680 fputs ("unord", file);
14681 break;
14682 case ORDERED:
14683 fputs ("ord", file);
14684 break;
14685 case UNEQ:
14686 fputs ("ueq", file);
14687 break;
14688 case UNGE:
14689 fputs ("nlt", file);
14690 break;
14691 case UNGT:
14692 fputs ("nle", file);
14693 break;
14694 case UNLE:
14695 fputs ("ule", file);
14696 break;
14697 case UNLT:
14698 fputs ("ult", file);
14699 break;
14700 case LTGT:
14701 fputs ("une", file);
14702 break;
14703 default:
14704 output_operand_lossage ("operand is not a condition code, "
14705 "invalid operand code 'Y'");
14706 return;
14707 }
14708 return;
14709
14710 case 'D':
14711 /* Little bit of braindamage here. The SSE compare instructions
14712 does use completely different names for the comparisons that the
14713 fp conditional moves. */
14714 switch (GET_CODE (x))
14715 {
14716 case UNEQ:
14717 if (TARGET_AVX)
14718 {
14719 fputs ("eq_us", file);
14720 break;
14721 }
14722 case EQ:
14723 fputs ("eq", file);
14724 break;
14725 case UNLT:
14726 if (TARGET_AVX)
14727 {
14728 fputs ("nge", file);
14729 break;
14730 }
14731 case LT:
14732 fputs ("lt", file);
14733 break;
14734 case UNLE:
14735 if (TARGET_AVX)
14736 {
14737 fputs ("ngt", file);
14738 break;
14739 }
14740 case LE:
14741 fputs ("le", file);
14742 break;
14743 case UNORDERED:
14744 fputs ("unord", file);
14745 break;
14746 case LTGT:
14747 if (TARGET_AVX)
14748 {
14749 fputs ("neq_oq", file);
14750 break;
14751 }
14752 case NE:
14753 fputs ("neq", file);
14754 break;
14755 case GE:
14756 if (TARGET_AVX)
14757 {
14758 fputs ("ge", file);
14759 break;
14760 }
14761 case UNGE:
14762 fputs ("nlt", file);
14763 break;
14764 case GT:
14765 if (TARGET_AVX)
14766 {
14767 fputs ("gt", file);
14768 break;
14769 }
14770 case UNGT:
14771 fputs ("nle", file);
14772 break;
14773 case ORDERED:
14774 fputs ("ord", file);
14775 break;
14776 default:
14777 output_operand_lossage ("operand is not a condition code, "
14778 "invalid operand code 'D'");
14779 return;
14780 }
14781 return;
14782
14783 case 'F':
14784 case 'f':
14785 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14786 if (ASSEMBLER_DIALECT == ASM_ATT)
14787 putc ('.', file);
14788 #endif
14789
14790 case 'C':
14791 case 'c':
14792 if (!COMPARISON_P (x))
14793 {
14794 output_operand_lossage ("operand is not a condition code, "
14795 "invalid operand code '%c'", code);
14796 return;
14797 }
14798 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14799 code == 'c' || code == 'f',
14800 code == 'F' || code == 'f',
14801 file);
14802 return;
14803
14804 case 'H':
14805 if (!offsettable_memref_p (x))
14806 {
14807 output_operand_lossage ("operand is not an offsettable memory "
14808 "reference, invalid operand code 'H'");
14809 return;
14810 }
14811 /* It doesn't actually matter what mode we use here, as we're
14812 only going to use this for printing. */
14813 x = adjust_address_nv (x, DImode, 8);
14814 /* Output 'qword ptr' for intel assembler dialect. */
14815 if (ASSEMBLER_DIALECT == ASM_INTEL)
14816 code = 'q';
14817 break;
14818
14819 case 'K':
14820 gcc_assert (CONST_INT_P (x));
14821
14822 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14823 #ifdef HAVE_AS_IX86_HLE
14824 fputs ("xacquire ", file);
14825 #else
14826 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14827 #endif
14828 else if (INTVAL (x) & IX86_HLE_RELEASE)
14829 #ifdef HAVE_AS_IX86_HLE
14830 fputs ("xrelease ", file);
14831 #else
14832 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14833 #endif
14834 /* We do not want to print value of the operand. */
14835 return;
14836
14837 case '*':
14838 if (ASSEMBLER_DIALECT == ASM_ATT)
14839 putc ('*', file);
14840 return;
14841
14842 case '&':
14843 {
14844 const char *name = get_some_local_dynamic_name ();
14845 if (name == NULL)
14846 output_operand_lossage ("'%%&' used without any "
14847 "local dynamic TLS references");
14848 else
14849 assemble_name (file, name);
14850 return;
14851 }
14852
14853 case '+':
14854 {
14855 rtx x;
14856
14857 if (!optimize
14858 || optimize_function_for_size_p (cfun)
14859 || !TARGET_BRANCH_PREDICTION_HINTS)
14860 return;
14861
14862 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14863 if (x)
14864 {
14865 int pred_val = XINT (x, 0);
14866
14867 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14868 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14869 {
14870 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14871 bool cputaken
14872 = final_forward_branch_p (current_output_insn) == 0;
14873
14874 /* Emit hints only in the case default branch prediction
14875 heuristics would fail. */
14876 if (taken != cputaken)
14877 {
14878 /* We use 3e (DS) prefix for taken branches and
14879 2e (CS) prefix for not taken branches. */
14880 if (taken)
14881 fputs ("ds ; ", file);
14882 else
14883 fputs ("cs ; ", file);
14884 }
14885 }
14886 }
14887 return;
14888 }
14889
14890 case ';':
14891 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14892 putc (';', file);
14893 #endif
14894 return;
14895
14896 case '@':
14897 if (ASSEMBLER_DIALECT == ASM_ATT)
14898 putc ('%', file);
14899
14900 /* The kernel uses a different segment register for performance
14901 reasons; a system call would not have to trash the userspace
14902 segment register, which would be expensive. */
14903 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14904 fputs ("fs", file);
14905 else
14906 fputs ("gs", file);
14907 return;
14908
14909 case '~':
14910 putc (TARGET_AVX2 ? 'i' : 'f', file);
14911 return;
14912
14913 case '^':
14914 if (TARGET_64BIT && Pmode != word_mode)
14915 fputs ("addr32 ", file);
14916 return;
14917
14918 case '!':
14919 if (ix86_bnd_prefixed_insn_p (NULL_RTX))
14920 fputs ("bnd ", file);
14921 return;
14922
14923 default:
14924 output_operand_lossage ("invalid operand code '%c'", code);
14925 }
14926 }
14927
14928 if (REG_P (x))
14929 print_reg (x, code, file);
14930
14931 else if (MEM_P (x))
14932 {
14933 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14934 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14935 && GET_MODE (x) != BLKmode)
14936 {
14937 const char * size;
14938 switch (GET_MODE_SIZE (GET_MODE (x)))
14939 {
14940 case 1: size = "BYTE"; break;
14941 case 2: size = "WORD"; break;
14942 case 4: size = "DWORD"; break;
14943 case 8: size = "QWORD"; break;
14944 case 12: size = "TBYTE"; break;
14945 case 16:
14946 if (GET_MODE (x) == XFmode)
14947 size = "TBYTE";
14948 else
14949 size = "XMMWORD";
14950 break;
14951 case 32: size = "YMMWORD"; break;
14952 case 64: size = "ZMMWORD"; break;
14953 default:
14954 gcc_unreachable ();
14955 }
14956
14957 /* Check for explicit size override (codes 'b', 'w', 'k',
14958 'q' and 'x') */
14959 if (code == 'b')
14960 size = "BYTE";
14961 else if (code == 'w')
14962 size = "WORD";
14963 else if (code == 'k')
14964 size = "DWORD";
14965 else if (code == 'q')
14966 size = "QWORD";
14967 else if (code == 'x')
14968 size = "XMMWORD";
14969
14970 fputs (size, file);
14971 fputs (" PTR ", file);
14972 }
14973
14974 x = XEXP (x, 0);
14975 /* Avoid (%rip) for call operands. */
14976 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14977 && !CONST_INT_P (x))
14978 output_addr_const (file, x);
14979 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14980 output_operand_lossage ("invalid constraints for operand");
14981 else
14982 output_address (x);
14983 }
14984
14985 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14986 {
14987 REAL_VALUE_TYPE r;
14988 long l;
14989
14990 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14991 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14992
14993 if (ASSEMBLER_DIALECT == ASM_ATT)
14994 putc ('$', file);
14995 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14996 if (code == 'q')
14997 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14998 (unsigned long long) (int) l);
14999 else
15000 fprintf (file, "0x%08x", (unsigned int) l);
15001 }
15002
15003 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
15004 {
15005 REAL_VALUE_TYPE r;
15006 long l[2];
15007
15008 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
15009 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
15010
15011 if (ASSEMBLER_DIALECT == ASM_ATT)
15012 putc ('$', file);
15013 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
15014 }
15015
15016 /* These float cases don't actually occur as immediate operands. */
15017 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
15018 {
15019 char dstr[30];
15020
15021 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
15022 fputs (dstr, file);
15023 }
15024
15025 else
15026 {
15027 /* We have patterns that allow zero sets of memory, for instance.
15028 In 64-bit mode, we should probably support all 8-byte vectors,
15029 since we can in fact encode that into an immediate. */
15030 if (GET_CODE (x) == CONST_VECTOR)
15031 {
15032 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
15033 x = const0_rtx;
15034 }
15035
15036 if (code != 'P' && code != 'p')
15037 {
15038 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
15039 {
15040 if (ASSEMBLER_DIALECT == ASM_ATT)
15041 putc ('$', file);
15042 }
15043 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
15044 || GET_CODE (x) == LABEL_REF)
15045 {
15046 if (ASSEMBLER_DIALECT == ASM_ATT)
15047 putc ('$', file);
15048 else
15049 fputs ("OFFSET FLAT:", file);
15050 }
15051 }
15052 if (CONST_INT_P (x))
15053 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
15054 else if (flag_pic || MACHOPIC_INDIRECT)
15055 output_pic_addr_const (file, x, code);
15056 else
15057 output_addr_const (file, x);
15058 }
15059 }
15060
15061 static bool
15062 ix86_print_operand_punct_valid_p (unsigned char code)
15063 {
15064 return (code == '@' || code == '*' || code == '+' || code == '&'
15065 || code == ';' || code == '~' || code == '^' || code == '!');
15066 }
15067 \f
15068 /* Print a memory operand whose address is ADDR. */
15069
15070 static void
15071 ix86_print_operand_address (FILE *file, rtx addr)
15072 {
15073 struct ix86_address parts;
15074 rtx base, index, disp;
15075 int scale;
15076 int ok;
15077 bool vsib = false;
15078 int code = 0;
15079
15080 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
15081 {
15082 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15083 gcc_assert (parts.index == NULL_RTX);
15084 parts.index = XVECEXP (addr, 0, 1);
15085 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
15086 addr = XVECEXP (addr, 0, 0);
15087 vsib = true;
15088 }
15089 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
15090 {
15091 gcc_assert (TARGET_64BIT);
15092 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15093 code = 'q';
15094 }
15095 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
15096 {
15097 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
15098 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
15099 if (parts.base != NULL_RTX)
15100 {
15101 parts.index = parts.base;
15102 parts.scale = 1;
15103 }
15104 parts.base = XVECEXP (addr, 0, 0);
15105 addr = XVECEXP (addr, 0, 0);
15106 }
15107 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
15108 {
15109 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
15110 gcc_assert (parts.index == NULL_RTX);
15111 parts.index = XVECEXP (addr, 0, 1);
15112 addr = XVECEXP (addr, 0, 0);
15113 }
15114 else
15115 ok = ix86_decompose_address (addr, &parts);
15116
15117 gcc_assert (ok);
15118
15119 base = parts.base;
15120 index = parts.index;
15121 disp = parts.disp;
15122 scale = parts.scale;
15123
15124 switch (parts.seg)
15125 {
15126 case SEG_DEFAULT:
15127 break;
15128 case SEG_FS:
15129 case SEG_GS:
15130 if (ASSEMBLER_DIALECT == ASM_ATT)
15131 putc ('%', file);
15132 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
15133 break;
15134 default:
15135 gcc_unreachable ();
15136 }
15137
15138 /* Use one byte shorter RIP relative addressing for 64bit mode. */
15139 if (TARGET_64BIT && !base && !index)
15140 {
15141 rtx symbol = disp;
15142
15143 if (GET_CODE (disp) == CONST
15144 && GET_CODE (XEXP (disp, 0)) == PLUS
15145 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15146 symbol = XEXP (XEXP (disp, 0), 0);
15147
15148 if (GET_CODE (symbol) == LABEL_REF
15149 || (GET_CODE (symbol) == SYMBOL_REF
15150 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
15151 base = pc_rtx;
15152 }
15153 if (!base && !index)
15154 {
15155 /* Displacement only requires special attention. */
15156
15157 if (CONST_INT_P (disp))
15158 {
15159 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
15160 fputs ("ds:", file);
15161 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
15162 }
15163 else if (flag_pic)
15164 output_pic_addr_const (file, disp, 0);
15165 else
15166 output_addr_const (file, disp);
15167 }
15168 else
15169 {
15170 /* Print SImode register names to force addr32 prefix. */
15171 if (SImode_address_operand (addr, VOIDmode))
15172 {
15173 #ifdef ENABLE_CHECKING
15174 gcc_assert (TARGET_64BIT);
15175 switch (GET_CODE (addr))
15176 {
15177 case SUBREG:
15178 gcc_assert (GET_MODE (addr) == SImode);
15179 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
15180 break;
15181 case ZERO_EXTEND:
15182 case AND:
15183 gcc_assert (GET_MODE (addr) == DImode);
15184 break;
15185 default:
15186 gcc_unreachable ();
15187 }
15188 #endif
15189 gcc_assert (!code);
15190 code = 'k';
15191 }
15192 else if (code == 0
15193 && TARGET_X32
15194 && disp
15195 && CONST_INT_P (disp)
15196 && INTVAL (disp) < -16*1024*1024)
15197 {
15198 /* X32 runs in 64-bit mode, where displacement, DISP, in
15199 address DISP(%r64), is encoded as 32-bit immediate sign-
15200 extended from 32-bit to 64-bit. For -0x40000300(%r64),
15201 address is %r64 + 0xffffffffbffffd00. When %r64 <
15202 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
15203 which is invalid for x32. The correct address is %r64
15204 - 0x40000300 == 0xf7ffdd64. To properly encode
15205 -0x40000300(%r64) for x32, we zero-extend negative
15206 displacement by forcing addr32 prefix which truncates
15207 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
15208 zero-extend all negative displacements, including -1(%rsp).
15209 However, for small negative displacements, sign-extension
15210 won't cause overflow. We only zero-extend negative
15211 displacements if they < -16*1024*1024, which is also used
15212 to check legitimate address displacements for PIC. */
15213 code = 'k';
15214 }
15215
15216 if (ASSEMBLER_DIALECT == ASM_ATT)
15217 {
15218 if (disp)
15219 {
15220 if (flag_pic)
15221 output_pic_addr_const (file, disp, 0);
15222 else if (GET_CODE (disp) == LABEL_REF)
15223 output_asm_label (disp);
15224 else
15225 output_addr_const (file, disp);
15226 }
15227
15228 putc ('(', file);
15229 if (base)
15230 print_reg (base, code, file);
15231 if (index)
15232 {
15233 putc (',', file);
15234 print_reg (index, vsib ? 0 : code, file);
15235 if (scale != 1 || vsib)
15236 fprintf (file, ",%d", scale);
15237 }
15238 putc (')', file);
15239 }
15240 else
15241 {
15242 rtx offset = NULL_RTX;
15243
15244 if (disp)
15245 {
15246 /* Pull out the offset of a symbol; print any symbol itself. */
15247 if (GET_CODE (disp) == CONST
15248 && GET_CODE (XEXP (disp, 0)) == PLUS
15249 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
15250 {
15251 offset = XEXP (XEXP (disp, 0), 1);
15252 disp = gen_rtx_CONST (VOIDmode,
15253 XEXP (XEXP (disp, 0), 0));
15254 }
15255
15256 if (flag_pic)
15257 output_pic_addr_const (file, disp, 0);
15258 else if (GET_CODE (disp) == LABEL_REF)
15259 output_asm_label (disp);
15260 else if (CONST_INT_P (disp))
15261 offset = disp;
15262 else
15263 output_addr_const (file, disp);
15264 }
15265
15266 putc ('[', file);
15267 if (base)
15268 {
15269 print_reg (base, code, file);
15270 if (offset)
15271 {
15272 if (INTVAL (offset) >= 0)
15273 putc ('+', file);
15274 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15275 }
15276 }
15277 else if (offset)
15278 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
15279 else
15280 putc ('0', file);
15281
15282 if (index)
15283 {
15284 putc ('+', file);
15285 print_reg (index, vsib ? 0 : code, file);
15286 if (scale != 1 || vsib)
15287 fprintf (file, "*%d", scale);
15288 }
15289 putc (']', file);
15290 }
15291 }
15292 }
15293
15294 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
15295
15296 static bool
15297 i386_asm_output_addr_const_extra (FILE *file, rtx x)
15298 {
15299 rtx op;
15300
15301 if (GET_CODE (x) != UNSPEC)
15302 return false;
15303
15304 op = XVECEXP (x, 0, 0);
15305 switch (XINT (x, 1))
15306 {
15307 case UNSPEC_GOTTPOFF:
15308 output_addr_const (file, op);
15309 /* FIXME: This might be @TPOFF in Sun ld. */
15310 fputs ("@gottpoff", file);
15311 break;
15312 case UNSPEC_TPOFF:
15313 output_addr_const (file, op);
15314 fputs ("@tpoff", file);
15315 break;
15316 case UNSPEC_NTPOFF:
15317 output_addr_const (file, op);
15318 if (TARGET_64BIT)
15319 fputs ("@tpoff", file);
15320 else
15321 fputs ("@ntpoff", file);
15322 break;
15323 case UNSPEC_DTPOFF:
15324 output_addr_const (file, op);
15325 fputs ("@dtpoff", file);
15326 break;
15327 case UNSPEC_GOTNTPOFF:
15328 output_addr_const (file, op);
15329 if (TARGET_64BIT)
15330 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
15331 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
15332 else
15333 fputs ("@gotntpoff", file);
15334 break;
15335 case UNSPEC_INDNTPOFF:
15336 output_addr_const (file, op);
15337 fputs ("@indntpoff", file);
15338 break;
15339 #if TARGET_MACHO
15340 case UNSPEC_MACHOPIC_OFFSET:
15341 output_addr_const (file, op);
15342 putc ('-', file);
15343 machopic_output_function_base_name (file);
15344 break;
15345 #endif
15346
15347 case UNSPEC_STACK_CHECK:
15348 {
15349 int offset;
15350
15351 gcc_assert (flag_split_stack);
15352
15353 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
15354 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
15355 #else
15356 gcc_unreachable ();
15357 #endif
15358
15359 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
15360 }
15361 break;
15362
15363 default:
15364 return false;
15365 }
15366
15367 return true;
15368 }
15369 \f
15370 /* Split one or more double-mode RTL references into pairs of half-mode
15371 references. The RTL can be REG, offsettable MEM, integer constant, or
15372 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
15373 split and "num" is its length. lo_half and hi_half are output arrays
15374 that parallel "operands". */
15375
15376 void
15377 split_double_mode (enum machine_mode mode, rtx operands[],
15378 int num, rtx lo_half[], rtx hi_half[])
15379 {
15380 enum machine_mode half_mode;
15381 unsigned int byte;
15382
15383 switch (mode)
15384 {
15385 case TImode:
15386 half_mode = DImode;
15387 break;
15388 case DImode:
15389 half_mode = SImode;
15390 break;
15391 default:
15392 gcc_unreachable ();
15393 }
15394
15395 byte = GET_MODE_SIZE (half_mode);
15396
15397 while (num--)
15398 {
15399 rtx op = operands[num];
15400
15401 /* simplify_subreg refuse to split volatile memory addresses,
15402 but we still have to handle it. */
15403 if (MEM_P (op))
15404 {
15405 lo_half[num] = adjust_address (op, half_mode, 0);
15406 hi_half[num] = adjust_address (op, half_mode, byte);
15407 }
15408 else
15409 {
15410 lo_half[num] = simplify_gen_subreg (half_mode, op,
15411 GET_MODE (op) == VOIDmode
15412 ? mode : GET_MODE (op), 0);
15413 hi_half[num] = simplify_gen_subreg (half_mode, op,
15414 GET_MODE (op) == VOIDmode
15415 ? mode : GET_MODE (op), byte);
15416 }
15417 }
15418 }
15419 \f
15420 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15421 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15422 is the expression of the binary operation. The output may either be
15423 emitted here, or returned to the caller, like all output_* functions.
15424
15425 There is no guarantee that the operands are the same mode, as they
15426 might be within FLOAT or FLOAT_EXTEND expressions. */
15427
15428 #ifndef SYSV386_COMPAT
15429 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15430 wants to fix the assemblers because that causes incompatibility
15431 with gcc. No-one wants to fix gcc because that causes
15432 incompatibility with assemblers... You can use the option of
15433 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15434 #define SYSV386_COMPAT 1
15435 #endif
15436
15437 const char *
15438 output_387_binary_op (rtx insn, rtx *operands)
15439 {
15440 static char buf[40];
15441 const char *p;
15442 const char *ssep;
15443 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15444
15445 #ifdef ENABLE_CHECKING
15446 /* Even if we do not want to check the inputs, this documents input
15447 constraints. Which helps in understanding the following code. */
15448 if (STACK_REG_P (operands[0])
15449 && ((REG_P (operands[1])
15450 && REGNO (operands[0]) == REGNO (operands[1])
15451 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15452 || (REG_P (operands[2])
15453 && REGNO (operands[0]) == REGNO (operands[2])
15454 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15455 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15456 ; /* ok */
15457 else
15458 gcc_assert (is_sse);
15459 #endif
15460
15461 switch (GET_CODE (operands[3]))
15462 {
15463 case PLUS:
15464 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15465 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15466 p = "fiadd";
15467 else
15468 p = "fadd";
15469 ssep = "vadd";
15470 break;
15471
15472 case MINUS:
15473 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15474 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15475 p = "fisub";
15476 else
15477 p = "fsub";
15478 ssep = "vsub";
15479 break;
15480
15481 case MULT:
15482 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15483 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15484 p = "fimul";
15485 else
15486 p = "fmul";
15487 ssep = "vmul";
15488 break;
15489
15490 case DIV:
15491 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15492 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15493 p = "fidiv";
15494 else
15495 p = "fdiv";
15496 ssep = "vdiv";
15497 break;
15498
15499 default:
15500 gcc_unreachable ();
15501 }
15502
15503 if (is_sse)
15504 {
15505 if (TARGET_AVX)
15506 {
15507 strcpy (buf, ssep);
15508 if (GET_MODE (operands[0]) == SFmode)
15509 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15510 else
15511 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15512 }
15513 else
15514 {
15515 strcpy (buf, ssep + 1);
15516 if (GET_MODE (operands[0]) == SFmode)
15517 strcat (buf, "ss\t{%2, %0|%0, %2}");
15518 else
15519 strcat (buf, "sd\t{%2, %0|%0, %2}");
15520 }
15521 return buf;
15522 }
15523 strcpy (buf, p);
15524
15525 switch (GET_CODE (operands[3]))
15526 {
15527 case MULT:
15528 case PLUS:
15529 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15530 {
15531 rtx temp = operands[2];
15532 operands[2] = operands[1];
15533 operands[1] = temp;
15534 }
15535
15536 /* know operands[0] == operands[1]. */
15537
15538 if (MEM_P (operands[2]))
15539 {
15540 p = "%Z2\t%2";
15541 break;
15542 }
15543
15544 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15545 {
15546 if (STACK_TOP_P (operands[0]))
15547 /* How is it that we are storing to a dead operand[2]?
15548 Well, presumably operands[1] is dead too. We can't
15549 store the result to st(0) as st(0) gets popped on this
15550 instruction. Instead store to operands[2] (which I
15551 think has to be st(1)). st(1) will be popped later.
15552 gcc <= 2.8.1 didn't have this check and generated
15553 assembly code that the Unixware assembler rejected. */
15554 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15555 else
15556 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15557 break;
15558 }
15559
15560 if (STACK_TOP_P (operands[0]))
15561 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15562 else
15563 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15564 break;
15565
15566 case MINUS:
15567 case DIV:
15568 if (MEM_P (operands[1]))
15569 {
15570 p = "r%Z1\t%1";
15571 break;
15572 }
15573
15574 if (MEM_P (operands[2]))
15575 {
15576 p = "%Z2\t%2";
15577 break;
15578 }
15579
15580 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15581 {
15582 #if SYSV386_COMPAT
15583 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15584 derived assemblers, confusingly reverse the direction of
15585 the operation for fsub{r} and fdiv{r} when the
15586 destination register is not st(0). The Intel assembler
15587 doesn't have this brain damage. Read !SYSV386_COMPAT to
15588 figure out what the hardware really does. */
15589 if (STACK_TOP_P (operands[0]))
15590 p = "{p\t%0, %2|rp\t%2, %0}";
15591 else
15592 p = "{rp\t%2, %0|p\t%0, %2}";
15593 #else
15594 if (STACK_TOP_P (operands[0]))
15595 /* As above for fmul/fadd, we can't store to st(0). */
15596 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15597 else
15598 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15599 #endif
15600 break;
15601 }
15602
15603 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15604 {
15605 #if SYSV386_COMPAT
15606 if (STACK_TOP_P (operands[0]))
15607 p = "{rp\t%0, %1|p\t%1, %0}";
15608 else
15609 p = "{p\t%1, %0|rp\t%0, %1}";
15610 #else
15611 if (STACK_TOP_P (operands[0]))
15612 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15613 else
15614 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15615 #endif
15616 break;
15617 }
15618
15619 if (STACK_TOP_P (operands[0]))
15620 {
15621 if (STACK_TOP_P (operands[1]))
15622 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15623 else
15624 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15625 break;
15626 }
15627 else if (STACK_TOP_P (operands[1]))
15628 {
15629 #if SYSV386_COMPAT
15630 p = "{\t%1, %0|r\t%0, %1}";
15631 #else
15632 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15633 #endif
15634 }
15635 else
15636 {
15637 #if SYSV386_COMPAT
15638 p = "{r\t%2, %0|\t%0, %2}";
15639 #else
15640 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15641 #endif
15642 }
15643 break;
15644
15645 default:
15646 gcc_unreachable ();
15647 }
15648
15649 strcat (buf, p);
15650 return buf;
15651 }
15652
15653 /* Check if a 256bit AVX register is referenced inside of EXP. */
15654
15655 static int
15656 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15657 {
15658 rtx exp = *pexp;
15659
15660 if (GET_CODE (exp) == SUBREG)
15661 exp = SUBREG_REG (exp);
15662
15663 if (REG_P (exp)
15664 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15665 return 1;
15666
15667 return 0;
15668 }
15669
15670 /* Return needed mode for entity in optimize_mode_switching pass. */
15671
15672 static int
15673 ix86_avx_u128_mode_needed (rtx insn)
15674 {
15675 if (CALL_P (insn))
15676 {
15677 rtx link;
15678
15679 /* Needed mode is set to AVX_U128_CLEAN if there are
15680 no 256bit modes used in function arguments. */
15681 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15682 link;
15683 link = XEXP (link, 1))
15684 {
15685 if (GET_CODE (XEXP (link, 0)) == USE)
15686 {
15687 rtx arg = XEXP (XEXP (link, 0), 0);
15688
15689 if (ix86_check_avx256_register (&arg, NULL))
15690 return AVX_U128_ANY;
15691 }
15692 }
15693
15694 return AVX_U128_CLEAN;
15695 }
15696
15697 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15698 changes state only when a 256bit register is written to, but we need
15699 to prevent the compiler from moving optimal insertion point above
15700 eventual read from 256bit register. */
15701 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15702 return AVX_U128_DIRTY;
15703
15704 return AVX_U128_ANY;
15705 }
15706
15707 /* Return mode that i387 must be switched into
15708 prior to the execution of insn. */
15709
15710 static int
15711 ix86_i387_mode_needed (int entity, rtx insn)
15712 {
15713 enum attr_i387_cw mode;
15714
15715 /* The mode UNINITIALIZED is used to store control word after a
15716 function call or ASM pattern. The mode ANY specify that function
15717 has no requirements on the control word and make no changes in the
15718 bits we are interested in. */
15719
15720 if (CALL_P (insn)
15721 || (NONJUMP_INSN_P (insn)
15722 && (asm_noperands (PATTERN (insn)) >= 0
15723 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15724 return I387_CW_UNINITIALIZED;
15725
15726 if (recog_memoized (insn) < 0)
15727 return I387_CW_ANY;
15728
15729 mode = get_attr_i387_cw (insn);
15730
15731 switch (entity)
15732 {
15733 case I387_TRUNC:
15734 if (mode == I387_CW_TRUNC)
15735 return mode;
15736 break;
15737
15738 case I387_FLOOR:
15739 if (mode == I387_CW_FLOOR)
15740 return mode;
15741 break;
15742
15743 case I387_CEIL:
15744 if (mode == I387_CW_CEIL)
15745 return mode;
15746 break;
15747
15748 case I387_MASK_PM:
15749 if (mode == I387_CW_MASK_PM)
15750 return mode;
15751 break;
15752
15753 default:
15754 gcc_unreachable ();
15755 }
15756
15757 return I387_CW_ANY;
15758 }
15759
15760 /* Return mode that entity must be switched into
15761 prior to the execution of insn. */
15762
15763 int
15764 ix86_mode_needed (int entity, rtx insn)
15765 {
15766 switch (entity)
15767 {
15768 case AVX_U128:
15769 return ix86_avx_u128_mode_needed (insn);
15770 case I387_TRUNC:
15771 case I387_FLOOR:
15772 case I387_CEIL:
15773 case I387_MASK_PM:
15774 return ix86_i387_mode_needed (entity, insn);
15775 default:
15776 gcc_unreachable ();
15777 }
15778 return 0;
15779 }
15780
15781 /* Check if a 256bit AVX register is referenced in stores. */
15782
15783 static void
15784 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15785 {
15786 if (ix86_check_avx256_register (&dest, NULL))
15787 {
15788 bool *used = (bool *) data;
15789 *used = true;
15790 }
15791 }
15792
15793 /* Calculate mode of upper 128bit AVX registers after the insn. */
15794
15795 static int
15796 ix86_avx_u128_mode_after (int mode, rtx insn)
15797 {
15798 rtx pat = PATTERN (insn);
15799
15800 if (vzeroupper_operation (pat, VOIDmode)
15801 || vzeroall_operation (pat, VOIDmode))
15802 return AVX_U128_CLEAN;
15803
15804 /* We know that state is clean after CALL insn if there are no
15805 256bit registers used in the function return register. */
15806 if (CALL_P (insn))
15807 {
15808 bool avx_reg256_found = false;
15809 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15810 if (!avx_reg256_found)
15811 return AVX_U128_CLEAN;
15812 }
15813
15814 /* Otherwise, return current mode. Remember that if insn
15815 references AVX 256bit registers, the mode was already changed
15816 to DIRTY from MODE_NEEDED. */
15817 return mode;
15818 }
15819
15820 /* Return the mode that an insn results in. */
15821
15822 int
15823 ix86_mode_after (int entity, int mode, rtx insn)
15824 {
15825 switch (entity)
15826 {
15827 case AVX_U128:
15828 return ix86_avx_u128_mode_after (mode, insn);
15829 case I387_TRUNC:
15830 case I387_FLOOR:
15831 case I387_CEIL:
15832 case I387_MASK_PM:
15833 return mode;
15834 default:
15835 gcc_unreachable ();
15836 }
15837 }
15838
15839 static int
15840 ix86_avx_u128_mode_entry (void)
15841 {
15842 tree arg;
15843
15844 /* Entry mode is set to AVX_U128_DIRTY if there are
15845 256bit modes used in function arguments. */
15846 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15847 arg = TREE_CHAIN (arg))
15848 {
15849 rtx incoming = DECL_INCOMING_RTL (arg);
15850
15851 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15852 return AVX_U128_DIRTY;
15853 }
15854
15855 return AVX_U128_CLEAN;
15856 }
15857
15858 /* Return a mode that ENTITY is assumed to be
15859 switched to at function entry. */
15860
15861 int
15862 ix86_mode_entry (int entity)
15863 {
15864 switch (entity)
15865 {
15866 case AVX_U128:
15867 return ix86_avx_u128_mode_entry ();
15868 case I387_TRUNC:
15869 case I387_FLOOR:
15870 case I387_CEIL:
15871 case I387_MASK_PM:
15872 return I387_CW_ANY;
15873 default:
15874 gcc_unreachable ();
15875 }
15876 }
15877
15878 static int
15879 ix86_avx_u128_mode_exit (void)
15880 {
15881 rtx reg = crtl->return_rtx;
15882
15883 /* Exit mode is set to AVX_U128_DIRTY if there are
15884 256bit modes used in the function return register. */
15885 if (reg && ix86_check_avx256_register (&reg, NULL))
15886 return AVX_U128_DIRTY;
15887
15888 return AVX_U128_CLEAN;
15889 }
15890
15891 /* Return a mode that ENTITY is assumed to be
15892 switched to at function exit. */
15893
15894 int
15895 ix86_mode_exit (int entity)
15896 {
15897 switch (entity)
15898 {
15899 case AVX_U128:
15900 return ix86_avx_u128_mode_exit ();
15901 case I387_TRUNC:
15902 case I387_FLOOR:
15903 case I387_CEIL:
15904 case I387_MASK_PM:
15905 return I387_CW_ANY;
15906 default:
15907 gcc_unreachable ();
15908 }
15909 }
15910
15911 /* Output code to initialize control word copies used by trunc?f?i and
15912 rounding patterns. CURRENT_MODE is set to current control word,
15913 while NEW_MODE is set to new control word. */
15914
15915 static void
15916 emit_i387_cw_initialization (int mode)
15917 {
15918 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15919 rtx new_mode;
15920
15921 enum ix86_stack_slot slot;
15922
15923 rtx reg = gen_reg_rtx (HImode);
15924
15925 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15926 emit_move_insn (reg, copy_rtx (stored_mode));
15927
15928 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15929 || optimize_insn_for_size_p ())
15930 {
15931 switch (mode)
15932 {
15933 case I387_CW_TRUNC:
15934 /* round toward zero (truncate) */
15935 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15936 slot = SLOT_CW_TRUNC;
15937 break;
15938
15939 case I387_CW_FLOOR:
15940 /* round down toward -oo */
15941 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15942 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15943 slot = SLOT_CW_FLOOR;
15944 break;
15945
15946 case I387_CW_CEIL:
15947 /* round up toward +oo */
15948 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15949 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15950 slot = SLOT_CW_CEIL;
15951 break;
15952
15953 case I387_CW_MASK_PM:
15954 /* mask precision exception for nearbyint() */
15955 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15956 slot = SLOT_CW_MASK_PM;
15957 break;
15958
15959 default:
15960 gcc_unreachable ();
15961 }
15962 }
15963 else
15964 {
15965 switch (mode)
15966 {
15967 case I387_CW_TRUNC:
15968 /* round toward zero (truncate) */
15969 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15970 slot = SLOT_CW_TRUNC;
15971 break;
15972
15973 case I387_CW_FLOOR:
15974 /* round down toward -oo */
15975 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15976 slot = SLOT_CW_FLOOR;
15977 break;
15978
15979 case I387_CW_CEIL:
15980 /* round up toward +oo */
15981 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15982 slot = SLOT_CW_CEIL;
15983 break;
15984
15985 case I387_CW_MASK_PM:
15986 /* mask precision exception for nearbyint() */
15987 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15988 slot = SLOT_CW_MASK_PM;
15989 break;
15990
15991 default:
15992 gcc_unreachable ();
15993 }
15994 }
15995
15996 gcc_assert (slot < MAX_386_STACK_LOCALS);
15997
15998 new_mode = assign_386_stack_local (HImode, slot);
15999 emit_move_insn (new_mode, reg);
16000 }
16001
16002 /* Emit vzeroupper. */
16003
16004 void
16005 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
16006 {
16007 int i;
16008
16009 /* Cancel automatic vzeroupper insertion if there are
16010 live call-saved SSE registers at the insertion point. */
16011
16012 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
16013 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16014 return;
16015
16016 if (TARGET_64BIT)
16017 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
16018 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
16019 return;
16020
16021 emit_insn (gen_avx_vzeroupper ());
16022 }
16023
16024 /* Generate one or more insns to set ENTITY to MODE. */
16025
16026 void
16027 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
16028 {
16029 switch (entity)
16030 {
16031 case AVX_U128:
16032 if (mode == AVX_U128_CLEAN)
16033 ix86_avx_emit_vzeroupper (regs_live);
16034 break;
16035 case I387_TRUNC:
16036 case I387_FLOOR:
16037 case I387_CEIL:
16038 case I387_MASK_PM:
16039 if (mode != I387_CW_ANY
16040 && mode != I387_CW_UNINITIALIZED)
16041 emit_i387_cw_initialization (mode);
16042 break;
16043 default:
16044 gcc_unreachable ();
16045 }
16046 }
16047
16048 /* Output code for INSN to convert a float to a signed int. OPERANDS
16049 are the insn operands. The output may be [HSD]Imode and the input
16050 operand may be [SDX]Fmode. */
16051
16052 const char *
16053 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
16054 {
16055 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16056 int dimode_p = GET_MODE (operands[0]) == DImode;
16057 int round_mode = get_attr_i387_cw (insn);
16058
16059 /* Jump through a hoop or two for DImode, since the hardware has no
16060 non-popping instruction. We used to do this a different way, but
16061 that was somewhat fragile and broke with post-reload splitters. */
16062 if ((dimode_p || fisttp) && !stack_top_dies)
16063 output_asm_insn ("fld\t%y1", operands);
16064
16065 gcc_assert (STACK_TOP_P (operands[1]));
16066 gcc_assert (MEM_P (operands[0]));
16067 gcc_assert (GET_MODE (operands[1]) != TFmode);
16068
16069 if (fisttp)
16070 output_asm_insn ("fisttp%Z0\t%0", operands);
16071 else
16072 {
16073 if (round_mode != I387_CW_ANY)
16074 output_asm_insn ("fldcw\t%3", operands);
16075 if (stack_top_dies || dimode_p)
16076 output_asm_insn ("fistp%Z0\t%0", operands);
16077 else
16078 output_asm_insn ("fist%Z0\t%0", operands);
16079 if (round_mode != I387_CW_ANY)
16080 output_asm_insn ("fldcw\t%2", operands);
16081 }
16082
16083 return "";
16084 }
16085
16086 /* Output code for x87 ffreep insn. The OPNO argument, which may only
16087 have the values zero or one, indicates the ffreep insn's operand
16088 from the OPERANDS array. */
16089
16090 static const char *
16091 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
16092 {
16093 if (TARGET_USE_FFREEP)
16094 #ifdef HAVE_AS_IX86_FFREEP
16095 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
16096 #else
16097 {
16098 static char retval[32];
16099 int regno = REGNO (operands[opno]);
16100
16101 gcc_assert (STACK_REGNO_P (regno));
16102
16103 regno -= FIRST_STACK_REG;
16104
16105 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
16106 return retval;
16107 }
16108 #endif
16109
16110 return opno ? "fstp\t%y1" : "fstp\t%y0";
16111 }
16112
16113
16114 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
16115 should be used. UNORDERED_P is true when fucom should be used. */
16116
16117 const char *
16118 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
16119 {
16120 int stack_top_dies;
16121 rtx cmp_op0, cmp_op1;
16122 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
16123
16124 if (eflags_p)
16125 {
16126 cmp_op0 = operands[0];
16127 cmp_op1 = operands[1];
16128 }
16129 else
16130 {
16131 cmp_op0 = operands[1];
16132 cmp_op1 = operands[2];
16133 }
16134
16135 if (is_sse)
16136 {
16137 if (GET_MODE (operands[0]) == SFmode)
16138 if (unordered_p)
16139 return "%vucomiss\t{%1, %0|%0, %1}";
16140 else
16141 return "%vcomiss\t{%1, %0|%0, %1}";
16142 else
16143 if (unordered_p)
16144 return "%vucomisd\t{%1, %0|%0, %1}";
16145 else
16146 return "%vcomisd\t{%1, %0|%0, %1}";
16147 }
16148
16149 gcc_assert (STACK_TOP_P (cmp_op0));
16150
16151 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
16152
16153 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
16154 {
16155 if (stack_top_dies)
16156 {
16157 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
16158 return output_387_ffreep (operands, 1);
16159 }
16160 else
16161 return "ftst\n\tfnstsw\t%0";
16162 }
16163
16164 if (STACK_REG_P (cmp_op1)
16165 && stack_top_dies
16166 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
16167 && REGNO (cmp_op1) != FIRST_STACK_REG)
16168 {
16169 /* If both the top of the 387 stack dies, and the other operand
16170 is also a stack register that dies, then this must be a
16171 `fcompp' float compare */
16172
16173 if (eflags_p)
16174 {
16175 /* There is no double popping fcomi variant. Fortunately,
16176 eflags is immune from the fstp's cc clobbering. */
16177 if (unordered_p)
16178 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
16179 else
16180 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
16181 return output_387_ffreep (operands, 0);
16182 }
16183 else
16184 {
16185 if (unordered_p)
16186 return "fucompp\n\tfnstsw\t%0";
16187 else
16188 return "fcompp\n\tfnstsw\t%0";
16189 }
16190 }
16191 else
16192 {
16193 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
16194
16195 static const char * const alt[16] =
16196 {
16197 "fcom%Z2\t%y2\n\tfnstsw\t%0",
16198 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
16199 "fucom%Z2\t%y2\n\tfnstsw\t%0",
16200 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
16201
16202 "ficom%Z2\t%y2\n\tfnstsw\t%0",
16203 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
16204 NULL,
16205 NULL,
16206
16207 "fcomi\t{%y1, %0|%0, %y1}",
16208 "fcomip\t{%y1, %0|%0, %y1}",
16209 "fucomi\t{%y1, %0|%0, %y1}",
16210 "fucomip\t{%y1, %0|%0, %y1}",
16211
16212 NULL,
16213 NULL,
16214 NULL,
16215 NULL
16216 };
16217
16218 int mask;
16219 const char *ret;
16220
16221 mask = eflags_p << 3;
16222 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
16223 mask |= unordered_p << 1;
16224 mask |= stack_top_dies;
16225
16226 gcc_assert (mask < 16);
16227 ret = alt[mask];
16228 gcc_assert (ret);
16229
16230 return ret;
16231 }
16232 }
16233
16234 void
16235 ix86_output_addr_vec_elt (FILE *file, int value)
16236 {
16237 const char *directive = ASM_LONG;
16238
16239 #ifdef ASM_QUAD
16240 if (TARGET_LP64)
16241 directive = ASM_QUAD;
16242 #else
16243 gcc_assert (!TARGET_64BIT);
16244 #endif
16245
16246 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
16247 }
16248
16249 void
16250 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
16251 {
16252 const char *directive = ASM_LONG;
16253
16254 #ifdef ASM_QUAD
16255 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
16256 directive = ASM_QUAD;
16257 #else
16258 gcc_assert (!TARGET_64BIT);
16259 #endif
16260 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
16261 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
16262 fprintf (file, "%s%s%d-%s%d\n",
16263 directive, LPREFIX, value, LPREFIX, rel);
16264 else if (HAVE_AS_GOTOFF_IN_DATA)
16265 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
16266 #if TARGET_MACHO
16267 else if (TARGET_MACHO)
16268 {
16269 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
16270 machopic_output_function_base_name (file);
16271 putc ('\n', file);
16272 }
16273 #endif
16274 else
16275 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
16276 GOT_SYMBOL_NAME, LPREFIX, value);
16277 }
16278 \f
16279 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
16280 for the target. */
16281
16282 void
16283 ix86_expand_clear (rtx dest)
16284 {
16285 rtx tmp;
16286
16287 /* We play register width games, which are only valid after reload. */
16288 gcc_assert (reload_completed);
16289
16290 /* Avoid HImode and its attendant prefix byte. */
16291 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
16292 dest = gen_rtx_REG (SImode, REGNO (dest));
16293 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
16294
16295 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
16296 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
16297 {
16298 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16299 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
16300 }
16301
16302 emit_insn (tmp);
16303 }
16304
16305 /* X is an unchanging MEM. If it is a constant pool reference, return
16306 the constant pool rtx, else NULL. */
16307
16308 rtx
16309 maybe_get_pool_constant (rtx x)
16310 {
16311 x = ix86_delegitimize_address (XEXP (x, 0));
16312
16313 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
16314 return get_pool_constant (x);
16315
16316 return NULL_RTX;
16317 }
16318
16319 void
16320 ix86_expand_move (enum machine_mode mode, rtx operands[])
16321 {
16322 rtx op0, op1;
16323 enum tls_model model;
16324
16325 op0 = operands[0];
16326 op1 = operands[1];
16327
16328 if (GET_CODE (op1) == SYMBOL_REF)
16329 {
16330 rtx tmp;
16331
16332 model = SYMBOL_REF_TLS_MODEL (op1);
16333 if (model)
16334 {
16335 op1 = legitimize_tls_address (op1, model, true);
16336 op1 = force_operand (op1, op0);
16337 if (op1 == op0)
16338 return;
16339 op1 = convert_to_mode (mode, op1, 1);
16340 }
16341 else if ((tmp = legitimize_pe_coff_symbol (op1, false)) != NULL_RTX)
16342 op1 = tmp;
16343 }
16344 else if (GET_CODE (op1) == CONST
16345 && GET_CODE (XEXP (op1, 0)) == PLUS
16346 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
16347 {
16348 rtx addend = XEXP (XEXP (op1, 0), 1);
16349 rtx symbol = XEXP (XEXP (op1, 0), 0);
16350 rtx tmp;
16351
16352 model = SYMBOL_REF_TLS_MODEL (symbol);
16353 if (model)
16354 tmp = legitimize_tls_address (symbol, model, true);
16355 else
16356 tmp = legitimize_pe_coff_symbol (symbol, true);
16357
16358 if (tmp)
16359 {
16360 tmp = force_operand (tmp, NULL);
16361 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
16362 op0, 1, OPTAB_DIRECT);
16363 if (tmp == op0)
16364 return;
16365 op1 = convert_to_mode (mode, tmp, 1);
16366 }
16367 }
16368
16369 if ((flag_pic || MACHOPIC_INDIRECT)
16370 && symbolic_operand (op1, mode))
16371 {
16372 if (TARGET_MACHO && !TARGET_64BIT)
16373 {
16374 #if TARGET_MACHO
16375 /* dynamic-no-pic */
16376 if (MACHOPIC_INDIRECT)
16377 {
16378 rtx temp = ((reload_in_progress
16379 || ((op0 && REG_P (op0))
16380 && mode == Pmode))
16381 ? op0 : gen_reg_rtx (Pmode));
16382 op1 = machopic_indirect_data_reference (op1, temp);
16383 if (MACHOPIC_PURE)
16384 op1 = machopic_legitimize_pic_address (op1, mode,
16385 temp == op1 ? 0 : temp);
16386 }
16387 if (op0 != op1 && GET_CODE (op0) != MEM)
16388 {
16389 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16390 emit_insn (insn);
16391 return;
16392 }
16393 if (GET_CODE (op0) == MEM)
16394 op1 = force_reg (Pmode, op1);
16395 else
16396 {
16397 rtx temp = op0;
16398 if (GET_CODE (temp) != REG)
16399 temp = gen_reg_rtx (Pmode);
16400 temp = legitimize_pic_address (op1, temp);
16401 if (temp == op0)
16402 return;
16403 op1 = temp;
16404 }
16405 /* dynamic-no-pic */
16406 #endif
16407 }
16408 else
16409 {
16410 if (MEM_P (op0))
16411 op1 = force_reg (mode, op1);
16412 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16413 {
16414 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16415 op1 = legitimize_pic_address (op1, reg);
16416 if (op0 == op1)
16417 return;
16418 op1 = convert_to_mode (mode, op1, 1);
16419 }
16420 }
16421 }
16422 else
16423 {
16424 if (MEM_P (op0)
16425 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16426 || !push_operand (op0, mode))
16427 && MEM_P (op1))
16428 op1 = force_reg (mode, op1);
16429
16430 if (push_operand (op0, mode)
16431 && ! general_no_elim_operand (op1, mode))
16432 op1 = copy_to_mode_reg (mode, op1);
16433
16434 /* Force large constants in 64bit compilation into register
16435 to get them CSEed. */
16436 if (can_create_pseudo_p ()
16437 && (mode == DImode) && TARGET_64BIT
16438 && immediate_operand (op1, mode)
16439 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16440 && !register_operand (op0, mode)
16441 && optimize)
16442 op1 = copy_to_mode_reg (mode, op1);
16443
16444 if (can_create_pseudo_p ()
16445 && FLOAT_MODE_P (mode)
16446 && GET_CODE (op1) == CONST_DOUBLE)
16447 {
16448 /* If we are loading a floating point constant to a register,
16449 force the value to memory now, since we'll get better code
16450 out the back end. */
16451
16452 op1 = validize_mem (force_const_mem (mode, op1));
16453 if (!register_operand (op0, mode))
16454 {
16455 rtx temp = gen_reg_rtx (mode);
16456 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16457 emit_move_insn (op0, temp);
16458 return;
16459 }
16460 }
16461 }
16462
16463 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16464 }
16465
16466 void
16467 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16468 {
16469 rtx op0 = operands[0], op1 = operands[1];
16470 unsigned int align = GET_MODE_ALIGNMENT (mode);
16471
16472 /* Force constants other than zero into memory. We do not know how
16473 the instructions used to build constants modify the upper 64 bits
16474 of the register, once we have that information we may be able
16475 to handle some of them more efficiently. */
16476 if (can_create_pseudo_p ()
16477 && register_operand (op0, mode)
16478 && (CONSTANT_P (op1)
16479 || (GET_CODE (op1) == SUBREG
16480 && CONSTANT_P (SUBREG_REG (op1))))
16481 && !standard_sse_constant_p (op1))
16482 op1 = validize_mem (force_const_mem (mode, op1));
16483
16484 /* We need to check memory alignment for SSE mode since attribute
16485 can make operands unaligned. */
16486 if (can_create_pseudo_p ()
16487 && SSE_REG_MODE_P (mode)
16488 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16489 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16490 {
16491 rtx tmp[2];
16492
16493 /* ix86_expand_vector_move_misalign() does not like constants ... */
16494 if (CONSTANT_P (op1)
16495 || (GET_CODE (op1) == SUBREG
16496 && CONSTANT_P (SUBREG_REG (op1))))
16497 op1 = validize_mem (force_const_mem (mode, op1));
16498
16499 /* ... nor both arguments in memory. */
16500 if (!register_operand (op0, mode)
16501 && !register_operand (op1, mode))
16502 op1 = force_reg (mode, op1);
16503
16504 tmp[0] = op0; tmp[1] = op1;
16505 ix86_expand_vector_move_misalign (mode, tmp);
16506 return;
16507 }
16508
16509 /* Make operand1 a register if it isn't already. */
16510 if (can_create_pseudo_p ()
16511 && !register_operand (op0, mode)
16512 && !register_operand (op1, mode))
16513 {
16514 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16515 return;
16516 }
16517
16518 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16519 }
16520
16521 /* Split 32-byte AVX unaligned load and store if needed. */
16522
16523 static void
16524 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16525 {
16526 rtx m;
16527 rtx (*extract) (rtx, rtx, rtx);
16528 rtx (*load_unaligned) (rtx, rtx);
16529 rtx (*store_unaligned) (rtx, rtx);
16530 enum machine_mode mode;
16531
16532 switch (GET_MODE (op0))
16533 {
16534 default:
16535 gcc_unreachable ();
16536 case V32QImode:
16537 extract = gen_avx_vextractf128v32qi;
16538 load_unaligned = gen_avx_loaddquv32qi;
16539 store_unaligned = gen_avx_storedquv32qi;
16540 mode = V16QImode;
16541 break;
16542 case V8SFmode:
16543 extract = gen_avx_vextractf128v8sf;
16544 load_unaligned = gen_avx_loadups256;
16545 store_unaligned = gen_avx_storeups256;
16546 mode = V4SFmode;
16547 break;
16548 case V4DFmode:
16549 extract = gen_avx_vextractf128v4df;
16550 load_unaligned = gen_avx_loadupd256;
16551 store_unaligned = gen_avx_storeupd256;
16552 mode = V2DFmode;
16553 break;
16554 }
16555
16556 if (MEM_P (op1))
16557 {
16558 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16559 {
16560 rtx r = gen_reg_rtx (mode);
16561 m = adjust_address (op1, mode, 0);
16562 emit_move_insn (r, m);
16563 m = adjust_address (op1, mode, 16);
16564 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16565 emit_move_insn (op0, r);
16566 }
16567 /* Normal *mov<mode>_internal pattern will handle
16568 unaligned loads just fine if misaligned_operand
16569 is true, and without the UNSPEC it can be combined
16570 with arithmetic instructions. */
16571 else if (misaligned_operand (op1, GET_MODE (op1)))
16572 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16573 else
16574 emit_insn (load_unaligned (op0, op1));
16575 }
16576 else if (MEM_P (op0))
16577 {
16578 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16579 {
16580 m = adjust_address (op0, mode, 0);
16581 emit_insn (extract (m, op1, const0_rtx));
16582 m = adjust_address (op0, mode, 16);
16583 emit_insn (extract (m, op1, const1_rtx));
16584 }
16585 else
16586 emit_insn (store_unaligned (op0, op1));
16587 }
16588 else
16589 gcc_unreachable ();
16590 }
16591
16592 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16593 straight to ix86_expand_vector_move. */
16594 /* Code generation for scalar reg-reg moves of single and double precision data:
16595 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16596 movaps reg, reg
16597 else
16598 movss reg, reg
16599 if (x86_sse_partial_reg_dependency == true)
16600 movapd reg, reg
16601 else
16602 movsd reg, reg
16603
16604 Code generation for scalar loads of double precision data:
16605 if (x86_sse_split_regs == true)
16606 movlpd mem, reg (gas syntax)
16607 else
16608 movsd mem, reg
16609
16610 Code generation for unaligned packed loads of single precision data
16611 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16612 if (x86_sse_unaligned_move_optimal)
16613 movups mem, reg
16614
16615 if (x86_sse_partial_reg_dependency == true)
16616 {
16617 xorps reg, reg
16618 movlps mem, reg
16619 movhps mem+8, reg
16620 }
16621 else
16622 {
16623 movlps mem, reg
16624 movhps mem+8, reg
16625 }
16626
16627 Code generation for unaligned packed loads of double precision data
16628 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16629 if (x86_sse_unaligned_move_optimal)
16630 movupd mem, reg
16631
16632 if (x86_sse_split_regs == true)
16633 {
16634 movlpd mem, reg
16635 movhpd mem+8, reg
16636 }
16637 else
16638 {
16639 movsd mem, reg
16640 movhpd mem+8, reg
16641 }
16642 */
16643
16644 void
16645 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16646 {
16647 rtx op0, op1, orig_op0 = NULL_RTX, m;
16648 rtx (*load_unaligned) (rtx, rtx);
16649 rtx (*store_unaligned) (rtx, rtx);
16650
16651 op0 = operands[0];
16652 op1 = operands[1];
16653
16654 if (GET_MODE_SIZE (mode) == 64)
16655 {
16656 switch (GET_MODE_CLASS (mode))
16657 {
16658 case MODE_VECTOR_INT:
16659 case MODE_INT:
16660 if (GET_MODE (op0) != V16SImode)
16661 {
16662 if (!MEM_P (op0))
16663 {
16664 orig_op0 = op0;
16665 op0 = gen_reg_rtx (V16SImode);
16666 }
16667 else
16668 op0 = gen_lowpart (V16SImode, op0);
16669 }
16670 op1 = gen_lowpart (V16SImode, op1);
16671 /* FALLTHRU */
16672
16673 case MODE_VECTOR_FLOAT:
16674 switch (GET_MODE (op0))
16675 {
16676 default:
16677 gcc_unreachable ();
16678 case V16SImode:
16679 load_unaligned = gen_avx512f_loaddquv16si;
16680 store_unaligned = gen_avx512f_storedquv16si;
16681 break;
16682 case V16SFmode:
16683 load_unaligned = gen_avx512f_loadups512;
16684 store_unaligned = gen_avx512f_storeups512;
16685 break;
16686 case V8DFmode:
16687 load_unaligned = gen_avx512f_loadupd512;
16688 store_unaligned = gen_avx512f_storeupd512;
16689 break;
16690 }
16691
16692 if (MEM_P (op1))
16693 emit_insn (load_unaligned (op0, op1));
16694 else if (MEM_P (op0))
16695 emit_insn (store_unaligned (op0, op1));
16696 else
16697 gcc_unreachable ();
16698 if (orig_op0)
16699 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16700 break;
16701
16702 default:
16703 gcc_unreachable ();
16704 }
16705
16706 return;
16707 }
16708
16709 if (TARGET_AVX
16710 && GET_MODE_SIZE (mode) == 32)
16711 {
16712 switch (GET_MODE_CLASS (mode))
16713 {
16714 case MODE_VECTOR_INT:
16715 case MODE_INT:
16716 if (GET_MODE (op0) != V32QImode)
16717 {
16718 if (!MEM_P (op0))
16719 {
16720 orig_op0 = op0;
16721 op0 = gen_reg_rtx (V32QImode);
16722 }
16723 else
16724 op0 = gen_lowpart (V32QImode, op0);
16725 }
16726 op1 = gen_lowpart (V32QImode, op1);
16727 /* FALLTHRU */
16728
16729 case MODE_VECTOR_FLOAT:
16730 ix86_avx256_split_vector_move_misalign (op0, op1);
16731 if (orig_op0)
16732 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16733 break;
16734
16735 default:
16736 gcc_unreachable ();
16737 }
16738
16739 return;
16740 }
16741
16742 if (MEM_P (op1))
16743 {
16744 /* Normal *mov<mode>_internal pattern will handle
16745 unaligned loads just fine if misaligned_operand
16746 is true, and without the UNSPEC it can be combined
16747 with arithmetic instructions. */
16748 if (TARGET_AVX
16749 && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16750 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16751 && misaligned_operand (op1, GET_MODE (op1)))
16752 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16753 /* ??? If we have typed data, then it would appear that using
16754 movdqu is the only way to get unaligned data loaded with
16755 integer type. */
16756 else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16757 {
16758 if (GET_MODE (op0) != V16QImode)
16759 {
16760 orig_op0 = op0;
16761 op0 = gen_reg_rtx (V16QImode);
16762 }
16763 op1 = gen_lowpart (V16QImode, op1);
16764 /* We will eventually emit movups based on insn attributes. */
16765 emit_insn (gen_sse2_loaddquv16qi (op0, op1));
16766 if (orig_op0)
16767 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
16768 }
16769 else if (TARGET_SSE2 && mode == V2DFmode)
16770 {
16771 rtx zero;
16772
16773 if (TARGET_AVX
16774 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16775 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16776 || optimize_insn_for_size_p ())
16777 {
16778 /* We will eventually emit movups based on insn attributes. */
16779 emit_insn (gen_sse2_loadupd (op0, op1));
16780 return;
16781 }
16782
16783 /* When SSE registers are split into halves, we can avoid
16784 writing to the top half twice. */
16785 if (TARGET_SSE_SPLIT_REGS)
16786 {
16787 emit_clobber (op0);
16788 zero = op0;
16789 }
16790 else
16791 {
16792 /* ??? Not sure about the best option for the Intel chips.
16793 The following would seem to satisfy; the register is
16794 entirely cleared, breaking the dependency chain. We
16795 then store to the upper half, with a dependency depth
16796 of one. A rumor has it that Intel recommends two movsd
16797 followed by an unpacklpd, but this is unconfirmed. And
16798 given that the dependency depth of the unpacklpd would
16799 still be one, I'm not sure why this would be better. */
16800 zero = CONST0_RTX (V2DFmode);
16801 }
16802
16803 m = adjust_address (op1, DFmode, 0);
16804 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16805 m = adjust_address (op1, DFmode, 8);
16806 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16807 }
16808 else
16809 {
16810 rtx t;
16811
16812 if (TARGET_AVX
16813 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16814 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16815 || optimize_insn_for_size_p ())
16816 {
16817 if (GET_MODE (op0) != V4SFmode)
16818 {
16819 orig_op0 = op0;
16820 op0 = gen_reg_rtx (V4SFmode);
16821 }
16822 op1 = gen_lowpart (V4SFmode, op1);
16823 emit_insn (gen_sse_loadups (op0, op1));
16824 if (orig_op0)
16825 emit_move_insn (orig_op0,
16826 gen_lowpart (GET_MODE (orig_op0), op0));
16827 return;
16828 }
16829
16830 if (mode != V4SFmode)
16831 t = gen_reg_rtx (V4SFmode);
16832 else
16833 t = op0;
16834
16835 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16836 emit_move_insn (t, CONST0_RTX (V4SFmode));
16837 else
16838 emit_clobber (t);
16839
16840 m = adjust_address (op1, V2SFmode, 0);
16841 emit_insn (gen_sse_loadlps (t, t, m));
16842 m = adjust_address (op1, V2SFmode, 8);
16843 emit_insn (gen_sse_loadhps (t, t, m));
16844 if (mode != V4SFmode)
16845 emit_move_insn (op0, gen_lowpart (mode, t));
16846 }
16847 }
16848 else if (MEM_P (op0))
16849 {
16850 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16851 {
16852 op0 = gen_lowpart (V16QImode, op0);
16853 op1 = gen_lowpart (V16QImode, op1);
16854 /* We will eventually emit movups based on insn attributes. */
16855 emit_insn (gen_sse2_storedquv16qi (op0, op1));
16856 }
16857 else if (TARGET_SSE2 && mode == V2DFmode)
16858 {
16859 if (TARGET_AVX
16860 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16861 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16862 || optimize_insn_for_size_p ())
16863 /* We will eventually emit movups based on insn attributes. */
16864 emit_insn (gen_sse2_storeupd (op0, op1));
16865 else
16866 {
16867 m = adjust_address (op0, DFmode, 0);
16868 emit_insn (gen_sse2_storelpd (m, op1));
16869 m = adjust_address (op0, DFmode, 8);
16870 emit_insn (gen_sse2_storehpd (m, op1));
16871 }
16872 }
16873 else
16874 {
16875 if (mode != V4SFmode)
16876 op1 = gen_lowpart (V4SFmode, op1);
16877
16878 if (TARGET_AVX
16879 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16880 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16881 || optimize_insn_for_size_p ())
16882 {
16883 op0 = gen_lowpart (V4SFmode, op0);
16884 emit_insn (gen_sse_storeups (op0, op1));
16885 }
16886 else
16887 {
16888 m = adjust_address (op0, V2SFmode, 0);
16889 emit_insn (gen_sse_storelps (m, op1));
16890 m = adjust_address (op0, V2SFmode, 8);
16891 emit_insn (gen_sse_storehps (m, op1));
16892 }
16893 }
16894 }
16895 else
16896 gcc_unreachable ();
16897 }
16898
16899 /* Expand a push in MODE. This is some mode for which we do not support
16900 proper push instructions, at least from the registers that we expect
16901 the value to live in. */
16902
16903 void
16904 ix86_expand_push (enum machine_mode mode, rtx x)
16905 {
16906 rtx tmp;
16907
16908 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16909 GEN_INT (-GET_MODE_SIZE (mode)),
16910 stack_pointer_rtx, 1, OPTAB_DIRECT);
16911 if (tmp != stack_pointer_rtx)
16912 emit_move_insn (stack_pointer_rtx, tmp);
16913
16914 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16915
16916 /* When we push an operand onto stack, it has to be aligned at least
16917 at the function argument boundary. However since we don't have
16918 the argument type, we can't determine the actual argument
16919 boundary. */
16920 emit_move_insn (tmp, x);
16921 }
16922
16923 /* Helper function of ix86_fixup_binary_operands to canonicalize
16924 operand order. Returns true if the operands should be swapped. */
16925
16926 static bool
16927 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16928 rtx operands[])
16929 {
16930 rtx dst = operands[0];
16931 rtx src1 = operands[1];
16932 rtx src2 = operands[2];
16933
16934 /* If the operation is not commutative, we can't do anything. */
16935 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16936 return false;
16937
16938 /* Highest priority is that src1 should match dst. */
16939 if (rtx_equal_p (dst, src1))
16940 return false;
16941 if (rtx_equal_p (dst, src2))
16942 return true;
16943
16944 /* Next highest priority is that immediate constants come second. */
16945 if (immediate_operand (src2, mode))
16946 return false;
16947 if (immediate_operand (src1, mode))
16948 return true;
16949
16950 /* Lowest priority is that memory references should come second. */
16951 if (MEM_P (src2))
16952 return false;
16953 if (MEM_P (src1))
16954 return true;
16955
16956 return false;
16957 }
16958
16959
16960 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16961 destination to use for the operation. If different from the true
16962 destination in operands[0], a copy operation will be required. */
16963
16964 rtx
16965 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16966 rtx operands[])
16967 {
16968 rtx dst = operands[0];
16969 rtx src1 = operands[1];
16970 rtx src2 = operands[2];
16971
16972 /* Canonicalize operand order. */
16973 if (ix86_swap_binary_operands_p (code, mode, operands))
16974 {
16975 rtx temp;
16976
16977 /* It is invalid to swap operands of different modes. */
16978 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16979
16980 temp = src1;
16981 src1 = src2;
16982 src2 = temp;
16983 }
16984
16985 /* Both source operands cannot be in memory. */
16986 if (MEM_P (src1) && MEM_P (src2))
16987 {
16988 /* Optimization: Only read from memory once. */
16989 if (rtx_equal_p (src1, src2))
16990 {
16991 src2 = force_reg (mode, src2);
16992 src1 = src2;
16993 }
16994 else if (rtx_equal_p (dst, src1))
16995 src2 = force_reg (mode, src2);
16996 else
16997 src1 = force_reg (mode, src1);
16998 }
16999
17000 /* If the destination is memory, and we do not have matching source
17001 operands, do things in registers. */
17002 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17003 dst = gen_reg_rtx (mode);
17004
17005 /* Source 1 cannot be a constant. */
17006 if (CONSTANT_P (src1))
17007 src1 = force_reg (mode, src1);
17008
17009 /* Source 1 cannot be a non-matching memory. */
17010 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17011 src1 = force_reg (mode, src1);
17012
17013 /* Improve address combine. */
17014 if (code == PLUS
17015 && GET_MODE_CLASS (mode) == MODE_INT
17016 && MEM_P (src2))
17017 src2 = force_reg (mode, src2);
17018
17019 operands[1] = src1;
17020 operands[2] = src2;
17021 return dst;
17022 }
17023
17024 /* Similarly, but assume that the destination has already been
17025 set up properly. */
17026
17027 void
17028 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
17029 enum machine_mode mode, rtx operands[])
17030 {
17031 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
17032 gcc_assert (dst == operands[0]);
17033 }
17034
17035 /* Attempt to expand a binary operator. Make the expansion closer to the
17036 actual machine, then just general_operand, which will allow 3 separate
17037 memory references (one output, two input) in a single insn. */
17038
17039 void
17040 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
17041 rtx operands[])
17042 {
17043 rtx src1, src2, dst, op, clob;
17044
17045 dst = ix86_fixup_binary_operands (code, mode, operands);
17046 src1 = operands[1];
17047 src2 = operands[2];
17048
17049 /* Emit the instruction. */
17050
17051 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
17052 if (reload_in_progress)
17053 {
17054 /* Reload doesn't know about the flags register, and doesn't know that
17055 it doesn't want to clobber it. We can only do this with PLUS. */
17056 gcc_assert (code == PLUS);
17057 emit_insn (op);
17058 }
17059 else if (reload_completed
17060 && code == PLUS
17061 && !rtx_equal_p (dst, src1))
17062 {
17063 /* This is going to be an LEA; avoid splitting it later. */
17064 emit_insn (op);
17065 }
17066 else
17067 {
17068 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17069 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17070 }
17071
17072 /* Fix up the destination if needed. */
17073 if (dst != operands[0])
17074 emit_move_insn (operands[0], dst);
17075 }
17076
17077 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
17078 the given OPERANDS. */
17079
17080 void
17081 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
17082 rtx operands[])
17083 {
17084 rtx op1 = NULL_RTX, op2 = NULL_RTX;
17085 if (GET_CODE (operands[1]) == SUBREG)
17086 {
17087 op1 = operands[1];
17088 op2 = operands[2];
17089 }
17090 else if (GET_CODE (operands[2]) == SUBREG)
17091 {
17092 op1 = operands[2];
17093 op2 = operands[1];
17094 }
17095 /* Optimize (__m128i) d | (__m128i) e and similar code
17096 when d and e are float vectors into float vector logical
17097 insn. In C/C++ without using intrinsics there is no other way
17098 to express vector logical operation on float vectors than
17099 to cast them temporarily to integer vectors. */
17100 if (op1
17101 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
17102 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
17103 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
17104 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
17105 && SUBREG_BYTE (op1) == 0
17106 && (GET_CODE (op2) == CONST_VECTOR
17107 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
17108 && SUBREG_BYTE (op2) == 0))
17109 && can_create_pseudo_p ())
17110 {
17111 rtx dst;
17112 switch (GET_MODE (SUBREG_REG (op1)))
17113 {
17114 case V4SFmode:
17115 case V8SFmode:
17116 case V2DFmode:
17117 case V4DFmode:
17118 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
17119 if (GET_CODE (op2) == CONST_VECTOR)
17120 {
17121 op2 = gen_lowpart (GET_MODE (dst), op2);
17122 op2 = force_reg (GET_MODE (dst), op2);
17123 }
17124 else
17125 {
17126 op1 = operands[1];
17127 op2 = SUBREG_REG (operands[2]);
17128 if (!nonimmediate_operand (op2, GET_MODE (dst)))
17129 op2 = force_reg (GET_MODE (dst), op2);
17130 }
17131 op1 = SUBREG_REG (op1);
17132 if (!nonimmediate_operand (op1, GET_MODE (dst)))
17133 op1 = force_reg (GET_MODE (dst), op1);
17134 emit_insn (gen_rtx_SET (VOIDmode, dst,
17135 gen_rtx_fmt_ee (code, GET_MODE (dst),
17136 op1, op2)));
17137 emit_move_insn (operands[0], gen_lowpart (mode, dst));
17138 return;
17139 default:
17140 break;
17141 }
17142 }
17143 if (!nonimmediate_operand (operands[1], mode))
17144 operands[1] = force_reg (mode, operands[1]);
17145 if (!nonimmediate_operand (operands[2], mode))
17146 operands[2] = force_reg (mode, operands[2]);
17147 ix86_fixup_binary_operands_no_copy (code, mode, operands);
17148 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
17149 gen_rtx_fmt_ee (code, mode, operands[1],
17150 operands[2])));
17151 }
17152
17153 /* Return TRUE or FALSE depending on whether the binary operator meets the
17154 appropriate constraints. */
17155
17156 bool
17157 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
17158 rtx operands[3])
17159 {
17160 rtx dst = operands[0];
17161 rtx src1 = operands[1];
17162 rtx src2 = operands[2];
17163
17164 /* Both source operands cannot be in memory. */
17165 if (MEM_P (src1) && MEM_P (src2))
17166 return false;
17167
17168 /* Canonicalize operand order for commutative operators. */
17169 if (ix86_swap_binary_operands_p (code, mode, operands))
17170 {
17171 rtx temp = src1;
17172 src1 = src2;
17173 src2 = temp;
17174 }
17175
17176 /* If the destination is memory, we must have a matching source operand. */
17177 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
17178 return false;
17179
17180 /* Source 1 cannot be a constant. */
17181 if (CONSTANT_P (src1))
17182 return false;
17183
17184 /* Source 1 cannot be a non-matching memory. */
17185 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
17186 /* Support "andhi/andsi/anddi" as a zero-extending move. */
17187 return (code == AND
17188 && (mode == HImode
17189 || mode == SImode
17190 || (TARGET_64BIT && mode == DImode))
17191 && satisfies_constraint_L (src2));
17192
17193 return true;
17194 }
17195
17196 /* Attempt to expand a unary operator. Make the expansion closer to the
17197 actual machine, then just general_operand, which will allow 2 separate
17198 memory references (one output, one input) in a single insn. */
17199
17200 void
17201 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
17202 rtx operands[])
17203 {
17204 int matching_memory;
17205 rtx src, dst, op, clob;
17206
17207 dst = operands[0];
17208 src = operands[1];
17209
17210 /* If the destination is memory, and we do not have matching source
17211 operands, do things in registers. */
17212 matching_memory = 0;
17213 if (MEM_P (dst))
17214 {
17215 if (rtx_equal_p (dst, src))
17216 matching_memory = 1;
17217 else
17218 dst = gen_reg_rtx (mode);
17219 }
17220
17221 /* When source operand is memory, destination must match. */
17222 if (MEM_P (src) && !matching_memory)
17223 src = force_reg (mode, src);
17224
17225 /* Emit the instruction. */
17226
17227 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
17228 if (reload_in_progress || code == NOT)
17229 {
17230 /* Reload doesn't know about the flags register, and doesn't know that
17231 it doesn't want to clobber it. */
17232 gcc_assert (code == NOT);
17233 emit_insn (op);
17234 }
17235 else
17236 {
17237 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17238 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17239 }
17240
17241 /* Fix up the destination if needed. */
17242 if (dst != operands[0])
17243 emit_move_insn (operands[0], dst);
17244 }
17245
17246 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
17247 divisor are within the range [0-255]. */
17248
17249 void
17250 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
17251 bool signed_p)
17252 {
17253 rtx end_label, qimode_label;
17254 rtx insn, div, mod;
17255 rtx scratch, tmp0, tmp1, tmp2;
17256 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
17257 rtx (*gen_zero_extend) (rtx, rtx);
17258 rtx (*gen_test_ccno_1) (rtx, rtx);
17259
17260 switch (mode)
17261 {
17262 case SImode:
17263 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
17264 gen_test_ccno_1 = gen_testsi_ccno_1;
17265 gen_zero_extend = gen_zero_extendqisi2;
17266 break;
17267 case DImode:
17268 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
17269 gen_test_ccno_1 = gen_testdi_ccno_1;
17270 gen_zero_extend = gen_zero_extendqidi2;
17271 break;
17272 default:
17273 gcc_unreachable ();
17274 }
17275
17276 end_label = gen_label_rtx ();
17277 qimode_label = gen_label_rtx ();
17278
17279 scratch = gen_reg_rtx (mode);
17280
17281 /* Use 8bit unsigned divimod if dividend and divisor are within
17282 the range [0-255]. */
17283 emit_move_insn (scratch, operands[2]);
17284 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
17285 scratch, 1, OPTAB_DIRECT);
17286 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
17287 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
17288 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
17289 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
17290 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
17291 pc_rtx);
17292 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
17293 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17294 JUMP_LABEL (insn) = qimode_label;
17295
17296 /* Generate original signed/unsigned divimod. */
17297 div = gen_divmod4_1 (operands[0], operands[1],
17298 operands[2], operands[3]);
17299 emit_insn (div);
17300
17301 /* Branch to the end. */
17302 emit_jump_insn (gen_jump (end_label));
17303 emit_barrier ();
17304
17305 /* Generate 8bit unsigned divide. */
17306 emit_label (qimode_label);
17307 /* Don't use operands[0] for result of 8bit divide since not all
17308 registers support QImode ZERO_EXTRACT. */
17309 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
17310 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
17311 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
17312 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
17313
17314 if (signed_p)
17315 {
17316 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
17317 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
17318 }
17319 else
17320 {
17321 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
17322 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
17323 }
17324
17325 /* Extract remainder from AH. */
17326 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
17327 if (REG_P (operands[1]))
17328 insn = emit_move_insn (operands[1], tmp1);
17329 else
17330 {
17331 /* Need a new scratch register since the old one has result
17332 of 8bit divide. */
17333 scratch = gen_reg_rtx (mode);
17334 emit_move_insn (scratch, tmp1);
17335 insn = emit_move_insn (operands[1], scratch);
17336 }
17337 set_unique_reg_note (insn, REG_EQUAL, mod);
17338
17339 /* Zero extend quotient from AL. */
17340 tmp1 = gen_lowpart (QImode, tmp0);
17341 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
17342 set_unique_reg_note (insn, REG_EQUAL, div);
17343
17344 emit_label (end_label);
17345 }
17346
17347 #define LEA_MAX_STALL (3)
17348 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
17349
17350 /* Increase given DISTANCE in half-cycles according to
17351 dependencies between PREV and NEXT instructions.
17352 Add 1 half-cycle if there is no dependency and
17353 go to next cycle if there is some dependecy. */
17354
17355 static unsigned int
17356 increase_distance (rtx prev, rtx next, unsigned int distance)
17357 {
17358 df_ref *use_rec;
17359 df_ref *def_rec;
17360
17361 if (!prev || !next)
17362 return distance + (distance & 1) + 2;
17363
17364 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
17365 return distance + 1;
17366
17367 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
17368 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
17369 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
17370 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
17371 return distance + (distance & 1) + 2;
17372
17373 return distance + 1;
17374 }
17375
17376 /* Function checks if instruction INSN defines register number
17377 REGNO1 or REGNO2. */
17378
17379 static bool
17380 insn_defines_reg (unsigned int regno1, unsigned int regno2,
17381 rtx insn)
17382 {
17383 df_ref *def_rec;
17384
17385 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
17386 if (DF_REF_REG_DEF_P (*def_rec)
17387 && !DF_REF_IS_ARTIFICIAL (*def_rec)
17388 && (regno1 == DF_REF_REGNO (*def_rec)
17389 || regno2 == DF_REF_REGNO (*def_rec)))
17390 {
17391 return true;
17392 }
17393
17394 return false;
17395 }
17396
17397 /* Function checks if instruction INSN uses register number
17398 REGNO as a part of address expression. */
17399
17400 static bool
17401 insn_uses_reg_mem (unsigned int regno, rtx insn)
17402 {
17403 df_ref *use_rec;
17404
17405 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
17406 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
17407 return true;
17408
17409 return false;
17410 }
17411
17412 /* Search backward for non-agu definition of register number REGNO1
17413 or register number REGNO2 in basic block starting from instruction
17414 START up to head of basic block or instruction INSN.
17415
17416 Function puts true value into *FOUND var if definition was found
17417 and false otherwise.
17418
17419 Distance in half-cycles between START and found instruction or head
17420 of BB is added to DISTANCE and returned. */
17421
17422 static int
17423 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
17424 rtx insn, int distance,
17425 rtx start, bool *found)
17426 {
17427 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17428 rtx prev = start;
17429 rtx next = NULL;
17430
17431 *found = false;
17432
17433 while (prev
17434 && prev != insn
17435 && distance < LEA_SEARCH_THRESHOLD)
17436 {
17437 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
17438 {
17439 distance = increase_distance (prev, next, distance);
17440 if (insn_defines_reg (regno1, regno2, prev))
17441 {
17442 if (recog_memoized (prev) < 0
17443 || get_attr_type (prev) != TYPE_LEA)
17444 {
17445 *found = true;
17446 return distance;
17447 }
17448 }
17449
17450 next = prev;
17451 }
17452 if (prev == BB_HEAD (bb))
17453 break;
17454
17455 prev = PREV_INSN (prev);
17456 }
17457
17458 return distance;
17459 }
17460
17461 /* Search backward for non-agu definition of register number REGNO1
17462 or register number REGNO2 in INSN's basic block until
17463 1. Pass LEA_SEARCH_THRESHOLD instructions, or
17464 2. Reach neighbour BBs boundary, or
17465 3. Reach agu definition.
17466 Returns the distance between the non-agu definition point and INSN.
17467 If no definition point, returns -1. */
17468
17469 static int
17470 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
17471 rtx insn)
17472 {
17473 basic_block bb = BLOCK_FOR_INSN (insn);
17474 int distance = 0;
17475 bool found = false;
17476
17477 if (insn != BB_HEAD (bb))
17478 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
17479 distance, PREV_INSN (insn),
17480 &found);
17481
17482 if (!found && distance < LEA_SEARCH_THRESHOLD)
17483 {
17484 edge e;
17485 edge_iterator ei;
17486 bool simple_loop = false;
17487
17488 FOR_EACH_EDGE (e, ei, bb->preds)
17489 if (e->src == bb)
17490 {
17491 simple_loop = true;
17492 break;
17493 }
17494
17495 if (simple_loop)
17496 distance = distance_non_agu_define_in_bb (regno1, regno2,
17497 insn, distance,
17498 BB_END (bb), &found);
17499 else
17500 {
17501 int shortest_dist = -1;
17502 bool found_in_bb = false;
17503
17504 FOR_EACH_EDGE (e, ei, bb->preds)
17505 {
17506 int bb_dist
17507 = distance_non_agu_define_in_bb (regno1, regno2,
17508 insn, distance,
17509 BB_END (e->src),
17510 &found_in_bb);
17511 if (found_in_bb)
17512 {
17513 if (shortest_dist < 0)
17514 shortest_dist = bb_dist;
17515 else if (bb_dist > 0)
17516 shortest_dist = MIN (bb_dist, shortest_dist);
17517
17518 found = true;
17519 }
17520 }
17521
17522 distance = shortest_dist;
17523 }
17524 }
17525
17526 /* get_attr_type may modify recog data. We want to make sure
17527 that recog data is valid for instruction INSN, on which
17528 distance_non_agu_define is called. INSN is unchanged here. */
17529 extract_insn_cached (insn);
17530
17531 if (!found)
17532 return -1;
17533
17534 return distance >> 1;
17535 }
17536
17537 /* Return the distance in half-cycles between INSN and the next
17538 insn that uses register number REGNO in memory address added
17539 to DISTANCE. Return -1 if REGNO0 is set.
17540
17541 Put true value into *FOUND if register usage was found and
17542 false otherwise.
17543 Put true value into *REDEFINED if register redefinition was
17544 found and false otherwise. */
17545
17546 static int
17547 distance_agu_use_in_bb (unsigned int regno,
17548 rtx insn, int distance, rtx start,
17549 bool *found, bool *redefined)
17550 {
17551 basic_block bb = NULL;
17552 rtx next = start;
17553 rtx prev = NULL;
17554
17555 *found = false;
17556 *redefined = false;
17557
17558 if (start != NULL_RTX)
17559 {
17560 bb = BLOCK_FOR_INSN (start);
17561 if (start != BB_HEAD (bb))
17562 /* If insn and start belong to the same bb, set prev to insn,
17563 so the call to increase_distance will increase the distance
17564 between insns by 1. */
17565 prev = insn;
17566 }
17567
17568 while (next
17569 && next != insn
17570 && distance < LEA_SEARCH_THRESHOLD)
17571 {
17572 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17573 {
17574 distance = increase_distance(prev, next, distance);
17575 if (insn_uses_reg_mem (regno, next))
17576 {
17577 /* Return DISTANCE if OP0 is used in memory
17578 address in NEXT. */
17579 *found = true;
17580 return distance;
17581 }
17582
17583 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17584 {
17585 /* Return -1 if OP0 is set in NEXT. */
17586 *redefined = true;
17587 return -1;
17588 }
17589
17590 prev = next;
17591 }
17592
17593 if (next == BB_END (bb))
17594 break;
17595
17596 next = NEXT_INSN (next);
17597 }
17598
17599 return distance;
17600 }
17601
17602 /* Return the distance between INSN and the next insn that uses
17603 register number REGNO0 in memory address. Return -1 if no such
17604 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17605
17606 static int
17607 distance_agu_use (unsigned int regno0, rtx insn)
17608 {
17609 basic_block bb = BLOCK_FOR_INSN (insn);
17610 int distance = 0;
17611 bool found = false;
17612 bool redefined = false;
17613
17614 if (insn != BB_END (bb))
17615 distance = distance_agu_use_in_bb (regno0, insn, distance,
17616 NEXT_INSN (insn),
17617 &found, &redefined);
17618
17619 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17620 {
17621 edge e;
17622 edge_iterator ei;
17623 bool simple_loop = false;
17624
17625 FOR_EACH_EDGE (e, ei, bb->succs)
17626 if (e->dest == bb)
17627 {
17628 simple_loop = true;
17629 break;
17630 }
17631
17632 if (simple_loop)
17633 distance = distance_agu_use_in_bb (regno0, insn,
17634 distance, BB_HEAD (bb),
17635 &found, &redefined);
17636 else
17637 {
17638 int shortest_dist = -1;
17639 bool found_in_bb = false;
17640 bool redefined_in_bb = false;
17641
17642 FOR_EACH_EDGE (e, ei, bb->succs)
17643 {
17644 int bb_dist
17645 = distance_agu_use_in_bb (regno0, insn,
17646 distance, BB_HEAD (e->dest),
17647 &found_in_bb, &redefined_in_bb);
17648 if (found_in_bb)
17649 {
17650 if (shortest_dist < 0)
17651 shortest_dist = bb_dist;
17652 else if (bb_dist > 0)
17653 shortest_dist = MIN (bb_dist, shortest_dist);
17654
17655 found = true;
17656 }
17657 }
17658
17659 distance = shortest_dist;
17660 }
17661 }
17662
17663 if (!found || redefined)
17664 return -1;
17665
17666 return distance >> 1;
17667 }
17668
17669 /* Define this macro to tune LEA priority vs ADD, it take effect when
17670 there is a dilemma of choicing LEA or ADD
17671 Negative value: ADD is more preferred than LEA
17672 Zero: Netrual
17673 Positive value: LEA is more preferred than ADD*/
17674 #define IX86_LEA_PRIORITY 0
17675
17676 /* Return true if usage of lea INSN has performance advantage
17677 over a sequence of instructions. Instructions sequence has
17678 SPLIT_COST cycles higher latency than lea latency. */
17679
17680 static bool
17681 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17682 unsigned int regno2, int split_cost, bool has_scale)
17683 {
17684 int dist_define, dist_use;
17685
17686 /* For Silvermont if using a 2-source or 3-source LEA for
17687 non-destructive destination purposes, or due to wanting
17688 ability to use SCALE, the use of LEA is justified. */
17689 if (ix86_tune == PROCESSOR_SLM)
17690 {
17691 if (has_scale)
17692 return true;
17693 if (split_cost < 1)
17694 return false;
17695 if (regno0 == regno1 || regno0 == regno2)
17696 return false;
17697 return true;
17698 }
17699
17700 dist_define = distance_non_agu_define (regno1, regno2, insn);
17701 dist_use = distance_agu_use (regno0, insn);
17702
17703 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17704 {
17705 /* If there is no non AGU operand definition, no AGU
17706 operand usage and split cost is 0 then both lea
17707 and non lea variants have same priority. Currently
17708 we prefer lea for 64 bit code and non lea on 32 bit
17709 code. */
17710 if (dist_use < 0 && split_cost == 0)
17711 return TARGET_64BIT || IX86_LEA_PRIORITY;
17712 else
17713 return true;
17714 }
17715
17716 /* With longer definitions distance lea is more preferable.
17717 Here we change it to take into account splitting cost and
17718 lea priority. */
17719 dist_define += split_cost + IX86_LEA_PRIORITY;
17720
17721 /* If there is no use in memory addess then we just check
17722 that split cost exceeds AGU stall. */
17723 if (dist_use < 0)
17724 return dist_define > LEA_MAX_STALL;
17725
17726 /* If this insn has both backward non-agu dependence and forward
17727 agu dependence, the one with short distance takes effect. */
17728 return dist_define >= dist_use;
17729 }
17730
17731 /* Return true if it is legal to clobber flags by INSN and
17732 false otherwise. */
17733
17734 static bool
17735 ix86_ok_to_clobber_flags (rtx insn)
17736 {
17737 basic_block bb = BLOCK_FOR_INSN (insn);
17738 df_ref *use;
17739 bitmap live;
17740
17741 while (insn)
17742 {
17743 if (NONDEBUG_INSN_P (insn))
17744 {
17745 for (use = DF_INSN_USES (insn); *use; use++)
17746 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17747 return false;
17748
17749 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17750 return true;
17751 }
17752
17753 if (insn == BB_END (bb))
17754 break;
17755
17756 insn = NEXT_INSN (insn);
17757 }
17758
17759 live = df_get_live_out(bb);
17760 return !REGNO_REG_SET_P (live, FLAGS_REG);
17761 }
17762
17763 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17764 move and add to avoid AGU stalls. */
17765
17766 bool
17767 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17768 {
17769 unsigned int regno0, regno1, regno2;
17770
17771 /* Check if we need to optimize. */
17772 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17773 return false;
17774
17775 /* Check it is correct to split here. */
17776 if (!ix86_ok_to_clobber_flags(insn))
17777 return false;
17778
17779 regno0 = true_regnum (operands[0]);
17780 regno1 = true_regnum (operands[1]);
17781 regno2 = true_regnum (operands[2]);
17782
17783 /* We need to split only adds with non destructive
17784 destination operand. */
17785 if (regno0 == regno1 || regno0 == regno2)
17786 return false;
17787 else
17788 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
17789 }
17790
17791 /* Return true if we should emit lea instruction instead of mov
17792 instruction. */
17793
17794 bool
17795 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17796 {
17797 unsigned int regno0, regno1;
17798
17799 /* Check if we need to optimize. */
17800 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17801 return false;
17802
17803 /* Use lea for reg to reg moves only. */
17804 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17805 return false;
17806
17807 regno0 = true_regnum (operands[0]);
17808 regno1 = true_regnum (operands[1]);
17809
17810 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
17811 }
17812
17813 /* Return true if we need to split lea into a sequence of
17814 instructions to avoid AGU stalls. */
17815
17816 bool
17817 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17818 {
17819 unsigned int regno0, regno1, regno2;
17820 int split_cost;
17821 struct ix86_address parts;
17822 int ok;
17823
17824 /* Check we need to optimize. */
17825 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17826 return false;
17827
17828 /* Check it is correct to split here. */
17829 if (!ix86_ok_to_clobber_flags(insn))
17830 return false;
17831
17832 ok = ix86_decompose_address (operands[1], &parts);
17833 gcc_assert (ok);
17834
17835 /* There should be at least two components in the address. */
17836 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17837 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17838 return false;
17839
17840 /* We should not split into add if non legitimate pic
17841 operand is used as displacement. */
17842 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17843 return false;
17844
17845 regno0 = true_regnum (operands[0]) ;
17846 regno1 = INVALID_REGNUM;
17847 regno2 = INVALID_REGNUM;
17848
17849 if (parts.base)
17850 regno1 = true_regnum (parts.base);
17851 if (parts.index)
17852 regno2 = true_regnum (parts.index);
17853
17854 split_cost = 0;
17855
17856 /* Compute how many cycles we will add to execution time
17857 if split lea into a sequence of instructions. */
17858 if (parts.base || parts.index)
17859 {
17860 /* Have to use mov instruction if non desctructive
17861 destination form is used. */
17862 if (regno1 != regno0 && regno2 != regno0)
17863 split_cost += 1;
17864
17865 /* Have to add index to base if both exist. */
17866 if (parts.base && parts.index)
17867 split_cost += 1;
17868
17869 /* Have to use shift and adds if scale is 2 or greater. */
17870 if (parts.scale > 1)
17871 {
17872 if (regno0 != regno1)
17873 split_cost += 1;
17874 else if (regno2 == regno0)
17875 split_cost += 4;
17876 else
17877 split_cost += parts.scale;
17878 }
17879
17880 /* Have to use add instruction with immediate if
17881 disp is non zero. */
17882 if (parts.disp && parts.disp != const0_rtx)
17883 split_cost += 1;
17884
17885 /* Subtract the price of lea. */
17886 split_cost -= 1;
17887 }
17888
17889 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
17890 parts.scale > 1);
17891 }
17892
17893 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17894 matches destination. RTX includes clobber of FLAGS_REG. */
17895
17896 static void
17897 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17898 rtx dst, rtx src)
17899 {
17900 rtx op, clob;
17901
17902 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17903 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17904
17905 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17906 }
17907
17908 /* Return true if regno1 def is nearest to the insn. */
17909
17910 static bool
17911 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17912 {
17913 rtx prev = insn;
17914 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17915
17916 if (insn == start)
17917 return false;
17918 while (prev && prev != start)
17919 {
17920 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17921 {
17922 prev = PREV_INSN (prev);
17923 continue;
17924 }
17925 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17926 return true;
17927 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17928 return false;
17929 prev = PREV_INSN (prev);
17930 }
17931
17932 /* None of the regs is defined in the bb. */
17933 return false;
17934 }
17935
17936 /* Split lea instructions into a sequence of instructions
17937 which are executed on ALU to avoid AGU stalls.
17938 It is assumed that it is allowed to clobber flags register
17939 at lea position. */
17940
17941 void
17942 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17943 {
17944 unsigned int regno0, regno1, regno2;
17945 struct ix86_address parts;
17946 rtx target, tmp;
17947 int ok, adds;
17948
17949 ok = ix86_decompose_address (operands[1], &parts);
17950 gcc_assert (ok);
17951
17952 target = gen_lowpart (mode, operands[0]);
17953
17954 regno0 = true_regnum (target);
17955 regno1 = INVALID_REGNUM;
17956 regno2 = INVALID_REGNUM;
17957
17958 if (parts.base)
17959 {
17960 parts.base = gen_lowpart (mode, parts.base);
17961 regno1 = true_regnum (parts.base);
17962 }
17963
17964 if (parts.index)
17965 {
17966 parts.index = gen_lowpart (mode, parts.index);
17967 regno2 = true_regnum (parts.index);
17968 }
17969
17970 if (parts.disp)
17971 parts.disp = gen_lowpart (mode, parts.disp);
17972
17973 if (parts.scale > 1)
17974 {
17975 /* Case r1 = r1 + ... */
17976 if (regno1 == regno0)
17977 {
17978 /* If we have a case r1 = r1 + C * r1 then we
17979 should use multiplication which is very
17980 expensive. Assume cost model is wrong if we
17981 have such case here. */
17982 gcc_assert (regno2 != regno0);
17983
17984 for (adds = parts.scale; adds > 0; adds--)
17985 ix86_emit_binop (PLUS, mode, target, parts.index);
17986 }
17987 else
17988 {
17989 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17990 if (regno0 != regno2)
17991 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17992
17993 /* Use shift for scaling. */
17994 ix86_emit_binop (ASHIFT, mode, target,
17995 GEN_INT (exact_log2 (parts.scale)));
17996
17997 if (parts.base)
17998 ix86_emit_binop (PLUS, mode, target, parts.base);
17999
18000 if (parts.disp && parts.disp != const0_rtx)
18001 ix86_emit_binop (PLUS, mode, target, parts.disp);
18002 }
18003 }
18004 else if (!parts.base && !parts.index)
18005 {
18006 gcc_assert(parts.disp);
18007 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
18008 }
18009 else
18010 {
18011 if (!parts.base)
18012 {
18013 if (regno0 != regno2)
18014 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
18015 }
18016 else if (!parts.index)
18017 {
18018 if (regno0 != regno1)
18019 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
18020 }
18021 else
18022 {
18023 if (regno0 == regno1)
18024 tmp = parts.index;
18025 else if (regno0 == regno2)
18026 tmp = parts.base;
18027 else
18028 {
18029 rtx tmp1;
18030
18031 /* Find better operand for SET instruction, depending
18032 on which definition is farther from the insn. */
18033 if (find_nearest_reg_def (insn, regno1, regno2))
18034 tmp = parts.index, tmp1 = parts.base;
18035 else
18036 tmp = parts.base, tmp1 = parts.index;
18037
18038 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
18039
18040 if (parts.disp && parts.disp != const0_rtx)
18041 ix86_emit_binop (PLUS, mode, target, parts.disp);
18042
18043 ix86_emit_binop (PLUS, mode, target, tmp1);
18044 return;
18045 }
18046
18047 ix86_emit_binop (PLUS, mode, target, tmp);
18048 }
18049
18050 if (parts.disp && parts.disp != const0_rtx)
18051 ix86_emit_binop (PLUS, mode, target, parts.disp);
18052 }
18053 }
18054
18055 /* Return true if it is ok to optimize an ADD operation to LEA
18056 operation to avoid flag register consumation. For most processors,
18057 ADD is faster than LEA. For the processors like ATOM, if the
18058 destination register of LEA holds an actual address which will be
18059 used soon, LEA is better and otherwise ADD is better. */
18060
18061 bool
18062 ix86_lea_for_add_ok (rtx insn, rtx operands[])
18063 {
18064 unsigned int regno0 = true_regnum (operands[0]);
18065 unsigned int regno1 = true_regnum (operands[1]);
18066 unsigned int regno2 = true_regnum (operands[2]);
18067
18068 /* If a = b + c, (a!=b && a!=c), must use lea form. */
18069 if (regno0 != regno1 && regno0 != regno2)
18070 return true;
18071
18072 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
18073 return false;
18074
18075 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
18076 }
18077
18078 /* Return true if destination reg of SET_BODY is shift count of
18079 USE_BODY. */
18080
18081 static bool
18082 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
18083 {
18084 rtx set_dest;
18085 rtx shift_rtx;
18086 int i;
18087
18088 /* Retrieve destination of SET_BODY. */
18089 switch (GET_CODE (set_body))
18090 {
18091 case SET:
18092 set_dest = SET_DEST (set_body);
18093 if (!set_dest || !REG_P (set_dest))
18094 return false;
18095 break;
18096 case PARALLEL:
18097 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
18098 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
18099 use_body))
18100 return true;
18101 default:
18102 return false;
18103 break;
18104 }
18105
18106 /* Retrieve shift count of USE_BODY. */
18107 switch (GET_CODE (use_body))
18108 {
18109 case SET:
18110 shift_rtx = XEXP (use_body, 1);
18111 break;
18112 case PARALLEL:
18113 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
18114 if (ix86_dep_by_shift_count_body (set_body,
18115 XVECEXP (use_body, 0, i)))
18116 return true;
18117 default:
18118 return false;
18119 break;
18120 }
18121
18122 if (shift_rtx
18123 && (GET_CODE (shift_rtx) == ASHIFT
18124 || GET_CODE (shift_rtx) == LSHIFTRT
18125 || GET_CODE (shift_rtx) == ASHIFTRT
18126 || GET_CODE (shift_rtx) == ROTATE
18127 || GET_CODE (shift_rtx) == ROTATERT))
18128 {
18129 rtx shift_count = XEXP (shift_rtx, 1);
18130
18131 /* Return true if shift count is dest of SET_BODY. */
18132 if (REG_P (shift_count))
18133 {
18134 /* Add check since it can be invoked before register
18135 allocation in pre-reload schedule. */
18136 if (reload_completed
18137 && true_regnum (set_dest) == true_regnum (shift_count))
18138 return true;
18139 else if (REGNO(set_dest) == REGNO(shift_count))
18140 return true;
18141 }
18142 }
18143
18144 return false;
18145 }
18146
18147 /* Return true if destination reg of SET_INSN is shift count of
18148 USE_INSN. */
18149
18150 bool
18151 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
18152 {
18153 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
18154 PATTERN (use_insn));
18155 }
18156
18157 /* Return TRUE or FALSE depending on whether the unary operator meets the
18158 appropriate constraints. */
18159
18160 bool
18161 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
18162 enum machine_mode mode ATTRIBUTE_UNUSED,
18163 rtx operands[2])
18164 {
18165 /* If one of operands is memory, source and destination must match. */
18166 if ((MEM_P (operands[0])
18167 || MEM_P (operands[1]))
18168 && ! rtx_equal_p (operands[0], operands[1]))
18169 return false;
18170 return true;
18171 }
18172
18173 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
18174 are ok, keeping in mind the possible movddup alternative. */
18175
18176 bool
18177 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
18178 {
18179 if (MEM_P (operands[0]))
18180 return rtx_equal_p (operands[0], operands[1 + high]);
18181 if (MEM_P (operands[1]) && MEM_P (operands[2]))
18182 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
18183 return true;
18184 }
18185
18186 /* Post-reload splitter for converting an SF or DFmode value in an
18187 SSE register into an unsigned SImode. */
18188
18189 void
18190 ix86_split_convert_uns_si_sse (rtx operands[])
18191 {
18192 enum machine_mode vecmode;
18193 rtx value, large, zero_or_two31, input, two31, x;
18194
18195 large = operands[1];
18196 zero_or_two31 = operands[2];
18197 input = operands[3];
18198 two31 = operands[4];
18199 vecmode = GET_MODE (large);
18200 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
18201
18202 /* Load up the value into the low element. We must ensure that the other
18203 elements are valid floats -- zero is the easiest such value. */
18204 if (MEM_P (input))
18205 {
18206 if (vecmode == V4SFmode)
18207 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
18208 else
18209 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
18210 }
18211 else
18212 {
18213 input = gen_rtx_REG (vecmode, REGNO (input));
18214 emit_move_insn (value, CONST0_RTX (vecmode));
18215 if (vecmode == V4SFmode)
18216 emit_insn (gen_sse_movss (value, value, input));
18217 else
18218 emit_insn (gen_sse2_movsd (value, value, input));
18219 }
18220
18221 emit_move_insn (large, two31);
18222 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
18223
18224 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
18225 emit_insn (gen_rtx_SET (VOIDmode, large, x));
18226
18227 x = gen_rtx_AND (vecmode, zero_or_two31, large);
18228 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
18229
18230 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
18231 emit_insn (gen_rtx_SET (VOIDmode, value, x));
18232
18233 large = gen_rtx_REG (V4SImode, REGNO (large));
18234 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
18235
18236 x = gen_rtx_REG (V4SImode, REGNO (value));
18237 if (vecmode == V4SFmode)
18238 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
18239 else
18240 emit_insn (gen_sse2_cvttpd2dq (x, value));
18241 value = x;
18242
18243 emit_insn (gen_xorv4si3 (value, value, large));
18244 }
18245
18246 /* Convert an unsigned DImode value into a DFmode, using only SSE.
18247 Expects the 64-bit DImode to be supplied in a pair of integral
18248 registers. Requires SSE2; will use SSE3 if available. For x86_32,
18249 -mfpmath=sse, !optimize_size only. */
18250
18251 void
18252 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
18253 {
18254 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
18255 rtx int_xmm, fp_xmm;
18256 rtx biases, exponents;
18257 rtx x;
18258
18259 int_xmm = gen_reg_rtx (V4SImode);
18260 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
18261 emit_insn (gen_movdi_to_sse (int_xmm, input));
18262 else if (TARGET_SSE_SPLIT_REGS)
18263 {
18264 emit_clobber (int_xmm);
18265 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
18266 }
18267 else
18268 {
18269 x = gen_reg_rtx (V2DImode);
18270 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
18271 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
18272 }
18273
18274 x = gen_rtx_CONST_VECTOR (V4SImode,
18275 gen_rtvec (4, GEN_INT (0x43300000UL),
18276 GEN_INT (0x45300000UL),
18277 const0_rtx, const0_rtx));
18278 exponents = validize_mem (force_const_mem (V4SImode, x));
18279
18280 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
18281 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
18282
18283 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
18284 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
18285 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
18286 (0x1.0p84 + double(fp_value_hi_xmm)).
18287 Note these exponents differ by 32. */
18288
18289 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
18290
18291 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
18292 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
18293 real_ldexp (&bias_lo_rvt, &dconst1, 52);
18294 real_ldexp (&bias_hi_rvt, &dconst1, 84);
18295 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
18296 x = const_double_from_real_value (bias_hi_rvt, DFmode);
18297 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
18298 biases = validize_mem (force_const_mem (V2DFmode, biases));
18299 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
18300
18301 /* Add the upper and lower DFmode values together. */
18302 if (TARGET_SSE3)
18303 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
18304 else
18305 {
18306 x = copy_to_mode_reg (V2DFmode, fp_xmm);
18307 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
18308 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
18309 }
18310
18311 ix86_expand_vector_extract (false, target, fp_xmm, 0);
18312 }
18313
18314 /* Not used, but eases macroization of patterns. */
18315 void
18316 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
18317 rtx input ATTRIBUTE_UNUSED)
18318 {
18319 gcc_unreachable ();
18320 }
18321
18322 /* Convert an unsigned SImode value into a DFmode. Only currently used
18323 for SSE, but applicable anywhere. */
18324
18325 void
18326 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
18327 {
18328 REAL_VALUE_TYPE TWO31r;
18329 rtx x, fp;
18330
18331 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
18332 NULL, 1, OPTAB_DIRECT);
18333
18334 fp = gen_reg_rtx (DFmode);
18335 emit_insn (gen_floatsidf2 (fp, x));
18336
18337 real_ldexp (&TWO31r, &dconst1, 31);
18338 x = const_double_from_real_value (TWO31r, DFmode);
18339
18340 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
18341 if (x != target)
18342 emit_move_insn (target, x);
18343 }
18344
18345 /* Convert a signed DImode value into a DFmode. Only used for SSE in
18346 32-bit mode; otherwise we have a direct convert instruction. */
18347
18348 void
18349 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
18350 {
18351 REAL_VALUE_TYPE TWO32r;
18352 rtx fp_lo, fp_hi, x;
18353
18354 fp_lo = gen_reg_rtx (DFmode);
18355 fp_hi = gen_reg_rtx (DFmode);
18356
18357 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
18358
18359 real_ldexp (&TWO32r, &dconst1, 32);
18360 x = const_double_from_real_value (TWO32r, DFmode);
18361 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
18362
18363 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
18364
18365 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
18366 0, OPTAB_DIRECT);
18367 if (x != target)
18368 emit_move_insn (target, x);
18369 }
18370
18371 /* Convert an unsigned SImode value into a SFmode, using only SSE.
18372 For x86_32, -mfpmath=sse, !optimize_size only. */
18373 void
18374 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
18375 {
18376 REAL_VALUE_TYPE ONE16r;
18377 rtx fp_hi, fp_lo, int_hi, int_lo, x;
18378
18379 real_ldexp (&ONE16r, &dconst1, 16);
18380 x = const_double_from_real_value (ONE16r, SFmode);
18381 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
18382 NULL, 0, OPTAB_DIRECT);
18383 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
18384 NULL, 0, OPTAB_DIRECT);
18385 fp_hi = gen_reg_rtx (SFmode);
18386 fp_lo = gen_reg_rtx (SFmode);
18387 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
18388 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
18389 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
18390 0, OPTAB_DIRECT);
18391 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
18392 0, OPTAB_DIRECT);
18393 if (!rtx_equal_p (target, fp_hi))
18394 emit_move_insn (target, fp_hi);
18395 }
18396
18397 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
18398 a vector of unsigned ints VAL to vector of floats TARGET. */
18399
18400 void
18401 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
18402 {
18403 rtx tmp[8];
18404 REAL_VALUE_TYPE TWO16r;
18405 enum machine_mode intmode = GET_MODE (val);
18406 enum machine_mode fltmode = GET_MODE (target);
18407 rtx (*cvt) (rtx, rtx);
18408
18409 if (intmode == V4SImode)
18410 cvt = gen_floatv4siv4sf2;
18411 else
18412 cvt = gen_floatv8siv8sf2;
18413 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
18414 tmp[0] = force_reg (intmode, tmp[0]);
18415 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
18416 OPTAB_DIRECT);
18417 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
18418 NULL_RTX, 1, OPTAB_DIRECT);
18419 tmp[3] = gen_reg_rtx (fltmode);
18420 emit_insn (cvt (tmp[3], tmp[1]));
18421 tmp[4] = gen_reg_rtx (fltmode);
18422 emit_insn (cvt (tmp[4], tmp[2]));
18423 real_ldexp (&TWO16r, &dconst1, 16);
18424 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
18425 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
18426 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
18427 OPTAB_DIRECT);
18428 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
18429 OPTAB_DIRECT);
18430 if (tmp[7] != target)
18431 emit_move_insn (target, tmp[7]);
18432 }
18433
18434 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
18435 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
18436 This is done by doing just signed conversion if < 0x1p31, and otherwise by
18437 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
18438
18439 rtx
18440 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
18441 {
18442 REAL_VALUE_TYPE TWO31r;
18443 rtx two31r, tmp[4];
18444 enum machine_mode mode = GET_MODE (val);
18445 enum machine_mode scalarmode = GET_MODE_INNER (mode);
18446 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
18447 rtx (*cmp) (rtx, rtx, rtx, rtx);
18448 int i;
18449
18450 for (i = 0; i < 3; i++)
18451 tmp[i] = gen_reg_rtx (mode);
18452 real_ldexp (&TWO31r, &dconst1, 31);
18453 two31r = const_double_from_real_value (TWO31r, scalarmode);
18454 two31r = ix86_build_const_vector (mode, 1, two31r);
18455 two31r = force_reg (mode, two31r);
18456 switch (mode)
18457 {
18458 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
18459 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
18460 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
18461 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
18462 default: gcc_unreachable ();
18463 }
18464 tmp[3] = gen_rtx_LE (mode, two31r, val);
18465 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
18466 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
18467 0, OPTAB_DIRECT);
18468 if (intmode == V4SImode || TARGET_AVX2)
18469 *xorp = expand_simple_binop (intmode, ASHIFT,
18470 gen_lowpart (intmode, tmp[0]),
18471 GEN_INT (31), NULL_RTX, 0,
18472 OPTAB_DIRECT);
18473 else
18474 {
18475 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
18476 two31 = ix86_build_const_vector (intmode, 1, two31);
18477 *xorp = expand_simple_binop (intmode, AND,
18478 gen_lowpart (intmode, tmp[0]),
18479 two31, NULL_RTX, 0,
18480 OPTAB_DIRECT);
18481 }
18482 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
18483 0, OPTAB_DIRECT);
18484 }
18485
18486 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
18487 then replicate the value for all elements of the vector
18488 register. */
18489
18490 rtx
18491 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
18492 {
18493 int i, n_elt;
18494 rtvec v;
18495 enum machine_mode scalar_mode;
18496
18497 switch (mode)
18498 {
18499 case V32QImode:
18500 case V16QImode:
18501 case V16HImode:
18502 case V8HImode:
18503 case V8SImode:
18504 case V4SImode:
18505 case V4DImode:
18506 case V2DImode:
18507 gcc_assert (vect);
18508 case V8SFmode:
18509 case V4SFmode:
18510 case V4DFmode:
18511 case V2DFmode:
18512 n_elt = GET_MODE_NUNITS (mode);
18513 v = rtvec_alloc (n_elt);
18514 scalar_mode = GET_MODE_INNER (mode);
18515
18516 RTVEC_ELT (v, 0) = value;
18517
18518 for (i = 1; i < n_elt; ++i)
18519 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18520
18521 return gen_rtx_CONST_VECTOR (mode, v);
18522
18523 default:
18524 gcc_unreachable ();
18525 }
18526 }
18527
18528 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18529 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18530 for an SSE register. If VECT is true, then replicate the mask for
18531 all elements of the vector register. If INVERT is true, then create
18532 a mask excluding the sign bit. */
18533
18534 rtx
18535 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18536 {
18537 enum machine_mode vec_mode, imode;
18538 HOST_WIDE_INT hi, lo;
18539 int shift = 63;
18540 rtx v;
18541 rtx mask;
18542
18543 /* Find the sign bit, sign extended to 2*HWI. */
18544 switch (mode)
18545 {
18546 case V8SImode:
18547 case V4SImode:
18548 case V8SFmode:
18549 case V4SFmode:
18550 vec_mode = mode;
18551 mode = GET_MODE_INNER (mode);
18552 imode = SImode;
18553 lo = 0x80000000, hi = lo < 0;
18554 break;
18555
18556 case V4DImode:
18557 case V2DImode:
18558 case V4DFmode:
18559 case V2DFmode:
18560 vec_mode = mode;
18561 mode = GET_MODE_INNER (mode);
18562 imode = DImode;
18563 if (HOST_BITS_PER_WIDE_INT >= 64)
18564 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18565 else
18566 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18567 break;
18568
18569 case TImode:
18570 case TFmode:
18571 vec_mode = VOIDmode;
18572 if (HOST_BITS_PER_WIDE_INT >= 64)
18573 {
18574 imode = TImode;
18575 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18576 }
18577 else
18578 {
18579 rtvec vec;
18580
18581 imode = DImode;
18582 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18583
18584 if (invert)
18585 {
18586 lo = ~lo, hi = ~hi;
18587 v = constm1_rtx;
18588 }
18589 else
18590 v = const0_rtx;
18591
18592 mask = immed_double_const (lo, hi, imode);
18593
18594 vec = gen_rtvec (2, v, mask);
18595 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18596 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18597
18598 return v;
18599 }
18600 break;
18601
18602 default:
18603 gcc_unreachable ();
18604 }
18605
18606 if (invert)
18607 lo = ~lo, hi = ~hi;
18608
18609 /* Force this value into the low part of a fp vector constant. */
18610 mask = immed_double_const (lo, hi, imode);
18611 mask = gen_lowpart (mode, mask);
18612
18613 if (vec_mode == VOIDmode)
18614 return force_reg (mode, mask);
18615
18616 v = ix86_build_const_vector (vec_mode, vect, mask);
18617 return force_reg (vec_mode, v);
18618 }
18619
18620 /* Generate code for floating point ABS or NEG. */
18621
18622 void
18623 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18624 rtx operands[])
18625 {
18626 rtx mask, set, dst, src;
18627 bool use_sse = false;
18628 bool vector_mode = VECTOR_MODE_P (mode);
18629 enum machine_mode vmode = mode;
18630
18631 if (vector_mode)
18632 use_sse = true;
18633 else if (mode == TFmode)
18634 use_sse = true;
18635 else if (TARGET_SSE_MATH)
18636 {
18637 use_sse = SSE_FLOAT_MODE_P (mode);
18638 if (mode == SFmode)
18639 vmode = V4SFmode;
18640 else if (mode == DFmode)
18641 vmode = V2DFmode;
18642 }
18643
18644 /* NEG and ABS performed with SSE use bitwise mask operations.
18645 Create the appropriate mask now. */
18646 if (use_sse)
18647 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18648 else
18649 mask = NULL_RTX;
18650
18651 dst = operands[0];
18652 src = operands[1];
18653
18654 set = gen_rtx_fmt_e (code, mode, src);
18655 set = gen_rtx_SET (VOIDmode, dst, set);
18656
18657 if (mask)
18658 {
18659 rtx use, clob;
18660 rtvec par;
18661
18662 use = gen_rtx_USE (VOIDmode, mask);
18663 if (vector_mode)
18664 par = gen_rtvec (2, set, use);
18665 else
18666 {
18667 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18668 par = gen_rtvec (3, set, use, clob);
18669 }
18670 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18671 }
18672 else
18673 emit_insn (set);
18674 }
18675
18676 /* Expand a copysign operation. Special case operand 0 being a constant. */
18677
18678 void
18679 ix86_expand_copysign (rtx operands[])
18680 {
18681 enum machine_mode mode, vmode;
18682 rtx dest, op0, op1, mask, nmask;
18683
18684 dest = operands[0];
18685 op0 = operands[1];
18686 op1 = operands[2];
18687
18688 mode = GET_MODE (dest);
18689
18690 if (mode == SFmode)
18691 vmode = V4SFmode;
18692 else if (mode == DFmode)
18693 vmode = V2DFmode;
18694 else
18695 vmode = mode;
18696
18697 if (GET_CODE (op0) == CONST_DOUBLE)
18698 {
18699 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18700
18701 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18702 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18703
18704 if (mode == SFmode || mode == DFmode)
18705 {
18706 if (op0 == CONST0_RTX (mode))
18707 op0 = CONST0_RTX (vmode);
18708 else
18709 {
18710 rtx v = ix86_build_const_vector (vmode, false, op0);
18711
18712 op0 = force_reg (vmode, v);
18713 }
18714 }
18715 else if (op0 != CONST0_RTX (mode))
18716 op0 = force_reg (mode, op0);
18717
18718 mask = ix86_build_signbit_mask (vmode, 0, 0);
18719
18720 if (mode == SFmode)
18721 copysign_insn = gen_copysignsf3_const;
18722 else if (mode == DFmode)
18723 copysign_insn = gen_copysigndf3_const;
18724 else
18725 copysign_insn = gen_copysigntf3_const;
18726
18727 emit_insn (copysign_insn (dest, op0, op1, mask));
18728 }
18729 else
18730 {
18731 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18732
18733 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18734 mask = ix86_build_signbit_mask (vmode, 0, 0);
18735
18736 if (mode == SFmode)
18737 copysign_insn = gen_copysignsf3_var;
18738 else if (mode == DFmode)
18739 copysign_insn = gen_copysigndf3_var;
18740 else
18741 copysign_insn = gen_copysigntf3_var;
18742
18743 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18744 }
18745 }
18746
18747 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18748 be a constant, and so has already been expanded into a vector constant. */
18749
18750 void
18751 ix86_split_copysign_const (rtx operands[])
18752 {
18753 enum machine_mode mode, vmode;
18754 rtx dest, op0, mask, x;
18755
18756 dest = operands[0];
18757 op0 = operands[1];
18758 mask = operands[3];
18759
18760 mode = GET_MODE (dest);
18761 vmode = GET_MODE (mask);
18762
18763 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18764 x = gen_rtx_AND (vmode, dest, mask);
18765 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18766
18767 if (op0 != CONST0_RTX (vmode))
18768 {
18769 x = gen_rtx_IOR (vmode, dest, op0);
18770 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18771 }
18772 }
18773
18774 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18775 so we have to do two masks. */
18776
18777 void
18778 ix86_split_copysign_var (rtx operands[])
18779 {
18780 enum machine_mode mode, vmode;
18781 rtx dest, scratch, op0, op1, mask, nmask, x;
18782
18783 dest = operands[0];
18784 scratch = operands[1];
18785 op0 = operands[2];
18786 op1 = operands[3];
18787 nmask = operands[4];
18788 mask = operands[5];
18789
18790 mode = GET_MODE (dest);
18791 vmode = GET_MODE (mask);
18792
18793 if (rtx_equal_p (op0, op1))
18794 {
18795 /* Shouldn't happen often (it's useless, obviously), but when it does
18796 we'd generate incorrect code if we continue below. */
18797 emit_move_insn (dest, op0);
18798 return;
18799 }
18800
18801 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18802 {
18803 gcc_assert (REGNO (op1) == REGNO (scratch));
18804
18805 x = gen_rtx_AND (vmode, scratch, mask);
18806 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18807
18808 dest = mask;
18809 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18810 x = gen_rtx_NOT (vmode, dest);
18811 x = gen_rtx_AND (vmode, x, op0);
18812 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18813 }
18814 else
18815 {
18816 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18817 {
18818 x = gen_rtx_AND (vmode, scratch, mask);
18819 }
18820 else /* alternative 2,4 */
18821 {
18822 gcc_assert (REGNO (mask) == REGNO (scratch));
18823 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18824 x = gen_rtx_AND (vmode, scratch, op1);
18825 }
18826 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18827
18828 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18829 {
18830 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18831 x = gen_rtx_AND (vmode, dest, nmask);
18832 }
18833 else /* alternative 3,4 */
18834 {
18835 gcc_assert (REGNO (nmask) == REGNO (dest));
18836 dest = nmask;
18837 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18838 x = gen_rtx_AND (vmode, dest, op0);
18839 }
18840 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18841 }
18842
18843 x = gen_rtx_IOR (vmode, dest, scratch);
18844 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18845 }
18846
18847 /* Return TRUE or FALSE depending on whether the first SET in INSN
18848 has source and destination with matching CC modes, and that the
18849 CC mode is at least as constrained as REQ_MODE. */
18850
18851 bool
18852 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18853 {
18854 rtx set;
18855 enum machine_mode set_mode;
18856
18857 set = PATTERN (insn);
18858 if (GET_CODE (set) == PARALLEL)
18859 set = XVECEXP (set, 0, 0);
18860 gcc_assert (GET_CODE (set) == SET);
18861 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18862
18863 set_mode = GET_MODE (SET_DEST (set));
18864 switch (set_mode)
18865 {
18866 case CCNOmode:
18867 if (req_mode != CCNOmode
18868 && (req_mode != CCmode
18869 || XEXP (SET_SRC (set), 1) != const0_rtx))
18870 return false;
18871 break;
18872 case CCmode:
18873 if (req_mode == CCGCmode)
18874 return false;
18875 /* FALLTHRU */
18876 case CCGCmode:
18877 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18878 return false;
18879 /* FALLTHRU */
18880 case CCGOCmode:
18881 if (req_mode == CCZmode)
18882 return false;
18883 /* FALLTHRU */
18884 case CCZmode:
18885 break;
18886
18887 case CCAmode:
18888 case CCCmode:
18889 case CCOmode:
18890 case CCSmode:
18891 if (set_mode != req_mode)
18892 return false;
18893 break;
18894
18895 default:
18896 gcc_unreachable ();
18897 }
18898
18899 return GET_MODE (SET_SRC (set)) == set_mode;
18900 }
18901
18902 /* Generate insn patterns to do an integer compare of OPERANDS. */
18903
18904 static rtx
18905 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18906 {
18907 enum machine_mode cmpmode;
18908 rtx tmp, flags;
18909
18910 cmpmode = SELECT_CC_MODE (code, op0, op1);
18911 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18912
18913 /* This is very simple, but making the interface the same as in the
18914 FP case makes the rest of the code easier. */
18915 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18916 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18917
18918 /* Return the test that should be put into the flags user, i.e.
18919 the bcc, scc, or cmov instruction. */
18920 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18921 }
18922
18923 /* Figure out whether to use ordered or unordered fp comparisons.
18924 Return the appropriate mode to use. */
18925
18926 enum machine_mode
18927 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18928 {
18929 /* ??? In order to make all comparisons reversible, we do all comparisons
18930 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18931 all forms trapping and nontrapping comparisons, we can make inequality
18932 comparisons trapping again, since it results in better code when using
18933 FCOM based compares. */
18934 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18935 }
18936
18937 enum machine_mode
18938 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18939 {
18940 enum machine_mode mode = GET_MODE (op0);
18941
18942 if (SCALAR_FLOAT_MODE_P (mode))
18943 {
18944 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18945 return ix86_fp_compare_mode (code);
18946 }
18947
18948 switch (code)
18949 {
18950 /* Only zero flag is needed. */
18951 case EQ: /* ZF=0 */
18952 case NE: /* ZF!=0 */
18953 return CCZmode;
18954 /* Codes needing carry flag. */
18955 case GEU: /* CF=0 */
18956 case LTU: /* CF=1 */
18957 /* Detect overflow checks. They need just the carry flag. */
18958 if (GET_CODE (op0) == PLUS
18959 && rtx_equal_p (op1, XEXP (op0, 0)))
18960 return CCCmode;
18961 else
18962 return CCmode;
18963 case GTU: /* CF=0 & ZF=0 */
18964 case LEU: /* CF=1 | ZF=1 */
18965 return CCmode;
18966 /* Codes possibly doable only with sign flag when
18967 comparing against zero. */
18968 case GE: /* SF=OF or SF=0 */
18969 case LT: /* SF<>OF or SF=1 */
18970 if (op1 == const0_rtx)
18971 return CCGOCmode;
18972 else
18973 /* For other cases Carry flag is not required. */
18974 return CCGCmode;
18975 /* Codes doable only with sign flag when comparing
18976 against zero, but we miss jump instruction for it
18977 so we need to use relational tests against overflow
18978 that thus needs to be zero. */
18979 case GT: /* ZF=0 & SF=OF */
18980 case LE: /* ZF=1 | SF<>OF */
18981 if (op1 == const0_rtx)
18982 return CCNOmode;
18983 else
18984 return CCGCmode;
18985 /* strcmp pattern do (use flags) and combine may ask us for proper
18986 mode. */
18987 case USE:
18988 return CCmode;
18989 default:
18990 gcc_unreachable ();
18991 }
18992 }
18993
18994 /* Return the fixed registers used for condition codes. */
18995
18996 static bool
18997 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18998 {
18999 *p1 = FLAGS_REG;
19000 *p2 = FPSR_REG;
19001 return true;
19002 }
19003
19004 /* If two condition code modes are compatible, return a condition code
19005 mode which is compatible with both. Otherwise, return
19006 VOIDmode. */
19007
19008 static enum machine_mode
19009 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
19010 {
19011 if (m1 == m2)
19012 return m1;
19013
19014 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
19015 return VOIDmode;
19016
19017 if ((m1 == CCGCmode && m2 == CCGOCmode)
19018 || (m1 == CCGOCmode && m2 == CCGCmode))
19019 return CCGCmode;
19020
19021 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
19022 return m2;
19023 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
19024 return m1;
19025
19026 switch (m1)
19027 {
19028 default:
19029 gcc_unreachable ();
19030
19031 case CCmode:
19032 case CCGCmode:
19033 case CCGOCmode:
19034 case CCNOmode:
19035 case CCAmode:
19036 case CCCmode:
19037 case CCOmode:
19038 case CCSmode:
19039 case CCZmode:
19040 switch (m2)
19041 {
19042 default:
19043 return VOIDmode;
19044
19045 case CCmode:
19046 case CCGCmode:
19047 case CCGOCmode:
19048 case CCNOmode:
19049 case CCAmode:
19050 case CCCmode:
19051 case CCOmode:
19052 case CCSmode:
19053 case CCZmode:
19054 return CCmode;
19055 }
19056
19057 case CCFPmode:
19058 case CCFPUmode:
19059 /* These are only compatible with themselves, which we already
19060 checked above. */
19061 return VOIDmode;
19062 }
19063 }
19064
19065
19066 /* Return a comparison we can do and that it is equivalent to
19067 swap_condition (code) apart possibly from orderedness.
19068 But, never change orderedness if TARGET_IEEE_FP, returning
19069 UNKNOWN in that case if necessary. */
19070
19071 static enum rtx_code
19072 ix86_fp_swap_condition (enum rtx_code code)
19073 {
19074 switch (code)
19075 {
19076 case GT: /* GTU - CF=0 & ZF=0 */
19077 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
19078 case GE: /* GEU - CF=0 */
19079 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
19080 case UNLT: /* LTU - CF=1 */
19081 return TARGET_IEEE_FP ? UNKNOWN : GT;
19082 case UNLE: /* LEU - CF=1 | ZF=1 */
19083 return TARGET_IEEE_FP ? UNKNOWN : GE;
19084 default:
19085 return swap_condition (code);
19086 }
19087 }
19088
19089 /* Return cost of comparison CODE using the best strategy for performance.
19090 All following functions do use number of instructions as a cost metrics.
19091 In future this should be tweaked to compute bytes for optimize_size and
19092 take into account performance of various instructions on various CPUs. */
19093
19094 static int
19095 ix86_fp_comparison_cost (enum rtx_code code)
19096 {
19097 int arith_cost;
19098
19099 /* The cost of code using bit-twiddling on %ah. */
19100 switch (code)
19101 {
19102 case UNLE:
19103 case UNLT:
19104 case LTGT:
19105 case GT:
19106 case GE:
19107 case UNORDERED:
19108 case ORDERED:
19109 case UNEQ:
19110 arith_cost = 4;
19111 break;
19112 case LT:
19113 case NE:
19114 case EQ:
19115 case UNGE:
19116 arith_cost = TARGET_IEEE_FP ? 5 : 4;
19117 break;
19118 case LE:
19119 case UNGT:
19120 arith_cost = TARGET_IEEE_FP ? 6 : 4;
19121 break;
19122 default:
19123 gcc_unreachable ();
19124 }
19125
19126 switch (ix86_fp_comparison_strategy (code))
19127 {
19128 case IX86_FPCMP_COMI:
19129 return arith_cost > 4 ? 3 : 2;
19130 case IX86_FPCMP_SAHF:
19131 return arith_cost > 4 ? 4 : 3;
19132 default:
19133 return arith_cost;
19134 }
19135 }
19136
19137 /* Return strategy to use for floating-point. We assume that fcomi is always
19138 preferrable where available, since that is also true when looking at size
19139 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
19140
19141 enum ix86_fpcmp_strategy
19142 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
19143 {
19144 /* Do fcomi/sahf based test when profitable. */
19145
19146 if (TARGET_CMOVE)
19147 return IX86_FPCMP_COMI;
19148
19149 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19150 return IX86_FPCMP_SAHF;
19151
19152 return IX86_FPCMP_ARITH;
19153 }
19154
19155 /* Swap, force into registers, or otherwise massage the two operands
19156 to a fp comparison. The operands are updated in place; the new
19157 comparison code is returned. */
19158
19159 static enum rtx_code
19160 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
19161 {
19162 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
19163 rtx op0 = *pop0, op1 = *pop1;
19164 enum machine_mode op_mode = GET_MODE (op0);
19165 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
19166
19167 /* All of the unordered compare instructions only work on registers.
19168 The same is true of the fcomi compare instructions. The XFmode
19169 compare instructions require registers except when comparing
19170 against zero or when converting operand 1 from fixed point to
19171 floating point. */
19172
19173 if (!is_sse
19174 && (fpcmp_mode == CCFPUmode
19175 || (op_mode == XFmode
19176 && ! (standard_80387_constant_p (op0) == 1
19177 || standard_80387_constant_p (op1) == 1)
19178 && GET_CODE (op1) != FLOAT)
19179 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
19180 {
19181 op0 = force_reg (op_mode, op0);
19182 op1 = force_reg (op_mode, op1);
19183 }
19184 else
19185 {
19186 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
19187 things around if they appear profitable, otherwise force op0
19188 into a register. */
19189
19190 if (standard_80387_constant_p (op0) == 0
19191 || (MEM_P (op0)
19192 && ! (standard_80387_constant_p (op1) == 0
19193 || MEM_P (op1))))
19194 {
19195 enum rtx_code new_code = ix86_fp_swap_condition (code);
19196 if (new_code != UNKNOWN)
19197 {
19198 rtx tmp;
19199 tmp = op0, op0 = op1, op1 = tmp;
19200 code = new_code;
19201 }
19202 }
19203
19204 if (!REG_P (op0))
19205 op0 = force_reg (op_mode, op0);
19206
19207 if (CONSTANT_P (op1))
19208 {
19209 int tmp = standard_80387_constant_p (op1);
19210 if (tmp == 0)
19211 op1 = validize_mem (force_const_mem (op_mode, op1));
19212 else if (tmp == 1)
19213 {
19214 if (TARGET_CMOVE)
19215 op1 = force_reg (op_mode, op1);
19216 }
19217 else
19218 op1 = force_reg (op_mode, op1);
19219 }
19220 }
19221
19222 /* Try to rearrange the comparison to make it cheaper. */
19223 if (ix86_fp_comparison_cost (code)
19224 > ix86_fp_comparison_cost (swap_condition (code))
19225 && (REG_P (op1) || can_create_pseudo_p ()))
19226 {
19227 rtx tmp;
19228 tmp = op0, op0 = op1, op1 = tmp;
19229 code = swap_condition (code);
19230 if (!REG_P (op0))
19231 op0 = force_reg (op_mode, op0);
19232 }
19233
19234 *pop0 = op0;
19235 *pop1 = op1;
19236 return code;
19237 }
19238
19239 /* Convert comparison codes we use to represent FP comparison to integer
19240 code that will result in proper branch. Return UNKNOWN if no such code
19241 is available. */
19242
19243 enum rtx_code
19244 ix86_fp_compare_code_to_integer (enum rtx_code code)
19245 {
19246 switch (code)
19247 {
19248 case GT:
19249 return GTU;
19250 case GE:
19251 return GEU;
19252 case ORDERED:
19253 case UNORDERED:
19254 return code;
19255 break;
19256 case UNEQ:
19257 return EQ;
19258 break;
19259 case UNLT:
19260 return LTU;
19261 break;
19262 case UNLE:
19263 return LEU;
19264 break;
19265 case LTGT:
19266 return NE;
19267 break;
19268 default:
19269 return UNKNOWN;
19270 }
19271 }
19272
19273 /* Generate insn patterns to do a floating point compare of OPERANDS. */
19274
19275 static rtx
19276 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
19277 {
19278 enum machine_mode fpcmp_mode, intcmp_mode;
19279 rtx tmp, tmp2;
19280
19281 fpcmp_mode = ix86_fp_compare_mode (code);
19282 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
19283
19284 /* Do fcomi/sahf based test when profitable. */
19285 switch (ix86_fp_comparison_strategy (code))
19286 {
19287 case IX86_FPCMP_COMI:
19288 intcmp_mode = fpcmp_mode;
19289 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19290 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19291 tmp);
19292 emit_insn (tmp);
19293 break;
19294
19295 case IX86_FPCMP_SAHF:
19296 intcmp_mode = fpcmp_mode;
19297 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19298 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
19299 tmp);
19300
19301 if (!scratch)
19302 scratch = gen_reg_rtx (HImode);
19303 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
19304 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
19305 break;
19306
19307 case IX86_FPCMP_ARITH:
19308 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
19309 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
19310 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
19311 if (!scratch)
19312 scratch = gen_reg_rtx (HImode);
19313 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
19314
19315 /* In the unordered case, we have to check C2 for NaN's, which
19316 doesn't happen to work out to anything nice combination-wise.
19317 So do some bit twiddling on the value we've got in AH to come
19318 up with an appropriate set of condition codes. */
19319
19320 intcmp_mode = CCNOmode;
19321 switch (code)
19322 {
19323 case GT:
19324 case UNGT:
19325 if (code == GT || !TARGET_IEEE_FP)
19326 {
19327 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19328 code = EQ;
19329 }
19330 else
19331 {
19332 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19333 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19334 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
19335 intcmp_mode = CCmode;
19336 code = GEU;
19337 }
19338 break;
19339 case LT:
19340 case UNLT:
19341 if (code == LT && TARGET_IEEE_FP)
19342 {
19343 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19344 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
19345 intcmp_mode = CCmode;
19346 code = EQ;
19347 }
19348 else
19349 {
19350 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
19351 code = NE;
19352 }
19353 break;
19354 case GE:
19355 case UNGE:
19356 if (code == GE || !TARGET_IEEE_FP)
19357 {
19358 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
19359 code = EQ;
19360 }
19361 else
19362 {
19363 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19364 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
19365 code = NE;
19366 }
19367 break;
19368 case LE:
19369 case UNLE:
19370 if (code == LE && TARGET_IEEE_FP)
19371 {
19372 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19373 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
19374 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19375 intcmp_mode = CCmode;
19376 code = LTU;
19377 }
19378 else
19379 {
19380 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
19381 code = NE;
19382 }
19383 break;
19384 case EQ:
19385 case UNEQ:
19386 if (code == EQ && TARGET_IEEE_FP)
19387 {
19388 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19389 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
19390 intcmp_mode = CCmode;
19391 code = EQ;
19392 }
19393 else
19394 {
19395 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19396 code = NE;
19397 }
19398 break;
19399 case NE:
19400 case LTGT:
19401 if (code == NE && TARGET_IEEE_FP)
19402 {
19403 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
19404 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
19405 GEN_INT (0x40)));
19406 code = NE;
19407 }
19408 else
19409 {
19410 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
19411 code = EQ;
19412 }
19413 break;
19414
19415 case UNORDERED:
19416 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19417 code = NE;
19418 break;
19419 case ORDERED:
19420 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
19421 code = EQ;
19422 break;
19423
19424 default:
19425 gcc_unreachable ();
19426 }
19427 break;
19428
19429 default:
19430 gcc_unreachable();
19431 }
19432
19433 /* Return the test that should be put into the flags user, i.e.
19434 the bcc, scc, or cmov instruction. */
19435 return gen_rtx_fmt_ee (code, VOIDmode,
19436 gen_rtx_REG (intcmp_mode, FLAGS_REG),
19437 const0_rtx);
19438 }
19439
19440 static rtx
19441 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
19442 {
19443 rtx ret;
19444
19445 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
19446 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
19447
19448 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
19449 {
19450 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
19451 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19452 }
19453 else
19454 ret = ix86_expand_int_compare (code, op0, op1);
19455
19456 return ret;
19457 }
19458
19459 void
19460 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
19461 {
19462 enum machine_mode mode = GET_MODE (op0);
19463 rtx tmp;
19464
19465 switch (mode)
19466 {
19467 case SFmode:
19468 case DFmode:
19469 case XFmode:
19470 case QImode:
19471 case HImode:
19472 case SImode:
19473 simple:
19474 tmp = ix86_expand_compare (code, op0, op1);
19475 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
19476 gen_rtx_LABEL_REF (VOIDmode, label),
19477 pc_rtx);
19478 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
19479 return;
19480
19481 case DImode:
19482 if (TARGET_64BIT)
19483 goto simple;
19484 case TImode:
19485 /* Expand DImode branch into multiple compare+branch. */
19486 {
19487 rtx lo[2], hi[2], label2;
19488 enum rtx_code code1, code2, code3;
19489 enum machine_mode submode;
19490
19491 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
19492 {
19493 tmp = op0, op0 = op1, op1 = tmp;
19494 code = swap_condition (code);
19495 }
19496
19497 split_double_mode (mode, &op0, 1, lo+0, hi+0);
19498 split_double_mode (mode, &op1, 1, lo+1, hi+1);
19499
19500 submode = mode == DImode ? SImode : DImode;
19501
19502 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
19503 avoid two branches. This costs one extra insn, so disable when
19504 optimizing for size. */
19505
19506 if ((code == EQ || code == NE)
19507 && (!optimize_insn_for_size_p ()
19508 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19509 {
19510 rtx xor0, xor1;
19511
19512 xor1 = hi[0];
19513 if (hi[1] != const0_rtx)
19514 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19515 NULL_RTX, 0, OPTAB_WIDEN);
19516
19517 xor0 = lo[0];
19518 if (lo[1] != const0_rtx)
19519 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19520 NULL_RTX, 0, OPTAB_WIDEN);
19521
19522 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19523 NULL_RTX, 0, OPTAB_WIDEN);
19524
19525 ix86_expand_branch (code, tmp, const0_rtx, label);
19526 return;
19527 }
19528
19529 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19530 op1 is a constant and the low word is zero, then we can just
19531 examine the high word. Similarly for low word -1 and
19532 less-or-equal-than or greater-than. */
19533
19534 if (CONST_INT_P (hi[1]))
19535 switch (code)
19536 {
19537 case LT: case LTU: case GE: case GEU:
19538 if (lo[1] == const0_rtx)
19539 {
19540 ix86_expand_branch (code, hi[0], hi[1], label);
19541 return;
19542 }
19543 break;
19544 case LE: case LEU: case GT: case GTU:
19545 if (lo[1] == constm1_rtx)
19546 {
19547 ix86_expand_branch (code, hi[0], hi[1], label);
19548 return;
19549 }
19550 break;
19551 default:
19552 break;
19553 }
19554
19555 /* Otherwise, we need two or three jumps. */
19556
19557 label2 = gen_label_rtx ();
19558
19559 code1 = code;
19560 code2 = swap_condition (code);
19561 code3 = unsigned_condition (code);
19562
19563 switch (code)
19564 {
19565 case LT: case GT: case LTU: case GTU:
19566 break;
19567
19568 case LE: code1 = LT; code2 = GT; break;
19569 case GE: code1 = GT; code2 = LT; break;
19570 case LEU: code1 = LTU; code2 = GTU; break;
19571 case GEU: code1 = GTU; code2 = LTU; break;
19572
19573 case EQ: code1 = UNKNOWN; code2 = NE; break;
19574 case NE: code2 = UNKNOWN; break;
19575
19576 default:
19577 gcc_unreachable ();
19578 }
19579
19580 /*
19581 * a < b =>
19582 * if (hi(a) < hi(b)) goto true;
19583 * if (hi(a) > hi(b)) goto false;
19584 * if (lo(a) < lo(b)) goto true;
19585 * false:
19586 */
19587
19588 if (code1 != UNKNOWN)
19589 ix86_expand_branch (code1, hi[0], hi[1], label);
19590 if (code2 != UNKNOWN)
19591 ix86_expand_branch (code2, hi[0], hi[1], label2);
19592
19593 ix86_expand_branch (code3, lo[0], lo[1], label);
19594
19595 if (code2 != UNKNOWN)
19596 emit_label (label2);
19597 return;
19598 }
19599
19600 default:
19601 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19602 goto simple;
19603 }
19604 }
19605
19606 /* Split branch based on floating point condition. */
19607 void
19608 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19609 rtx target1, rtx target2, rtx tmp, rtx pushed)
19610 {
19611 rtx condition;
19612 rtx i;
19613
19614 if (target2 != pc_rtx)
19615 {
19616 rtx tmp = target2;
19617 code = reverse_condition_maybe_unordered (code);
19618 target2 = target1;
19619 target1 = tmp;
19620 }
19621
19622 condition = ix86_expand_fp_compare (code, op1, op2,
19623 tmp);
19624
19625 /* Remove pushed operand from stack. */
19626 if (pushed)
19627 ix86_free_from_memory (GET_MODE (pushed));
19628
19629 i = emit_jump_insn (gen_rtx_SET
19630 (VOIDmode, pc_rtx,
19631 gen_rtx_IF_THEN_ELSE (VOIDmode,
19632 condition, target1, target2)));
19633 if (split_branch_probability >= 0)
19634 add_int_reg_note (i, REG_BR_PROB, split_branch_probability);
19635 }
19636
19637 void
19638 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19639 {
19640 rtx ret;
19641
19642 gcc_assert (GET_MODE (dest) == QImode);
19643
19644 ret = ix86_expand_compare (code, op0, op1);
19645 PUT_MODE (ret, QImode);
19646 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19647 }
19648
19649 /* Expand comparison setting or clearing carry flag. Return true when
19650 successful and set pop for the operation. */
19651 static bool
19652 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19653 {
19654 enum machine_mode mode =
19655 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19656
19657 /* Do not handle double-mode compares that go through special path. */
19658 if (mode == (TARGET_64BIT ? TImode : DImode))
19659 return false;
19660
19661 if (SCALAR_FLOAT_MODE_P (mode))
19662 {
19663 rtx compare_op, compare_seq;
19664
19665 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19666
19667 /* Shortcut: following common codes never translate
19668 into carry flag compares. */
19669 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19670 || code == ORDERED || code == UNORDERED)
19671 return false;
19672
19673 /* These comparisons require zero flag; swap operands so they won't. */
19674 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19675 && !TARGET_IEEE_FP)
19676 {
19677 rtx tmp = op0;
19678 op0 = op1;
19679 op1 = tmp;
19680 code = swap_condition (code);
19681 }
19682
19683 /* Try to expand the comparison and verify that we end up with
19684 carry flag based comparison. This fails to be true only when
19685 we decide to expand comparison using arithmetic that is not
19686 too common scenario. */
19687 start_sequence ();
19688 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19689 compare_seq = get_insns ();
19690 end_sequence ();
19691
19692 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19693 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19694 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19695 else
19696 code = GET_CODE (compare_op);
19697
19698 if (code != LTU && code != GEU)
19699 return false;
19700
19701 emit_insn (compare_seq);
19702 *pop = compare_op;
19703 return true;
19704 }
19705
19706 if (!INTEGRAL_MODE_P (mode))
19707 return false;
19708
19709 switch (code)
19710 {
19711 case LTU:
19712 case GEU:
19713 break;
19714
19715 /* Convert a==0 into (unsigned)a<1. */
19716 case EQ:
19717 case NE:
19718 if (op1 != const0_rtx)
19719 return false;
19720 op1 = const1_rtx;
19721 code = (code == EQ ? LTU : GEU);
19722 break;
19723
19724 /* Convert a>b into b<a or a>=b-1. */
19725 case GTU:
19726 case LEU:
19727 if (CONST_INT_P (op1))
19728 {
19729 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19730 /* Bail out on overflow. We still can swap operands but that
19731 would force loading of the constant into register. */
19732 if (op1 == const0_rtx
19733 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19734 return false;
19735 code = (code == GTU ? GEU : LTU);
19736 }
19737 else
19738 {
19739 rtx tmp = op1;
19740 op1 = op0;
19741 op0 = tmp;
19742 code = (code == GTU ? LTU : GEU);
19743 }
19744 break;
19745
19746 /* Convert a>=0 into (unsigned)a<0x80000000. */
19747 case LT:
19748 case GE:
19749 if (mode == DImode || op1 != const0_rtx)
19750 return false;
19751 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19752 code = (code == LT ? GEU : LTU);
19753 break;
19754 case LE:
19755 case GT:
19756 if (mode == DImode || op1 != constm1_rtx)
19757 return false;
19758 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19759 code = (code == LE ? GEU : LTU);
19760 break;
19761
19762 default:
19763 return false;
19764 }
19765 /* Swapping operands may cause constant to appear as first operand. */
19766 if (!nonimmediate_operand (op0, VOIDmode))
19767 {
19768 if (!can_create_pseudo_p ())
19769 return false;
19770 op0 = force_reg (mode, op0);
19771 }
19772 *pop = ix86_expand_compare (code, op0, op1);
19773 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19774 return true;
19775 }
19776
19777 bool
19778 ix86_expand_int_movcc (rtx operands[])
19779 {
19780 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19781 rtx compare_seq, compare_op;
19782 enum machine_mode mode = GET_MODE (operands[0]);
19783 bool sign_bit_compare_p = false;
19784 rtx op0 = XEXP (operands[1], 0);
19785 rtx op1 = XEXP (operands[1], 1);
19786
19787 if (GET_MODE (op0) == TImode
19788 || (GET_MODE (op0) == DImode
19789 && !TARGET_64BIT))
19790 return false;
19791
19792 start_sequence ();
19793 compare_op = ix86_expand_compare (code, op0, op1);
19794 compare_seq = get_insns ();
19795 end_sequence ();
19796
19797 compare_code = GET_CODE (compare_op);
19798
19799 if ((op1 == const0_rtx && (code == GE || code == LT))
19800 || (op1 == constm1_rtx && (code == GT || code == LE)))
19801 sign_bit_compare_p = true;
19802
19803 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19804 HImode insns, we'd be swallowed in word prefix ops. */
19805
19806 if ((mode != HImode || TARGET_FAST_PREFIX)
19807 && (mode != (TARGET_64BIT ? TImode : DImode))
19808 && CONST_INT_P (operands[2])
19809 && CONST_INT_P (operands[3]))
19810 {
19811 rtx out = operands[0];
19812 HOST_WIDE_INT ct = INTVAL (operands[2]);
19813 HOST_WIDE_INT cf = INTVAL (operands[3]);
19814 HOST_WIDE_INT diff;
19815
19816 diff = ct - cf;
19817 /* Sign bit compares are better done using shifts than we do by using
19818 sbb. */
19819 if (sign_bit_compare_p
19820 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19821 {
19822 /* Detect overlap between destination and compare sources. */
19823 rtx tmp = out;
19824
19825 if (!sign_bit_compare_p)
19826 {
19827 rtx flags;
19828 bool fpcmp = false;
19829
19830 compare_code = GET_CODE (compare_op);
19831
19832 flags = XEXP (compare_op, 0);
19833
19834 if (GET_MODE (flags) == CCFPmode
19835 || GET_MODE (flags) == CCFPUmode)
19836 {
19837 fpcmp = true;
19838 compare_code
19839 = ix86_fp_compare_code_to_integer (compare_code);
19840 }
19841
19842 /* To simplify rest of code, restrict to the GEU case. */
19843 if (compare_code == LTU)
19844 {
19845 HOST_WIDE_INT tmp = ct;
19846 ct = cf;
19847 cf = tmp;
19848 compare_code = reverse_condition (compare_code);
19849 code = reverse_condition (code);
19850 }
19851 else
19852 {
19853 if (fpcmp)
19854 PUT_CODE (compare_op,
19855 reverse_condition_maybe_unordered
19856 (GET_CODE (compare_op)));
19857 else
19858 PUT_CODE (compare_op,
19859 reverse_condition (GET_CODE (compare_op)));
19860 }
19861 diff = ct - cf;
19862
19863 if (reg_overlap_mentioned_p (out, op0)
19864 || reg_overlap_mentioned_p (out, op1))
19865 tmp = gen_reg_rtx (mode);
19866
19867 if (mode == DImode)
19868 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19869 else
19870 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19871 flags, compare_op));
19872 }
19873 else
19874 {
19875 if (code == GT || code == GE)
19876 code = reverse_condition (code);
19877 else
19878 {
19879 HOST_WIDE_INT tmp = ct;
19880 ct = cf;
19881 cf = tmp;
19882 diff = ct - cf;
19883 }
19884 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19885 }
19886
19887 if (diff == 1)
19888 {
19889 /*
19890 * cmpl op0,op1
19891 * sbbl dest,dest
19892 * [addl dest, ct]
19893 *
19894 * Size 5 - 8.
19895 */
19896 if (ct)
19897 tmp = expand_simple_binop (mode, PLUS,
19898 tmp, GEN_INT (ct),
19899 copy_rtx (tmp), 1, OPTAB_DIRECT);
19900 }
19901 else if (cf == -1)
19902 {
19903 /*
19904 * cmpl op0,op1
19905 * sbbl dest,dest
19906 * orl $ct, dest
19907 *
19908 * Size 8.
19909 */
19910 tmp = expand_simple_binop (mode, IOR,
19911 tmp, GEN_INT (ct),
19912 copy_rtx (tmp), 1, OPTAB_DIRECT);
19913 }
19914 else if (diff == -1 && ct)
19915 {
19916 /*
19917 * cmpl op0,op1
19918 * sbbl dest,dest
19919 * notl dest
19920 * [addl dest, cf]
19921 *
19922 * Size 8 - 11.
19923 */
19924 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19925 if (cf)
19926 tmp = expand_simple_binop (mode, PLUS,
19927 copy_rtx (tmp), GEN_INT (cf),
19928 copy_rtx (tmp), 1, OPTAB_DIRECT);
19929 }
19930 else
19931 {
19932 /*
19933 * cmpl op0,op1
19934 * sbbl dest,dest
19935 * [notl dest]
19936 * andl cf - ct, dest
19937 * [addl dest, ct]
19938 *
19939 * Size 8 - 11.
19940 */
19941
19942 if (cf == 0)
19943 {
19944 cf = ct;
19945 ct = 0;
19946 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19947 }
19948
19949 tmp = expand_simple_binop (mode, AND,
19950 copy_rtx (tmp),
19951 gen_int_mode (cf - ct, mode),
19952 copy_rtx (tmp), 1, OPTAB_DIRECT);
19953 if (ct)
19954 tmp = expand_simple_binop (mode, PLUS,
19955 copy_rtx (tmp), GEN_INT (ct),
19956 copy_rtx (tmp), 1, OPTAB_DIRECT);
19957 }
19958
19959 if (!rtx_equal_p (tmp, out))
19960 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19961
19962 return true;
19963 }
19964
19965 if (diff < 0)
19966 {
19967 enum machine_mode cmp_mode = GET_MODE (op0);
19968
19969 HOST_WIDE_INT tmp;
19970 tmp = ct, ct = cf, cf = tmp;
19971 diff = -diff;
19972
19973 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19974 {
19975 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19976
19977 /* We may be reversing unordered compare to normal compare, that
19978 is not valid in general (we may convert non-trapping condition
19979 to trapping one), however on i386 we currently emit all
19980 comparisons unordered. */
19981 compare_code = reverse_condition_maybe_unordered (compare_code);
19982 code = reverse_condition_maybe_unordered (code);
19983 }
19984 else
19985 {
19986 compare_code = reverse_condition (compare_code);
19987 code = reverse_condition (code);
19988 }
19989 }
19990
19991 compare_code = UNKNOWN;
19992 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19993 && CONST_INT_P (op1))
19994 {
19995 if (op1 == const0_rtx
19996 && (code == LT || code == GE))
19997 compare_code = code;
19998 else if (op1 == constm1_rtx)
19999 {
20000 if (code == LE)
20001 compare_code = LT;
20002 else if (code == GT)
20003 compare_code = GE;
20004 }
20005 }
20006
20007 /* Optimize dest = (op0 < 0) ? -1 : cf. */
20008 if (compare_code != UNKNOWN
20009 && GET_MODE (op0) == GET_MODE (out)
20010 && (cf == -1 || ct == -1))
20011 {
20012 /* If lea code below could be used, only optimize
20013 if it results in a 2 insn sequence. */
20014
20015 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
20016 || diff == 3 || diff == 5 || diff == 9)
20017 || (compare_code == LT && ct == -1)
20018 || (compare_code == GE && cf == -1))
20019 {
20020 /*
20021 * notl op1 (if necessary)
20022 * sarl $31, op1
20023 * orl cf, op1
20024 */
20025 if (ct != -1)
20026 {
20027 cf = ct;
20028 ct = -1;
20029 code = reverse_condition (code);
20030 }
20031
20032 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20033
20034 out = expand_simple_binop (mode, IOR,
20035 out, GEN_INT (cf),
20036 out, 1, OPTAB_DIRECT);
20037 if (out != operands[0])
20038 emit_move_insn (operands[0], out);
20039
20040 return true;
20041 }
20042 }
20043
20044
20045 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
20046 || diff == 3 || diff == 5 || diff == 9)
20047 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
20048 && (mode != DImode
20049 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
20050 {
20051 /*
20052 * xorl dest,dest
20053 * cmpl op1,op2
20054 * setcc dest
20055 * lea cf(dest*(ct-cf)),dest
20056 *
20057 * Size 14.
20058 *
20059 * This also catches the degenerate setcc-only case.
20060 */
20061
20062 rtx tmp;
20063 int nops;
20064
20065 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20066
20067 nops = 0;
20068 /* On x86_64 the lea instruction operates on Pmode, so we need
20069 to get arithmetics done in proper mode to match. */
20070 if (diff == 1)
20071 tmp = copy_rtx (out);
20072 else
20073 {
20074 rtx out1;
20075 out1 = copy_rtx (out);
20076 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
20077 nops++;
20078 if (diff & 1)
20079 {
20080 tmp = gen_rtx_PLUS (mode, tmp, out1);
20081 nops++;
20082 }
20083 }
20084 if (cf != 0)
20085 {
20086 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
20087 nops++;
20088 }
20089 if (!rtx_equal_p (tmp, out))
20090 {
20091 if (nops == 1)
20092 out = force_operand (tmp, copy_rtx (out));
20093 else
20094 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
20095 }
20096 if (!rtx_equal_p (out, operands[0]))
20097 emit_move_insn (operands[0], copy_rtx (out));
20098
20099 return true;
20100 }
20101
20102 /*
20103 * General case: Jumpful:
20104 * xorl dest,dest cmpl op1, op2
20105 * cmpl op1, op2 movl ct, dest
20106 * setcc dest jcc 1f
20107 * decl dest movl cf, dest
20108 * andl (cf-ct),dest 1:
20109 * addl ct,dest
20110 *
20111 * Size 20. Size 14.
20112 *
20113 * This is reasonably steep, but branch mispredict costs are
20114 * high on modern cpus, so consider failing only if optimizing
20115 * for space.
20116 */
20117
20118 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20119 && BRANCH_COST (optimize_insn_for_speed_p (),
20120 false) >= 2)
20121 {
20122 if (cf == 0)
20123 {
20124 enum machine_mode cmp_mode = GET_MODE (op0);
20125
20126 cf = ct;
20127 ct = 0;
20128
20129 if (SCALAR_FLOAT_MODE_P (cmp_mode))
20130 {
20131 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
20132
20133 /* We may be reversing unordered compare to normal compare,
20134 that is not valid in general (we may convert non-trapping
20135 condition to trapping one), however on i386 we currently
20136 emit all comparisons unordered. */
20137 code = reverse_condition_maybe_unordered (code);
20138 }
20139 else
20140 {
20141 code = reverse_condition (code);
20142 if (compare_code != UNKNOWN)
20143 compare_code = reverse_condition (compare_code);
20144 }
20145 }
20146
20147 if (compare_code != UNKNOWN)
20148 {
20149 /* notl op1 (if needed)
20150 sarl $31, op1
20151 andl (cf-ct), op1
20152 addl ct, op1
20153
20154 For x < 0 (resp. x <= -1) there will be no notl,
20155 so if possible swap the constants to get rid of the
20156 complement.
20157 True/false will be -1/0 while code below (store flag
20158 followed by decrement) is 0/-1, so the constants need
20159 to be exchanged once more. */
20160
20161 if (compare_code == GE || !cf)
20162 {
20163 code = reverse_condition (code);
20164 compare_code = LT;
20165 }
20166 else
20167 {
20168 HOST_WIDE_INT tmp = cf;
20169 cf = ct;
20170 ct = tmp;
20171 }
20172
20173 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
20174 }
20175 else
20176 {
20177 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
20178
20179 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
20180 constm1_rtx,
20181 copy_rtx (out), 1, OPTAB_DIRECT);
20182 }
20183
20184 out = expand_simple_binop (mode, AND, copy_rtx (out),
20185 gen_int_mode (cf - ct, mode),
20186 copy_rtx (out), 1, OPTAB_DIRECT);
20187 if (ct)
20188 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
20189 copy_rtx (out), 1, OPTAB_DIRECT);
20190 if (!rtx_equal_p (out, operands[0]))
20191 emit_move_insn (operands[0], copy_rtx (out));
20192
20193 return true;
20194 }
20195 }
20196
20197 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
20198 {
20199 /* Try a few things more with specific constants and a variable. */
20200
20201 optab op;
20202 rtx var, orig_out, out, tmp;
20203
20204 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
20205 return false;
20206
20207 /* If one of the two operands is an interesting constant, load a
20208 constant with the above and mask it in with a logical operation. */
20209
20210 if (CONST_INT_P (operands[2]))
20211 {
20212 var = operands[3];
20213 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
20214 operands[3] = constm1_rtx, op = and_optab;
20215 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
20216 operands[3] = const0_rtx, op = ior_optab;
20217 else
20218 return false;
20219 }
20220 else if (CONST_INT_P (operands[3]))
20221 {
20222 var = operands[2];
20223 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
20224 operands[2] = constm1_rtx, op = and_optab;
20225 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
20226 operands[2] = const0_rtx, op = ior_optab;
20227 else
20228 return false;
20229 }
20230 else
20231 return false;
20232
20233 orig_out = operands[0];
20234 tmp = gen_reg_rtx (mode);
20235 operands[0] = tmp;
20236
20237 /* Recurse to get the constant loaded. */
20238 if (ix86_expand_int_movcc (operands) == 0)
20239 return false;
20240
20241 /* Mask in the interesting variable. */
20242 out = expand_binop (mode, op, var, tmp, orig_out, 0,
20243 OPTAB_WIDEN);
20244 if (!rtx_equal_p (out, orig_out))
20245 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
20246
20247 return true;
20248 }
20249
20250 /*
20251 * For comparison with above,
20252 *
20253 * movl cf,dest
20254 * movl ct,tmp
20255 * cmpl op1,op2
20256 * cmovcc tmp,dest
20257 *
20258 * Size 15.
20259 */
20260
20261 if (! nonimmediate_operand (operands[2], mode))
20262 operands[2] = force_reg (mode, operands[2]);
20263 if (! nonimmediate_operand (operands[3], mode))
20264 operands[3] = force_reg (mode, operands[3]);
20265
20266 if (! register_operand (operands[2], VOIDmode)
20267 && (mode == QImode
20268 || ! register_operand (operands[3], VOIDmode)))
20269 operands[2] = force_reg (mode, operands[2]);
20270
20271 if (mode == QImode
20272 && ! register_operand (operands[3], VOIDmode))
20273 operands[3] = force_reg (mode, operands[3]);
20274
20275 emit_insn (compare_seq);
20276 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20277 gen_rtx_IF_THEN_ELSE (mode,
20278 compare_op, operands[2],
20279 operands[3])));
20280 return true;
20281 }
20282
20283 /* Swap, force into registers, or otherwise massage the two operands
20284 to an sse comparison with a mask result. Thus we differ a bit from
20285 ix86_prepare_fp_compare_args which expects to produce a flags result.
20286
20287 The DEST operand exists to help determine whether to commute commutative
20288 operators. The POP0/POP1 operands are updated in place. The new
20289 comparison code is returned, or UNKNOWN if not implementable. */
20290
20291 static enum rtx_code
20292 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
20293 rtx *pop0, rtx *pop1)
20294 {
20295 rtx tmp;
20296
20297 switch (code)
20298 {
20299 case LTGT:
20300 case UNEQ:
20301 /* AVX supports all the needed comparisons. */
20302 if (TARGET_AVX)
20303 break;
20304 /* We have no LTGT as an operator. We could implement it with
20305 NE & ORDERED, but this requires an extra temporary. It's
20306 not clear that it's worth it. */
20307 return UNKNOWN;
20308
20309 case LT:
20310 case LE:
20311 case UNGT:
20312 case UNGE:
20313 /* These are supported directly. */
20314 break;
20315
20316 case EQ:
20317 case NE:
20318 case UNORDERED:
20319 case ORDERED:
20320 /* AVX has 3 operand comparisons, no need to swap anything. */
20321 if (TARGET_AVX)
20322 break;
20323 /* For commutative operators, try to canonicalize the destination
20324 operand to be first in the comparison - this helps reload to
20325 avoid extra moves. */
20326 if (!dest || !rtx_equal_p (dest, *pop1))
20327 break;
20328 /* FALLTHRU */
20329
20330 case GE:
20331 case GT:
20332 case UNLE:
20333 case UNLT:
20334 /* These are not supported directly before AVX, and furthermore
20335 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
20336 comparison operands to transform into something that is
20337 supported. */
20338 tmp = *pop0;
20339 *pop0 = *pop1;
20340 *pop1 = tmp;
20341 code = swap_condition (code);
20342 break;
20343
20344 default:
20345 gcc_unreachable ();
20346 }
20347
20348 return code;
20349 }
20350
20351 /* Detect conditional moves that exactly match min/max operational
20352 semantics. Note that this is IEEE safe, as long as we don't
20353 interchange the operands.
20354
20355 Returns FALSE if this conditional move doesn't match a MIN/MAX,
20356 and TRUE if the operation is successful and instructions are emitted. */
20357
20358 static bool
20359 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
20360 rtx cmp_op1, rtx if_true, rtx if_false)
20361 {
20362 enum machine_mode mode;
20363 bool is_min;
20364 rtx tmp;
20365
20366 if (code == LT)
20367 ;
20368 else if (code == UNGE)
20369 {
20370 tmp = if_true;
20371 if_true = if_false;
20372 if_false = tmp;
20373 }
20374 else
20375 return false;
20376
20377 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
20378 is_min = true;
20379 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
20380 is_min = false;
20381 else
20382 return false;
20383
20384 mode = GET_MODE (dest);
20385
20386 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
20387 but MODE may be a vector mode and thus not appropriate. */
20388 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
20389 {
20390 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
20391 rtvec v;
20392
20393 if_true = force_reg (mode, if_true);
20394 v = gen_rtvec (2, if_true, if_false);
20395 tmp = gen_rtx_UNSPEC (mode, v, u);
20396 }
20397 else
20398 {
20399 code = is_min ? SMIN : SMAX;
20400 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
20401 }
20402
20403 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
20404 return true;
20405 }
20406
20407 /* Expand an sse vector comparison. Return the register with the result. */
20408
20409 static rtx
20410 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
20411 rtx op_true, rtx op_false)
20412 {
20413 enum machine_mode mode = GET_MODE (dest);
20414 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
20415 rtx x;
20416
20417 cmp_op0 = force_reg (cmp_mode, cmp_op0);
20418 if (!nonimmediate_operand (cmp_op1, cmp_mode))
20419 cmp_op1 = force_reg (cmp_mode, cmp_op1);
20420
20421 if (optimize
20422 || reg_overlap_mentioned_p (dest, op_true)
20423 || reg_overlap_mentioned_p (dest, op_false))
20424 dest = gen_reg_rtx (mode);
20425
20426 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
20427 if (cmp_mode != mode)
20428 {
20429 x = force_reg (cmp_mode, x);
20430 convert_move (dest, x, false);
20431 }
20432 else
20433 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20434
20435 return dest;
20436 }
20437
20438 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
20439 operations. This is used for both scalar and vector conditional moves. */
20440
20441 static void
20442 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
20443 {
20444 enum machine_mode mode = GET_MODE (dest);
20445 rtx t2, t3, x;
20446
20447 if (vector_all_ones_operand (op_true, mode)
20448 && rtx_equal_p (op_false, CONST0_RTX (mode)))
20449 {
20450 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
20451 }
20452 else if (op_false == CONST0_RTX (mode))
20453 {
20454 op_true = force_reg (mode, op_true);
20455 x = gen_rtx_AND (mode, cmp, op_true);
20456 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20457 }
20458 else if (op_true == CONST0_RTX (mode))
20459 {
20460 op_false = force_reg (mode, op_false);
20461 x = gen_rtx_NOT (mode, cmp);
20462 x = gen_rtx_AND (mode, x, op_false);
20463 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20464 }
20465 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
20466 {
20467 op_false = force_reg (mode, op_false);
20468 x = gen_rtx_IOR (mode, cmp, op_false);
20469 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20470 }
20471 else if (TARGET_XOP)
20472 {
20473 op_true = force_reg (mode, op_true);
20474
20475 if (!nonimmediate_operand (op_false, mode))
20476 op_false = force_reg (mode, op_false);
20477
20478 emit_insn (gen_rtx_SET (mode, dest,
20479 gen_rtx_IF_THEN_ELSE (mode, cmp,
20480 op_true,
20481 op_false)));
20482 }
20483 else
20484 {
20485 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
20486 rtx d = dest;
20487
20488 if (!nonimmediate_operand (op_true, mode))
20489 op_true = force_reg (mode, op_true);
20490
20491 op_false = force_reg (mode, op_false);
20492
20493 switch (mode)
20494 {
20495 case V4SFmode:
20496 if (TARGET_SSE4_1)
20497 gen = gen_sse4_1_blendvps;
20498 break;
20499 case V2DFmode:
20500 if (TARGET_SSE4_1)
20501 gen = gen_sse4_1_blendvpd;
20502 break;
20503 case V16QImode:
20504 case V8HImode:
20505 case V4SImode:
20506 case V2DImode:
20507 if (TARGET_SSE4_1)
20508 {
20509 gen = gen_sse4_1_pblendvb;
20510 if (mode != V16QImode)
20511 d = gen_reg_rtx (V16QImode);
20512 op_false = gen_lowpart (V16QImode, op_false);
20513 op_true = gen_lowpart (V16QImode, op_true);
20514 cmp = gen_lowpart (V16QImode, cmp);
20515 }
20516 break;
20517 case V8SFmode:
20518 if (TARGET_AVX)
20519 gen = gen_avx_blendvps256;
20520 break;
20521 case V4DFmode:
20522 if (TARGET_AVX)
20523 gen = gen_avx_blendvpd256;
20524 break;
20525 case V32QImode:
20526 case V16HImode:
20527 case V8SImode:
20528 case V4DImode:
20529 if (TARGET_AVX2)
20530 {
20531 gen = gen_avx2_pblendvb;
20532 if (mode != V32QImode)
20533 d = gen_reg_rtx (V32QImode);
20534 op_false = gen_lowpart (V32QImode, op_false);
20535 op_true = gen_lowpart (V32QImode, op_true);
20536 cmp = gen_lowpart (V32QImode, cmp);
20537 }
20538 break;
20539 default:
20540 break;
20541 }
20542
20543 if (gen != NULL)
20544 {
20545 emit_insn (gen (d, op_false, op_true, cmp));
20546 if (d != dest)
20547 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
20548 }
20549 else
20550 {
20551 op_true = force_reg (mode, op_true);
20552
20553 t2 = gen_reg_rtx (mode);
20554 if (optimize)
20555 t3 = gen_reg_rtx (mode);
20556 else
20557 t3 = dest;
20558
20559 x = gen_rtx_AND (mode, op_true, cmp);
20560 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20561
20562 x = gen_rtx_NOT (mode, cmp);
20563 x = gen_rtx_AND (mode, x, op_false);
20564 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20565
20566 x = gen_rtx_IOR (mode, t3, t2);
20567 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20568 }
20569 }
20570 }
20571
20572 /* Expand a floating-point conditional move. Return true if successful. */
20573
20574 bool
20575 ix86_expand_fp_movcc (rtx operands[])
20576 {
20577 enum machine_mode mode = GET_MODE (operands[0]);
20578 enum rtx_code code = GET_CODE (operands[1]);
20579 rtx tmp, compare_op;
20580 rtx op0 = XEXP (operands[1], 0);
20581 rtx op1 = XEXP (operands[1], 1);
20582
20583 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20584 {
20585 enum machine_mode cmode;
20586
20587 /* Since we've no cmove for sse registers, don't force bad register
20588 allocation just to gain access to it. Deny movcc when the
20589 comparison mode doesn't match the move mode. */
20590 cmode = GET_MODE (op0);
20591 if (cmode == VOIDmode)
20592 cmode = GET_MODE (op1);
20593 if (cmode != mode)
20594 return false;
20595
20596 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20597 if (code == UNKNOWN)
20598 return false;
20599
20600 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20601 operands[2], operands[3]))
20602 return true;
20603
20604 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20605 operands[2], operands[3]);
20606 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20607 return true;
20608 }
20609
20610 if (GET_MODE (op0) == TImode
20611 || (GET_MODE (op0) == DImode
20612 && !TARGET_64BIT))
20613 return false;
20614
20615 /* The floating point conditional move instructions don't directly
20616 support conditions resulting from a signed integer comparison. */
20617
20618 compare_op = ix86_expand_compare (code, op0, op1);
20619 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20620 {
20621 tmp = gen_reg_rtx (QImode);
20622 ix86_expand_setcc (tmp, code, op0, op1);
20623
20624 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20625 }
20626
20627 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20628 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20629 operands[2], operands[3])));
20630
20631 return true;
20632 }
20633
20634 /* Expand a floating-point vector conditional move; a vcond operation
20635 rather than a movcc operation. */
20636
20637 bool
20638 ix86_expand_fp_vcond (rtx operands[])
20639 {
20640 enum rtx_code code = GET_CODE (operands[3]);
20641 rtx cmp;
20642
20643 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20644 &operands[4], &operands[5]);
20645 if (code == UNKNOWN)
20646 {
20647 rtx temp;
20648 switch (GET_CODE (operands[3]))
20649 {
20650 case LTGT:
20651 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20652 operands[5], operands[0], operands[0]);
20653 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20654 operands[5], operands[1], operands[2]);
20655 code = AND;
20656 break;
20657 case UNEQ:
20658 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20659 operands[5], operands[0], operands[0]);
20660 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20661 operands[5], operands[1], operands[2]);
20662 code = IOR;
20663 break;
20664 default:
20665 gcc_unreachable ();
20666 }
20667 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20668 OPTAB_DIRECT);
20669 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20670 return true;
20671 }
20672
20673 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20674 operands[5], operands[1], operands[2]))
20675 return true;
20676
20677 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20678 operands[1], operands[2]);
20679 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20680 return true;
20681 }
20682
20683 /* Expand a signed/unsigned integral vector conditional move. */
20684
20685 bool
20686 ix86_expand_int_vcond (rtx operands[])
20687 {
20688 enum machine_mode data_mode = GET_MODE (operands[0]);
20689 enum machine_mode mode = GET_MODE (operands[4]);
20690 enum rtx_code code = GET_CODE (operands[3]);
20691 bool negate = false;
20692 rtx x, cop0, cop1;
20693
20694 cop0 = operands[4];
20695 cop1 = operands[5];
20696
20697 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20698 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20699 if ((code == LT || code == GE)
20700 && data_mode == mode
20701 && cop1 == CONST0_RTX (mode)
20702 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20703 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20704 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20705 && (GET_MODE_SIZE (data_mode) == 16
20706 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20707 {
20708 rtx negop = operands[2 - (code == LT)];
20709 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20710 if (negop == CONST1_RTX (data_mode))
20711 {
20712 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20713 operands[0], 1, OPTAB_DIRECT);
20714 if (res != operands[0])
20715 emit_move_insn (operands[0], res);
20716 return true;
20717 }
20718 else if (GET_MODE_INNER (data_mode) != DImode
20719 && vector_all_ones_operand (negop, data_mode))
20720 {
20721 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20722 operands[0], 0, OPTAB_DIRECT);
20723 if (res != operands[0])
20724 emit_move_insn (operands[0], res);
20725 return true;
20726 }
20727 }
20728
20729 if (!nonimmediate_operand (cop1, mode))
20730 cop1 = force_reg (mode, cop1);
20731 if (!general_operand (operands[1], data_mode))
20732 operands[1] = force_reg (data_mode, operands[1]);
20733 if (!general_operand (operands[2], data_mode))
20734 operands[2] = force_reg (data_mode, operands[2]);
20735
20736 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20737 if (TARGET_XOP
20738 && (mode == V16QImode || mode == V8HImode
20739 || mode == V4SImode || mode == V2DImode))
20740 ;
20741 else
20742 {
20743 /* Canonicalize the comparison to EQ, GT, GTU. */
20744 switch (code)
20745 {
20746 case EQ:
20747 case GT:
20748 case GTU:
20749 break;
20750
20751 case NE:
20752 case LE:
20753 case LEU:
20754 code = reverse_condition (code);
20755 negate = true;
20756 break;
20757
20758 case GE:
20759 case GEU:
20760 code = reverse_condition (code);
20761 negate = true;
20762 /* FALLTHRU */
20763
20764 case LT:
20765 case LTU:
20766 code = swap_condition (code);
20767 x = cop0, cop0 = cop1, cop1 = x;
20768 break;
20769
20770 default:
20771 gcc_unreachable ();
20772 }
20773
20774 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20775 if (mode == V2DImode)
20776 {
20777 switch (code)
20778 {
20779 case EQ:
20780 /* SSE4.1 supports EQ. */
20781 if (!TARGET_SSE4_1)
20782 return false;
20783 break;
20784
20785 case GT:
20786 case GTU:
20787 /* SSE4.2 supports GT/GTU. */
20788 if (!TARGET_SSE4_2)
20789 return false;
20790 break;
20791
20792 default:
20793 gcc_unreachable ();
20794 }
20795 }
20796
20797 /* Unsigned parallel compare is not supported by the hardware.
20798 Play some tricks to turn this into a signed comparison
20799 against 0. */
20800 if (code == GTU)
20801 {
20802 cop0 = force_reg (mode, cop0);
20803
20804 switch (mode)
20805 {
20806 case V8SImode:
20807 case V4DImode:
20808 case V4SImode:
20809 case V2DImode:
20810 {
20811 rtx t1, t2, mask;
20812 rtx (*gen_sub3) (rtx, rtx, rtx);
20813
20814 switch (mode)
20815 {
20816 case V8SImode: gen_sub3 = gen_subv8si3; break;
20817 case V4DImode: gen_sub3 = gen_subv4di3; break;
20818 case V4SImode: gen_sub3 = gen_subv4si3; break;
20819 case V2DImode: gen_sub3 = gen_subv2di3; break;
20820 default:
20821 gcc_unreachable ();
20822 }
20823 /* Subtract (-(INT MAX) - 1) from both operands to make
20824 them signed. */
20825 mask = ix86_build_signbit_mask (mode, true, false);
20826 t1 = gen_reg_rtx (mode);
20827 emit_insn (gen_sub3 (t1, cop0, mask));
20828
20829 t2 = gen_reg_rtx (mode);
20830 emit_insn (gen_sub3 (t2, cop1, mask));
20831
20832 cop0 = t1;
20833 cop1 = t2;
20834 code = GT;
20835 }
20836 break;
20837
20838 case V32QImode:
20839 case V16HImode:
20840 case V16QImode:
20841 case V8HImode:
20842 /* Perform a parallel unsigned saturating subtraction. */
20843 x = gen_reg_rtx (mode);
20844 emit_insn (gen_rtx_SET (VOIDmode, x,
20845 gen_rtx_US_MINUS (mode, cop0, cop1)));
20846
20847 cop0 = x;
20848 cop1 = CONST0_RTX (mode);
20849 code = EQ;
20850 negate = !negate;
20851 break;
20852
20853 default:
20854 gcc_unreachable ();
20855 }
20856 }
20857 }
20858
20859 /* Allow the comparison to be done in one mode, but the movcc to
20860 happen in another mode. */
20861 if (data_mode == mode)
20862 {
20863 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20864 operands[1+negate], operands[2-negate]);
20865 }
20866 else
20867 {
20868 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20869 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
20870 operands[1+negate], operands[2-negate]);
20871 x = gen_lowpart (data_mode, x);
20872 }
20873
20874 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20875 operands[2-negate]);
20876 return true;
20877 }
20878
20879 /* Expand a variable vector permutation. */
20880
20881 void
20882 ix86_expand_vec_perm (rtx operands[])
20883 {
20884 rtx target = operands[0];
20885 rtx op0 = operands[1];
20886 rtx op1 = operands[2];
20887 rtx mask = operands[3];
20888 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
20889 enum machine_mode mode = GET_MODE (op0);
20890 enum machine_mode maskmode = GET_MODE (mask);
20891 int w, e, i;
20892 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20893
20894 /* Number of elements in the vector. */
20895 w = GET_MODE_NUNITS (mode);
20896 e = GET_MODE_UNIT_SIZE (mode);
20897 gcc_assert (w <= 32);
20898
20899 if (TARGET_AVX2)
20900 {
20901 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20902 {
20903 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20904 an constant shuffle operand. With a tiny bit of effort we can
20905 use VPERMD instead. A re-interpretation stall for V4DFmode is
20906 unfortunate but there's no avoiding it.
20907 Similarly for V16HImode we don't have instructions for variable
20908 shuffling, while for V32QImode we can use after preparing suitable
20909 masks vpshufb; vpshufb; vpermq; vpor. */
20910
20911 if (mode == V16HImode)
20912 {
20913 maskmode = mode = V32QImode;
20914 w = 32;
20915 e = 1;
20916 }
20917 else
20918 {
20919 maskmode = mode = V8SImode;
20920 w = 8;
20921 e = 4;
20922 }
20923 t1 = gen_reg_rtx (maskmode);
20924
20925 /* Replicate the low bits of the V4DImode mask into V8SImode:
20926 mask = { A B C D }
20927 t1 = { A A B B C C D D }. */
20928 for (i = 0; i < w / 2; ++i)
20929 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20930 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20931 vt = force_reg (maskmode, vt);
20932 mask = gen_lowpart (maskmode, mask);
20933 if (maskmode == V8SImode)
20934 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20935 else
20936 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20937
20938 /* Multiply the shuffle indicies by two. */
20939 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20940 OPTAB_DIRECT);
20941
20942 /* Add one to the odd shuffle indicies:
20943 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20944 for (i = 0; i < w / 2; ++i)
20945 {
20946 vec[i * 2] = const0_rtx;
20947 vec[i * 2 + 1] = const1_rtx;
20948 }
20949 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20950 vt = validize_mem (force_const_mem (maskmode, vt));
20951 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20952 OPTAB_DIRECT);
20953
20954 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20955 operands[3] = mask = t1;
20956 target = gen_reg_rtx (mode);
20957 op0 = gen_lowpart (mode, op0);
20958 op1 = gen_lowpart (mode, op1);
20959 }
20960
20961 switch (mode)
20962 {
20963 case V8SImode:
20964 /* The VPERMD and VPERMPS instructions already properly ignore
20965 the high bits of the shuffle elements. No need for us to
20966 perform an AND ourselves. */
20967 if (one_operand_shuffle)
20968 {
20969 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20970 if (target != operands[0])
20971 emit_move_insn (operands[0],
20972 gen_lowpart (GET_MODE (operands[0]), target));
20973 }
20974 else
20975 {
20976 t1 = gen_reg_rtx (V8SImode);
20977 t2 = gen_reg_rtx (V8SImode);
20978 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20979 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20980 goto merge_two;
20981 }
20982 return;
20983
20984 case V8SFmode:
20985 mask = gen_lowpart (V8SFmode, mask);
20986 if (one_operand_shuffle)
20987 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20988 else
20989 {
20990 t1 = gen_reg_rtx (V8SFmode);
20991 t2 = gen_reg_rtx (V8SFmode);
20992 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20993 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20994 goto merge_two;
20995 }
20996 return;
20997
20998 case V4SImode:
20999 /* By combining the two 128-bit input vectors into one 256-bit
21000 input vector, we can use VPERMD and VPERMPS for the full
21001 two-operand shuffle. */
21002 t1 = gen_reg_rtx (V8SImode);
21003 t2 = gen_reg_rtx (V8SImode);
21004 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
21005 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21006 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
21007 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
21008 return;
21009
21010 case V4SFmode:
21011 t1 = gen_reg_rtx (V8SFmode);
21012 t2 = gen_reg_rtx (V8SImode);
21013 mask = gen_lowpart (V4SImode, mask);
21014 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
21015 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
21016 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
21017 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
21018 return;
21019
21020 case V32QImode:
21021 t1 = gen_reg_rtx (V32QImode);
21022 t2 = gen_reg_rtx (V32QImode);
21023 t3 = gen_reg_rtx (V32QImode);
21024 vt2 = GEN_INT (128);
21025 for (i = 0; i < 32; i++)
21026 vec[i] = vt2;
21027 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21028 vt = force_reg (V32QImode, vt);
21029 for (i = 0; i < 32; i++)
21030 vec[i] = i < 16 ? vt2 : const0_rtx;
21031 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
21032 vt2 = force_reg (V32QImode, vt2);
21033 /* From mask create two adjusted masks, which contain the same
21034 bits as mask in the low 7 bits of each vector element.
21035 The first mask will have the most significant bit clear
21036 if it requests element from the same 128-bit lane
21037 and MSB set if it requests element from the other 128-bit lane.
21038 The second mask will have the opposite values of the MSB,
21039 and additionally will have its 128-bit lanes swapped.
21040 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
21041 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
21042 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
21043 stands for other 12 bytes. */
21044 /* The bit whether element is from the same lane or the other
21045 lane is bit 4, so shift it up by 3 to the MSB position. */
21046 t5 = gen_reg_rtx (V4DImode);
21047 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
21048 GEN_INT (3)));
21049 /* Clear MSB bits from the mask just in case it had them set. */
21050 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
21051 /* After this t1 will have MSB set for elements from other lane. */
21052 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
21053 /* Clear bits other than MSB. */
21054 emit_insn (gen_andv32qi3 (t1, t1, vt));
21055 /* Or in the lower bits from mask into t3. */
21056 emit_insn (gen_iorv32qi3 (t3, t1, t2));
21057 /* And invert MSB bits in t1, so MSB is set for elements from the same
21058 lane. */
21059 emit_insn (gen_xorv32qi3 (t1, t1, vt));
21060 /* Swap 128-bit lanes in t3. */
21061 t6 = gen_reg_rtx (V4DImode);
21062 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
21063 const2_rtx, GEN_INT (3),
21064 const0_rtx, const1_rtx));
21065 /* And or in the lower bits from mask into t1. */
21066 emit_insn (gen_iorv32qi3 (t1, t1, t2));
21067 if (one_operand_shuffle)
21068 {
21069 /* Each of these shuffles will put 0s in places where
21070 element from the other 128-bit lane is needed, otherwise
21071 will shuffle in the requested value. */
21072 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
21073 gen_lowpart (V32QImode, t6)));
21074 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
21075 /* For t3 the 128-bit lanes are swapped again. */
21076 t7 = gen_reg_rtx (V4DImode);
21077 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
21078 const2_rtx, GEN_INT (3),
21079 const0_rtx, const1_rtx));
21080 /* And oring both together leads to the result. */
21081 emit_insn (gen_iorv32qi3 (target, t1,
21082 gen_lowpart (V32QImode, t7)));
21083 if (target != operands[0])
21084 emit_move_insn (operands[0],
21085 gen_lowpart (GET_MODE (operands[0]), target));
21086 return;
21087 }
21088
21089 t4 = gen_reg_rtx (V32QImode);
21090 /* Similarly to the above one_operand_shuffle code,
21091 just for repeated twice for each operand. merge_two:
21092 code will merge the two results together. */
21093 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
21094 gen_lowpart (V32QImode, t6)));
21095 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
21096 gen_lowpart (V32QImode, t6)));
21097 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
21098 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
21099 t7 = gen_reg_rtx (V4DImode);
21100 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
21101 const2_rtx, GEN_INT (3),
21102 const0_rtx, const1_rtx));
21103 t8 = gen_reg_rtx (V4DImode);
21104 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
21105 const2_rtx, GEN_INT (3),
21106 const0_rtx, const1_rtx));
21107 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
21108 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
21109 t1 = t4;
21110 t2 = t3;
21111 goto merge_two;
21112
21113 default:
21114 gcc_assert (GET_MODE_SIZE (mode) <= 16);
21115 break;
21116 }
21117 }
21118
21119 if (TARGET_XOP)
21120 {
21121 /* The XOP VPPERM insn supports three inputs. By ignoring the
21122 one_operand_shuffle special case, we avoid creating another
21123 set of constant vectors in memory. */
21124 one_operand_shuffle = false;
21125
21126 /* mask = mask & {2*w-1, ...} */
21127 vt = GEN_INT (2*w - 1);
21128 }
21129 else
21130 {
21131 /* mask = mask & {w-1, ...} */
21132 vt = GEN_INT (w - 1);
21133 }
21134
21135 for (i = 0; i < w; i++)
21136 vec[i] = vt;
21137 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21138 mask = expand_simple_binop (maskmode, AND, mask, vt,
21139 NULL_RTX, 0, OPTAB_DIRECT);
21140
21141 /* For non-QImode operations, convert the word permutation control
21142 into a byte permutation control. */
21143 if (mode != V16QImode)
21144 {
21145 mask = expand_simple_binop (maskmode, ASHIFT, mask,
21146 GEN_INT (exact_log2 (e)),
21147 NULL_RTX, 0, OPTAB_DIRECT);
21148
21149 /* Convert mask to vector of chars. */
21150 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
21151
21152 /* Replicate each of the input bytes into byte positions:
21153 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
21154 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
21155 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
21156 for (i = 0; i < 16; ++i)
21157 vec[i] = GEN_INT (i/e * e);
21158 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21159 vt = validize_mem (force_const_mem (V16QImode, vt));
21160 if (TARGET_XOP)
21161 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
21162 else
21163 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
21164
21165 /* Convert it into the byte positions by doing
21166 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
21167 for (i = 0; i < 16; ++i)
21168 vec[i] = GEN_INT (i % e);
21169 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
21170 vt = validize_mem (force_const_mem (V16QImode, vt));
21171 emit_insn (gen_addv16qi3 (mask, mask, vt));
21172 }
21173
21174 /* The actual shuffle operations all operate on V16QImode. */
21175 op0 = gen_lowpart (V16QImode, op0);
21176 op1 = gen_lowpart (V16QImode, op1);
21177
21178 if (TARGET_XOP)
21179 {
21180 if (GET_MODE (target) != V16QImode)
21181 target = gen_reg_rtx (V16QImode);
21182 emit_insn (gen_xop_pperm (target, op0, op1, mask));
21183 if (target != operands[0])
21184 emit_move_insn (operands[0],
21185 gen_lowpart (GET_MODE (operands[0]), target));
21186 }
21187 else if (one_operand_shuffle)
21188 {
21189 if (GET_MODE (target) != V16QImode)
21190 target = gen_reg_rtx (V16QImode);
21191 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
21192 if (target != operands[0])
21193 emit_move_insn (operands[0],
21194 gen_lowpart (GET_MODE (operands[0]), target));
21195 }
21196 else
21197 {
21198 rtx xops[6];
21199 bool ok;
21200
21201 /* Shuffle the two input vectors independently. */
21202 t1 = gen_reg_rtx (V16QImode);
21203 t2 = gen_reg_rtx (V16QImode);
21204 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
21205 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
21206
21207 merge_two:
21208 /* Then merge them together. The key is whether any given control
21209 element contained a bit set that indicates the second word. */
21210 mask = operands[3];
21211 vt = GEN_INT (w);
21212 if (maskmode == V2DImode && !TARGET_SSE4_1)
21213 {
21214 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
21215 more shuffle to convert the V2DI input mask into a V4SI
21216 input mask. At which point the masking that expand_int_vcond
21217 will work as desired. */
21218 rtx t3 = gen_reg_rtx (V4SImode);
21219 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
21220 const0_rtx, const0_rtx,
21221 const2_rtx, const2_rtx));
21222 mask = t3;
21223 maskmode = V4SImode;
21224 e = w = 4;
21225 }
21226
21227 for (i = 0; i < w; i++)
21228 vec[i] = vt;
21229 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
21230 vt = force_reg (maskmode, vt);
21231 mask = expand_simple_binop (maskmode, AND, mask, vt,
21232 NULL_RTX, 0, OPTAB_DIRECT);
21233
21234 if (GET_MODE (target) != mode)
21235 target = gen_reg_rtx (mode);
21236 xops[0] = target;
21237 xops[1] = gen_lowpart (mode, t2);
21238 xops[2] = gen_lowpart (mode, t1);
21239 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
21240 xops[4] = mask;
21241 xops[5] = vt;
21242 ok = ix86_expand_int_vcond (xops);
21243 gcc_assert (ok);
21244 if (target != operands[0])
21245 emit_move_insn (operands[0],
21246 gen_lowpart (GET_MODE (operands[0]), target));
21247 }
21248 }
21249
21250 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
21251 true if we should do zero extension, else sign extension. HIGH_P is
21252 true if we want the N/2 high elements, else the low elements. */
21253
21254 void
21255 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
21256 {
21257 enum machine_mode imode = GET_MODE (src);
21258 rtx tmp;
21259
21260 if (TARGET_SSE4_1)
21261 {
21262 rtx (*unpack)(rtx, rtx);
21263 rtx (*extract)(rtx, rtx) = NULL;
21264 enum machine_mode halfmode = BLKmode;
21265
21266 switch (imode)
21267 {
21268 case V32QImode:
21269 if (unsigned_p)
21270 unpack = gen_avx2_zero_extendv16qiv16hi2;
21271 else
21272 unpack = gen_avx2_sign_extendv16qiv16hi2;
21273 halfmode = V16QImode;
21274 extract
21275 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
21276 break;
21277 case V16HImode:
21278 if (unsigned_p)
21279 unpack = gen_avx2_zero_extendv8hiv8si2;
21280 else
21281 unpack = gen_avx2_sign_extendv8hiv8si2;
21282 halfmode = V8HImode;
21283 extract
21284 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
21285 break;
21286 case V8SImode:
21287 if (unsigned_p)
21288 unpack = gen_avx2_zero_extendv4siv4di2;
21289 else
21290 unpack = gen_avx2_sign_extendv4siv4di2;
21291 halfmode = V4SImode;
21292 extract
21293 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
21294 break;
21295 case V16QImode:
21296 if (unsigned_p)
21297 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
21298 else
21299 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
21300 break;
21301 case V8HImode:
21302 if (unsigned_p)
21303 unpack = gen_sse4_1_zero_extendv4hiv4si2;
21304 else
21305 unpack = gen_sse4_1_sign_extendv4hiv4si2;
21306 break;
21307 case V4SImode:
21308 if (unsigned_p)
21309 unpack = gen_sse4_1_zero_extendv2siv2di2;
21310 else
21311 unpack = gen_sse4_1_sign_extendv2siv2di2;
21312 break;
21313 default:
21314 gcc_unreachable ();
21315 }
21316
21317 if (GET_MODE_SIZE (imode) == 32)
21318 {
21319 tmp = gen_reg_rtx (halfmode);
21320 emit_insn (extract (tmp, src));
21321 }
21322 else if (high_p)
21323 {
21324 /* Shift higher 8 bytes to lower 8 bytes. */
21325 tmp = gen_reg_rtx (V1TImode);
21326 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
21327 GEN_INT (64)));
21328 tmp = gen_lowpart (imode, tmp);
21329 }
21330 else
21331 tmp = src;
21332
21333 emit_insn (unpack (dest, tmp));
21334 }
21335 else
21336 {
21337 rtx (*unpack)(rtx, rtx, rtx);
21338
21339 switch (imode)
21340 {
21341 case V16QImode:
21342 if (high_p)
21343 unpack = gen_vec_interleave_highv16qi;
21344 else
21345 unpack = gen_vec_interleave_lowv16qi;
21346 break;
21347 case V8HImode:
21348 if (high_p)
21349 unpack = gen_vec_interleave_highv8hi;
21350 else
21351 unpack = gen_vec_interleave_lowv8hi;
21352 break;
21353 case V4SImode:
21354 if (high_p)
21355 unpack = gen_vec_interleave_highv4si;
21356 else
21357 unpack = gen_vec_interleave_lowv4si;
21358 break;
21359 default:
21360 gcc_unreachable ();
21361 }
21362
21363 if (unsigned_p)
21364 tmp = force_reg (imode, CONST0_RTX (imode));
21365 else
21366 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
21367 src, pc_rtx, pc_rtx);
21368
21369 rtx tmp2 = gen_reg_rtx (imode);
21370 emit_insn (unpack (tmp2, src, tmp));
21371 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
21372 }
21373 }
21374
21375 /* Expand conditional increment or decrement using adb/sbb instructions.
21376 The default case using setcc followed by the conditional move can be
21377 done by generic code. */
21378 bool
21379 ix86_expand_int_addcc (rtx operands[])
21380 {
21381 enum rtx_code code = GET_CODE (operands[1]);
21382 rtx flags;
21383 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
21384 rtx compare_op;
21385 rtx val = const0_rtx;
21386 bool fpcmp = false;
21387 enum machine_mode mode;
21388 rtx op0 = XEXP (operands[1], 0);
21389 rtx op1 = XEXP (operands[1], 1);
21390
21391 if (operands[3] != const1_rtx
21392 && operands[3] != constm1_rtx)
21393 return false;
21394 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
21395 return false;
21396 code = GET_CODE (compare_op);
21397
21398 flags = XEXP (compare_op, 0);
21399
21400 if (GET_MODE (flags) == CCFPmode
21401 || GET_MODE (flags) == CCFPUmode)
21402 {
21403 fpcmp = true;
21404 code = ix86_fp_compare_code_to_integer (code);
21405 }
21406
21407 if (code != LTU)
21408 {
21409 val = constm1_rtx;
21410 if (fpcmp)
21411 PUT_CODE (compare_op,
21412 reverse_condition_maybe_unordered
21413 (GET_CODE (compare_op)));
21414 else
21415 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
21416 }
21417
21418 mode = GET_MODE (operands[0]);
21419
21420 /* Construct either adc or sbb insn. */
21421 if ((code == LTU) == (operands[3] == constm1_rtx))
21422 {
21423 switch (mode)
21424 {
21425 case QImode:
21426 insn = gen_subqi3_carry;
21427 break;
21428 case HImode:
21429 insn = gen_subhi3_carry;
21430 break;
21431 case SImode:
21432 insn = gen_subsi3_carry;
21433 break;
21434 case DImode:
21435 insn = gen_subdi3_carry;
21436 break;
21437 default:
21438 gcc_unreachable ();
21439 }
21440 }
21441 else
21442 {
21443 switch (mode)
21444 {
21445 case QImode:
21446 insn = gen_addqi3_carry;
21447 break;
21448 case HImode:
21449 insn = gen_addhi3_carry;
21450 break;
21451 case SImode:
21452 insn = gen_addsi3_carry;
21453 break;
21454 case DImode:
21455 insn = gen_adddi3_carry;
21456 break;
21457 default:
21458 gcc_unreachable ();
21459 }
21460 }
21461 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
21462
21463 return true;
21464 }
21465
21466
21467 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
21468 but works for floating pointer parameters and nonoffsetable memories.
21469 For pushes, it returns just stack offsets; the values will be saved
21470 in the right order. Maximally three parts are generated. */
21471
21472 static int
21473 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
21474 {
21475 int size;
21476
21477 if (!TARGET_64BIT)
21478 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
21479 else
21480 size = (GET_MODE_SIZE (mode) + 4) / 8;
21481
21482 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
21483 gcc_assert (size >= 2 && size <= 4);
21484
21485 /* Optimize constant pool reference to immediates. This is used by fp
21486 moves, that force all constants to memory to allow combining. */
21487 if (MEM_P (operand) && MEM_READONLY_P (operand))
21488 {
21489 rtx tmp = maybe_get_pool_constant (operand);
21490 if (tmp)
21491 operand = tmp;
21492 }
21493
21494 if (MEM_P (operand) && !offsettable_memref_p (operand))
21495 {
21496 /* The only non-offsetable memories we handle are pushes. */
21497 int ok = push_operand (operand, VOIDmode);
21498
21499 gcc_assert (ok);
21500
21501 operand = copy_rtx (operand);
21502 PUT_MODE (operand, word_mode);
21503 parts[0] = parts[1] = parts[2] = parts[3] = operand;
21504 return size;
21505 }
21506
21507 if (GET_CODE (operand) == CONST_VECTOR)
21508 {
21509 enum machine_mode imode = int_mode_for_mode (mode);
21510 /* Caution: if we looked through a constant pool memory above,
21511 the operand may actually have a different mode now. That's
21512 ok, since we want to pun this all the way back to an integer. */
21513 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
21514 gcc_assert (operand != NULL);
21515 mode = imode;
21516 }
21517
21518 if (!TARGET_64BIT)
21519 {
21520 if (mode == DImode)
21521 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21522 else
21523 {
21524 int i;
21525
21526 if (REG_P (operand))
21527 {
21528 gcc_assert (reload_completed);
21529 for (i = 0; i < size; i++)
21530 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
21531 }
21532 else if (offsettable_memref_p (operand))
21533 {
21534 operand = adjust_address (operand, SImode, 0);
21535 parts[0] = operand;
21536 for (i = 1; i < size; i++)
21537 parts[i] = adjust_address (operand, SImode, 4 * i);
21538 }
21539 else if (GET_CODE (operand) == CONST_DOUBLE)
21540 {
21541 REAL_VALUE_TYPE r;
21542 long l[4];
21543
21544 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21545 switch (mode)
21546 {
21547 case TFmode:
21548 real_to_target (l, &r, mode);
21549 parts[3] = gen_int_mode (l[3], SImode);
21550 parts[2] = gen_int_mode (l[2], SImode);
21551 break;
21552 case XFmode:
21553 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21554 long double may not be 80-bit. */
21555 real_to_target (l, &r, mode);
21556 parts[2] = gen_int_mode (l[2], SImode);
21557 break;
21558 case DFmode:
21559 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21560 break;
21561 default:
21562 gcc_unreachable ();
21563 }
21564 parts[1] = gen_int_mode (l[1], SImode);
21565 parts[0] = gen_int_mode (l[0], SImode);
21566 }
21567 else
21568 gcc_unreachable ();
21569 }
21570 }
21571 else
21572 {
21573 if (mode == TImode)
21574 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21575 if (mode == XFmode || mode == TFmode)
21576 {
21577 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21578 if (REG_P (operand))
21579 {
21580 gcc_assert (reload_completed);
21581 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21582 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21583 }
21584 else if (offsettable_memref_p (operand))
21585 {
21586 operand = adjust_address (operand, DImode, 0);
21587 parts[0] = operand;
21588 parts[1] = adjust_address (operand, upper_mode, 8);
21589 }
21590 else if (GET_CODE (operand) == CONST_DOUBLE)
21591 {
21592 REAL_VALUE_TYPE r;
21593 long l[4];
21594
21595 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21596 real_to_target (l, &r, mode);
21597
21598 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21599 if (HOST_BITS_PER_WIDE_INT >= 64)
21600 parts[0]
21601 = gen_int_mode
21602 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21603 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21604 DImode);
21605 else
21606 parts[0] = immed_double_const (l[0], l[1], DImode);
21607
21608 if (upper_mode == SImode)
21609 parts[1] = gen_int_mode (l[2], SImode);
21610 else if (HOST_BITS_PER_WIDE_INT >= 64)
21611 parts[1]
21612 = gen_int_mode
21613 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21614 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21615 DImode);
21616 else
21617 parts[1] = immed_double_const (l[2], l[3], DImode);
21618 }
21619 else
21620 gcc_unreachable ();
21621 }
21622 }
21623
21624 return size;
21625 }
21626
21627 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21628 Return false when normal moves are needed; true when all required
21629 insns have been emitted. Operands 2-4 contain the input values
21630 int the correct order; operands 5-7 contain the output values. */
21631
21632 void
21633 ix86_split_long_move (rtx operands[])
21634 {
21635 rtx part[2][4];
21636 int nparts, i, j;
21637 int push = 0;
21638 int collisions = 0;
21639 enum machine_mode mode = GET_MODE (operands[0]);
21640 bool collisionparts[4];
21641
21642 /* The DFmode expanders may ask us to move double.
21643 For 64bit target this is single move. By hiding the fact
21644 here we simplify i386.md splitters. */
21645 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21646 {
21647 /* Optimize constant pool reference to immediates. This is used by
21648 fp moves, that force all constants to memory to allow combining. */
21649
21650 if (MEM_P (operands[1])
21651 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21652 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21653 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21654 if (push_operand (operands[0], VOIDmode))
21655 {
21656 operands[0] = copy_rtx (operands[0]);
21657 PUT_MODE (operands[0], word_mode);
21658 }
21659 else
21660 operands[0] = gen_lowpart (DImode, operands[0]);
21661 operands[1] = gen_lowpart (DImode, operands[1]);
21662 emit_move_insn (operands[0], operands[1]);
21663 return;
21664 }
21665
21666 /* The only non-offsettable memory we handle is push. */
21667 if (push_operand (operands[0], VOIDmode))
21668 push = 1;
21669 else
21670 gcc_assert (!MEM_P (operands[0])
21671 || offsettable_memref_p (operands[0]));
21672
21673 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21674 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21675
21676 /* When emitting push, take care for source operands on the stack. */
21677 if (push && MEM_P (operands[1])
21678 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21679 {
21680 rtx src_base = XEXP (part[1][nparts - 1], 0);
21681
21682 /* Compensate for the stack decrement by 4. */
21683 if (!TARGET_64BIT && nparts == 3
21684 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21685 src_base = plus_constant (Pmode, src_base, 4);
21686
21687 /* src_base refers to the stack pointer and is
21688 automatically decreased by emitted push. */
21689 for (i = 0; i < nparts; i++)
21690 part[1][i] = change_address (part[1][i],
21691 GET_MODE (part[1][i]), src_base);
21692 }
21693
21694 /* We need to do copy in the right order in case an address register
21695 of the source overlaps the destination. */
21696 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21697 {
21698 rtx tmp;
21699
21700 for (i = 0; i < nparts; i++)
21701 {
21702 collisionparts[i]
21703 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21704 if (collisionparts[i])
21705 collisions++;
21706 }
21707
21708 /* Collision in the middle part can be handled by reordering. */
21709 if (collisions == 1 && nparts == 3 && collisionparts [1])
21710 {
21711 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21712 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21713 }
21714 else if (collisions == 1
21715 && nparts == 4
21716 && (collisionparts [1] || collisionparts [2]))
21717 {
21718 if (collisionparts [1])
21719 {
21720 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21721 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21722 }
21723 else
21724 {
21725 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21726 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21727 }
21728 }
21729
21730 /* If there are more collisions, we can't handle it by reordering.
21731 Do an lea to the last part and use only one colliding move. */
21732 else if (collisions > 1)
21733 {
21734 rtx base;
21735
21736 collisions = 1;
21737
21738 base = part[0][nparts - 1];
21739
21740 /* Handle the case when the last part isn't valid for lea.
21741 Happens in 64-bit mode storing the 12-byte XFmode. */
21742 if (GET_MODE (base) != Pmode)
21743 base = gen_rtx_REG (Pmode, REGNO (base));
21744
21745 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21746 part[1][0] = replace_equiv_address (part[1][0], base);
21747 for (i = 1; i < nparts; i++)
21748 {
21749 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21750 part[1][i] = replace_equiv_address (part[1][i], tmp);
21751 }
21752 }
21753 }
21754
21755 if (push)
21756 {
21757 if (!TARGET_64BIT)
21758 {
21759 if (nparts == 3)
21760 {
21761 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21762 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21763 stack_pointer_rtx, GEN_INT (-4)));
21764 emit_move_insn (part[0][2], part[1][2]);
21765 }
21766 else if (nparts == 4)
21767 {
21768 emit_move_insn (part[0][3], part[1][3]);
21769 emit_move_insn (part[0][2], part[1][2]);
21770 }
21771 }
21772 else
21773 {
21774 /* In 64bit mode we don't have 32bit push available. In case this is
21775 register, it is OK - we will just use larger counterpart. We also
21776 retype memory - these comes from attempt to avoid REX prefix on
21777 moving of second half of TFmode value. */
21778 if (GET_MODE (part[1][1]) == SImode)
21779 {
21780 switch (GET_CODE (part[1][1]))
21781 {
21782 case MEM:
21783 part[1][1] = adjust_address (part[1][1], DImode, 0);
21784 break;
21785
21786 case REG:
21787 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21788 break;
21789
21790 default:
21791 gcc_unreachable ();
21792 }
21793
21794 if (GET_MODE (part[1][0]) == SImode)
21795 part[1][0] = part[1][1];
21796 }
21797 }
21798 emit_move_insn (part[0][1], part[1][1]);
21799 emit_move_insn (part[0][0], part[1][0]);
21800 return;
21801 }
21802
21803 /* Choose correct order to not overwrite the source before it is copied. */
21804 if ((REG_P (part[0][0])
21805 && REG_P (part[1][1])
21806 && (REGNO (part[0][0]) == REGNO (part[1][1])
21807 || (nparts == 3
21808 && REGNO (part[0][0]) == REGNO (part[1][2]))
21809 || (nparts == 4
21810 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21811 || (collisions > 0
21812 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21813 {
21814 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21815 {
21816 operands[2 + i] = part[0][j];
21817 operands[6 + i] = part[1][j];
21818 }
21819 }
21820 else
21821 {
21822 for (i = 0; i < nparts; i++)
21823 {
21824 operands[2 + i] = part[0][i];
21825 operands[6 + i] = part[1][i];
21826 }
21827 }
21828
21829 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21830 if (optimize_insn_for_size_p ())
21831 {
21832 for (j = 0; j < nparts - 1; j++)
21833 if (CONST_INT_P (operands[6 + j])
21834 && operands[6 + j] != const0_rtx
21835 && REG_P (operands[2 + j]))
21836 for (i = j; i < nparts - 1; i++)
21837 if (CONST_INT_P (operands[7 + i])
21838 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21839 operands[7 + i] = operands[2 + j];
21840 }
21841
21842 for (i = 0; i < nparts; i++)
21843 emit_move_insn (operands[2 + i], operands[6 + i]);
21844
21845 return;
21846 }
21847
21848 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21849 left shift by a constant, either using a single shift or
21850 a sequence of add instructions. */
21851
21852 static void
21853 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21854 {
21855 rtx (*insn)(rtx, rtx, rtx);
21856
21857 if (count == 1
21858 || (count * ix86_cost->add <= ix86_cost->shift_const
21859 && !optimize_insn_for_size_p ()))
21860 {
21861 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21862 while (count-- > 0)
21863 emit_insn (insn (operand, operand, operand));
21864 }
21865 else
21866 {
21867 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21868 emit_insn (insn (operand, operand, GEN_INT (count)));
21869 }
21870 }
21871
21872 void
21873 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21874 {
21875 rtx (*gen_ashl3)(rtx, rtx, rtx);
21876 rtx (*gen_shld)(rtx, rtx, rtx);
21877 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21878
21879 rtx low[2], high[2];
21880 int count;
21881
21882 if (CONST_INT_P (operands[2]))
21883 {
21884 split_double_mode (mode, operands, 2, low, high);
21885 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21886
21887 if (count >= half_width)
21888 {
21889 emit_move_insn (high[0], low[1]);
21890 emit_move_insn (low[0], const0_rtx);
21891
21892 if (count > half_width)
21893 ix86_expand_ashl_const (high[0], count - half_width, mode);
21894 }
21895 else
21896 {
21897 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21898
21899 if (!rtx_equal_p (operands[0], operands[1]))
21900 emit_move_insn (operands[0], operands[1]);
21901
21902 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21903 ix86_expand_ashl_const (low[0], count, mode);
21904 }
21905 return;
21906 }
21907
21908 split_double_mode (mode, operands, 1, low, high);
21909
21910 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21911
21912 if (operands[1] == const1_rtx)
21913 {
21914 /* Assuming we've chosen a QImode capable registers, then 1 << N
21915 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21916 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21917 {
21918 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21919
21920 ix86_expand_clear (low[0]);
21921 ix86_expand_clear (high[0]);
21922 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21923
21924 d = gen_lowpart (QImode, low[0]);
21925 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21926 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21927 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21928
21929 d = gen_lowpart (QImode, high[0]);
21930 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21931 s = gen_rtx_NE (QImode, flags, const0_rtx);
21932 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21933 }
21934
21935 /* Otherwise, we can get the same results by manually performing
21936 a bit extract operation on bit 5/6, and then performing the two
21937 shifts. The two methods of getting 0/1 into low/high are exactly
21938 the same size. Avoiding the shift in the bit extract case helps
21939 pentium4 a bit; no one else seems to care much either way. */
21940 else
21941 {
21942 enum machine_mode half_mode;
21943 rtx (*gen_lshr3)(rtx, rtx, rtx);
21944 rtx (*gen_and3)(rtx, rtx, rtx);
21945 rtx (*gen_xor3)(rtx, rtx, rtx);
21946 HOST_WIDE_INT bits;
21947 rtx x;
21948
21949 if (mode == DImode)
21950 {
21951 half_mode = SImode;
21952 gen_lshr3 = gen_lshrsi3;
21953 gen_and3 = gen_andsi3;
21954 gen_xor3 = gen_xorsi3;
21955 bits = 5;
21956 }
21957 else
21958 {
21959 half_mode = DImode;
21960 gen_lshr3 = gen_lshrdi3;
21961 gen_and3 = gen_anddi3;
21962 gen_xor3 = gen_xordi3;
21963 bits = 6;
21964 }
21965
21966 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21967 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21968 else
21969 x = gen_lowpart (half_mode, operands[2]);
21970 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21971
21972 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21973 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21974 emit_move_insn (low[0], high[0]);
21975 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21976 }
21977
21978 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21979 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21980 return;
21981 }
21982
21983 if (operands[1] == constm1_rtx)
21984 {
21985 /* For -1 << N, we can avoid the shld instruction, because we
21986 know that we're shifting 0...31/63 ones into a -1. */
21987 emit_move_insn (low[0], constm1_rtx);
21988 if (optimize_insn_for_size_p ())
21989 emit_move_insn (high[0], low[0]);
21990 else
21991 emit_move_insn (high[0], constm1_rtx);
21992 }
21993 else
21994 {
21995 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21996
21997 if (!rtx_equal_p (operands[0], operands[1]))
21998 emit_move_insn (operands[0], operands[1]);
21999
22000 split_double_mode (mode, operands, 1, low, high);
22001 emit_insn (gen_shld (high[0], low[0], operands[2]));
22002 }
22003
22004 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
22005
22006 if (TARGET_CMOVE && scratch)
22007 {
22008 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22009 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22010
22011 ix86_expand_clear (scratch);
22012 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
22013 }
22014 else
22015 {
22016 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22017 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22018
22019 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
22020 }
22021 }
22022
22023 void
22024 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
22025 {
22026 rtx (*gen_ashr3)(rtx, rtx, rtx)
22027 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
22028 rtx (*gen_shrd)(rtx, rtx, rtx);
22029 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22030
22031 rtx low[2], high[2];
22032 int count;
22033
22034 if (CONST_INT_P (operands[2]))
22035 {
22036 split_double_mode (mode, operands, 2, low, high);
22037 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22038
22039 if (count == GET_MODE_BITSIZE (mode) - 1)
22040 {
22041 emit_move_insn (high[0], high[1]);
22042 emit_insn (gen_ashr3 (high[0], high[0],
22043 GEN_INT (half_width - 1)));
22044 emit_move_insn (low[0], high[0]);
22045
22046 }
22047 else if (count >= half_width)
22048 {
22049 emit_move_insn (low[0], high[1]);
22050 emit_move_insn (high[0], low[0]);
22051 emit_insn (gen_ashr3 (high[0], high[0],
22052 GEN_INT (half_width - 1)));
22053
22054 if (count > half_width)
22055 emit_insn (gen_ashr3 (low[0], low[0],
22056 GEN_INT (count - half_width)));
22057 }
22058 else
22059 {
22060 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22061
22062 if (!rtx_equal_p (operands[0], operands[1]))
22063 emit_move_insn (operands[0], operands[1]);
22064
22065 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22066 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
22067 }
22068 }
22069 else
22070 {
22071 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22072
22073 if (!rtx_equal_p (operands[0], operands[1]))
22074 emit_move_insn (operands[0], operands[1]);
22075
22076 split_double_mode (mode, operands, 1, low, high);
22077
22078 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22079 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
22080
22081 if (TARGET_CMOVE && scratch)
22082 {
22083 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22084 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22085
22086 emit_move_insn (scratch, high[0]);
22087 emit_insn (gen_ashr3 (scratch, scratch,
22088 GEN_INT (half_width - 1)));
22089 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22090 scratch));
22091 }
22092 else
22093 {
22094 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
22095 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
22096
22097 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
22098 }
22099 }
22100 }
22101
22102 void
22103 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
22104 {
22105 rtx (*gen_lshr3)(rtx, rtx, rtx)
22106 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
22107 rtx (*gen_shrd)(rtx, rtx, rtx);
22108 int half_width = GET_MODE_BITSIZE (mode) >> 1;
22109
22110 rtx low[2], high[2];
22111 int count;
22112
22113 if (CONST_INT_P (operands[2]))
22114 {
22115 split_double_mode (mode, operands, 2, low, high);
22116 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
22117
22118 if (count >= half_width)
22119 {
22120 emit_move_insn (low[0], high[1]);
22121 ix86_expand_clear (high[0]);
22122
22123 if (count > half_width)
22124 emit_insn (gen_lshr3 (low[0], low[0],
22125 GEN_INT (count - half_width)));
22126 }
22127 else
22128 {
22129 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22130
22131 if (!rtx_equal_p (operands[0], operands[1]))
22132 emit_move_insn (operands[0], operands[1]);
22133
22134 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
22135 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
22136 }
22137 }
22138 else
22139 {
22140 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
22141
22142 if (!rtx_equal_p (operands[0], operands[1]))
22143 emit_move_insn (operands[0], operands[1]);
22144
22145 split_double_mode (mode, operands, 1, low, high);
22146
22147 emit_insn (gen_shrd (low[0], high[0], operands[2]));
22148 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
22149
22150 if (TARGET_CMOVE && scratch)
22151 {
22152 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
22153 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
22154
22155 ix86_expand_clear (scratch);
22156 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
22157 scratch));
22158 }
22159 else
22160 {
22161 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
22162 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
22163
22164 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
22165 }
22166 }
22167 }
22168
22169 /* Predict just emitted jump instruction to be taken with probability PROB. */
22170 static void
22171 predict_jump (int prob)
22172 {
22173 rtx insn = get_last_insn ();
22174 gcc_assert (JUMP_P (insn));
22175 add_int_reg_note (insn, REG_BR_PROB, prob);
22176 }
22177
22178 /* Helper function for the string operations below. Dest VARIABLE whether
22179 it is aligned to VALUE bytes. If true, jump to the label. */
22180 static rtx
22181 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
22182 {
22183 rtx label = gen_label_rtx ();
22184 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
22185 if (GET_MODE (variable) == DImode)
22186 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
22187 else
22188 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
22189 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
22190 1, label);
22191 if (epilogue)
22192 predict_jump (REG_BR_PROB_BASE * 50 / 100);
22193 else
22194 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22195 return label;
22196 }
22197
22198 /* Adjust COUNTER by the VALUE. */
22199 static void
22200 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
22201 {
22202 rtx (*gen_add)(rtx, rtx, rtx)
22203 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
22204
22205 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
22206 }
22207
22208 /* Zero extend possibly SImode EXP to Pmode register. */
22209 rtx
22210 ix86_zero_extend_to_Pmode (rtx exp)
22211 {
22212 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
22213 }
22214
22215 /* Divide COUNTREG by SCALE. */
22216 static rtx
22217 scale_counter (rtx countreg, int scale)
22218 {
22219 rtx sc;
22220
22221 if (scale == 1)
22222 return countreg;
22223 if (CONST_INT_P (countreg))
22224 return GEN_INT (INTVAL (countreg) / scale);
22225 gcc_assert (REG_P (countreg));
22226
22227 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
22228 GEN_INT (exact_log2 (scale)),
22229 NULL, 1, OPTAB_DIRECT);
22230 return sc;
22231 }
22232
22233 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
22234 DImode for constant loop counts. */
22235
22236 static enum machine_mode
22237 counter_mode (rtx count_exp)
22238 {
22239 if (GET_MODE (count_exp) != VOIDmode)
22240 return GET_MODE (count_exp);
22241 if (!CONST_INT_P (count_exp))
22242 return Pmode;
22243 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
22244 return DImode;
22245 return SImode;
22246 }
22247
22248 /* Copy the address to a Pmode register. This is used for x32 to
22249 truncate DImode TLS address to a SImode register. */
22250
22251 static rtx
22252 ix86_copy_addr_to_reg (rtx addr)
22253 {
22254 if (GET_MODE (addr) == Pmode)
22255 return copy_addr_to_reg (addr);
22256 else
22257 {
22258 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
22259 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
22260 }
22261 }
22262
22263 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
22264 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
22265 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
22266 memory by VALUE (supposed to be in MODE).
22267
22268 The size is rounded down to whole number of chunk size moved at once.
22269 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
22270
22271
22272 static void
22273 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
22274 rtx destptr, rtx srcptr, rtx value,
22275 rtx count, enum machine_mode mode, int unroll,
22276 int expected_size, bool issetmem)
22277 {
22278 rtx out_label, top_label, iter, tmp;
22279 enum machine_mode iter_mode = counter_mode (count);
22280 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
22281 rtx piece_size = GEN_INT (piece_size_n);
22282 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
22283 rtx size;
22284 int i;
22285
22286 top_label = gen_label_rtx ();
22287 out_label = gen_label_rtx ();
22288 iter = gen_reg_rtx (iter_mode);
22289
22290 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
22291 NULL, 1, OPTAB_DIRECT);
22292 /* Those two should combine. */
22293 if (piece_size == const1_rtx)
22294 {
22295 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
22296 true, out_label);
22297 predict_jump (REG_BR_PROB_BASE * 10 / 100);
22298 }
22299 emit_move_insn (iter, const0_rtx);
22300
22301 emit_label (top_label);
22302
22303 tmp = convert_modes (Pmode, iter_mode, iter, true);
22304
22305 /* This assert could be relaxed - in this case we'll need to compute
22306 smallest power of two, containing in PIECE_SIZE_N and pass it to
22307 offset_address. */
22308 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
22309 destmem = offset_address (destmem, tmp, piece_size_n);
22310 destmem = adjust_address (destmem, mode, 0);
22311
22312 if (!issetmem)
22313 {
22314 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
22315 srcmem = adjust_address (srcmem, mode, 0);
22316
22317 /* When unrolling for chips that reorder memory reads and writes,
22318 we can save registers by using single temporary.
22319 Also using 4 temporaries is overkill in 32bit mode. */
22320 if (!TARGET_64BIT && 0)
22321 {
22322 for (i = 0; i < unroll; i++)
22323 {
22324 if (i)
22325 {
22326 destmem =
22327 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22328 srcmem =
22329 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22330 }
22331 emit_move_insn (destmem, srcmem);
22332 }
22333 }
22334 else
22335 {
22336 rtx tmpreg[4];
22337 gcc_assert (unroll <= 4);
22338 for (i = 0; i < unroll; i++)
22339 {
22340 tmpreg[i] = gen_reg_rtx (mode);
22341 if (i)
22342 {
22343 srcmem =
22344 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
22345 }
22346 emit_move_insn (tmpreg[i], srcmem);
22347 }
22348 for (i = 0; i < unroll; i++)
22349 {
22350 if (i)
22351 {
22352 destmem =
22353 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22354 }
22355 emit_move_insn (destmem, tmpreg[i]);
22356 }
22357 }
22358 }
22359 else
22360 for (i = 0; i < unroll; i++)
22361 {
22362 if (i)
22363 destmem =
22364 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
22365 emit_move_insn (destmem, value);
22366 }
22367
22368 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
22369 true, OPTAB_LIB_WIDEN);
22370 if (tmp != iter)
22371 emit_move_insn (iter, tmp);
22372
22373 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
22374 true, top_label);
22375 if (expected_size != -1)
22376 {
22377 expected_size /= GET_MODE_SIZE (mode) * unroll;
22378 if (expected_size == 0)
22379 predict_jump (0);
22380 else if (expected_size > REG_BR_PROB_BASE)
22381 predict_jump (REG_BR_PROB_BASE - 1);
22382 else
22383 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
22384 }
22385 else
22386 predict_jump (REG_BR_PROB_BASE * 80 / 100);
22387 iter = ix86_zero_extend_to_Pmode (iter);
22388 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
22389 true, OPTAB_LIB_WIDEN);
22390 if (tmp != destptr)
22391 emit_move_insn (destptr, tmp);
22392 if (!issetmem)
22393 {
22394 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
22395 true, OPTAB_LIB_WIDEN);
22396 if (tmp != srcptr)
22397 emit_move_insn (srcptr, tmp);
22398 }
22399 emit_label (out_label);
22400 }
22401
22402 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
22403 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
22404 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
22405 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
22406 ORIG_VALUE is the original value passed to memset to fill the memory with.
22407 Other arguments have same meaning as for previous function. */
22408
22409 static void
22410 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
22411 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
22412 rtx count,
22413 enum machine_mode mode, bool issetmem)
22414 {
22415 rtx destexp;
22416 rtx srcexp;
22417 rtx countreg;
22418 HOST_WIDE_INT rounded_count;
22419
22420 /* If possible, it is shorter to use rep movs.
22421 TODO: Maybe it is better to move this logic to decide_alg. */
22422 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
22423 && (!issetmem || orig_value == const0_rtx))
22424 mode = SImode;
22425
22426 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
22427 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
22428
22429 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
22430 GET_MODE_SIZE (mode)));
22431 if (mode != QImode)
22432 {
22433 destexp = gen_rtx_ASHIFT (Pmode, countreg,
22434 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22435 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
22436 }
22437 else
22438 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
22439 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
22440 {
22441 rounded_count = (INTVAL (count)
22442 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22443 destmem = shallow_copy_rtx (destmem);
22444 set_mem_size (destmem, rounded_count);
22445 }
22446 else if (MEM_SIZE_KNOWN_P (destmem))
22447 clear_mem_size (destmem);
22448
22449 if (issetmem)
22450 {
22451 value = force_reg (mode, gen_lowpart (mode, value));
22452 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
22453 }
22454 else
22455 {
22456 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
22457 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
22458 if (mode != QImode)
22459 {
22460 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
22461 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
22462 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
22463 }
22464 else
22465 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
22466 if (CONST_INT_P (count))
22467 {
22468 rounded_count = (INTVAL (count)
22469 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
22470 srcmem = shallow_copy_rtx (srcmem);
22471 set_mem_size (srcmem, rounded_count);
22472 }
22473 else
22474 {
22475 if (MEM_SIZE_KNOWN_P (srcmem))
22476 clear_mem_size (srcmem);
22477 }
22478 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
22479 destexp, srcexp));
22480 }
22481 }
22482
22483 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
22484 DESTMEM.
22485 SRC is passed by pointer to be updated on return.
22486 Return value is updated DST. */
22487 static rtx
22488 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
22489 HOST_WIDE_INT size_to_move)
22490 {
22491 rtx dst = destmem, src = *srcmem, adjust, tempreg;
22492 enum insn_code code;
22493 enum machine_mode move_mode;
22494 int piece_size, i;
22495
22496 /* Find the widest mode in which we could perform moves.
22497 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22498 it until move of such size is supported. */
22499 piece_size = 1 << floor_log2 (size_to_move);
22500 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22501 code = optab_handler (mov_optab, move_mode);
22502 while (code == CODE_FOR_nothing && piece_size > 1)
22503 {
22504 piece_size >>= 1;
22505 move_mode = mode_for_size (piece_size * BITS_PER_UNIT, MODE_INT, 0);
22506 code = optab_handler (mov_optab, move_mode);
22507 }
22508
22509 /* Find the corresponding vector mode with the same size as MOVE_MODE.
22510 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
22511 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
22512 {
22513 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
22514 move_mode = mode_for_vector (word_mode, nunits);
22515 code = optab_handler (mov_optab, move_mode);
22516 if (code == CODE_FOR_nothing)
22517 {
22518 move_mode = word_mode;
22519 piece_size = GET_MODE_SIZE (move_mode);
22520 code = optab_handler (mov_optab, move_mode);
22521 }
22522 }
22523 gcc_assert (code != CODE_FOR_nothing);
22524
22525 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22526 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
22527
22528 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22529 gcc_assert (size_to_move % piece_size == 0);
22530 adjust = GEN_INT (piece_size);
22531 for (i = 0; i < size_to_move; i += piece_size)
22532 {
22533 /* We move from memory to memory, so we'll need to do it via
22534 a temporary register. */
22535 tempreg = gen_reg_rtx (move_mode);
22536 emit_insn (GEN_FCN (code) (tempreg, src));
22537 emit_insn (GEN_FCN (code) (dst, tempreg));
22538
22539 emit_move_insn (destptr,
22540 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22541 emit_move_insn (srcptr,
22542 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
22543
22544 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22545 piece_size);
22546 src = adjust_automodify_address_nv (src, move_mode, srcptr,
22547 piece_size);
22548 }
22549
22550 /* Update DST and SRC rtx. */
22551 *srcmem = src;
22552 return dst;
22553 }
22554
22555 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
22556 static void
22557 expand_movmem_epilogue (rtx destmem, rtx srcmem,
22558 rtx destptr, rtx srcptr, rtx count, int max_size)
22559 {
22560 rtx src, dest;
22561 if (CONST_INT_P (count))
22562 {
22563 HOST_WIDE_INT countval = INTVAL (count);
22564 HOST_WIDE_INT epilogue_size = countval % max_size;
22565 int i;
22566
22567 /* For now MAX_SIZE should be a power of 2. This assert could be
22568 relaxed, but it'll require a bit more complicated epilogue
22569 expanding. */
22570 gcc_assert ((max_size & (max_size - 1)) == 0);
22571 for (i = max_size; i >= 1; i >>= 1)
22572 {
22573 if (epilogue_size & i)
22574 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22575 }
22576 return;
22577 }
22578 if (max_size > 8)
22579 {
22580 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22581 count, 1, OPTAB_DIRECT);
22582 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22583 count, QImode, 1, 4, false);
22584 return;
22585 }
22586
22587 /* When there are stringops, we can cheaply increase dest and src pointers.
22588 Otherwise we save code size by maintaining offset (zero is readily
22589 available from preceding rep operation) and using x86 addressing modes.
22590 */
22591 if (TARGET_SINGLE_STRINGOP)
22592 {
22593 if (max_size > 4)
22594 {
22595 rtx label = ix86_expand_aligntest (count, 4, true);
22596 src = change_address (srcmem, SImode, srcptr);
22597 dest = change_address (destmem, SImode, destptr);
22598 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22599 emit_label (label);
22600 LABEL_NUSES (label) = 1;
22601 }
22602 if (max_size > 2)
22603 {
22604 rtx label = ix86_expand_aligntest (count, 2, true);
22605 src = change_address (srcmem, HImode, srcptr);
22606 dest = change_address (destmem, HImode, destptr);
22607 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22608 emit_label (label);
22609 LABEL_NUSES (label) = 1;
22610 }
22611 if (max_size > 1)
22612 {
22613 rtx label = ix86_expand_aligntest (count, 1, true);
22614 src = change_address (srcmem, QImode, srcptr);
22615 dest = change_address (destmem, QImode, destptr);
22616 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22617 emit_label (label);
22618 LABEL_NUSES (label) = 1;
22619 }
22620 }
22621 else
22622 {
22623 rtx offset = force_reg (Pmode, const0_rtx);
22624 rtx tmp;
22625
22626 if (max_size > 4)
22627 {
22628 rtx label = ix86_expand_aligntest (count, 4, true);
22629 src = change_address (srcmem, SImode, srcptr);
22630 dest = change_address (destmem, SImode, destptr);
22631 emit_move_insn (dest, src);
22632 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22633 true, OPTAB_LIB_WIDEN);
22634 if (tmp != offset)
22635 emit_move_insn (offset, tmp);
22636 emit_label (label);
22637 LABEL_NUSES (label) = 1;
22638 }
22639 if (max_size > 2)
22640 {
22641 rtx label = ix86_expand_aligntest (count, 2, true);
22642 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22643 src = change_address (srcmem, HImode, tmp);
22644 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22645 dest = change_address (destmem, HImode, tmp);
22646 emit_move_insn (dest, src);
22647 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22648 true, OPTAB_LIB_WIDEN);
22649 if (tmp != offset)
22650 emit_move_insn (offset, tmp);
22651 emit_label (label);
22652 LABEL_NUSES (label) = 1;
22653 }
22654 if (max_size > 1)
22655 {
22656 rtx label = ix86_expand_aligntest (count, 1, true);
22657 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22658 src = change_address (srcmem, QImode, tmp);
22659 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22660 dest = change_address (destmem, QImode, tmp);
22661 emit_move_insn (dest, src);
22662 emit_label (label);
22663 LABEL_NUSES (label) = 1;
22664 }
22665 }
22666 }
22667
22668 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
22669 with value PROMOTED_VAL.
22670 SRC is passed by pointer to be updated on return.
22671 Return value is updated DST. */
22672 static rtx
22673 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
22674 HOST_WIDE_INT size_to_move)
22675 {
22676 rtx dst = destmem, adjust;
22677 enum insn_code code;
22678 enum machine_mode move_mode;
22679 int piece_size, i;
22680
22681 /* Find the widest mode in which we could perform moves.
22682 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
22683 it until move of such size is supported. */
22684 move_mode = GET_MODE (promoted_val);
22685 if (move_mode == VOIDmode)
22686 move_mode = QImode;
22687 if (size_to_move < GET_MODE_SIZE (move_mode))
22688 {
22689 move_mode = mode_for_size (size_to_move * BITS_PER_UNIT, MODE_INT, 0);
22690 promoted_val = gen_lowpart (move_mode, promoted_val);
22691 }
22692 piece_size = GET_MODE_SIZE (move_mode);
22693 code = optab_handler (mov_optab, move_mode);
22694 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
22695
22696 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
22697
22698 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
22699 gcc_assert (size_to_move % piece_size == 0);
22700 adjust = GEN_INT (piece_size);
22701 for (i = 0; i < size_to_move; i += piece_size)
22702 {
22703 if (piece_size <= GET_MODE_SIZE (word_mode))
22704 {
22705 emit_insn (gen_strset (destptr, dst, promoted_val));
22706 continue;
22707 }
22708
22709 emit_insn (GEN_FCN (code) (dst, promoted_val));
22710
22711 emit_move_insn (destptr,
22712 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
22713
22714 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
22715 piece_size);
22716 }
22717
22718 /* Update DST rtx. */
22719 return dst;
22720 }
22721 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22722 static void
22723 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22724 rtx count, int max_size)
22725 {
22726 count =
22727 expand_simple_binop (counter_mode (count), AND, count,
22728 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22729 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22730 gen_lowpart (QImode, value), count, QImode,
22731 1, max_size / 2, true);
22732 }
22733
22734 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22735 static void
22736 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
22737 rtx count, int max_size)
22738 {
22739 rtx dest;
22740
22741 if (CONST_INT_P (count))
22742 {
22743 HOST_WIDE_INT countval = INTVAL (count);
22744 HOST_WIDE_INT epilogue_size = countval % max_size;
22745 int i;
22746
22747 /* For now MAX_SIZE should be a power of 2. This assert could be
22748 relaxed, but it'll require a bit more complicated epilogue
22749 expanding. */
22750 gcc_assert ((max_size & (max_size - 1)) == 0);
22751 for (i = max_size; i >= 1; i >>= 1)
22752 {
22753 if (epilogue_size & i)
22754 {
22755 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22756 destmem = emit_memset (destmem, destptr, vec_value, i);
22757 else
22758 destmem = emit_memset (destmem, destptr, value, i);
22759 }
22760 }
22761 return;
22762 }
22763 if (max_size > 32)
22764 {
22765 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22766 return;
22767 }
22768 if (max_size > 16)
22769 {
22770 rtx label = ix86_expand_aligntest (count, 16, true);
22771 if (TARGET_64BIT)
22772 {
22773 dest = change_address (destmem, DImode, destptr);
22774 emit_insn (gen_strset (destptr, dest, value));
22775 emit_insn (gen_strset (destptr, dest, value));
22776 }
22777 else
22778 {
22779 dest = change_address (destmem, SImode, destptr);
22780 emit_insn (gen_strset (destptr, dest, value));
22781 emit_insn (gen_strset (destptr, dest, value));
22782 emit_insn (gen_strset (destptr, dest, value));
22783 emit_insn (gen_strset (destptr, dest, value));
22784 }
22785 emit_label (label);
22786 LABEL_NUSES (label) = 1;
22787 }
22788 if (max_size > 8)
22789 {
22790 rtx label = ix86_expand_aligntest (count, 8, true);
22791 if (TARGET_64BIT)
22792 {
22793 dest = change_address (destmem, DImode, destptr);
22794 emit_insn (gen_strset (destptr, dest, value));
22795 }
22796 else
22797 {
22798 dest = change_address (destmem, SImode, destptr);
22799 emit_insn (gen_strset (destptr, dest, value));
22800 emit_insn (gen_strset (destptr, dest, value));
22801 }
22802 emit_label (label);
22803 LABEL_NUSES (label) = 1;
22804 }
22805 if (max_size > 4)
22806 {
22807 rtx label = ix86_expand_aligntest (count, 4, true);
22808 dest = change_address (destmem, SImode, destptr);
22809 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22810 emit_label (label);
22811 LABEL_NUSES (label) = 1;
22812 }
22813 if (max_size > 2)
22814 {
22815 rtx label = ix86_expand_aligntest (count, 2, true);
22816 dest = change_address (destmem, HImode, destptr);
22817 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22818 emit_label (label);
22819 LABEL_NUSES (label) = 1;
22820 }
22821 if (max_size > 1)
22822 {
22823 rtx label = ix86_expand_aligntest (count, 1, true);
22824 dest = change_address (destmem, QImode, destptr);
22825 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22826 emit_label (label);
22827 LABEL_NUSES (label) = 1;
22828 }
22829 }
22830
22831 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
22832 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
22833 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
22834 ignored.
22835 Return value is updated DESTMEM. */
22836 static rtx
22837 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
22838 rtx destptr, rtx srcptr, rtx value,
22839 rtx vec_value, rtx count, int align,
22840 int desired_alignment, bool issetmem)
22841 {
22842 int i;
22843 for (i = 1; i < desired_alignment; i <<= 1)
22844 {
22845 if (align <= i)
22846 {
22847 rtx label = ix86_expand_aligntest (destptr, i, false);
22848 if (issetmem)
22849 {
22850 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
22851 destmem = emit_memset (destmem, destptr, vec_value, i);
22852 else
22853 destmem = emit_memset (destmem, destptr, value, i);
22854 }
22855 else
22856 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
22857 ix86_adjust_counter (count, i);
22858 emit_label (label);
22859 LABEL_NUSES (label) = 1;
22860 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
22861 }
22862 }
22863 return destmem;
22864 }
22865
22866 /* Test if COUNT&SIZE is nonzero and if so, expand movme
22867 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
22868 and jump to DONE_LABEL. */
22869 static void
22870 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
22871 rtx destptr, rtx srcptr,
22872 rtx value, rtx vec_value,
22873 rtx count, int size,
22874 rtx done_label, bool issetmem)
22875 {
22876 rtx label = ix86_expand_aligntest (count, size, false);
22877 enum machine_mode mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 1);
22878 rtx modesize;
22879 int n;
22880
22881 /* If we do not have vector value to copy, we must reduce size. */
22882 if (issetmem)
22883 {
22884 if (!vec_value)
22885 {
22886 if (GET_MODE (value) == VOIDmode && size > 8)
22887 mode = Pmode;
22888 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
22889 mode = GET_MODE (value);
22890 }
22891 else
22892 mode = GET_MODE (vec_value), value = vec_value;
22893 }
22894 else
22895 {
22896 /* Choose appropriate vector mode. */
22897 if (size >= 32)
22898 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
22899 else if (size >= 16)
22900 mode = TARGET_SSE ? V16QImode : DImode;
22901 srcmem = change_address (srcmem, mode, srcptr);
22902 }
22903 destmem = change_address (destmem, mode, destptr);
22904 modesize = GEN_INT (GET_MODE_SIZE (mode));
22905 gcc_assert (GET_MODE_SIZE (mode) <= size);
22906 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
22907 {
22908 if (issetmem)
22909 emit_move_insn (destmem, gen_lowpart (mode, value));
22910 else
22911 {
22912 emit_move_insn (destmem, srcmem);
22913 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
22914 }
22915 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
22916 }
22917
22918 destmem = offset_address (destmem, count, 1);
22919 destmem = offset_address (destmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22920 GET_MODE_SIZE (mode));
22921 if (issetmem)
22922 emit_move_insn (destmem, gen_lowpart (mode, value));
22923 else
22924 {
22925 srcmem = offset_address (srcmem, count, 1);
22926 srcmem = offset_address (srcmem, GEN_INT (-size - GET_MODE_SIZE (mode)),
22927 GET_MODE_SIZE (mode));
22928 emit_move_insn (destmem, srcmem);
22929 }
22930 emit_jump_insn (gen_jump (done_label));
22931 emit_barrier ();
22932
22933 emit_label (label);
22934 LABEL_NUSES (label) = 1;
22935 }
22936
22937 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
22938 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
22939 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
22940 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
22941 DONE_LABEL is a label after the whole copying sequence. The label is created
22942 on demand if *DONE_LABEL is NULL.
22943 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
22944 bounds after the initial copies.
22945
22946 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
22947 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
22948 we will dispatch to a library call for large blocks.
22949
22950 In pseudocode we do:
22951
22952 if (COUNT < SIZE)
22953 {
22954 Assume that SIZE is 4. Bigger sizes are handled analogously
22955 if (COUNT & 4)
22956 {
22957 copy 4 bytes from SRCPTR to DESTPTR
22958 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
22959 goto done_label
22960 }
22961 if (!COUNT)
22962 goto done_label;
22963 copy 1 byte from SRCPTR to DESTPTR
22964 if (COUNT & 2)
22965 {
22966 copy 2 bytes from SRCPTR to DESTPTR
22967 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
22968 }
22969 }
22970 else
22971 {
22972 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
22973 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
22974
22975 OLD_DESPTR = DESTPTR;
22976 Align DESTPTR up to DESIRED_ALIGN
22977 SRCPTR += DESTPTR - OLD_DESTPTR
22978 COUNT -= DEST_PTR - OLD_DESTPTR
22979 if (DYNAMIC_CHECK)
22980 Round COUNT down to multiple of SIZE
22981 << optional caller supplied zero size guard is here >>
22982 << optional caller suppplied dynamic check is here >>
22983 << caller supplied main copy loop is here >>
22984 }
22985 done_label:
22986 */
22987 static void
22988 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
22989 rtx *destptr, rtx *srcptr,
22990 enum machine_mode mode,
22991 rtx value, rtx vec_value,
22992 rtx *count,
22993 rtx *done_label,
22994 int size,
22995 int desired_align,
22996 int align,
22997 unsigned HOST_WIDE_INT *min_size,
22998 bool dynamic_check,
22999 bool issetmem)
23000 {
23001 rtx loop_label = NULL, label;
23002 int n;
23003 rtx modesize;
23004 int prolog_size = 0;
23005 rtx mode_value;
23006
23007 /* Chose proper value to copy. */
23008 if (issetmem && VECTOR_MODE_P (mode))
23009 mode_value = vec_value;
23010 else
23011 mode_value = value;
23012 gcc_assert (GET_MODE_SIZE (mode) <= size);
23013
23014 /* See if block is big or small, handle small blocks. */
23015 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
23016 {
23017 int size2 = size;
23018 loop_label = gen_label_rtx ();
23019
23020 if (!*done_label)
23021 *done_label = gen_label_rtx ();
23022
23023 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
23024 1, loop_label);
23025 size2 >>= 1;
23026
23027 /* Handle sizes > 3. */
23028 for (;size2 > 2; size2 >>= 1)
23029 expand_small_movmem_or_setmem (destmem, srcmem,
23030 *destptr, *srcptr,
23031 value, vec_value,
23032 *count,
23033 size2, *done_label, issetmem);
23034 /* Nothing to copy? Jump to DONE_LABEL if so */
23035 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
23036 1, *done_label);
23037
23038 /* Do a byte copy. */
23039 destmem = change_address (destmem, QImode, *destptr);
23040 if (issetmem)
23041 emit_move_insn (destmem, gen_lowpart (QImode, value));
23042 else
23043 {
23044 srcmem = change_address (srcmem, QImode, *srcptr);
23045 emit_move_insn (destmem, srcmem);
23046 }
23047
23048 /* Handle sizes 2 and 3. */
23049 label = ix86_expand_aligntest (*count, 2, false);
23050 destmem = change_address (destmem, HImode, *destptr);
23051 destmem = offset_address (destmem, *count, 1);
23052 destmem = offset_address (destmem, GEN_INT (-2), 2);
23053 if (issetmem)
23054 emit_move_insn (destmem, gen_lowpart (HImode, value));
23055 else
23056 {
23057 srcmem = change_address (srcmem, HImode, *srcptr);
23058 srcmem = offset_address (srcmem, *count, 1);
23059 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
23060 emit_move_insn (destmem, srcmem);
23061 }
23062
23063 emit_label (label);
23064 LABEL_NUSES (label) = 1;
23065 emit_jump_insn (gen_jump (*done_label));
23066 emit_barrier ();
23067 }
23068 else
23069 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
23070 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
23071
23072 /* Start memcpy for COUNT >= SIZE. */
23073 if (loop_label)
23074 {
23075 emit_label (loop_label);
23076 LABEL_NUSES (loop_label) = 1;
23077 }
23078
23079 /* Copy first desired_align bytes. */
23080 if (!issetmem)
23081 srcmem = change_address (srcmem, mode, *srcptr);
23082 destmem = change_address (destmem, mode, *destptr);
23083 modesize = GEN_INT (GET_MODE_SIZE (mode));
23084 for (n = 0; prolog_size < desired_align - align; n++)
23085 {
23086 if (issetmem)
23087 emit_move_insn (destmem, mode_value);
23088 else
23089 {
23090 emit_move_insn (destmem, srcmem);
23091 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
23092 }
23093 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
23094 prolog_size += GET_MODE_SIZE (mode);
23095 }
23096
23097
23098 /* Copy last SIZE bytes. */
23099 destmem = offset_address (destmem, *count, 1);
23100 destmem = offset_address (destmem,
23101 GEN_INT (-size - prolog_size),
23102 1);
23103 if (issetmem)
23104 emit_move_insn (destmem, mode_value);
23105 else
23106 {
23107 srcmem = offset_address (srcmem, *count, 1);
23108 srcmem = offset_address (srcmem,
23109 GEN_INT (-size - prolog_size),
23110 1);
23111 emit_move_insn (destmem, srcmem);
23112 }
23113 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
23114 {
23115 destmem = offset_address (destmem, modesize, 1);
23116 if (issetmem)
23117 emit_move_insn (destmem, mode_value);
23118 else
23119 {
23120 srcmem = offset_address (srcmem, modesize, 1);
23121 emit_move_insn (destmem, srcmem);
23122 }
23123 }
23124
23125 /* Align destination. */
23126 if (desired_align > 1 && desired_align > align)
23127 {
23128 rtx saveddest = *destptr;
23129
23130 gcc_assert (desired_align <= size);
23131 /* Align destptr up, place it to new register. */
23132 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
23133 GEN_INT (prolog_size),
23134 NULL_RTX, 1, OPTAB_DIRECT);
23135 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
23136 GEN_INT (-desired_align),
23137 *destptr, 1, OPTAB_DIRECT);
23138 /* See how many bytes we skipped. */
23139 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
23140 *destptr,
23141 saveddest, 1, OPTAB_DIRECT);
23142 /* Adjust srcptr and count. */
23143 if (!issetmem)
23144 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr, saveddest,
23145 *srcptr, 1, OPTAB_DIRECT);
23146 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23147 saveddest, *count, 1, OPTAB_DIRECT);
23148 /* We copied at most size + prolog_size. */
23149 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
23150 *min_size = (*min_size - size) & ~(unsigned HOST_WIDE_INT)(size - 1);
23151 else
23152 *min_size = 0;
23153
23154 /* Our loops always round down the bock size, but for dispatch to library
23155 we need precise value. */
23156 if (dynamic_check)
23157 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
23158 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
23159 }
23160 else
23161 {
23162 gcc_assert (prolog_size == 0);
23163 /* Decrease count, so we won't end up copying last word twice. */
23164 if (!CONST_INT_P (*count))
23165 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
23166 constm1_rtx, *count, 1, OPTAB_DIRECT);
23167 else
23168 *count = GEN_INT ((UINTVAL (*count) - 1) & ~(unsigned HOST_WIDE_INT)(size - 1));
23169 if (*min_size)
23170 *min_size = (*min_size - 1) & ~(unsigned HOST_WIDE_INT)(size - 1);
23171 }
23172 }
23173
23174
23175 /* This function is like the previous one, except here we know how many bytes
23176 need to be copied. That allows us to update alignment not only of DST, which
23177 is returned, but also of SRC, which is passed as a pointer for that
23178 reason. */
23179 static rtx
23180 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
23181 rtx srcreg, rtx value, rtx vec_value,
23182 int desired_align, int align_bytes,
23183 bool issetmem)
23184 {
23185 rtx src = NULL;
23186 rtx orig_dst = dst;
23187 rtx orig_src = NULL;
23188 int piece_size = 1;
23189 int copied_bytes = 0;
23190
23191 if (!issetmem)
23192 {
23193 gcc_assert (srcp != NULL);
23194 src = *srcp;
23195 orig_src = src;
23196 }
23197
23198 for (piece_size = 1;
23199 piece_size <= desired_align && copied_bytes < align_bytes;
23200 piece_size <<= 1)
23201 {
23202 if (align_bytes & piece_size)
23203 {
23204 if (issetmem)
23205 {
23206 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
23207 dst = emit_memset (dst, destreg, vec_value, piece_size);
23208 else
23209 dst = emit_memset (dst, destreg, value, piece_size);
23210 }
23211 else
23212 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
23213 copied_bytes += piece_size;
23214 }
23215 }
23216 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
23217 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23218 if (MEM_SIZE_KNOWN_P (orig_dst))
23219 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
23220
23221 if (!issetmem)
23222 {
23223 int src_align_bytes = get_mem_align_offset (src, desired_align
23224 * BITS_PER_UNIT);
23225 if (src_align_bytes >= 0)
23226 src_align_bytes = desired_align - src_align_bytes;
23227 if (src_align_bytes >= 0)
23228 {
23229 unsigned int src_align;
23230 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
23231 {
23232 if ((src_align_bytes & (src_align - 1))
23233 == (align_bytes & (src_align - 1)))
23234 break;
23235 }
23236 if (src_align > (unsigned int) desired_align)
23237 src_align = desired_align;
23238 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
23239 set_mem_align (src, src_align * BITS_PER_UNIT);
23240 }
23241 if (MEM_SIZE_KNOWN_P (orig_src))
23242 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
23243 *srcp = src;
23244 }
23245
23246 return dst;
23247 }
23248
23249 /* Return true if ALG can be used in current context.
23250 Assume we expand memset if MEMSET is true. */
23251 static bool
23252 alg_usable_p (enum stringop_alg alg, bool memset)
23253 {
23254 if (alg == no_stringop)
23255 return false;
23256 if (alg == vector_loop)
23257 return TARGET_SSE || TARGET_AVX;
23258 /* Algorithms using the rep prefix want at least edi and ecx;
23259 additionally, memset wants eax and memcpy wants esi. Don't
23260 consider such algorithms if the user has appropriated those
23261 registers for their own purposes. */
23262 if (alg == rep_prefix_1_byte
23263 || alg == rep_prefix_4_byte
23264 || alg == rep_prefix_8_byte)
23265 return !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
23266 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
23267 return true;
23268 }
23269
23270 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
23271 static enum stringop_alg
23272 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
23273 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
23274 bool memset, bool zero_memset, int *dynamic_check, bool *noalign)
23275 {
23276 const struct stringop_algs * algs;
23277 bool optimize_for_speed;
23278 int max = -1;
23279 const struct processor_costs *cost;
23280 int i;
23281 bool any_alg_usable_p = false;
23282
23283 *noalign = false;
23284 *dynamic_check = -1;
23285
23286 /* Even if the string operation call is cold, we still might spend a lot
23287 of time processing large blocks. */
23288 if (optimize_function_for_size_p (cfun)
23289 || (optimize_insn_for_size_p ()
23290 && (max_size < 256
23291 || (expected_size != -1 && expected_size < 256))))
23292 optimize_for_speed = false;
23293 else
23294 optimize_for_speed = true;
23295
23296 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
23297 if (memset)
23298 algs = &cost->memset[TARGET_64BIT != 0];
23299 else
23300 algs = &cost->memcpy[TARGET_64BIT != 0];
23301
23302 /* See maximal size for user defined algorithm. */
23303 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23304 {
23305 enum stringop_alg candidate = algs->size[i].alg;
23306 bool usable = alg_usable_p (candidate, memset);
23307 any_alg_usable_p |= usable;
23308
23309 if (candidate != libcall && candidate && usable)
23310 max = algs->size[i].max;
23311 }
23312
23313 /* If expected size is not known but max size is small enough
23314 so inline version is a win, set expected size into
23315 the range. */
23316 if (max > 1 && (unsigned HOST_WIDE_INT)max >= max_size && expected_size == -1)
23317 expected_size = min_size / 2 + max_size / 2;
23318
23319 /* If user specified the algorithm, honnor it if possible. */
23320 if (ix86_stringop_alg != no_stringop
23321 && alg_usable_p (ix86_stringop_alg, memset))
23322 return ix86_stringop_alg;
23323 /* rep; movq or rep; movl is the smallest variant. */
23324 else if (!optimize_for_speed)
23325 {
23326 *noalign = true;
23327 if (!count || (count & 3) || (memset && !zero_memset))
23328 return alg_usable_p (rep_prefix_1_byte, memset)
23329 ? rep_prefix_1_byte : loop_1_byte;
23330 else
23331 return alg_usable_p (rep_prefix_4_byte, memset)
23332 ? rep_prefix_4_byte : loop;
23333 }
23334 /* Very tiny blocks are best handled via the loop, REP is expensive to
23335 setup. */
23336 else if (expected_size != -1 && expected_size < 4)
23337 return loop_1_byte;
23338 else if (expected_size != -1)
23339 {
23340 enum stringop_alg alg = libcall;
23341 bool alg_noalign = false;
23342 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
23343 {
23344 /* We get here if the algorithms that were not libcall-based
23345 were rep-prefix based and we are unable to use rep prefixes
23346 based on global register usage. Break out of the loop and
23347 use the heuristic below. */
23348 if (algs->size[i].max == 0)
23349 break;
23350 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
23351 {
23352 enum stringop_alg candidate = algs->size[i].alg;
23353
23354 if (candidate != libcall && alg_usable_p (candidate, memset))
23355 {
23356 alg = candidate;
23357 alg_noalign = algs->size[i].noalign;
23358 }
23359 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
23360 last non-libcall inline algorithm. */
23361 if (TARGET_INLINE_ALL_STRINGOPS)
23362 {
23363 /* When the current size is best to be copied by a libcall,
23364 but we are still forced to inline, run the heuristic below
23365 that will pick code for medium sized blocks. */
23366 if (alg != libcall)
23367 {
23368 *noalign = alg_noalign;
23369 return alg;
23370 }
23371 break;
23372 }
23373 else if (alg_usable_p (candidate, memset))
23374 {
23375 *noalign = algs->size[i].noalign;
23376 return candidate;
23377 }
23378 }
23379 }
23380 }
23381 /* When asked to inline the call anyway, try to pick meaningful choice.
23382 We look for maximal size of block that is faster to copy by hand and
23383 take blocks of at most of that size guessing that average size will
23384 be roughly half of the block.
23385
23386 If this turns out to be bad, we might simply specify the preferred
23387 choice in ix86_costs. */
23388 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23389 && (algs->unknown_size == libcall
23390 || !alg_usable_p (algs->unknown_size, memset)))
23391 {
23392 enum stringop_alg alg;
23393
23394 /* If there aren't any usable algorithms, then recursing on
23395 smaller sizes isn't going to find anything. Just return the
23396 simple byte-at-a-time copy loop. */
23397 if (!any_alg_usable_p)
23398 {
23399 /* Pick something reasonable. */
23400 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23401 *dynamic_check = 128;
23402 return loop_1_byte;
23403 }
23404 if (max == -1)
23405 max = 4096;
23406 alg = decide_alg (count, max / 2, min_size, max_size, memset,
23407 zero_memset, dynamic_check, noalign);
23408 gcc_assert (*dynamic_check == -1);
23409 gcc_assert (alg != libcall);
23410 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
23411 *dynamic_check = max;
23412 return alg;
23413 }
23414 return (alg_usable_p (algs->unknown_size, memset)
23415 ? algs->unknown_size : libcall);
23416 }
23417
23418 /* Decide on alignment. We know that the operand is already aligned to ALIGN
23419 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
23420 static int
23421 decide_alignment (int align,
23422 enum stringop_alg alg,
23423 int expected_size,
23424 enum machine_mode move_mode)
23425 {
23426 int desired_align = 0;
23427
23428 gcc_assert (alg != no_stringop);
23429
23430 if (alg == libcall)
23431 return 0;
23432 if (move_mode == VOIDmode)
23433 return 0;
23434
23435 desired_align = GET_MODE_SIZE (move_mode);
23436 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
23437 copying whole cacheline at once. */
23438 if (TARGET_PENTIUMPRO
23439 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
23440 desired_align = 8;
23441
23442 if (optimize_size)
23443 desired_align = 1;
23444 if (desired_align < align)
23445 desired_align = align;
23446 if (expected_size != -1 && expected_size < 4)
23447 desired_align = align;
23448
23449 return desired_align;
23450 }
23451
23452
23453 /* Helper function for memcpy. For QImode value 0xXY produce
23454 0xXYXYXYXY of wide specified by MODE. This is essentially
23455 a * 0x10101010, but we can do slightly better than
23456 synth_mult by unwinding the sequence by hand on CPUs with
23457 slow multiply. */
23458 static rtx
23459 promote_duplicated_reg (enum machine_mode mode, rtx val)
23460 {
23461 enum machine_mode valmode = GET_MODE (val);
23462 rtx tmp;
23463 int nops = mode == DImode ? 3 : 2;
23464
23465 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
23466 if (val == const0_rtx)
23467 return copy_to_mode_reg (mode, CONST0_RTX (mode));
23468 if (CONST_INT_P (val))
23469 {
23470 HOST_WIDE_INT v = INTVAL (val) & 255;
23471
23472 v |= v << 8;
23473 v |= v << 16;
23474 if (mode == DImode)
23475 v |= (v << 16) << 16;
23476 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
23477 }
23478
23479 if (valmode == VOIDmode)
23480 valmode = QImode;
23481 if (valmode != QImode)
23482 val = gen_lowpart (QImode, val);
23483 if (mode == QImode)
23484 return val;
23485 if (!TARGET_PARTIAL_REG_STALL)
23486 nops--;
23487 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
23488 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
23489 <= (ix86_cost->shift_const + ix86_cost->add) * nops
23490 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
23491 {
23492 rtx reg = convert_modes (mode, QImode, val, true);
23493 tmp = promote_duplicated_reg (mode, const1_rtx);
23494 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23495 OPTAB_DIRECT);
23496 }
23497 else
23498 {
23499 rtx reg = convert_modes (mode, QImode, val, true);
23500
23501 if (!TARGET_PARTIAL_REG_STALL)
23502 if (mode == SImode)
23503 emit_insn (gen_movsi_insv_1 (reg, reg));
23504 else
23505 emit_insn (gen_movdi_insv_1 (reg, reg));
23506 else
23507 {
23508 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23509 NULL, 1, OPTAB_DIRECT);
23510 reg =
23511 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23512 }
23513 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23514 NULL, 1, OPTAB_DIRECT);
23515 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23516 if (mode == SImode)
23517 return reg;
23518 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23519 NULL, 1, OPTAB_DIRECT);
23520 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23521 return reg;
23522 }
23523 }
23524
23525 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23526 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23527 alignment from ALIGN to DESIRED_ALIGN. */
23528 static rtx
23529 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
23530 int align)
23531 {
23532 rtx promoted_val;
23533
23534 if (TARGET_64BIT
23535 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23536 promoted_val = promote_duplicated_reg (DImode, val);
23537 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23538 promoted_val = promote_duplicated_reg (SImode, val);
23539 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23540 promoted_val = promote_duplicated_reg (HImode, val);
23541 else
23542 promoted_val = val;
23543
23544 return promoted_val;
23545 }
23546
23547 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
23548 operations when profitable. The code depends upon architecture, block size
23549 and alignment, but always has one of the following overall structures:
23550
23551 Aligned move sequence:
23552
23553 1) Prologue guard: Conditional that jumps up to epilogues for small
23554 blocks that can be handled by epilogue alone. This is faster
23555 but also needed for correctness, since prologue assume the block
23556 is larger than the desired alignment.
23557
23558 Optional dynamic check for size and libcall for large
23559 blocks is emitted here too, with -minline-stringops-dynamically.
23560
23561 2) Prologue: copy first few bytes in order to get destination
23562 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
23563 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
23564 copied. We emit either a jump tree on power of two sized
23565 blocks, or a byte loop.
23566
23567 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23568 with specified algorithm.
23569
23570 4) Epilogue: code copying tail of the block that is too small to be
23571 handled by main body (or up to size guarded by prologue guard).
23572
23573 Misaligned move sequence
23574
23575 1) missaligned move prologue/epilogue containing:
23576 a) Prologue handling small memory blocks and jumping to done_label
23577 (skipped if blocks are known to be large enough)
23578 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
23579 needed by single possibly misaligned move
23580 (skipped if alignment is not needed)
23581 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
23582
23583 2) Zero size guard dispatching to done_label, if needed
23584
23585 3) dispatch to library call, if needed,
23586
23587 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
23588 with specified algorithm. */
23589 static bool
23590 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
23591 rtx align_exp, rtx expected_align_exp,
23592 rtx expected_size_exp, bool issetmem)
23593 {
23594 rtx destreg;
23595 rtx srcreg = NULL;
23596 rtx label = NULL;
23597 rtx tmp;
23598 rtx jump_around_label = NULL;
23599 HOST_WIDE_INT align = 1;
23600 unsigned HOST_WIDE_INT count = 0;
23601 HOST_WIDE_INT expected_size = -1;
23602 int size_needed = 0, epilogue_size_needed;
23603 int desired_align = 0, align_bytes = 0;
23604 enum stringop_alg alg;
23605 rtx promoted_val = NULL;
23606 rtx vec_promoted_val = NULL;
23607 bool force_loopy_epilogue = false;
23608 int dynamic_check;
23609 bool need_zero_guard = false;
23610 bool noalign;
23611 enum machine_mode move_mode = VOIDmode;
23612 int unroll_factor = 1;
23613 /* TODO: Once vlaue ranges are available, fill in proper data. */
23614 unsigned HOST_WIDE_INT min_size = 0;
23615 unsigned HOST_WIDE_INT max_size = -1;
23616 bool misaligned_prologue_used = false;
23617
23618 if (CONST_INT_P (align_exp))
23619 align = INTVAL (align_exp);
23620 /* i386 can do misaligned access on reasonably increased cost. */
23621 if (CONST_INT_P (expected_align_exp)
23622 && INTVAL (expected_align_exp) > align)
23623 align = INTVAL (expected_align_exp);
23624 /* ALIGN is the minimum of destination and source alignment, but we care here
23625 just about destination alignment. */
23626 else if (!issetmem
23627 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
23628 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
23629
23630 if (CONST_INT_P (count_exp))
23631 min_size = max_size = count = expected_size = INTVAL (count_exp);
23632 if (CONST_INT_P (expected_size_exp) && count == 0)
23633 expected_size = INTVAL (expected_size_exp);
23634
23635 /* Make sure we don't need to care about overflow later on. */
23636 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23637 return false;
23638
23639 /* Step 0: Decide on preferred algorithm, desired alignment and
23640 size of chunks to be copied by main loop. */
23641 alg = decide_alg (count, expected_size, min_size, max_size, issetmem,
23642 issetmem && val_exp == const0_rtx,
23643 &dynamic_check, &noalign);
23644 if (alg == libcall)
23645 return false;
23646 gcc_assert (alg != no_stringop);
23647
23648 /* For now vector-version of memset is generated only for memory zeroing, as
23649 creating of promoted vector value is very cheap in this case. */
23650 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
23651 alg = unrolled_loop;
23652
23653 if (!count)
23654 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
23655 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23656 if (!issetmem)
23657 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
23658
23659 unroll_factor = 1;
23660 move_mode = word_mode;
23661 switch (alg)
23662 {
23663 case libcall:
23664 case no_stringop:
23665 case last_alg:
23666 gcc_unreachable ();
23667 case loop_1_byte:
23668 need_zero_guard = true;
23669 move_mode = QImode;
23670 break;
23671 case loop:
23672 need_zero_guard = true;
23673 break;
23674 case unrolled_loop:
23675 need_zero_guard = true;
23676 unroll_factor = (TARGET_64BIT ? 4 : 2);
23677 break;
23678 case vector_loop:
23679 need_zero_guard = true;
23680 unroll_factor = 4;
23681 /* Find the widest supported mode. */
23682 move_mode = word_mode;
23683 while (optab_handler (mov_optab, GET_MODE_WIDER_MODE (move_mode))
23684 != CODE_FOR_nothing)
23685 move_mode = GET_MODE_WIDER_MODE (move_mode);
23686
23687 /* Find the corresponding vector mode with the same size as MOVE_MODE.
23688 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
23689 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
23690 {
23691 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
23692 move_mode = mode_for_vector (word_mode, nunits);
23693 if (optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
23694 move_mode = word_mode;
23695 }
23696 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
23697 break;
23698 case rep_prefix_8_byte:
23699 move_mode = DImode;
23700 break;
23701 case rep_prefix_4_byte:
23702 move_mode = SImode;
23703 break;
23704 case rep_prefix_1_byte:
23705 move_mode = QImode;
23706 break;
23707 }
23708 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
23709 epilogue_size_needed = size_needed;
23710
23711 desired_align = decide_alignment (align, alg, expected_size, move_mode);
23712 if (!TARGET_ALIGN_STRINGOPS || noalign)
23713 align = desired_align;
23714
23715 /* Step 1: Prologue guard. */
23716
23717 /* Alignment code needs count to be in register. */
23718 if (CONST_INT_P (count_exp) && desired_align > align)
23719 {
23720 if (INTVAL (count_exp) > desired_align
23721 && INTVAL (count_exp) > size_needed)
23722 {
23723 align_bytes
23724 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23725 if (align_bytes <= 0)
23726 align_bytes = 0;
23727 else
23728 align_bytes = desired_align - align_bytes;
23729 }
23730 if (align_bytes == 0)
23731 count_exp = force_reg (counter_mode (count_exp), count_exp);
23732 }
23733 gcc_assert (desired_align >= 1 && align >= 1);
23734
23735 /* Misaligned move sequences handles both prologues and epilogues at once.
23736 Default code generation results in smaller code for large alignments and
23737 also avoids redundant job when sizes are known precisely. */
23738 misaligned_prologue_used = (TARGET_MISALIGNED_MOVE_STRING_PROLOGUES
23739 && MAX (desired_align, epilogue_size_needed) <= 32
23740 && ((desired_align > align && !align_bytes)
23741 || (!count && epilogue_size_needed > 1)));
23742
23743 /* Do the cheap promotion to allow better CSE across the
23744 main loop and epilogue (ie one load of the big constant in the
23745 front of all code.
23746 For now the misaligned move sequences do not have fast path
23747 without broadcasting. */
23748 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
23749 {
23750 if (alg == vector_loop)
23751 {
23752 gcc_assert (val_exp == const0_rtx);
23753 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
23754 promoted_val = promote_duplicated_reg_to_size (val_exp,
23755 GET_MODE_SIZE (word_mode),
23756 desired_align, align);
23757 }
23758 else
23759 {
23760 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23761 desired_align, align);
23762 }
23763 }
23764 /* Misaligned move sequences handles both prologues and epilogues at once.
23765 Default code generation results in smaller code for large alignments and
23766 also avoids redundant job when sizes are known precisely. */
23767 if (misaligned_prologue_used)
23768 {
23769 /* Misaligned move prologue handled small blocks by itself. */
23770 misaligned_prologue_used = true;
23771 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
23772 (dst, src, &destreg, &srcreg,
23773 move_mode, promoted_val, vec_promoted_val,
23774 &count_exp,
23775 &jump_around_label,
23776 desired_align < align
23777 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
23778 desired_align, align, &min_size, dynamic_check, issetmem);
23779 if (!issetmem)
23780 src = change_address (src, BLKmode, srcreg);
23781 dst = change_address (dst, BLKmode, destreg);
23782 set_mem_align (dst, desired_align * BITS_PER_UNIT);
23783 epilogue_size_needed = 0;
23784 if (need_zero_guard && !min_size)
23785 {
23786 /* It is possible that we copied enough so the main loop will not
23787 execute. */
23788 gcc_assert (size_needed > 1);
23789 if (jump_around_label == NULL_RTX)
23790 jump_around_label = gen_label_rtx ();
23791 emit_cmp_and_jump_insns (count_exp,
23792 GEN_INT (size_needed),
23793 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
23794 if (expected_size == -1
23795 || expected_size < (desired_align - align) / 2 + size_needed)
23796 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23797 else
23798 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23799 }
23800 }
23801 /* Ensure that alignment prologue won't copy past end of block. */
23802 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23803 {
23804 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23805 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
23806 Make sure it is power of 2. */
23807 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
23808
23809 /* To improve performance of small blocks, we jump around the VAL
23810 promoting mode. This mean that if the promoted VAL is not constant,
23811 we might not use it in the epilogue and have to use byte
23812 loop variant. */
23813 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
23814 force_loopy_epilogue = true;
23815 if (count)
23816 {
23817 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23818 {
23819 /* If main algorithm works on QImode, no epilogue is needed.
23820 For small sizes just don't align anything. */
23821 if (size_needed == 1)
23822 desired_align = align;
23823 else
23824 goto epilogue;
23825 }
23826 }
23827 else if (min_size < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23828 {
23829 gcc_assert (max_size >= (unsigned HOST_WIDE_INT)epilogue_size_needed);
23830 label = gen_label_rtx ();
23831 emit_cmp_and_jump_insns (count_exp,
23832 GEN_INT (epilogue_size_needed),
23833 LTU, 0, counter_mode (count_exp), 1, label);
23834 if (expected_size == -1 || expected_size < epilogue_size_needed)
23835 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23836 else
23837 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23838 }
23839 }
23840
23841 /* Emit code to decide on runtime whether library call or inline should be
23842 used. */
23843 if (dynamic_check != -1)
23844 {
23845 if (!issetmem && CONST_INT_P (count_exp))
23846 {
23847 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
23848 {
23849 emit_block_move_via_libcall (dst, src, count_exp, false);
23850 count_exp = const0_rtx;
23851 goto epilogue;
23852 }
23853 }
23854 else
23855 {
23856 rtx hot_label = gen_label_rtx ();
23857 jump_around_label = gen_label_rtx ();
23858 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23859 LEU, 0, GET_MODE (count_exp), 1, hot_label);
23860 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23861 if (issetmem)
23862 set_storage_via_libcall (dst, count_exp, val_exp, false);
23863 else
23864 emit_block_move_via_libcall (dst, src, count_exp, false);
23865 emit_jump (jump_around_label);
23866 emit_label (hot_label);
23867 }
23868 }
23869
23870 /* Step 2: Alignment prologue. */
23871 /* Do the expensive promotion once we branched off the small blocks. */
23872 if (issetmem && !promoted_val)
23873 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23874 desired_align, align);
23875
23876 if (desired_align > align && !misaligned_prologue_used)
23877 {
23878 if (align_bytes == 0)
23879 {
23880 /* Except for the first move in prologue, we no longer know
23881 constant offset in aliasing info. It don't seems to worth
23882 the pain to maintain it for the first move, so throw away
23883 the info early. */
23884 dst = change_address (dst, BLKmode, destreg);
23885 if (!issetmem)
23886 src = change_address (src, BLKmode, srcreg);
23887 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
23888 promoted_val, vec_promoted_val,
23889 count_exp, align, desired_align,
23890 issetmem);
23891 /* At most desired_align - align bytes are copied. */
23892 if (min_size < (unsigned)(desired_align - align))
23893 min_size = 0;
23894 else
23895 min_size -= desired_align - align;
23896 }
23897 else
23898 {
23899 /* If we know how many bytes need to be stored before dst is
23900 sufficiently aligned, maintain aliasing info accurately. */
23901 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
23902 srcreg,
23903 promoted_val,
23904 vec_promoted_val,
23905 desired_align,
23906 align_bytes,
23907 issetmem);
23908
23909 count_exp = plus_constant (counter_mode (count_exp),
23910 count_exp, -align_bytes);
23911 count -= align_bytes;
23912 min_size -= align_bytes;
23913 max_size -= align_bytes;
23914 }
23915 if (need_zero_guard
23916 && !min_size
23917 && (count < (unsigned HOST_WIDE_INT) size_needed
23918 || (align_bytes == 0
23919 && count < ((unsigned HOST_WIDE_INT) size_needed
23920 + desired_align - align))))
23921 {
23922 /* It is possible that we copied enough so the main loop will not
23923 execute. */
23924 gcc_assert (size_needed > 1);
23925 if (label == NULL_RTX)
23926 label = gen_label_rtx ();
23927 emit_cmp_and_jump_insns (count_exp,
23928 GEN_INT (size_needed),
23929 LTU, 0, counter_mode (count_exp), 1, label);
23930 if (expected_size == -1
23931 || expected_size < (desired_align - align) / 2 + size_needed)
23932 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23933 else
23934 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23935 }
23936 }
23937 if (label && size_needed == 1)
23938 {
23939 emit_label (label);
23940 LABEL_NUSES (label) = 1;
23941 label = NULL;
23942 epilogue_size_needed = 1;
23943 if (issetmem)
23944 promoted_val = val_exp;
23945 }
23946 else if (label == NULL_RTX && !misaligned_prologue_used)
23947 epilogue_size_needed = size_needed;
23948
23949 /* Step 3: Main loop. */
23950
23951 switch (alg)
23952 {
23953 case libcall:
23954 case no_stringop:
23955 case last_alg:
23956 gcc_unreachable ();
23957 case loop_1_byte:
23958 case loop:
23959 case unrolled_loop:
23960 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
23961 count_exp, move_mode, unroll_factor,
23962 expected_size, issetmem);
23963 break;
23964 case vector_loop:
23965 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
23966 vec_promoted_val, count_exp, move_mode,
23967 unroll_factor, expected_size, issetmem);
23968 break;
23969 case rep_prefix_8_byte:
23970 case rep_prefix_4_byte:
23971 case rep_prefix_1_byte:
23972 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
23973 val_exp, count_exp, move_mode, issetmem);
23974 break;
23975 }
23976 /* Adjust properly the offset of src and dest memory for aliasing. */
23977 if (CONST_INT_P (count_exp))
23978 {
23979 if (!issetmem)
23980 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
23981 (count / size_needed) * size_needed);
23982 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23983 (count / size_needed) * size_needed);
23984 }
23985 else
23986 {
23987 if (!issetmem)
23988 src = change_address (src, BLKmode, srcreg);
23989 dst = change_address (dst, BLKmode, destreg);
23990 }
23991
23992 /* Step 4: Epilogue to copy the remaining bytes. */
23993 epilogue:
23994 if (label)
23995 {
23996 /* When the main loop is done, COUNT_EXP might hold original count,
23997 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23998 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23999 bytes. Compensate if needed. */
24000
24001 if (size_needed < epilogue_size_needed)
24002 {
24003 tmp =
24004 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
24005 GEN_INT (size_needed - 1), count_exp, 1,
24006 OPTAB_DIRECT);
24007 if (tmp != count_exp)
24008 emit_move_insn (count_exp, tmp);
24009 }
24010 emit_label (label);
24011 LABEL_NUSES (label) = 1;
24012 }
24013
24014 if (count_exp != const0_rtx && epilogue_size_needed > 1)
24015 {
24016 if (force_loopy_epilogue)
24017 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
24018 epilogue_size_needed);
24019 else
24020 {
24021 if (issetmem)
24022 expand_setmem_epilogue (dst, destreg, promoted_val,
24023 vec_promoted_val, count_exp,
24024 epilogue_size_needed);
24025 else
24026 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
24027 epilogue_size_needed);
24028 }
24029 }
24030 if (jump_around_label)
24031 emit_label (jump_around_label);
24032 return true;
24033 }
24034
24035 /* Wrapper for ix86_expand_set_or_movmem for memcpy case. */
24036 bool
24037 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
24038 rtx expected_align_exp, rtx expected_size_exp)
24039 {
24040 return ix86_expand_set_or_movmem (dst, src, count_exp, NULL, align_exp,
24041 expected_align_exp, expected_size_exp, false);
24042 }
24043
24044 /* Wrapper for ix86_expand_set_or_movmem for memset case. */
24045 bool
24046 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
24047 rtx expected_align_exp, rtx expected_size_exp)
24048 {
24049 return ix86_expand_set_or_movmem (dst, NULL, count_exp, val_exp, align_exp,
24050 expected_align_exp, expected_size_exp, true);
24051 }
24052
24053
24054 /* Expand the appropriate insns for doing strlen if not just doing
24055 repnz; scasb
24056
24057 out = result, initialized with the start address
24058 align_rtx = alignment of the address.
24059 scratch = scratch register, initialized with the startaddress when
24060 not aligned, otherwise undefined
24061
24062 This is just the body. It needs the initializations mentioned above and
24063 some address computing at the end. These things are done in i386.md. */
24064
24065 static void
24066 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
24067 {
24068 int align;
24069 rtx tmp;
24070 rtx align_2_label = NULL_RTX;
24071 rtx align_3_label = NULL_RTX;
24072 rtx align_4_label = gen_label_rtx ();
24073 rtx end_0_label = gen_label_rtx ();
24074 rtx mem;
24075 rtx tmpreg = gen_reg_rtx (SImode);
24076 rtx scratch = gen_reg_rtx (SImode);
24077 rtx cmp;
24078
24079 align = 0;
24080 if (CONST_INT_P (align_rtx))
24081 align = INTVAL (align_rtx);
24082
24083 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
24084
24085 /* Is there a known alignment and is it less than 4? */
24086 if (align < 4)
24087 {
24088 rtx scratch1 = gen_reg_rtx (Pmode);
24089 emit_move_insn (scratch1, out);
24090 /* Is there a known alignment and is it not 2? */
24091 if (align != 2)
24092 {
24093 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
24094 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
24095
24096 /* Leave just the 3 lower bits. */
24097 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
24098 NULL_RTX, 0, OPTAB_WIDEN);
24099
24100 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24101 Pmode, 1, align_4_label);
24102 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
24103 Pmode, 1, align_2_label);
24104 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
24105 Pmode, 1, align_3_label);
24106 }
24107 else
24108 {
24109 /* Since the alignment is 2, we have to check 2 or 0 bytes;
24110 check if is aligned to 4 - byte. */
24111
24112 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
24113 NULL_RTX, 0, OPTAB_WIDEN);
24114
24115 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
24116 Pmode, 1, align_4_label);
24117 }
24118
24119 mem = change_address (src, QImode, out);
24120
24121 /* Now compare the bytes. */
24122
24123 /* Compare the first n unaligned byte on a byte per byte basis. */
24124 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
24125 QImode, 1, end_0_label);
24126
24127 /* Increment the address. */
24128 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24129
24130 /* Not needed with an alignment of 2 */
24131 if (align != 2)
24132 {
24133 emit_label (align_2_label);
24134
24135 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24136 end_0_label);
24137
24138 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24139
24140 emit_label (align_3_label);
24141 }
24142
24143 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
24144 end_0_label);
24145
24146 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
24147 }
24148
24149 /* Generate loop to check 4 bytes at a time. It is not a good idea to
24150 align this loop. It gives only huge programs, but does not help to
24151 speed up. */
24152 emit_label (align_4_label);
24153
24154 mem = change_address (src, SImode, out);
24155 emit_move_insn (scratch, mem);
24156 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
24157
24158 /* This formula yields a nonzero result iff one of the bytes is zero.
24159 This saves three branches inside loop and many cycles. */
24160
24161 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
24162 emit_insn (gen_one_cmplsi2 (scratch, scratch));
24163 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
24164 emit_insn (gen_andsi3 (tmpreg, tmpreg,
24165 gen_int_mode (0x80808080, SImode)));
24166 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
24167 align_4_label);
24168
24169 if (TARGET_CMOVE)
24170 {
24171 rtx reg = gen_reg_rtx (SImode);
24172 rtx reg2 = gen_reg_rtx (Pmode);
24173 emit_move_insn (reg, tmpreg);
24174 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
24175
24176 /* If zero is not in the first two bytes, move two bytes forward. */
24177 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24178 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24179 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24180 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
24181 gen_rtx_IF_THEN_ELSE (SImode, tmp,
24182 reg,
24183 tmpreg)));
24184 /* Emit lea manually to avoid clobbering of flags. */
24185 emit_insn (gen_rtx_SET (SImode, reg2,
24186 gen_rtx_PLUS (Pmode, out, const2_rtx)));
24187
24188 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24189 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
24190 emit_insn (gen_rtx_SET (VOIDmode, out,
24191 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
24192 reg2,
24193 out)));
24194 }
24195 else
24196 {
24197 rtx end_2_label = gen_label_rtx ();
24198 /* Is zero in the first two bytes? */
24199
24200 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
24201 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
24202 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
24203 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
24204 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
24205 pc_rtx);
24206 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
24207 JUMP_LABEL (tmp) = end_2_label;
24208
24209 /* Not in the first two. Move two bytes forward. */
24210 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
24211 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
24212
24213 emit_label (end_2_label);
24214
24215 }
24216
24217 /* Avoid branch in fixing the byte. */
24218 tmpreg = gen_lowpart (QImode, tmpreg);
24219 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
24220 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
24221 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
24222 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
24223
24224 emit_label (end_0_label);
24225 }
24226
24227 /* Expand strlen. */
24228
24229 bool
24230 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
24231 {
24232 rtx addr, scratch1, scratch2, scratch3, scratch4;
24233
24234 /* The generic case of strlen expander is long. Avoid it's
24235 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
24236
24237 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24238 && !TARGET_INLINE_ALL_STRINGOPS
24239 && !optimize_insn_for_size_p ()
24240 && (!CONST_INT_P (align) || INTVAL (align) < 4))
24241 return false;
24242
24243 addr = force_reg (Pmode, XEXP (src, 0));
24244 scratch1 = gen_reg_rtx (Pmode);
24245
24246 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
24247 && !optimize_insn_for_size_p ())
24248 {
24249 /* Well it seems that some optimizer does not combine a call like
24250 foo(strlen(bar), strlen(bar));
24251 when the move and the subtraction is done here. It does calculate
24252 the length just once when these instructions are done inside of
24253 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
24254 often used and I use one fewer register for the lifetime of
24255 output_strlen_unroll() this is better. */
24256
24257 emit_move_insn (out, addr);
24258
24259 ix86_expand_strlensi_unroll_1 (out, src, align);
24260
24261 /* strlensi_unroll_1 returns the address of the zero at the end of
24262 the string, like memchr(), so compute the length by subtracting
24263 the start address. */
24264 emit_insn (ix86_gen_sub3 (out, out, addr));
24265 }
24266 else
24267 {
24268 rtx unspec;
24269
24270 /* Can't use this if the user has appropriated eax, ecx, or edi. */
24271 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
24272 return false;
24273
24274 scratch2 = gen_reg_rtx (Pmode);
24275 scratch3 = gen_reg_rtx (Pmode);
24276 scratch4 = force_reg (Pmode, constm1_rtx);
24277
24278 emit_move_insn (scratch3, addr);
24279 eoschar = force_reg (QImode, eoschar);
24280
24281 src = replace_equiv_address_nv (src, scratch3);
24282
24283 /* If .md starts supporting :P, this can be done in .md. */
24284 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
24285 scratch4), UNSPEC_SCAS);
24286 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
24287 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
24288 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
24289 }
24290 return true;
24291 }
24292
24293 /* For given symbol (function) construct code to compute address of it's PLT
24294 entry in large x86-64 PIC model. */
24295 static rtx
24296 construct_plt_address (rtx symbol)
24297 {
24298 rtx tmp, unspec;
24299
24300 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
24301 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
24302 gcc_assert (Pmode == DImode);
24303
24304 tmp = gen_reg_rtx (Pmode);
24305 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
24306
24307 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
24308 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
24309 return tmp;
24310 }
24311
24312 rtx
24313 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
24314 rtx callarg2,
24315 rtx pop, bool sibcall)
24316 {
24317 unsigned int const cregs_size
24318 = ARRAY_SIZE (x86_64_ms_sysv_extra_clobbered_registers);
24319 rtx vec[3 + cregs_size];
24320 rtx use = NULL, call;
24321 unsigned int vec_len = 0;
24322
24323 if (pop == const0_rtx)
24324 pop = NULL;
24325 gcc_assert (!TARGET_64BIT || !pop);
24326
24327 if (TARGET_MACHO && !TARGET_64BIT)
24328 {
24329 #if TARGET_MACHO
24330 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
24331 fnaddr = machopic_indirect_call_target (fnaddr);
24332 #endif
24333 }
24334 else
24335 {
24336 /* Static functions and indirect calls don't need the pic register. */
24337 if (flag_pic
24338 && (!TARGET_64BIT
24339 || (ix86_cmodel == CM_LARGE_PIC
24340 && DEFAULT_ABI != MS_ABI))
24341 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24342 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
24343 use_reg (&use, pic_offset_table_rtx);
24344 }
24345
24346 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
24347 {
24348 rtx al = gen_rtx_REG (QImode, AX_REG);
24349 emit_move_insn (al, callarg2);
24350 use_reg (&use, al);
24351 }
24352
24353 if (ix86_cmodel == CM_LARGE_PIC
24354 && !TARGET_PECOFF
24355 && MEM_P (fnaddr)
24356 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
24357 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
24358 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
24359 else if (sibcall
24360 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
24361 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
24362 {
24363 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
24364 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
24365 }
24366
24367 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
24368 if (retval)
24369 call = gen_rtx_SET (VOIDmode, retval, call);
24370 vec[vec_len++] = call;
24371
24372 if (pop)
24373 {
24374 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
24375 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
24376 vec[vec_len++] = pop;
24377 }
24378
24379 if (TARGET_64BIT_MS_ABI
24380 && (!callarg2 || INTVAL (callarg2) != -2))
24381 {
24382 unsigned i;
24383
24384 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
24385 UNSPEC_MS_TO_SYSV_CALL);
24386
24387 for (i = 0; i < cregs_size; i++)
24388 {
24389 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
24390 enum machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
24391
24392 vec[vec_len++]
24393 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (mode, regno));
24394 }
24395 }
24396
24397 if (vec_len > 1)
24398 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
24399 call = emit_call_insn (call);
24400 if (use)
24401 CALL_INSN_FUNCTION_USAGE (call) = use;
24402
24403 return call;
24404 }
24405
24406 /* Output the assembly for a call instruction. */
24407
24408 const char *
24409 ix86_output_call_insn (rtx insn, rtx call_op)
24410 {
24411 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
24412 bool seh_nop_p = false;
24413 const char *xasm;
24414
24415 if (SIBLING_CALL_P (insn))
24416 {
24417 if (direct_p)
24418 xasm = "%!jmp\t%P0";
24419 /* SEH epilogue detection requires the indirect branch case
24420 to include REX.W. */
24421 else if (TARGET_SEH)
24422 xasm = "%!rex.W jmp %A0";
24423 else
24424 xasm = "%!jmp\t%A0";
24425
24426 output_asm_insn (xasm, &call_op);
24427 return "";
24428 }
24429
24430 /* SEH unwinding can require an extra nop to be emitted in several
24431 circumstances. Determine if we have one of those. */
24432 if (TARGET_SEH)
24433 {
24434 rtx i;
24435
24436 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
24437 {
24438 /* If we get to another real insn, we don't need the nop. */
24439 if (INSN_P (i))
24440 break;
24441
24442 /* If we get to the epilogue note, prevent a catch region from
24443 being adjacent to the standard epilogue sequence. If non-
24444 call-exceptions, we'll have done this during epilogue emission. */
24445 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
24446 && !flag_non_call_exceptions
24447 && !can_throw_internal (insn))
24448 {
24449 seh_nop_p = true;
24450 break;
24451 }
24452 }
24453
24454 /* If we didn't find a real insn following the call, prevent the
24455 unwinder from looking into the next function. */
24456 if (i == NULL)
24457 seh_nop_p = true;
24458 }
24459
24460 if (direct_p)
24461 xasm = "%!call\t%P0";
24462 else
24463 xasm = "%!call\t%A0";
24464
24465 output_asm_insn (xasm, &call_op);
24466
24467 if (seh_nop_p)
24468 return "nop";
24469
24470 return "";
24471 }
24472 \f
24473 /* Clear stack slot assignments remembered from previous functions.
24474 This is called from INIT_EXPANDERS once before RTL is emitted for each
24475 function. */
24476
24477 static struct machine_function *
24478 ix86_init_machine_status (void)
24479 {
24480 struct machine_function *f;
24481
24482 f = ggc_alloc_cleared_machine_function ();
24483 f->use_fast_prologue_epilogue_nregs = -1;
24484 f->call_abi = ix86_abi;
24485
24486 return f;
24487 }
24488
24489 /* Return a MEM corresponding to a stack slot with mode MODE.
24490 Allocate a new slot if necessary.
24491
24492 The RTL for a function can have several slots available: N is
24493 which slot to use. */
24494
24495 rtx
24496 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
24497 {
24498 struct stack_local_entry *s;
24499
24500 gcc_assert (n < MAX_386_STACK_LOCALS);
24501
24502 for (s = ix86_stack_locals; s; s = s->next)
24503 if (s->mode == mode && s->n == n)
24504 return validize_mem (copy_rtx (s->rtl));
24505
24506 s = ggc_alloc_stack_local_entry ();
24507 s->n = n;
24508 s->mode = mode;
24509 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
24510
24511 s->next = ix86_stack_locals;
24512 ix86_stack_locals = s;
24513 return validize_mem (s->rtl);
24514 }
24515
24516 static void
24517 ix86_instantiate_decls (void)
24518 {
24519 struct stack_local_entry *s;
24520
24521 for (s = ix86_stack_locals; s; s = s->next)
24522 if (s->rtl != NULL_RTX)
24523 instantiate_decl_rtl (s->rtl);
24524 }
24525 \f
24526 /* Check whether x86 address PARTS is a pc-relative address. */
24527
24528 static bool
24529 rip_relative_addr_p (struct ix86_address *parts)
24530 {
24531 rtx base, index, disp;
24532
24533 base = parts->base;
24534 index = parts->index;
24535 disp = parts->disp;
24536
24537 if (disp && !base && !index)
24538 {
24539 if (TARGET_64BIT)
24540 {
24541 rtx symbol = disp;
24542
24543 if (GET_CODE (disp) == CONST)
24544 symbol = XEXP (disp, 0);
24545 if (GET_CODE (symbol) == PLUS
24546 && CONST_INT_P (XEXP (symbol, 1)))
24547 symbol = XEXP (symbol, 0);
24548
24549 if (GET_CODE (symbol) == LABEL_REF
24550 || (GET_CODE (symbol) == SYMBOL_REF
24551 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
24552 || (GET_CODE (symbol) == UNSPEC
24553 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
24554 || XINT (symbol, 1) == UNSPEC_PCREL
24555 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
24556 return true;
24557 }
24558 }
24559 return false;
24560 }
24561
24562 /* Calculate the length of the memory address in the instruction encoding.
24563 Includes addr32 prefix, does not include the one-byte modrm, opcode,
24564 or other prefixes. We never generate addr32 prefix for LEA insn. */
24565
24566 int
24567 memory_address_length (rtx addr, bool lea)
24568 {
24569 struct ix86_address parts;
24570 rtx base, index, disp;
24571 int len;
24572 int ok;
24573
24574 if (GET_CODE (addr) == PRE_DEC
24575 || GET_CODE (addr) == POST_INC
24576 || GET_CODE (addr) == PRE_MODIFY
24577 || GET_CODE (addr) == POST_MODIFY)
24578 return 0;
24579
24580 ok = ix86_decompose_address (addr, &parts);
24581 gcc_assert (ok);
24582
24583 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
24584
24585 /* If this is not LEA instruction, add the length of addr32 prefix. */
24586 if (TARGET_64BIT && !lea
24587 && (SImode_address_operand (addr, VOIDmode)
24588 || (parts.base && GET_MODE (parts.base) == SImode)
24589 || (parts.index && GET_MODE (parts.index) == SImode)))
24590 len++;
24591
24592 base = parts.base;
24593 index = parts.index;
24594 disp = parts.disp;
24595
24596 if (base && GET_CODE (base) == SUBREG)
24597 base = SUBREG_REG (base);
24598 if (index && GET_CODE (index) == SUBREG)
24599 index = SUBREG_REG (index);
24600
24601 gcc_assert (base == NULL_RTX || REG_P (base));
24602 gcc_assert (index == NULL_RTX || REG_P (index));
24603
24604 /* Rule of thumb:
24605 - esp as the base always wants an index,
24606 - ebp as the base always wants a displacement,
24607 - r12 as the base always wants an index,
24608 - r13 as the base always wants a displacement. */
24609
24610 /* Register Indirect. */
24611 if (base && !index && !disp)
24612 {
24613 /* esp (for its index) and ebp (for its displacement) need
24614 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
24615 code. */
24616 if (base == arg_pointer_rtx
24617 || base == frame_pointer_rtx
24618 || REGNO (base) == SP_REG
24619 || REGNO (base) == BP_REG
24620 || REGNO (base) == R12_REG
24621 || REGNO (base) == R13_REG)
24622 len++;
24623 }
24624
24625 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
24626 is not disp32, but disp32(%rip), so for disp32
24627 SIB byte is needed, unless print_operand_address
24628 optimizes it into disp32(%rip) or (%rip) is implied
24629 by UNSPEC. */
24630 else if (disp && !base && !index)
24631 {
24632 len += 4;
24633 if (rip_relative_addr_p (&parts))
24634 len++;
24635 }
24636 else
24637 {
24638 /* Find the length of the displacement constant. */
24639 if (disp)
24640 {
24641 if (base && satisfies_constraint_K (disp))
24642 len += 1;
24643 else
24644 len += 4;
24645 }
24646 /* ebp always wants a displacement. Similarly r13. */
24647 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
24648 len++;
24649
24650 /* An index requires the two-byte modrm form.... */
24651 if (index
24652 /* ...like esp (or r12), which always wants an index. */
24653 || base == arg_pointer_rtx
24654 || base == frame_pointer_rtx
24655 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
24656 len++;
24657 }
24658
24659 return len;
24660 }
24661
24662 /* Compute default value for "length_immediate" attribute. When SHORTFORM
24663 is set, expect that insn have 8bit immediate alternative. */
24664 int
24665 ix86_attr_length_immediate_default (rtx insn, bool shortform)
24666 {
24667 int len = 0;
24668 int i;
24669 extract_insn_cached (insn);
24670 for (i = recog_data.n_operands - 1; i >= 0; --i)
24671 if (CONSTANT_P (recog_data.operand[i]))
24672 {
24673 enum attr_mode mode = get_attr_mode (insn);
24674
24675 gcc_assert (!len);
24676 if (shortform && CONST_INT_P (recog_data.operand[i]))
24677 {
24678 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
24679 switch (mode)
24680 {
24681 case MODE_QI:
24682 len = 1;
24683 continue;
24684 case MODE_HI:
24685 ival = trunc_int_for_mode (ival, HImode);
24686 break;
24687 case MODE_SI:
24688 ival = trunc_int_for_mode (ival, SImode);
24689 break;
24690 default:
24691 break;
24692 }
24693 if (IN_RANGE (ival, -128, 127))
24694 {
24695 len = 1;
24696 continue;
24697 }
24698 }
24699 switch (mode)
24700 {
24701 case MODE_QI:
24702 len = 1;
24703 break;
24704 case MODE_HI:
24705 len = 2;
24706 break;
24707 case MODE_SI:
24708 len = 4;
24709 break;
24710 /* Immediates for DImode instructions are encoded
24711 as 32bit sign extended values. */
24712 case MODE_DI:
24713 len = 4;
24714 break;
24715 default:
24716 fatal_insn ("unknown insn mode", insn);
24717 }
24718 }
24719 return len;
24720 }
24721
24722 /* Compute default value for "length_address" attribute. */
24723 int
24724 ix86_attr_length_address_default (rtx insn)
24725 {
24726 int i;
24727
24728 if (get_attr_type (insn) == TYPE_LEA)
24729 {
24730 rtx set = PATTERN (insn), addr;
24731
24732 if (GET_CODE (set) == PARALLEL)
24733 set = XVECEXP (set, 0, 0);
24734
24735 gcc_assert (GET_CODE (set) == SET);
24736
24737 addr = SET_SRC (set);
24738
24739 return memory_address_length (addr, true);
24740 }
24741
24742 extract_insn_cached (insn);
24743 for (i = recog_data.n_operands - 1; i >= 0; --i)
24744 if (MEM_P (recog_data.operand[i]))
24745 {
24746 constrain_operands_cached (reload_completed);
24747 if (which_alternative != -1)
24748 {
24749 const char *constraints = recog_data.constraints[i];
24750 int alt = which_alternative;
24751
24752 while (*constraints == '=' || *constraints == '+')
24753 constraints++;
24754 while (alt-- > 0)
24755 while (*constraints++ != ',')
24756 ;
24757 /* Skip ignored operands. */
24758 if (*constraints == 'X')
24759 continue;
24760 }
24761 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24762 }
24763 return 0;
24764 }
24765
24766 /* Compute default value for "length_vex" attribute. It includes
24767 2 or 3 byte VEX prefix and 1 opcode byte. */
24768
24769 int
24770 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24771 {
24772 int i;
24773
24774 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24775 byte VEX prefix. */
24776 if (!has_0f_opcode || has_vex_w)
24777 return 3 + 1;
24778
24779 /* We can always use 2 byte VEX prefix in 32bit. */
24780 if (!TARGET_64BIT)
24781 return 2 + 1;
24782
24783 extract_insn_cached (insn);
24784
24785 for (i = recog_data.n_operands - 1; i >= 0; --i)
24786 if (REG_P (recog_data.operand[i]))
24787 {
24788 /* REX.W bit uses 3 byte VEX prefix. */
24789 if (GET_MODE (recog_data.operand[i]) == DImode
24790 && GENERAL_REG_P (recog_data.operand[i]))
24791 return 3 + 1;
24792 }
24793 else
24794 {
24795 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24796 if (MEM_P (recog_data.operand[i])
24797 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24798 return 3 + 1;
24799 }
24800
24801 return 2 + 1;
24802 }
24803 \f
24804 /* Return the maximum number of instructions a cpu can issue. */
24805
24806 static int
24807 ix86_issue_rate (void)
24808 {
24809 switch (ix86_tune)
24810 {
24811 case PROCESSOR_PENTIUM:
24812 case PROCESSOR_ATOM:
24813 case PROCESSOR_SLM:
24814 case PROCESSOR_K6:
24815 case PROCESSOR_BTVER2:
24816 case PROCESSOR_PENTIUM4:
24817 case PROCESSOR_NOCONA:
24818 return 2;
24819
24820 case PROCESSOR_PENTIUMPRO:
24821 case PROCESSOR_ATHLON:
24822 case PROCESSOR_K8:
24823 case PROCESSOR_AMDFAM10:
24824 case PROCESSOR_GENERIC:
24825 case PROCESSOR_BDVER1:
24826 case PROCESSOR_BDVER2:
24827 case PROCESSOR_BDVER3:
24828 case PROCESSOR_BTVER1:
24829 return 3;
24830
24831 case PROCESSOR_CORE2:
24832 case PROCESSOR_COREI7:
24833 case PROCESSOR_COREI7_AVX:
24834 case PROCESSOR_HASWELL:
24835 return 4;
24836
24837 default:
24838 return 1;
24839 }
24840 }
24841
24842 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24843 by DEP_INSN and nothing set by DEP_INSN. */
24844
24845 static bool
24846 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24847 {
24848 rtx set, set2;
24849
24850 /* Simplify the test for uninteresting insns. */
24851 if (insn_type != TYPE_SETCC
24852 && insn_type != TYPE_ICMOV
24853 && insn_type != TYPE_FCMOV
24854 && insn_type != TYPE_IBR)
24855 return false;
24856
24857 if ((set = single_set (dep_insn)) != 0)
24858 {
24859 set = SET_DEST (set);
24860 set2 = NULL_RTX;
24861 }
24862 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24863 && XVECLEN (PATTERN (dep_insn), 0) == 2
24864 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24865 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24866 {
24867 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24868 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24869 }
24870 else
24871 return false;
24872
24873 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24874 return false;
24875
24876 /* This test is true if the dependent insn reads the flags but
24877 not any other potentially set register. */
24878 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24879 return false;
24880
24881 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24882 return false;
24883
24884 return true;
24885 }
24886
24887 /* Return true iff USE_INSN has a memory address with operands set by
24888 SET_INSN. */
24889
24890 bool
24891 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24892 {
24893 int i;
24894 extract_insn_cached (use_insn);
24895 for (i = recog_data.n_operands - 1; i >= 0; --i)
24896 if (MEM_P (recog_data.operand[i]))
24897 {
24898 rtx addr = XEXP (recog_data.operand[i], 0);
24899 return modified_in_p (addr, set_insn) != 0;
24900 }
24901 return false;
24902 }
24903
24904 /* Helper function for exact_store_load_dependency.
24905 Return true if addr is found in insn. */
24906 static bool
24907 exact_dependency_1 (rtx addr, rtx insn)
24908 {
24909 enum rtx_code code;
24910 const char *format_ptr;
24911 int i, j;
24912
24913 code = GET_CODE (insn);
24914 switch (code)
24915 {
24916 case MEM:
24917 if (rtx_equal_p (addr, insn))
24918 return true;
24919 break;
24920 case REG:
24921 CASE_CONST_ANY:
24922 case SYMBOL_REF:
24923 case CODE_LABEL:
24924 case PC:
24925 case CC0:
24926 case EXPR_LIST:
24927 return false;
24928 default:
24929 break;
24930 }
24931
24932 format_ptr = GET_RTX_FORMAT (code);
24933 for (i = 0; i < GET_RTX_LENGTH (code); i++)
24934 {
24935 switch (*format_ptr++)
24936 {
24937 case 'e':
24938 if (exact_dependency_1 (addr, XEXP (insn, i)))
24939 return true;
24940 break;
24941 case 'E':
24942 for (j = 0; j < XVECLEN (insn, i); j++)
24943 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
24944 return true;
24945 break;
24946 }
24947 }
24948 return false;
24949 }
24950
24951 /* Return true if there exists exact dependency for store & load, i.e.
24952 the same memory address is used in them. */
24953 static bool
24954 exact_store_load_dependency (rtx store, rtx load)
24955 {
24956 rtx set1, set2;
24957
24958 set1 = single_set (store);
24959 if (!set1)
24960 return false;
24961 if (!MEM_P (SET_DEST (set1)))
24962 return false;
24963 set2 = single_set (load);
24964 if (!set2)
24965 return false;
24966 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
24967 return true;
24968 return false;
24969 }
24970
24971 static int
24972 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24973 {
24974 enum attr_type insn_type, dep_insn_type;
24975 enum attr_memory memory;
24976 rtx set, set2;
24977 int dep_insn_code_number;
24978
24979 /* Anti and output dependencies have zero cost on all CPUs. */
24980 if (REG_NOTE_KIND (link) != 0)
24981 return 0;
24982
24983 dep_insn_code_number = recog_memoized (dep_insn);
24984
24985 /* If we can't recognize the insns, we can't really do anything. */
24986 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24987 return cost;
24988
24989 insn_type = get_attr_type (insn);
24990 dep_insn_type = get_attr_type (dep_insn);
24991
24992 switch (ix86_tune)
24993 {
24994 case PROCESSOR_PENTIUM:
24995 /* Address Generation Interlock adds a cycle of latency. */
24996 if (insn_type == TYPE_LEA)
24997 {
24998 rtx addr = PATTERN (insn);
24999
25000 if (GET_CODE (addr) == PARALLEL)
25001 addr = XVECEXP (addr, 0, 0);
25002
25003 gcc_assert (GET_CODE (addr) == SET);
25004
25005 addr = SET_SRC (addr);
25006 if (modified_in_p (addr, dep_insn))
25007 cost += 1;
25008 }
25009 else if (ix86_agi_dependent (dep_insn, insn))
25010 cost += 1;
25011
25012 /* ??? Compares pair with jump/setcc. */
25013 if (ix86_flags_dependent (insn, dep_insn, insn_type))
25014 cost = 0;
25015
25016 /* Floating point stores require value to be ready one cycle earlier. */
25017 if (insn_type == TYPE_FMOV
25018 && get_attr_memory (insn) == MEMORY_STORE
25019 && !ix86_agi_dependent (dep_insn, insn))
25020 cost += 1;
25021 break;
25022
25023 case PROCESSOR_PENTIUMPRO:
25024 memory = get_attr_memory (insn);
25025
25026 /* INT->FP conversion is expensive. */
25027 if (get_attr_fp_int_src (dep_insn))
25028 cost += 5;
25029
25030 /* There is one cycle extra latency between an FP op and a store. */
25031 if (insn_type == TYPE_FMOV
25032 && (set = single_set (dep_insn)) != NULL_RTX
25033 && (set2 = single_set (insn)) != NULL_RTX
25034 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
25035 && MEM_P (SET_DEST (set2)))
25036 cost += 1;
25037
25038 /* Show ability of reorder buffer to hide latency of load by executing
25039 in parallel with previous instruction in case
25040 previous instruction is not needed to compute the address. */
25041 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25042 && !ix86_agi_dependent (dep_insn, insn))
25043 {
25044 /* Claim moves to take one cycle, as core can issue one load
25045 at time and the next load can start cycle later. */
25046 if (dep_insn_type == TYPE_IMOV
25047 || dep_insn_type == TYPE_FMOV)
25048 cost = 1;
25049 else if (cost > 1)
25050 cost--;
25051 }
25052 break;
25053
25054 case PROCESSOR_K6:
25055 memory = get_attr_memory (insn);
25056
25057 /* The esp dependency is resolved before the instruction is really
25058 finished. */
25059 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25060 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25061 return 1;
25062
25063 /* INT->FP conversion is expensive. */
25064 if (get_attr_fp_int_src (dep_insn))
25065 cost += 5;
25066
25067 /* Show ability of reorder buffer to hide latency of load by executing
25068 in parallel with previous instruction in case
25069 previous instruction is not needed to compute the address. */
25070 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25071 && !ix86_agi_dependent (dep_insn, insn))
25072 {
25073 /* Claim moves to take one cycle, as core can issue one load
25074 at time and the next load can start cycle later. */
25075 if (dep_insn_type == TYPE_IMOV
25076 || dep_insn_type == TYPE_FMOV)
25077 cost = 1;
25078 else if (cost > 2)
25079 cost -= 2;
25080 else
25081 cost = 1;
25082 }
25083 break;
25084
25085 case PROCESSOR_ATHLON:
25086 case PROCESSOR_K8:
25087 case PROCESSOR_AMDFAM10:
25088 case PROCESSOR_BDVER1:
25089 case PROCESSOR_BDVER2:
25090 case PROCESSOR_BDVER3:
25091 case PROCESSOR_BTVER1:
25092 case PROCESSOR_BTVER2:
25093 case PROCESSOR_GENERIC:
25094 memory = get_attr_memory (insn);
25095
25096 /* Stack engine allows to execute push&pop instructions in parall. */
25097 if (((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25098 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25099 && (ix86_tune != PROCESSOR_ATHLON && ix86_tune != PROCESSOR_K8))
25100 return 0;
25101
25102 /* Show ability of reorder buffer to hide latency of load by executing
25103 in parallel with previous instruction in case
25104 previous instruction is not needed to compute the address. */
25105 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25106 && !ix86_agi_dependent (dep_insn, insn))
25107 {
25108 enum attr_unit unit = get_attr_unit (insn);
25109 int loadcost = 3;
25110
25111 /* Because of the difference between the length of integer and
25112 floating unit pipeline preparation stages, the memory operands
25113 for floating point are cheaper.
25114
25115 ??? For Athlon it the difference is most probably 2. */
25116 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
25117 loadcost = 3;
25118 else
25119 loadcost = TARGET_ATHLON ? 2 : 0;
25120
25121 if (cost >= loadcost)
25122 cost -= loadcost;
25123 else
25124 cost = 0;
25125 }
25126 break;
25127
25128 case PROCESSOR_CORE2:
25129 case PROCESSOR_COREI7:
25130 case PROCESSOR_COREI7_AVX:
25131 case PROCESSOR_HASWELL:
25132 memory = get_attr_memory (insn);
25133
25134 /* Stack engine allows to execute push&pop instructions in parall. */
25135 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
25136 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
25137 return 0;
25138
25139 /* Show ability of reorder buffer to hide latency of load by executing
25140 in parallel with previous instruction in case
25141 previous instruction is not needed to compute the address. */
25142 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25143 && !ix86_agi_dependent (dep_insn, insn))
25144 {
25145 if (cost >= 4)
25146 cost -= 4;
25147 else
25148 cost = 0;
25149 }
25150 break;
25151
25152 case PROCESSOR_SLM:
25153 if (!reload_completed)
25154 return cost;
25155
25156 /* Increase cost of integer loads. */
25157 memory = get_attr_memory (dep_insn);
25158 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
25159 {
25160 enum attr_unit unit = get_attr_unit (dep_insn);
25161 if (unit == UNIT_INTEGER && cost == 1)
25162 {
25163 if (memory == MEMORY_LOAD)
25164 cost = 3;
25165 else
25166 {
25167 /* Increase cost of ld/st for short int types only
25168 because of store forwarding issue. */
25169 rtx set = single_set (dep_insn);
25170 if (set && (GET_MODE (SET_DEST (set)) == QImode
25171 || GET_MODE (SET_DEST (set)) == HImode))
25172 {
25173 /* Increase cost of store/load insn if exact
25174 dependence exists and it is load insn. */
25175 enum attr_memory insn_memory = get_attr_memory (insn);
25176 if (insn_memory == MEMORY_LOAD
25177 && exact_store_load_dependency (dep_insn, insn))
25178 cost = 3;
25179 }
25180 }
25181 }
25182 }
25183
25184 default:
25185 break;
25186 }
25187
25188 return cost;
25189 }
25190
25191 /* How many alternative schedules to try. This should be as wide as the
25192 scheduling freedom in the DFA, but no wider. Making this value too
25193 large results extra work for the scheduler. */
25194
25195 static int
25196 ia32_multipass_dfa_lookahead (void)
25197 {
25198 switch (ix86_tune)
25199 {
25200 case PROCESSOR_PENTIUM:
25201 return 2;
25202
25203 case PROCESSOR_PENTIUMPRO:
25204 case PROCESSOR_K6:
25205 return 1;
25206
25207 case PROCESSOR_CORE2:
25208 case PROCESSOR_COREI7:
25209 case PROCESSOR_COREI7_AVX:
25210 case PROCESSOR_HASWELL:
25211 case PROCESSOR_ATOM:
25212 case PROCESSOR_SLM:
25213 /* Generally, we want haifa-sched:max_issue() to look ahead as far
25214 as many instructions can be executed on a cycle, i.e.,
25215 issue_rate. I wonder why tuning for many CPUs does not do this. */
25216 if (reload_completed)
25217 return ix86_issue_rate ();
25218 /* Don't use lookahead for pre-reload schedule to save compile time. */
25219 return 0;
25220
25221 default:
25222 return 0;
25223 }
25224 }
25225
25226 /* Return true if target platform supports macro-fusion. */
25227
25228 static bool
25229 ix86_macro_fusion_p ()
25230 {
25231 return TARGET_FUSE_CMP_AND_BRANCH;
25232 }
25233
25234 /* Check whether current microarchitecture support macro fusion
25235 for insn pair "CONDGEN + CONDJMP". Refer to
25236 "Intel Architectures Optimization Reference Manual". */
25237
25238 static bool
25239 ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
25240 {
25241 rtx src, dest;
25242 rtx single_set = single_set (condgen);
25243 enum rtx_code ccode;
25244 rtx compare_set = NULL_RTX, test_if, cond;
25245 rtx alu_set = NULL_RTX, addr = NULL_RTX;
25246
25247 if (get_attr_type (condgen) != TYPE_TEST
25248 && get_attr_type (condgen) != TYPE_ICMP
25249 && get_attr_type (condgen) != TYPE_INCDEC
25250 && get_attr_type (condgen) != TYPE_ALU)
25251 return false;
25252
25253 if (single_set == NULL_RTX
25254 && !TARGET_FUSE_ALU_AND_BRANCH)
25255 return false;
25256
25257 if (single_set != NULL_RTX)
25258 compare_set = single_set;
25259 else
25260 {
25261 int i;
25262 rtx pat = PATTERN (condgen);
25263 for (i = 0; i < XVECLEN (pat, 0); i++)
25264 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
25265 {
25266 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
25267 if (GET_CODE (set_src) == COMPARE)
25268 compare_set = XVECEXP (pat, 0, i);
25269 else
25270 alu_set = XVECEXP (pat, 0, i);
25271 }
25272 }
25273 if (compare_set == NULL_RTX)
25274 return false;
25275 src = SET_SRC (compare_set);
25276 if (GET_CODE (src) != COMPARE)
25277 return false;
25278
25279 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
25280 supported. */
25281 if ((MEM_P (XEXP (src, 0))
25282 && CONST_INT_P (XEXP (src, 1)))
25283 || (MEM_P (XEXP (src, 1))
25284 && CONST_INT_P (XEXP (src, 0))))
25285 return false;
25286
25287 /* No fusion for RIP-relative address. */
25288 if (MEM_P (XEXP (src, 0)))
25289 addr = XEXP (XEXP (src, 0), 0);
25290 else if (MEM_P (XEXP (src, 1)))
25291 addr = XEXP (XEXP (src, 1), 0);
25292
25293 if (addr) {
25294 ix86_address parts;
25295 int ok = ix86_decompose_address (addr, &parts);
25296 gcc_assert (ok);
25297
25298 if (rip_relative_addr_p (&parts))
25299 return false;
25300 }
25301
25302 test_if = SET_SRC (pc_set (condjmp));
25303 cond = XEXP (test_if, 0);
25304 ccode = GET_CODE (cond);
25305 /* Check whether conditional jump use Sign or Overflow Flags. */
25306 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
25307 && (ccode == GE
25308 || ccode == GT
25309 || ccode == LE
25310 || ccode == LT))
25311 return false;
25312
25313 /* Return true for TYPE_TEST and TYPE_ICMP. */
25314 if (get_attr_type (condgen) == TYPE_TEST
25315 || get_attr_type (condgen) == TYPE_ICMP)
25316 return true;
25317
25318 /* The following is the case that macro-fusion for alu + jmp. */
25319 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
25320 return false;
25321
25322 /* No fusion for alu op with memory destination operand. */
25323 dest = SET_DEST (alu_set);
25324 if (MEM_P (dest))
25325 return false;
25326
25327 /* Macro-fusion for inc/dec + unsigned conditional jump is not
25328 supported. */
25329 if (get_attr_type (condgen) == TYPE_INCDEC
25330 && (ccode == GEU
25331 || ccode == GTU
25332 || ccode == LEU
25333 || ccode == LTU))
25334 return false;
25335
25336 return true;
25337 }
25338
25339 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
25340 execution. It is applied if
25341 (1) IMUL instruction is on the top of list;
25342 (2) There exists the only producer of independent IMUL instruction in
25343 ready list.
25344 Return index of IMUL producer if it was found and -1 otherwise. */
25345 static int
25346 do_reorder_for_imul (rtx *ready, int n_ready)
25347 {
25348 rtx insn, set, insn1, insn2;
25349 sd_iterator_def sd_it;
25350 dep_t dep;
25351 int index = -1;
25352 int i;
25353
25354 if (ix86_tune != PROCESSOR_ATOM)
25355 return index;
25356
25357 /* Check that IMUL instruction is on the top of ready list. */
25358 insn = ready[n_ready - 1];
25359 set = single_set (insn);
25360 if (!set)
25361 return index;
25362 if (!(GET_CODE (SET_SRC (set)) == MULT
25363 && GET_MODE (SET_SRC (set)) == SImode))
25364 return index;
25365
25366 /* Search for producer of independent IMUL instruction. */
25367 for (i = n_ready - 2; i >= 0; i--)
25368 {
25369 insn = ready[i];
25370 if (!NONDEBUG_INSN_P (insn))
25371 continue;
25372 /* Skip IMUL instruction. */
25373 insn2 = PATTERN (insn);
25374 if (GET_CODE (insn2) == PARALLEL)
25375 insn2 = XVECEXP (insn2, 0, 0);
25376 if (GET_CODE (insn2) == SET
25377 && GET_CODE (SET_SRC (insn2)) == MULT
25378 && GET_MODE (SET_SRC (insn2)) == SImode)
25379 continue;
25380
25381 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
25382 {
25383 rtx con;
25384 con = DEP_CON (dep);
25385 if (!NONDEBUG_INSN_P (con))
25386 continue;
25387 insn1 = PATTERN (con);
25388 if (GET_CODE (insn1) == PARALLEL)
25389 insn1 = XVECEXP (insn1, 0, 0);
25390
25391 if (GET_CODE (insn1) == SET
25392 && GET_CODE (SET_SRC (insn1)) == MULT
25393 && GET_MODE (SET_SRC (insn1)) == SImode)
25394 {
25395 sd_iterator_def sd_it1;
25396 dep_t dep1;
25397 /* Check if there is no other dependee for IMUL. */
25398 index = i;
25399 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
25400 {
25401 rtx pro;
25402 pro = DEP_PRO (dep1);
25403 if (!NONDEBUG_INSN_P (pro))
25404 continue;
25405 if (pro != insn)
25406 index = -1;
25407 }
25408 if (index >= 0)
25409 break;
25410 }
25411 }
25412 if (index >= 0)
25413 break;
25414 }
25415 return index;
25416 }
25417
25418 /* Try to find the best candidate on the top of ready list if two insns
25419 have the same priority - candidate is best if its dependees were
25420 scheduled earlier. Applied for Silvermont only.
25421 Return true if top 2 insns must be interchanged. */
25422 static bool
25423 swap_top_of_ready_list (rtx *ready, int n_ready)
25424 {
25425 rtx top = ready[n_ready - 1];
25426 rtx next = ready[n_ready - 2];
25427 rtx set;
25428 sd_iterator_def sd_it;
25429 dep_t dep;
25430 int clock1 = -1;
25431 int clock2 = -1;
25432 #define INSN_TICK(INSN) (HID (INSN)->tick)
25433
25434 if (ix86_tune != PROCESSOR_SLM)
25435 return false;
25436
25437 if (!NONDEBUG_INSN_P (top))
25438 return false;
25439 if (!NONJUMP_INSN_P (top))
25440 return false;
25441 if (!NONDEBUG_INSN_P (next))
25442 return false;
25443 if (!NONJUMP_INSN_P (next))
25444 return false;
25445 set = single_set (top);
25446 if (!set)
25447 return false;
25448 set = single_set (next);
25449 if (!set)
25450 return false;
25451
25452 if (INSN_PRIORITY_KNOWN (top) && INSN_PRIORITY_KNOWN (next))
25453 {
25454 if (INSN_PRIORITY (top) != INSN_PRIORITY (next))
25455 return false;
25456 /* Determine winner more precise. */
25457 FOR_EACH_DEP (top, SD_LIST_RES_BACK, sd_it, dep)
25458 {
25459 rtx pro;
25460 pro = DEP_PRO (dep);
25461 if (!NONDEBUG_INSN_P (pro))
25462 continue;
25463 if (INSN_TICK (pro) > clock1)
25464 clock1 = INSN_TICK (pro);
25465 }
25466 FOR_EACH_DEP (next, SD_LIST_RES_BACK, sd_it, dep)
25467 {
25468 rtx pro;
25469 pro = DEP_PRO (dep);
25470 if (!NONDEBUG_INSN_P (pro))
25471 continue;
25472 if (INSN_TICK (pro) > clock2)
25473 clock2 = INSN_TICK (pro);
25474 }
25475
25476 if (clock1 == clock2)
25477 {
25478 /* Determine winner - load must win. */
25479 enum attr_memory memory1, memory2;
25480 memory1 = get_attr_memory (top);
25481 memory2 = get_attr_memory (next);
25482 if (memory2 == MEMORY_LOAD && memory1 != MEMORY_LOAD)
25483 return true;
25484 }
25485 return (bool) (clock2 < clock1);
25486 }
25487 return false;
25488 #undef INSN_TICK
25489 }
25490
25491 /* Perform possible reodering of ready list for Atom/Silvermont only.
25492 Return issue rate. */
25493 static int
25494 ix86_sched_reorder (FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
25495 int clock_var)
25496 {
25497 int issue_rate = -1;
25498 int n_ready = *pn_ready;
25499 int i;
25500 rtx insn;
25501 int index = -1;
25502
25503 /* Set up issue rate. */
25504 issue_rate = ix86_issue_rate ();
25505
25506 /* Do reodering for Atom/SLM only. */
25507 if (ix86_tune != PROCESSOR_ATOM && ix86_tune != PROCESSOR_SLM)
25508 return issue_rate;
25509
25510 /* Nothing to do if ready list contains only 1 instruction. */
25511 if (n_ready <= 1)
25512 return issue_rate;
25513
25514 /* Do reodering for post-reload scheduler only. */
25515 if (!reload_completed)
25516 return issue_rate;
25517
25518 if ((index = do_reorder_for_imul (ready, n_ready)) >= 0)
25519 {
25520 if (sched_verbose > 1)
25521 fprintf (dump, ";;\tatom sched_reorder: put %d insn on top\n",
25522 INSN_UID (ready[index]));
25523
25524 /* Put IMUL producer (ready[index]) at the top of ready list. */
25525 insn = ready[index];
25526 for (i = index; i < n_ready - 1; i++)
25527 ready[i] = ready[i + 1];
25528 ready[n_ready - 1] = insn;
25529 return issue_rate;
25530 }
25531 if (clock_var != 0 && swap_top_of_ready_list (ready, n_ready))
25532 {
25533 if (sched_verbose > 1)
25534 fprintf (dump, ";;\tslm sched_reorder: swap %d and %d insns\n",
25535 INSN_UID (ready[n_ready - 1]), INSN_UID (ready[n_ready - 2]));
25536 /* Swap 2 top elements of ready list. */
25537 insn = ready[n_ready - 1];
25538 ready[n_ready - 1] = ready[n_ready - 2];
25539 ready[n_ready - 2] = insn;
25540 }
25541 return issue_rate;
25542 }
25543
25544 static bool
25545 ix86_class_likely_spilled_p (reg_class_t);
25546
25547 /* Returns true if lhs of insn is HW function argument register and set up
25548 is_spilled to true if it is likely spilled HW register. */
25549 static bool
25550 insn_is_function_arg (rtx insn, bool* is_spilled)
25551 {
25552 rtx dst;
25553
25554 if (!NONDEBUG_INSN_P (insn))
25555 return false;
25556 /* Call instructions are not movable, ignore it. */
25557 if (CALL_P (insn))
25558 return false;
25559 insn = PATTERN (insn);
25560 if (GET_CODE (insn) == PARALLEL)
25561 insn = XVECEXP (insn, 0, 0);
25562 if (GET_CODE (insn) != SET)
25563 return false;
25564 dst = SET_DEST (insn);
25565 if (REG_P (dst) && HARD_REGISTER_P (dst)
25566 && ix86_function_arg_regno_p (REGNO (dst)))
25567 {
25568 /* Is it likely spilled HW register? */
25569 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
25570 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
25571 *is_spilled = true;
25572 return true;
25573 }
25574 return false;
25575 }
25576
25577 /* Add output dependencies for chain of function adjacent arguments if only
25578 there is a move to likely spilled HW register. Return first argument
25579 if at least one dependence was added or NULL otherwise. */
25580 static rtx
25581 add_parameter_dependencies (rtx call, rtx head)
25582 {
25583 rtx insn;
25584 rtx last = call;
25585 rtx first_arg = NULL;
25586 bool is_spilled = false;
25587
25588 head = PREV_INSN (head);
25589
25590 /* Find nearest to call argument passing instruction. */
25591 while (true)
25592 {
25593 last = PREV_INSN (last);
25594 if (last == head)
25595 return NULL;
25596 if (!NONDEBUG_INSN_P (last))
25597 continue;
25598 if (insn_is_function_arg (last, &is_spilled))
25599 break;
25600 return NULL;
25601 }
25602
25603 first_arg = last;
25604 while (true)
25605 {
25606 insn = PREV_INSN (last);
25607 if (!INSN_P (insn))
25608 break;
25609 if (insn == head)
25610 break;
25611 if (!NONDEBUG_INSN_P (insn))
25612 {
25613 last = insn;
25614 continue;
25615 }
25616 if (insn_is_function_arg (insn, &is_spilled))
25617 {
25618 /* Add output depdendence between two function arguments if chain
25619 of output arguments contains likely spilled HW registers. */
25620 if (is_spilled)
25621 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25622 first_arg = last = insn;
25623 }
25624 else
25625 break;
25626 }
25627 if (!is_spilled)
25628 return NULL;
25629 return first_arg;
25630 }
25631
25632 /* Add output or anti dependency from insn to first_arg to restrict its code
25633 motion. */
25634 static void
25635 avoid_func_arg_motion (rtx first_arg, rtx insn)
25636 {
25637 rtx set;
25638 rtx tmp;
25639
25640 set = single_set (insn);
25641 if (!set)
25642 return;
25643 tmp = SET_DEST (set);
25644 if (REG_P (tmp))
25645 {
25646 /* Add output dependency to the first function argument. */
25647 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
25648 return;
25649 }
25650 /* Add anti dependency. */
25651 add_dependence (first_arg, insn, REG_DEP_ANTI);
25652 }
25653
25654 /* Avoid cross block motion of function argument through adding dependency
25655 from the first non-jump instruction in bb. */
25656 static void
25657 add_dependee_for_func_arg (rtx arg, basic_block bb)
25658 {
25659 rtx insn = BB_END (bb);
25660
25661 while (insn)
25662 {
25663 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
25664 {
25665 rtx set = single_set (insn);
25666 if (set)
25667 {
25668 avoid_func_arg_motion (arg, insn);
25669 return;
25670 }
25671 }
25672 if (insn == BB_HEAD (bb))
25673 return;
25674 insn = PREV_INSN (insn);
25675 }
25676 }
25677
25678 /* Hook for pre-reload schedule - avoid motion of function arguments
25679 passed in likely spilled HW registers. */
25680 static void
25681 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
25682 {
25683 rtx insn;
25684 rtx first_arg = NULL;
25685 if (reload_completed)
25686 return;
25687 while (head != tail && DEBUG_INSN_P (head))
25688 head = NEXT_INSN (head);
25689 for (insn = tail; insn != head; insn = PREV_INSN (insn))
25690 if (INSN_P (insn) && CALL_P (insn))
25691 {
25692 first_arg = add_parameter_dependencies (insn, head);
25693 if (first_arg)
25694 {
25695 /* Add dependee for first argument to predecessors if only
25696 region contains more than one block. */
25697 basic_block bb = BLOCK_FOR_INSN (insn);
25698 int rgn = CONTAINING_RGN (bb->index);
25699 int nr_blks = RGN_NR_BLOCKS (rgn);
25700 /* Skip trivial regions and region head blocks that can have
25701 predecessors outside of region. */
25702 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
25703 {
25704 edge e;
25705 edge_iterator ei;
25706 /* Assume that region is SCC, i.e. all immediate predecessors
25707 of non-head block are in the same region. */
25708 FOR_EACH_EDGE (e, ei, bb->preds)
25709 {
25710 /* Avoid creating of loop-carried dependencies through
25711 using topological odering in region. */
25712 if (BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
25713 add_dependee_for_func_arg (first_arg, e->src);
25714 }
25715 }
25716 insn = first_arg;
25717 if (insn == head)
25718 break;
25719 }
25720 }
25721 else if (first_arg)
25722 avoid_func_arg_motion (first_arg, insn);
25723 }
25724
25725 /* Hook for pre-reload schedule - set priority of moves from likely spilled
25726 HW registers to maximum, to schedule them at soon as possible. These are
25727 moves from function argument registers at the top of the function entry
25728 and moves from function return value registers after call. */
25729 static int
25730 ix86_adjust_priority (rtx insn, int priority)
25731 {
25732 rtx set;
25733
25734 if (reload_completed)
25735 return priority;
25736
25737 if (!NONDEBUG_INSN_P (insn))
25738 return priority;
25739
25740 set = single_set (insn);
25741 if (set)
25742 {
25743 rtx tmp = SET_SRC (set);
25744 if (REG_P (tmp)
25745 && HARD_REGISTER_P (tmp)
25746 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
25747 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
25748 return current_sched_info->sched_max_insns_priority;
25749 }
25750
25751 return priority;
25752 }
25753
25754 /* Model decoder of Core 2/i7.
25755 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
25756 track the instruction fetch block boundaries and make sure that long
25757 (9+ bytes) instructions are assigned to D0. */
25758
25759 /* Maximum length of an insn that can be handled by
25760 a secondary decoder unit. '8' for Core 2/i7. */
25761 static int core2i7_secondary_decoder_max_insn_size;
25762
25763 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
25764 '16' for Core 2/i7. */
25765 static int core2i7_ifetch_block_size;
25766
25767 /* Maximum number of instructions decoder can handle per cycle.
25768 '6' for Core 2/i7. */
25769 static int core2i7_ifetch_block_max_insns;
25770
25771 typedef struct ix86_first_cycle_multipass_data_ *
25772 ix86_first_cycle_multipass_data_t;
25773 typedef const struct ix86_first_cycle_multipass_data_ *
25774 const_ix86_first_cycle_multipass_data_t;
25775
25776 /* A variable to store target state across calls to max_issue within
25777 one cycle. */
25778 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
25779 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
25780
25781 /* Initialize DATA. */
25782 static void
25783 core2i7_first_cycle_multipass_init (void *_data)
25784 {
25785 ix86_first_cycle_multipass_data_t data
25786 = (ix86_first_cycle_multipass_data_t) _data;
25787
25788 data->ifetch_block_len = 0;
25789 data->ifetch_block_n_insns = 0;
25790 data->ready_try_change = NULL;
25791 data->ready_try_change_size = 0;
25792 }
25793
25794 /* Advancing the cycle; reset ifetch block counts. */
25795 static void
25796 core2i7_dfa_post_advance_cycle (void)
25797 {
25798 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
25799
25800 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25801
25802 data->ifetch_block_len = 0;
25803 data->ifetch_block_n_insns = 0;
25804 }
25805
25806 static int min_insn_size (rtx);
25807
25808 /* Filter out insns from ready_try that the core will not be able to issue
25809 on current cycle due to decoder. */
25810 static void
25811 core2i7_first_cycle_multipass_filter_ready_try
25812 (const_ix86_first_cycle_multipass_data_t data,
25813 char *ready_try, int n_ready, bool first_cycle_insn_p)
25814 {
25815 while (n_ready--)
25816 {
25817 rtx insn;
25818 int insn_size;
25819
25820 if (ready_try[n_ready])
25821 continue;
25822
25823 insn = get_ready_element (n_ready);
25824 insn_size = min_insn_size (insn);
25825
25826 if (/* If this is a too long an insn for a secondary decoder ... */
25827 (!first_cycle_insn_p
25828 && insn_size > core2i7_secondary_decoder_max_insn_size)
25829 /* ... or it would not fit into the ifetch block ... */
25830 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
25831 /* ... or the decoder is full already ... */
25832 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
25833 /* ... mask the insn out. */
25834 {
25835 ready_try[n_ready] = 1;
25836
25837 if (data->ready_try_change)
25838 bitmap_set_bit (data->ready_try_change, n_ready);
25839 }
25840 }
25841 }
25842
25843 /* Prepare for a new round of multipass lookahead scheduling. */
25844 static void
25845 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
25846 bool first_cycle_insn_p)
25847 {
25848 ix86_first_cycle_multipass_data_t data
25849 = (ix86_first_cycle_multipass_data_t) _data;
25850 const_ix86_first_cycle_multipass_data_t prev_data
25851 = ix86_first_cycle_multipass_data;
25852
25853 /* Restore the state from the end of the previous round. */
25854 data->ifetch_block_len = prev_data->ifetch_block_len;
25855 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
25856
25857 /* Filter instructions that cannot be issued on current cycle due to
25858 decoder restrictions. */
25859 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25860 first_cycle_insn_p);
25861 }
25862
25863 /* INSN is being issued in current solution. Account for its impact on
25864 the decoder model. */
25865 static void
25866 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
25867 rtx insn, const void *_prev_data)
25868 {
25869 ix86_first_cycle_multipass_data_t data
25870 = (ix86_first_cycle_multipass_data_t) _data;
25871 const_ix86_first_cycle_multipass_data_t prev_data
25872 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
25873
25874 int insn_size = min_insn_size (insn);
25875
25876 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
25877 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
25878 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
25879 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
25880
25881 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
25882 if (!data->ready_try_change)
25883 {
25884 data->ready_try_change = sbitmap_alloc (n_ready);
25885 data->ready_try_change_size = n_ready;
25886 }
25887 else if (data->ready_try_change_size < n_ready)
25888 {
25889 data->ready_try_change = sbitmap_resize (data->ready_try_change,
25890 n_ready, 0);
25891 data->ready_try_change_size = n_ready;
25892 }
25893 bitmap_clear (data->ready_try_change);
25894
25895 /* Filter out insns from ready_try that the core will not be able to issue
25896 on current cycle due to decoder. */
25897 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
25898 false);
25899 }
25900
25901 /* Revert the effect on ready_try. */
25902 static void
25903 core2i7_first_cycle_multipass_backtrack (const void *_data,
25904 char *ready_try,
25905 int n_ready ATTRIBUTE_UNUSED)
25906 {
25907 const_ix86_first_cycle_multipass_data_t data
25908 = (const_ix86_first_cycle_multipass_data_t) _data;
25909 unsigned int i = 0;
25910 sbitmap_iterator sbi;
25911
25912 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
25913 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
25914 {
25915 ready_try[i] = 0;
25916 }
25917 }
25918
25919 /* Save the result of multipass lookahead scheduling for the next round. */
25920 static void
25921 core2i7_first_cycle_multipass_end (const void *_data)
25922 {
25923 const_ix86_first_cycle_multipass_data_t data
25924 = (const_ix86_first_cycle_multipass_data_t) _data;
25925 ix86_first_cycle_multipass_data_t next_data
25926 = ix86_first_cycle_multipass_data;
25927
25928 if (data != NULL)
25929 {
25930 next_data->ifetch_block_len = data->ifetch_block_len;
25931 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
25932 }
25933 }
25934
25935 /* Deallocate target data. */
25936 static void
25937 core2i7_first_cycle_multipass_fini (void *_data)
25938 {
25939 ix86_first_cycle_multipass_data_t data
25940 = (ix86_first_cycle_multipass_data_t) _data;
25941
25942 if (data->ready_try_change)
25943 {
25944 sbitmap_free (data->ready_try_change);
25945 data->ready_try_change = NULL;
25946 data->ready_try_change_size = 0;
25947 }
25948 }
25949
25950 /* Prepare for scheduling pass. */
25951 static void
25952 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
25953 int verbose ATTRIBUTE_UNUSED,
25954 int max_uid ATTRIBUTE_UNUSED)
25955 {
25956 /* Install scheduling hooks for current CPU. Some of these hooks are used
25957 in time-critical parts of the scheduler, so we only set them up when
25958 they are actually used. */
25959 switch (ix86_tune)
25960 {
25961 case PROCESSOR_CORE2:
25962 case PROCESSOR_COREI7:
25963 case PROCESSOR_COREI7_AVX:
25964 case PROCESSOR_HASWELL:
25965 /* Do not perform multipass scheduling for pre-reload schedule
25966 to save compile time. */
25967 if (reload_completed)
25968 {
25969 targetm.sched.dfa_post_advance_cycle
25970 = core2i7_dfa_post_advance_cycle;
25971 targetm.sched.first_cycle_multipass_init
25972 = core2i7_first_cycle_multipass_init;
25973 targetm.sched.first_cycle_multipass_begin
25974 = core2i7_first_cycle_multipass_begin;
25975 targetm.sched.first_cycle_multipass_issue
25976 = core2i7_first_cycle_multipass_issue;
25977 targetm.sched.first_cycle_multipass_backtrack
25978 = core2i7_first_cycle_multipass_backtrack;
25979 targetm.sched.first_cycle_multipass_end
25980 = core2i7_first_cycle_multipass_end;
25981 targetm.sched.first_cycle_multipass_fini
25982 = core2i7_first_cycle_multipass_fini;
25983
25984 /* Set decoder parameters. */
25985 core2i7_secondary_decoder_max_insn_size = 8;
25986 core2i7_ifetch_block_size = 16;
25987 core2i7_ifetch_block_max_insns = 6;
25988 break;
25989 }
25990 /* ... Fall through ... */
25991 default:
25992 targetm.sched.dfa_post_advance_cycle = NULL;
25993 targetm.sched.first_cycle_multipass_init = NULL;
25994 targetm.sched.first_cycle_multipass_begin = NULL;
25995 targetm.sched.first_cycle_multipass_issue = NULL;
25996 targetm.sched.first_cycle_multipass_backtrack = NULL;
25997 targetm.sched.first_cycle_multipass_end = NULL;
25998 targetm.sched.first_cycle_multipass_fini = NULL;
25999 break;
26000 }
26001 }
26002
26003 \f
26004 /* Compute the alignment given to a constant that is being placed in memory.
26005 EXP is the constant and ALIGN is the alignment that the object would
26006 ordinarily have.
26007 The value of this function is used instead of that alignment to align
26008 the object. */
26009
26010 int
26011 ix86_constant_alignment (tree exp, int align)
26012 {
26013 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
26014 || TREE_CODE (exp) == INTEGER_CST)
26015 {
26016 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
26017 return 64;
26018 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
26019 return 128;
26020 }
26021 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
26022 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
26023 return BITS_PER_WORD;
26024
26025 return align;
26026 }
26027
26028 /* Compute the alignment for a static variable.
26029 TYPE is the data type, and ALIGN is the alignment that
26030 the object would ordinarily have. The value of this function is used
26031 instead of that alignment to align the object. */
26032
26033 int
26034 ix86_data_alignment (tree type, int align, bool opt)
26035 {
26036 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
26037
26038 if (opt
26039 && AGGREGATE_TYPE_P (type)
26040 && TYPE_SIZE (type)
26041 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26042 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
26043 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
26044 && align < max_align)
26045 align = max_align;
26046
26047 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26048 to 16byte boundary. */
26049 if (TARGET_64BIT)
26050 {
26051 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
26052 && TYPE_SIZE (type)
26053 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26054 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
26055 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26056 return 128;
26057 }
26058
26059 if (!opt)
26060 return align;
26061
26062 if (TREE_CODE (type) == ARRAY_TYPE)
26063 {
26064 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26065 return 64;
26066 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26067 return 128;
26068 }
26069 else if (TREE_CODE (type) == COMPLEX_TYPE)
26070 {
26071
26072 if (TYPE_MODE (type) == DCmode && align < 64)
26073 return 64;
26074 if ((TYPE_MODE (type) == XCmode
26075 || TYPE_MODE (type) == TCmode) && align < 128)
26076 return 128;
26077 }
26078 else if ((TREE_CODE (type) == RECORD_TYPE
26079 || TREE_CODE (type) == UNION_TYPE
26080 || TREE_CODE (type) == QUAL_UNION_TYPE)
26081 && TYPE_FIELDS (type))
26082 {
26083 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26084 return 64;
26085 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26086 return 128;
26087 }
26088 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26089 || TREE_CODE (type) == INTEGER_TYPE)
26090 {
26091 if (TYPE_MODE (type) == DFmode && align < 64)
26092 return 64;
26093 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26094 return 128;
26095 }
26096
26097 return align;
26098 }
26099
26100 /* Compute the alignment for a local variable or a stack slot. EXP is
26101 the data type or decl itself, MODE is the widest mode available and
26102 ALIGN is the alignment that the object would ordinarily have. The
26103 value of this macro is used instead of that alignment to align the
26104 object. */
26105
26106 unsigned int
26107 ix86_local_alignment (tree exp, enum machine_mode mode,
26108 unsigned int align)
26109 {
26110 tree type, decl;
26111
26112 if (exp && DECL_P (exp))
26113 {
26114 type = TREE_TYPE (exp);
26115 decl = exp;
26116 }
26117 else
26118 {
26119 type = exp;
26120 decl = NULL;
26121 }
26122
26123 /* Don't do dynamic stack realignment for long long objects with
26124 -mpreferred-stack-boundary=2. */
26125 if (!TARGET_64BIT
26126 && align == 64
26127 && ix86_preferred_stack_boundary < 64
26128 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
26129 && (!type || !TYPE_USER_ALIGN (type))
26130 && (!decl || !DECL_USER_ALIGN (decl)))
26131 align = 32;
26132
26133 /* If TYPE is NULL, we are allocating a stack slot for caller-save
26134 register in MODE. We will return the largest alignment of XF
26135 and DF. */
26136 if (!type)
26137 {
26138 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
26139 align = GET_MODE_ALIGNMENT (DFmode);
26140 return align;
26141 }
26142
26143 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
26144 to 16byte boundary. Exact wording is:
26145
26146 An array uses the same alignment as its elements, except that a local or
26147 global array variable of length at least 16 bytes or
26148 a C99 variable-length array variable always has alignment of at least 16 bytes.
26149
26150 This was added to allow use of aligned SSE instructions at arrays. This
26151 rule is meant for static storage (where compiler can not do the analysis
26152 by itself). We follow it for automatic variables only when convenient.
26153 We fully control everything in the function compiled and functions from
26154 other unit can not rely on the alignment.
26155
26156 Exclude va_list type. It is the common case of local array where
26157 we can not benefit from the alignment.
26158
26159 TODO: Probably one should optimize for size only when var is not escaping. */
26160 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
26161 && TARGET_SSE)
26162 {
26163 if (AGGREGATE_TYPE_P (type)
26164 && (va_list_type_node == NULL_TREE
26165 || (TYPE_MAIN_VARIANT (type)
26166 != TYPE_MAIN_VARIANT (va_list_type_node)))
26167 && TYPE_SIZE (type)
26168 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
26169 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
26170 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
26171 return 128;
26172 }
26173 if (TREE_CODE (type) == ARRAY_TYPE)
26174 {
26175 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
26176 return 64;
26177 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
26178 return 128;
26179 }
26180 else if (TREE_CODE (type) == COMPLEX_TYPE)
26181 {
26182 if (TYPE_MODE (type) == DCmode && align < 64)
26183 return 64;
26184 if ((TYPE_MODE (type) == XCmode
26185 || TYPE_MODE (type) == TCmode) && align < 128)
26186 return 128;
26187 }
26188 else if ((TREE_CODE (type) == RECORD_TYPE
26189 || TREE_CODE (type) == UNION_TYPE
26190 || TREE_CODE (type) == QUAL_UNION_TYPE)
26191 && TYPE_FIELDS (type))
26192 {
26193 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
26194 return 64;
26195 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
26196 return 128;
26197 }
26198 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
26199 || TREE_CODE (type) == INTEGER_TYPE)
26200 {
26201
26202 if (TYPE_MODE (type) == DFmode && align < 64)
26203 return 64;
26204 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
26205 return 128;
26206 }
26207 return align;
26208 }
26209
26210 /* Compute the minimum required alignment for dynamic stack realignment
26211 purposes for a local variable, parameter or a stack slot. EXP is
26212 the data type or decl itself, MODE is its mode and ALIGN is the
26213 alignment that the object would ordinarily have. */
26214
26215 unsigned int
26216 ix86_minimum_alignment (tree exp, enum machine_mode mode,
26217 unsigned int align)
26218 {
26219 tree type, decl;
26220
26221 if (exp && DECL_P (exp))
26222 {
26223 type = TREE_TYPE (exp);
26224 decl = exp;
26225 }
26226 else
26227 {
26228 type = exp;
26229 decl = NULL;
26230 }
26231
26232 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
26233 return align;
26234
26235 /* Don't do dynamic stack realignment for long long objects with
26236 -mpreferred-stack-boundary=2. */
26237 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
26238 && (!type || !TYPE_USER_ALIGN (type))
26239 && (!decl || !DECL_USER_ALIGN (decl)))
26240 return 32;
26241
26242 return align;
26243 }
26244 \f
26245 /* Find a location for the static chain incoming to a nested function.
26246 This is a register, unless all free registers are used by arguments. */
26247
26248 static rtx
26249 ix86_static_chain (const_tree fndecl, bool incoming_p)
26250 {
26251 unsigned regno;
26252
26253 if (!DECL_STATIC_CHAIN (fndecl))
26254 return NULL;
26255
26256 if (TARGET_64BIT)
26257 {
26258 /* We always use R10 in 64-bit mode. */
26259 regno = R10_REG;
26260 }
26261 else
26262 {
26263 tree fntype;
26264 unsigned int ccvt;
26265
26266 /* By default in 32-bit mode we use ECX to pass the static chain. */
26267 regno = CX_REG;
26268
26269 fntype = TREE_TYPE (fndecl);
26270 ccvt = ix86_get_callcvt (fntype);
26271 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
26272 {
26273 /* Fastcall functions use ecx/edx for arguments, which leaves
26274 us with EAX for the static chain.
26275 Thiscall functions use ecx for arguments, which also
26276 leaves us with EAX for the static chain. */
26277 regno = AX_REG;
26278 }
26279 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
26280 {
26281 /* Thiscall functions use ecx for arguments, which leaves
26282 us with EAX and EDX for the static chain.
26283 We are using for abi-compatibility EAX. */
26284 regno = AX_REG;
26285 }
26286 else if (ix86_function_regparm (fntype, fndecl) == 3)
26287 {
26288 /* For regparm 3, we have no free call-clobbered registers in
26289 which to store the static chain. In order to implement this,
26290 we have the trampoline push the static chain to the stack.
26291 However, we can't push a value below the return address when
26292 we call the nested function directly, so we have to use an
26293 alternate entry point. For this we use ESI, and have the
26294 alternate entry point push ESI, so that things appear the
26295 same once we're executing the nested function. */
26296 if (incoming_p)
26297 {
26298 if (fndecl == current_function_decl)
26299 ix86_static_chain_on_stack = true;
26300 return gen_frame_mem (SImode,
26301 plus_constant (Pmode,
26302 arg_pointer_rtx, -8));
26303 }
26304 regno = SI_REG;
26305 }
26306 }
26307
26308 return gen_rtx_REG (Pmode, regno);
26309 }
26310
26311 /* Emit RTL insns to initialize the variable parts of a trampoline.
26312 FNDECL is the decl of the target address; M_TRAMP is a MEM for
26313 the trampoline, and CHAIN_VALUE is an RTX for the static chain
26314 to be passed to the target function. */
26315
26316 static void
26317 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
26318 {
26319 rtx mem, fnaddr;
26320 int opcode;
26321 int offset = 0;
26322
26323 fnaddr = XEXP (DECL_RTL (fndecl), 0);
26324
26325 if (TARGET_64BIT)
26326 {
26327 int size;
26328
26329 /* Load the function address to r11. Try to load address using
26330 the shorter movl instead of movabs. We may want to support
26331 movq for kernel mode, but kernel does not use trampolines at
26332 the moment. FNADDR is a 32bit address and may not be in
26333 DImode when ptr_mode == SImode. Always use movl in this
26334 case. */
26335 if (ptr_mode == SImode
26336 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
26337 {
26338 fnaddr = copy_addr_to_reg (fnaddr);
26339
26340 mem = adjust_address (m_tramp, HImode, offset);
26341 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
26342
26343 mem = adjust_address (m_tramp, SImode, offset + 2);
26344 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
26345 offset += 6;
26346 }
26347 else
26348 {
26349 mem = adjust_address (m_tramp, HImode, offset);
26350 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
26351
26352 mem = adjust_address (m_tramp, DImode, offset + 2);
26353 emit_move_insn (mem, fnaddr);
26354 offset += 10;
26355 }
26356
26357 /* Load static chain using movabs to r10. Use the shorter movl
26358 instead of movabs when ptr_mode == SImode. */
26359 if (ptr_mode == SImode)
26360 {
26361 opcode = 0xba41;
26362 size = 6;
26363 }
26364 else
26365 {
26366 opcode = 0xba49;
26367 size = 10;
26368 }
26369
26370 mem = adjust_address (m_tramp, HImode, offset);
26371 emit_move_insn (mem, gen_int_mode (opcode, HImode));
26372
26373 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
26374 emit_move_insn (mem, chain_value);
26375 offset += size;
26376
26377 /* Jump to r11; the last (unused) byte is a nop, only there to
26378 pad the write out to a single 32-bit store. */
26379 mem = adjust_address (m_tramp, SImode, offset);
26380 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
26381 offset += 4;
26382 }
26383 else
26384 {
26385 rtx disp, chain;
26386
26387 /* Depending on the static chain location, either load a register
26388 with a constant, or push the constant to the stack. All of the
26389 instructions are the same size. */
26390 chain = ix86_static_chain (fndecl, true);
26391 if (REG_P (chain))
26392 {
26393 switch (REGNO (chain))
26394 {
26395 case AX_REG:
26396 opcode = 0xb8; break;
26397 case CX_REG:
26398 opcode = 0xb9; break;
26399 default:
26400 gcc_unreachable ();
26401 }
26402 }
26403 else
26404 opcode = 0x68;
26405
26406 mem = adjust_address (m_tramp, QImode, offset);
26407 emit_move_insn (mem, gen_int_mode (opcode, QImode));
26408
26409 mem = adjust_address (m_tramp, SImode, offset + 1);
26410 emit_move_insn (mem, chain_value);
26411 offset += 5;
26412
26413 mem = adjust_address (m_tramp, QImode, offset);
26414 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
26415
26416 mem = adjust_address (m_tramp, SImode, offset + 1);
26417
26418 /* Compute offset from the end of the jmp to the target function.
26419 In the case in which the trampoline stores the static chain on
26420 the stack, we need to skip the first insn which pushes the
26421 (call-saved) register static chain; this push is 1 byte. */
26422 offset += 5;
26423 disp = expand_binop (SImode, sub_optab, fnaddr,
26424 plus_constant (Pmode, XEXP (m_tramp, 0),
26425 offset - (MEM_P (chain) ? 1 : 0)),
26426 NULL_RTX, 1, OPTAB_DIRECT);
26427 emit_move_insn (mem, disp);
26428 }
26429
26430 gcc_assert (offset <= TRAMPOLINE_SIZE);
26431
26432 #ifdef HAVE_ENABLE_EXECUTE_STACK
26433 #ifdef CHECK_EXECUTE_STACK_ENABLED
26434 if (CHECK_EXECUTE_STACK_ENABLED)
26435 #endif
26436 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
26437 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
26438 #endif
26439 }
26440 \f
26441 /* The following file contains several enumerations and data structures
26442 built from the definitions in i386-builtin-types.def. */
26443
26444 #include "i386-builtin-types.inc"
26445
26446 /* Table for the ix86 builtin non-function types. */
26447 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
26448
26449 /* Retrieve an element from the above table, building some of
26450 the types lazily. */
26451
26452 static tree
26453 ix86_get_builtin_type (enum ix86_builtin_type tcode)
26454 {
26455 unsigned int index;
26456 tree type, itype;
26457
26458 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
26459
26460 type = ix86_builtin_type_tab[(int) tcode];
26461 if (type != NULL)
26462 return type;
26463
26464 gcc_assert (tcode > IX86_BT_LAST_PRIM);
26465 if (tcode <= IX86_BT_LAST_VECT)
26466 {
26467 enum machine_mode mode;
26468
26469 index = tcode - IX86_BT_LAST_PRIM - 1;
26470 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
26471 mode = ix86_builtin_type_vect_mode[index];
26472
26473 type = build_vector_type_for_mode (itype, mode);
26474 }
26475 else
26476 {
26477 int quals;
26478
26479 index = tcode - IX86_BT_LAST_VECT - 1;
26480 if (tcode <= IX86_BT_LAST_PTR)
26481 quals = TYPE_UNQUALIFIED;
26482 else
26483 quals = TYPE_QUAL_CONST;
26484
26485 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
26486 if (quals != TYPE_UNQUALIFIED)
26487 itype = build_qualified_type (itype, quals);
26488
26489 type = build_pointer_type (itype);
26490 }
26491
26492 ix86_builtin_type_tab[(int) tcode] = type;
26493 return type;
26494 }
26495
26496 /* Table for the ix86 builtin function types. */
26497 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
26498
26499 /* Retrieve an element from the above table, building some of
26500 the types lazily. */
26501
26502 static tree
26503 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
26504 {
26505 tree type;
26506
26507 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
26508
26509 type = ix86_builtin_func_type_tab[(int) tcode];
26510 if (type != NULL)
26511 return type;
26512
26513 if (tcode <= IX86_BT_LAST_FUNC)
26514 {
26515 unsigned start = ix86_builtin_func_start[(int) tcode];
26516 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
26517 tree rtype, atype, args = void_list_node;
26518 unsigned i;
26519
26520 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
26521 for (i = after - 1; i > start; --i)
26522 {
26523 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
26524 args = tree_cons (NULL, atype, args);
26525 }
26526
26527 type = build_function_type (rtype, args);
26528 }
26529 else
26530 {
26531 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
26532 enum ix86_builtin_func_type icode;
26533
26534 icode = ix86_builtin_func_alias_base[index];
26535 type = ix86_get_builtin_func_type (icode);
26536 }
26537
26538 ix86_builtin_func_type_tab[(int) tcode] = type;
26539 return type;
26540 }
26541
26542
26543 /* Codes for all the SSE/MMX builtins. */
26544 enum ix86_builtins
26545 {
26546 IX86_BUILTIN_ADDPS,
26547 IX86_BUILTIN_ADDSS,
26548 IX86_BUILTIN_DIVPS,
26549 IX86_BUILTIN_DIVSS,
26550 IX86_BUILTIN_MULPS,
26551 IX86_BUILTIN_MULSS,
26552 IX86_BUILTIN_SUBPS,
26553 IX86_BUILTIN_SUBSS,
26554
26555 IX86_BUILTIN_CMPEQPS,
26556 IX86_BUILTIN_CMPLTPS,
26557 IX86_BUILTIN_CMPLEPS,
26558 IX86_BUILTIN_CMPGTPS,
26559 IX86_BUILTIN_CMPGEPS,
26560 IX86_BUILTIN_CMPNEQPS,
26561 IX86_BUILTIN_CMPNLTPS,
26562 IX86_BUILTIN_CMPNLEPS,
26563 IX86_BUILTIN_CMPNGTPS,
26564 IX86_BUILTIN_CMPNGEPS,
26565 IX86_BUILTIN_CMPORDPS,
26566 IX86_BUILTIN_CMPUNORDPS,
26567 IX86_BUILTIN_CMPEQSS,
26568 IX86_BUILTIN_CMPLTSS,
26569 IX86_BUILTIN_CMPLESS,
26570 IX86_BUILTIN_CMPNEQSS,
26571 IX86_BUILTIN_CMPNLTSS,
26572 IX86_BUILTIN_CMPNLESS,
26573 IX86_BUILTIN_CMPORDSS,
26574 IX86_BUILTIN_CMPUNORDSS,
26575
26576 IX86_BUILTIN_COMIEQSS,
26577 IX86_BUILTIN_COMILTSS,
26578 IX86_BUILTIN_COMILESS,
26579 IX86_BUILTIN_COMIGTSS,
26580 IX86_BUILTIN_COMIGESS,
26581 IX86_BUILTIN_COMINEQSS,
26582 IX86_BUILTIN_UCOMIEQSS,
26583 IX86_BUILTIN_UCOMILTSS,
26584 IX86_BUILTIN_UCOMILESS,
26585 IX86_BUILTIN_UCOMIGTSS,
26586 IX86_BUILTIN_UCOMIGESS,
26587 IX86_BUILTIN_UCOMINEQSS,
26588
26589 IX86_BUILTIN_CVTPI2PS,
26590 IX86_BUILTIN_CVTPS2PI,
26591 IX86_BUILTIN_CVTSI2SS,
26592 IX86_BUILTIN_CVTSI642SS,
26593 IX86_BUILTIN_CVTSS2SI,
26594 IX86_BUILTIN_CVTSS2SI64,
26595 IX86_BUILTIN_CVTTPS2PI,
26596 IX86_BUILTIN_CVTTSS2SI,
26597 IX86_BUILTIN_CVTTSS2SI64,
26598
26599 IX86_BUILTIN_MAXPS,
26600 IX86_BUILTIN_MAXSS,
26601 IX86_BUILTIN_MINPS,
26602 IX86_BUILTIN_MINSS,
26603
26604 IX86_BUILTIN_LOADUPS,
26605 IX86_BUILTIN_STOREUPS,
26606 IX86_BUILTIN_MOVSS,
26607
26608 IX86_BUILTIN_MOVHLPS,
26609 IX86_BUILTIN_MOVLHPS,
26610 IX86_BUILTIN_LOADHPS,
26611 IX86_BUILTIN_LOADLPS,
26612 IX86_BUILTIN_STOREHPS,
26613 IX86_BUILTIN_STORELPS,
26614
26615 IX86_BUILTIN_MASKMOVQ,
26616 IX86_BUILTIN_MOVMSKPS,
26617 IX86_BUILTIN_PMOVMSKB,
26618
26619 IX86_BUILTIN_MOVNTPS,
26620 IX86_BUILTIN_MOVNTQ,
26621
26622 IX86_BUILTIN_LOADDQU,
26623 IX86_BUILTIN_STOREDQU,
26624
26625 IX86_BUILTIN_PACKSSWB,
26626 IX86_BUILTIN_PACKSSDW,
26627 IX86_BUILTIN_PACKUSWB,
26628
26629 IX86_BUILTIN_PADDB,
26630 IX86_BUILTIN_PADDW,
26631 IX86_BUILTIN_PADDD,
26632 IX86_BUILTIN_PADDQ,
26633 IX86_BUILTIN_PADDSB,
26634 IX86_BUILTIN_PADDSW,
26635 IX86_BUILTIN_PADDUSB,
26636 IX86_BUILTIN_PADDUSW,
26637 IX86_BUILTIN_PSUBB,
26638 IX86_BUILTIN_PSUBW,
26639 IX86_BUILTIN_PSUBD,
26640 IX86_BUILTIN_PSUBQ,
26641 IX86_BUILTIN_PSUBSB,
26642 IX86_BUILTIN_PSUBSW,
26643 IX86_BUILTIN_PSUBUSB,
26644 IX86_BUILTIN_PSUBUSW,
26645
26646 IX86_BUILTIN_PAND,
26647 IX86_BUILTIN_PANDN,
26648 IX86_BUILTIN_POR,
26649 IX86_BUILTIN_PXOR,
26650
26651 IX86_BUILTIN_PAVGB,
26652 IX86_BUILTIN_PAVGW,
26653
26654 IX86_BUILTIN_PCMPEQB,
26655 IX86_BUILTIN_PCMPEQW,
26656 IX86_BUILTIN_PCMPEQD,
26657 IX86_BUILTIN_PCMPGTB,
26658 IX86_BUILTIN_PCMPGTW,
26659 IX86_BUILTIN_PCMPGTD,
26660
26661 IX86_BUILTIN_PMADDWD,
26662
26663 IX86_BUILTIN_PMAXSW,
26664 IX86_BUILTIN_PMAXUB,
26665 IX86_BUILTIN_PMINSW,
26666 IX86_BUILTIN_PMINUB,
26667
26668 IX86_BUILTIN_PMULHUW,
26669 IX86_BUILTIN_PMULHW,
26670 IX86_BUILTIN_PMULLW,
26671
26672 IX86_BUILTIN_PSADBW,
26673 IX86_BUILTIN_PSHUFW,
26674
26675 IX86_BUILTIN_PSLLW,
26676 IX86_BUILTIN_PSLLD,
26677 IX86_BUILTIN_PSLLQ,
26678 IX86_BUILTIN_PSRAW,
26679 IX86_BUILTIN_PSRAD,
26680 IX86_BUILTIN_PSRLW,
26681 IX86_BUILTIN_PSRLD,
26682 IX86_BUILTIN_PSRLQ,
26683 IX86_BUILTIN_PSLLWI,
26684 IX86_BUILTIN_PSLLDI,
26685 IX86_BUILTIN_PSLLQI,
26686 IX86_BUILTIN_PSRAWI,
26687 IX86_BUILTIN_PSRADI,
26688 IX86_BUILTIN_PSRLWI,
26689 IX86_BUILTIN_PSRLDI,
26690 IX86_BUILTIN_PSRLQI,
26691
26692 IX86_BUILTIN_PUNPCKHBW,
26693 IX86_BUILTIN_PUNPCKHWD,
26694 IX86_BUILTIN_PUNPCKHDQ,
26695 IX86_BUILTIN_PUNPCKLBW,
26696 IX86_BUILTIN_PUNPCKLWD,
26697 IX86_BUILTIN_PUNPCKLDQ,
26698
26699 IX86_BUILTIN_SHUFPS,
26700
26701 IX86_BUILTIN_RCPPS,
26702 IX86_BUILTIN_RCPSS,
26703 IX86_BUILTIN_RSQRTPS,
26704 IX86_BUILTIN_RSQRTPS_NR,
26705 IX86_BUILTIN_RSQRTSS,
26706 IX86_BUILTIN_RSQRTF,
26707 IX86_BUILTIN_SQRTPS,
26708 IX86_BUILTIN_SQRTPS_NR,
26709 IX86_BUILTIN_SQRTSS,
26710
26711 IX86_BUILTIN_UNPCKHPS,
26712 IX86_BUILTIN_UNPCKLPS,
26713
26714 IX86_BUILTIN_ANDPS,
26715 IX86_BUILTIN_ANDNPS,
26716 IX86_BUILTIN_ORPS,
26717 IX86_BUILTIN_XORPS,
26718
26719 IX86_BUILTIN_EMMS,
26720 IX86_BUILTIN_LDMXCSR,
26721 IX86_BUILTIN_STMXCSR,
26722 IX86_BUILTIN_SFENCE,
26723
26724 IX86_BUILTIN_FXSAVE,
26725 IX86_BUILTIN_FXRSTOR,
26726 IX86_BUILTIN_FXSAVE64,
26727 IX86_BUILTIN_FXRSTOR64,
26728
26729 IX86_BUILTIN_XSAVE,
26730 IX86_BUILTIN_XRSTOR,
26731 IX86_BUILTIN_XSAVE64,
26732 IX86_BUILTIN_XRSTOR64,
26733
26734 IX86_BUILTIN_XSAVEOPT,
26735 IX86_BUILTIN_XSAVEOPT64,
26736
26737 /* 3DNow! Original */
26738 IX86_BUILTIN_FEMMS,
26739 IX86_BUILTIN_PAVGUSB,
26740 IX86_BUILTIN_PF2ID,
26741 IX86_BUILTIN_PFACC,
26742 IX86_BUILTIN_PFADD,
26743 IX86_BUILTIN_PFCMPEQ,
26744 IX86_BUILTIN_PFCMPGE,
26745 IX86_BUILTIN_PFCMPGT,
26746 IX86_BUILTIN_PFMAX,
26747 IX86_BUILTIN_PFMIN,
26748 IX86_BUILTIN_PFMUL,
26749 IX86_BUILTIN_PFRCP,
26750 IX86_BUILTIN_PFRCPIT1,
26751 IX86_BUILTIN_PFRCPIT2,
26752 IX86_BUILTIN_PFRSQIT1,
26753 IX86_BUILTIN_PFRSQRT,
26754 IX86_BUILTIN_PFSUB,
26755 IX86_BUILTIN_PFSUBR,
26756 IX86_BUILTIN_PI2FD,
26757 IX86_BUILTIN_PMULHRW,
26758
26759 /* 3DNow! Athlon Extensions */
26760 IX86_BUILTIN_PF2IW,
26761 IX86_BUILTIN_PFNACC,
26762 IX86_BUILTIN_PFPNACC,
26763 IX86_BUILTIN_PI2FW,
26764 IX86_BUILTIN_PSWAPDSI,
26765 IX86_BUILTIN_PSWAPDSF,
26766
26767 /* SSE2 */
26768 IX86_BUILTIN_ADDPD,
26769 IX86_BUILTIN_ADDSD,
26770 IX86_BUILTIN_DIVPD,
26771 IX86_BUILTIN_DIVSD,
26772 IX86_BUILTIN_MULPD,
26773 IX86_BUILTIN_MULSD,
26774 IX86_BUILTIN_SUBPD,
26775 IX86_BUILTIN_SUBSD,
26776
26777 IX86_BUILTIN_CMPEQPD,
26778 IX86_BUILTIN_CMPLTPD,
26779 IX86_BUILTIN_CMPLEPD,
26780 IX86_BUILTIN_CMPGTPD,
26781 IX86_BUILTIN_CMPGEPD,
26782 IX86_BUILTIN_CMPNEQPD,
26783 IX86_BUILTIN_CMPNLTPD,
26784 IX86_BUILTIN_CMPNLEPD,
26785 IX86_BUILTIN_CMPNGTPD,
26786 IX86_BUILTIN_CMPNGEPD,
26787 IX86_BUILTIN_CMPORDPD,
26788 IX86_BUILTIN_CMPUNORDPD,
26789 IX86_BUILTIN_CMPEQSD,
26790 IX86_BUILTIN_CMPLTSD,
26791 IX86_BUILTIN_CMPLESD,
26792 IX86_BUILTIN_CMPNEQSD,
26793 IX86_BUILTIN_CMPNLTSD,
26794 IX86_BUILTIN_CMPNLESD,
26795 IX86_BUILTIN_CMPORDSD,
26796 IX86_BUILTIN_CMPUNORDSD,
26797
26798 IX86_BUILTIN_COMIEQSD,
26799 IX86_BUILTIN_COMILTSD,
26800 IX86_BUILTIN_COMILESD,
26801 IX86_BUILTIN_COMIGTSD,
26802 IX86_BUILTIN_COMIGESD,
26803 IX86_BUILTIN_COMINEQSD,
26804 IX86_BUILTIN_UCOMIEQSD,
26805 IX86_BUILTIN_UCOMILTSD,
26806 IX86_BUILTIN_UCOMILESD,
26807 IX86_BUILTIN_UCOMIGTSD,
26808 IX86_BUILTIN_UCOMIGESD,
26809 IX86_BUILTIN_UCOMINEQSD,
26810
26811 IX86_BUILTIN_MAXPD,
26812 IX86_BUILTIN_MAXSD,
26813 IX86_BUILTIN_MINPD,
26814 IX86_BUILTIN_MINSD,
26815
26816 IX86_BUILTIN_ANDPD,
26817 IX86_BUILTIN_ANDNPD,
26818 IX86_BUILTIN_ORPD,
26819 IX86_BUILTIN_XORPD,
26820
26821 IX86_BUILTIN_SQRTPD,
26822 IX86_BUILTIN_SQRTSD,
26823
26824 IX86_BUILTIN_UNPCKHPD,
26825 IX86_BUILTIN_UNPCKLPD,
26826
26827 IX86_BUILTIN_SHUFPD,
26828
26829 IX86_BUILTIN_LOADUPD,
26830 IX86_BUILTIN_STOREUPD,
26831 IX86_BUILTIN_MOVSD,
26832
26833 IX86_BUILTIN_LOADHPD,
26834 IX86_BUILTIN_LOADLPD,
26835
26836 IX86_BUILTIN_CVTDQ2PD,
26837 IX86_BUILTIN_CVTDQ2PS,
26838
26839 IX86_BUILTIN_CVTPD2DQ,
26840 IX86_BUILTIN_CVTPD2PI,
26841 IX86_BUILTIN_CVTPD2PS,
26842 IX86_BUILTIN_CVTTPD2DQ,
26843 IX86_BUILTIN_CVTTPD2PI,
26844
26845 IX86_BUILTIN_CVTPI2PD,
26846 IX86_BUILTIN_CVTSI2SD,
26847 IX86_BUILTIN_CVTSI642SD,
26848
26849 IX86_BUILTIN_CVTSD2SI,
26850 IX86_BUILTIN_CVTSD2SI64,
26851 IX86_BUILTIN_CVTSD2SS,
26852 IX86_BUILTIN_CVTSS2SD,
26853 IX86_BUILTIN_CVTTSD2SI,
26854 IX86_BUILTIN_CVTTSD2SI64,
26855
26856 IX86_BUILTIN_CVTPS2DQ,
26857 IX86_BUILTIN_CVTPS2PD,
26858 IX86_BUILTIN_CVTTPS2DQ,
26859
26860 IX86_BUILTIN_MOVNTI,
26861 IX86_BUILTIN_MOVNTI64,
26862 IX86_BUILTIN_MOVNTPD,
26863 IX86_BUILTIN_MOVNTDQ,
26864
26865 IX86_BUILTIN_MOVQ128,
26866
26867 /* SSE2 MMX */
26868 IX86_BUILTIN_MASKMOVDQU,
26869 IX86_BUILTIN_MOVMSKPD,
26870 IX86_BUILTIN_PMOVMSKB128,
26871
26872 IX86_BUILTIN_PACKSSWB128,
26873 IX86_BUILTIN_PACKSSDW128,
26874 IX86_BUILTIN_PACKUSWB128,
26875
26876 IX86_BUILTIN_PADDB128,
26877 IX86_BUILTIN_PADDW128,
26878 IX86_BUILTIN_PADDD128,
26879 IX86_BUILTIN_PADDQ128,
26880 IX86_BUILTIN_PADDSB128,
26881 IX86_BUILTIN_PADDSW128,
26882 IX86_BUILTIN_PADDUSB128,
26883 IX86_BUILTIN_PADDUSW128,
26884 IX86_BUILTIN_PSUBB128,
26885 IX86_BUILTIN_PSUBW128,
26886 IX86_BUILTIN_PSUBD128,
26887 IX86_BUILTIN_PSUBQ128,
26888 IX86_BUILTIN_PSUBSB128,
26889 IX86_BUILTIN_PSUBSW128,
26890 IX86_BUILTIN_PSUBUSB128,
26891 IX86_BUILTIN_PSUBUSW128,
26892
26893 IX86_BUILTIN_PAND128,
26894 IX86_BUILTIN_PANDN128,
26895 IX86_BUILTIN_POR128,
26896 IX86_BUILTIN_PXOR128,
26897
26898 IX86_BUILTIN_PAVGB128,
26899 IX86_BUILTIN_PAVGW128,
26900
26901 IX86_BUILTIN_PCMPEQB128,
26902 IX86_BUILTIN_PCMPEQW128,
26903 IX86_BUILTIN_PCMPEQD128,
26904 IX86_BUILTIN_PCMPGTB128,
26905 IX86_BUILTIN_PCMPGTW128,
26906 IX86_BUILTIN_PCMPGTD128,
26907
26908 IX86_BUILTIN_PMADDWD128,
26909
26910 IX86_BUILTIN_PMAXSW128,
26911 IX86_BUILTIN_PMAXUB128,
26912 IX86_BUILTIN_PMINSW128,
26913 IX86_BUILTIN_PMINUB128,
26914
26915 IX86_BUILTIN_PMULUDQ,
26916 IX86_BUILTIN_PMULUDQ128,
26917 IX86_BUILTIN_PMULHUW128,
26918 IX86_BUILTIN_PMULHW128,
26919 IX86_BUILTIN_PMULLW128,
26920
26921 IX86_BUILTIN_PSADBW128,
26922 IX86_BUILTIN_PSHUFHW,
26923 IX86_BUILTIN_PSHUFLW,
26924 IX86_BUILTIN_PSHUFD,
26925
26926 IX86_BUILTIN_PSLLDQI128,
26927 IX86_BUILTIN_PSLLWI128,
26928 IX86_BUILTIN_PSLLDI128,
26929 IX86_BUILTIN_PSLLQI128,
26930 IX86_BUILTIN_PSRAWI128,
26931 IX86_BUILTIN_PSRADI128,
26932 IX86_BUILTIN_PSRLDQI128,
26933 IX86_BUILTIN_PSRLWI128,
26934 IX86_BUILTIN_PSRLDI128,
26935 IX86_BUILTIN_PSRLQI128,
26936
26937 IX86_BUILTIN_PSLLDQ128,
26938 IX86_BUILTIN_PSLLW128,
26939 IX86_BUILTIN_PSLLD128,
26940 IX86_BUILTIN_PSLLQ128,
26941 IX86_BUILTIN_PSRAW128,
26942 IX86_BUILTIN_PSRAD128,
26943 IX86_BUILTIN_PSRLW128,
26944 IX86_BUILTIN_PSRLD128,
26945 IX86_BUILTIN_PSRLQ128,
26946
26947 IX86_BUILTIN_PUNPCKHBW128,
26948 IX86_BUILTIN_PUNPCKHWD128,
26949 IX86_BUILTIN_PUNPCKHDQ128,
26950 IX86_BUILTIN_PUNPCKHQDQ128,
26951 IX86_BUILTIN_PUNPCKLBW128,
26952 IX86_BUILTIN_PUNPCKLWD128,
26953 IX86_BUILTIN_PUNPCKLDQ128,
26954 IX86_BUILTIN_PUNPCKLQDQ128,
26955
26956 IX86_BUILTIN_CLFLUSH,
26957 IX86_BUILTIN_MFENCE,
26958 IX86_BUILTIN_LFENCE,
26959 IX86_BUILTIN_PAUSE,
26960
26961 IX86_BUILTIN_BSRSI,
26962 IX86_BUILTIN_BSRDI,
26963 IX86_BUILTIN_RDPMC,
26964 IX86_BUILTIN_RDTSC,
26965 IX86_BUILTIN_RDTSCP,
26966 IX86_BUILTIN_ROLQI,
26967 IX86_BUILTIN_ROLHI,
26968 IX86_BUILTIN_RORQI,
26969 IX86_BUILTIN_RORHI,
26970
26971 /* SSE3. */
26972 IX86_BUILTIN_ADDSUBPS,
26973 IX86_BUILTIN_HADDPS,
26974 IX86_BUILTIN_HSUBPS,
26975 IX86_BUILTIN_MOVSHDUP,
26976 IX86_BUILTIN_MOVSLDUP,
26977 IX86_BUILTIN_ADDSUBPD,
26978 IX86_BUILTIN_HADDPD,
26979 IX86_BUILTIN_HSUBPD,
26980 IX86_BUILTIN_LDDQU,
26981
26982 IX86_BUILTIN_MONITOR,
26983 IX86_BUILTIN_MWAIT,
26984
26985 /* SSSE3. */
26986 IX86_BUILTIN_PHADDW,
26987 IX86_BUILTIN_PHADDD,
26988 IX86_BUILTIN_PHADDSW,
26989 IX86_BUILTIN_PHSUBW,
26990 IX86_BUILTIN_PHSUBD,
26991 IX86_BUILTIN_PHSUBSW,
26992 IX86_BUILTIN_PMADDUBSW,
26993 IX86_BUILTIN_PMULHRSW,
26994 IX86_BUILTIN_PSHUFB,
26995 IX86_BUILTIN_PSIGNB,
26996 IX86_BUILTIN_PSIGNW,
26997 IX86_BUILTIN_PSIGND,
26998 IX86_BUILTIN_PALIGNR,
26999 IX86_BUILTIN_PABSB,
27000 IX86_BUILTIN_PABSW,
27001 IX86_BUILTIN_PABSD,
27002
27003 IX86_BUILTIN_PHADDW128,
27004 IX86_BUILTIN_PHADDD128,
27005 IX86_BUILTIN_PHADDSW128,
27006 IX86_BUILTIN_PHSUBW128,
27007 IX86_BUILTIN_PHSUBD128,
27008 IX86_BUILTIN_PHSUBSW128,
27009 IX86_BUILTIN_PMADDUBSW128,
27010 IX86_BUILTIN_PMULHRSW128,
27011 IX86_BUILTIN_PSHUFB128,
27012 IX86_BUILTIN_PSIGNB128,
27013 IX86_BUILTIN_PSIGNW128,
27014 IX86_BUILTIN_PSIGND128,
27015 IX86_BUILTIN_PALIGNR128,
27016 IX86_BUILTIN_PABSB128,
27017 IX86_BUILTIN_PABSW128,
27018 IX86_BUILTIN_PABSD128,
27019
27020 /* AMDFAM10 - SSE4A New Instructions. */
27021 IX86_BUILTIN_MOVNTSD,
27022 IX86_BUILTIN_MOVNTSS,
27023 IX86_BUILTIN_EXTRQI,
27024 IX86_BUILTIN_EXTRQ,
27025 IX86_BUILTIN_INSERTQI,
27026 IX86_BUILTIN_INSERTQ,
27027
27028 /* SSE4.1. */
27029 IX86_BUILTIN_BLENDPD,
27030 IX86_BUILTIN_BLENDPS,
27031 IX86_BUILTIN_BLENDVPD,
27032 IX86_BUILTIN_BLENDVPS,
27033 IX86_BUILTIN_PBLENDVB128,
27034 IX86_BUILTIN_PBLENDW128,
27035
27036 IX86_BUILTIN_DPPD,
27037 IX86_BUILTIN_DPPS,
27038
27039 IX86_BUILTIN_INSERTPS128,
27040
27041 IX86_BUILTIN_MOVNTDQA,
27042 IX86_BUILTIN_MPSADBW128,
27043 IX86_BUILTIN_PACKUSDW128,
27044 IX86_BUILTIN_PCMPEQQ,
27045 IX86_BUILTIN_PHMINPOSUW128,
27046
27047 IX86_BUILTIN_PMAXSB128,
27048 IX86_BUILTIN_PMAXSD128,
27049 IX86_BUILTIN_PMAXUD128,
27050 IX86_BUILTIN_PMAXUW128,
27051
27052 IX86_BUILTIN_PMINSB128,
27053 IX86_BUILTIN_PMINSD128,
27054 IX86_BUILTIN_PMINUD128,
27055 IX86_BUILTIN_PMINUW128,
27056
27057 IX86_BUILTIN_PMOVSXBW128,
27058 IX86_BUILTIN_PMOVSXBD128,
27059 IX86_BUILTIN_PMOVSXBQ128,
27060 IX86_BUILTIN_PMOVSXWD128,
27061 IX86_BUILTIN_PMOVSXWQ128,
27062 IX86_BUILTIN_PMOVSXDQ128,
27063
27064 IX86_BUILTIN_PMOVZXBW128,
27065 IX86_BUILTIN_PMOVZXBD128,
27066 IX86_BUILTIN_PMOVZXBQ128,
27067 IX86_BUILTIN_PMOVZXWD128,
27068 IX86_BUILTIN_PMOVZXWQ128,
27069 IX86_BUILTIN_PMOVZXDQ128,
27070
27071 IX86_BUILTIN_PMULDQ128,
27072 IX86_BUILTIN_PMULLD128,
27073
27074 IX86_BUILTIN_ROUNDSD,
27075 IX86_BUILTIN_ROUNDSS,
27076
27077 IX86_BUILTIN_ROUNDPD,
27078 IX86_BUILTIN_ROUNDPS,
27079
27080 IX86_BUILTIN_FLOORPD,
27081 IX86_BUILTIN_CEILPD,
27082 IX86_BUILTIN_TRUNCPD,
27083 IX86_BUILTIN_RINTPD,
27084 IX86_BUILTIN_ROUNDPD_AZ,
27085
27086 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
27087 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
27088 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
27089
27090 IX86_BUILTIN_FLOORPS,
27091 IX86_BUILTIN_CEILPS,
27092 IX86_BUILTIN_TRUNCPS,
27093 IX86_BUILTIN_RINTPS,
27094 IX86_BUILTIN_ROUNDPS_AZ,
27095
27096 IX86_BUILTIN_FLOORPS_SFIX,
27097 IX86_BUILTIN_CEILPS_SFIX,
27098 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
27099
27100 IX86_BUILTIN_PTESTZ,
27101 IX86_BUILTIN_PTESTC,
27102 IX86_BUILTIN_PTESTNZC,
27103
27104 IX86_BUILTIN_VEC_INIT_V2SI,
27105 IX86_BUILTIN_VEC_INIT_V4HI,
27106 IX86_BUILTIN_VEC_INIT_V8QI,
27107 IX86_BUILTIN_VEC_EXT_V2DF,
27108 IX86_BUILTIN_VEC_EXT_V2DI,
27109 IX86_BUILTIN_VEC_EXT_V4SF,
27110 IX86_BUILTIN_VEC_EXT_V4SI,
27111 IX86_BUILTIN_VEC_EXT_V8HI,
27112 IX86_BUILTIN_VEC_EXT_V2SI,
27113 IX86_BUILTIN_VEC_EXT_V4HI,
27114 IX86_BUILTIN_VEC_EXT_V16QI,
27115 IX86_BUILTIN_VEC_SET_V2DI,
27116 IX86_BUILTIN_VEC_SET_V4SF,
27117 IX86_BUILTIN_VEC_SET_V4SI,
27118 IX86_BUILTIN_VEC_SET_V8HI,
27119 IX86_BUILTIN_VEC_SET_V4HI,
27120 IX86_BUILTIN_VEC_SET_V16QI,
27121
27122 IX86_BUILTIN_VEC_PACK_SFIX,
27123 IX86_BUILTIN_VEC_PACK_SFIX256,
27124
27125 /* SSE4.2. */
27126 IX86_BUILTIN_CRC32QI,
27127 IX86_BUILTIN_CRC32HI,
27128 IX86_BUILTIN_CRC32SI,
27129 IX86_BUILTIN_CRC32DI,
27130
27131 IX86_BUILTIN_PCMPESTRI128,
27132 IX86_BUILTIN_PCMPESTRM128,
27133 IX86_BUILTIN_PCMPESTRA128,
27134 IX86_BUILTIN_PCMPESTRC128,
27135 IX86_BUILTIN_PCMPESTRO128,
27136 IX86_BUILTIN_PCMPESTRS128,
27137 IX86_BUILTIN_PCMPESTRZ128,
27138 IX86_BUILTIN_PCMPISTRI128,
27139 IX86_BUILTIN_PCMPISTRM128,
27140 IX86_BUILTIN_PCMPISTRA128,
27141 IX86_BUILTIN_PCMPISTRC128,
27142 IX86_BUILTIN_PCMPISTRO128,
27143 IX86_BUILTIN_PCMPISTRS128,
27144 IX86_BUILTIN_PCMPISTRZ128,
27145
27146 IX86_BUILTIN_PCMPGTQ,
27147
27148 /* AES instructions */
27149 IX86_BUILTIN_AESENC128,
27150 IX86_BUILTIN_AESENCLAST128,
27151 IX86_BUILTIN_AESDEC128,
27152 IX86_BUILTIN_AESDECLAST128,
27153 IX86_BUILTIN_AESIMC128,
27154 IX86_BUILTIN_AESKEYGENASSIST128,
27155
27156 /* PCLMUL instruction */
27157 IX86_BUILTIN_PCLMULQDQ128,
27158
27159 /* AVX */
27160 IX86_BUILTIN_ADDPD256,
27161 IX86_BUILTIN_ADDPS256,
27162 IX86_BUILTIN_ADDSUBPD256,
27163 IX86_BUILTIN_ADDSUBPS256,
27164 IX86_BUILTIN_ANDPD256,
27165 IX86_BUILTIN_ANDPS256,
27166 IX86_BUILTIN_ANDNPD256,
27167 IX86_BUILTIN_ANDNPS256,
27168 IX86_BUILTIN_BLENDPD256,
27169 IX86_BUILTIN_BLENDPS256,
27170 IX86_BUILTIN_BLENDVPD256,
27171 IX86_BUILTIN_BLENDVPS256,
27172 IX86_BUILTIN_DIVPD256,
27173 IX86_BUILTIN_DIVPS256,
27174 IX86_BUILTIN_DPPS256,
27175 IX86_BUILTIN_HADDPD256,
27176 IX86_BUILTIN_HADDPS256,
27177 IX86_BUILTIN_HSUBPD256,
27178 IX86_BUILTIN_HSUBPS256,
27179 IX86_BUILTIN_MAXPD256,
27180 IX86_BUILTIN_MAXPS256,
27181 IX86_BUILTIN_MINPD256,
27182 IX86_BUILTIN_MINPS256,
27183 IX86_BUILTIN_MULPD256,
27184 IX86_BUILTIN_MULPS256,
27185 IX86_BUILTIN_ORPD256,
27186 IX86_BUILTIN_ORPS256,
27187 IX86_BUILTIN_SHUFPD256,
27188 IX86_BUILTIN_SHUFPS256,
27189 IX86_BUILTIN_SUBPD256,
27190 IX86_BUILTIN_SUBPS256,
27191 IX86_BUILTIN_XORPD256,
27192 IX86_BUILTIN_XORPS256,
27193 IX86_BUILTIN_CMPSD,
27194 IX86_BUILTIN_CMPSS,
27195 IX86_BUILTIN_CMPPD,
27196 IX86_BUILTIN_CMPPS,
27197 IX86_BUILTIN_CMPPD256,
27198 IX86_BUILTIN_CMPPS256,
27199 IX86_BUILTIN_CVTDQ2PD256,
27200 IX86_BUILTIN_CVTDQ2PS256,
27201 IX86_BUILTIN_CVTPD2PS256,
27202 IX86_BUILTIN_CVTPS2DQ256,
27203 IX86_BUILTIN_CVTPS2PD256,
27204 IX86_BUILTIN_CVTTPD2DQ256,
27205 IX86_BUILTIN_CVTPD2DQ256,
27206 IX86_BUILTIN_CVTTPS2DQ256,
27207 IX86_BUILTIN_EXTRACTF128PD256,
27208 IX86_BUILTIN_EXTRACTF128PS256,
27209 IX86_BUILTIN_EXTRACTF128SI256,
27210 IX86_BUILTIN_VZEROALL,
27211 IX86_BUILTIN_VZEROUPPER,
27212 IX86_BUILTIN_VPERMILVARPD,
27213 IX86_BUILTIN_VPERMILVARPS,
27214 IX86_BUILTIN_VPERMILVARPD256,
27215 IX86_BUILTIN_VPERMILVARPS256,
27216 IX86_BUILTIN_VPERMILPD,
27217 IX86_BUILTIN_VPERMILPS,
27218 IX86_BUILTIN_VPERMILPD256,
27219 IX86_BUILTIN_VPERMILPS256,
27220 IX86_BUILTIN_VPERMIL2PD,
27221 IX86_BUILTIN_VPERMIL2PS,
27222 IX86_BUILTIN_VPERMIL2PD256,
27223 IX86_BUILTIN_VPERMIL2PS256,
27224 IX86_BUILTIN_VPERM2F128PD256,
27225 IX86_BUILTIN_VPERM2F128PS256,
27226 IX86_BUILTIN_VPERM2F128SI256,
27227 IX86_BUILTIN_VBROADCASTSS,
27228 IX86_BUILTIN_VBROADCASTSD256,
27229 IX86_BUILTIN_VBROADCASTSS256,
27230 IX86_BUILTIN_VBROADCASTPD256,
27231 IX86_BUILTIN_VBROADCASTPS256,
27232 IX86_BUILTIN_VINSERTF128PD256,
27233 IX86_BUILTIN_VINSERTF128PS256,
27234 IX86_BUILTIN_VINSERTF128SI256,
27235 IX86_BUILTIN_LOADUPD256,
27236 IX86_BUILTIN_LOADUPS256,
27237 IX86_BUILTIN_STOREUPD256,
27238 IX86_BUILTIN_STOREUPS256,
27239 IX86_BUILTIN_LDDQU256,
27240 IX86_BUILTIN_MOVNTDQ256,
27241 IX86_BUILTIN_MOVNTPD256,
27242 IX86_BUILTIN_MOVNTPS256,
27243 IX86_BUILTIN_LOADDQU256,
27244 IX86_BUILTIN_STOREDQU256,
27245 IX86_BUILTIN_MASKLOADPD,
27246 IX86_BUILTIN_MASKLOADPS,
27247 IX86_BUILTIN_MASKSTOREPD,
27248 IX86_BUILTIN_MASKSTOREPS,
27249 IX86_BUILTIN_MASKLOADPD256,
27250 IX86_BUILTIN_MASKLOADPS256,
27251 IX86_BUILTIN_MASKSTOREPD256,
27252 IX86_BUILTIN_MASKSTOREPS256,
27253 IX86_BUILTIN_MOVSHDUP256,
27254 IX86_BUILTIN_MOVSLDUP256,
27255 IX86_BUILTIN_MOVDDUP256,
27256
27257 IX86_BUILTIN_SQRTPD256,
27258 IX86_BUILTIN_SQRTPS256,
27259 IX86_BUILTIN_SQRTPS_NR256,
27260 IX86_BUILTIN_RSQRTPS256,
27261 IX86_BUILTIN_RSQRTPS_NR256,
27262
27263 IX86_BUILTIN_RCPPS256,
27264
27265 IX86_BUILTIN_ROUNDPD256,
27266 IX86_BUILTIN_ROUNDPS256,
27267
27268 IX86_BUILTIN_FLOORPD256,
27269 IX86_BUILTIN_CEILPD256,
27270 IX86_BUILTIN_TRUNCPD256,
27271 IX86_BUILTIN_RINTPD256,
27272 IX86_BUILTIN_ROUNDPD_AZ256,
27273
27274 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
27275 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
27276 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
27277
27278 IX86_BUILTIN_FLOORPS256,
27279 IX86_BUILTIN_CEILPS256,
27280 IX86_BUILTIN_TRUNCPS256,
27281 IX86_BUILTIN_RINTPS256,
27282 IX86_BUILTIN_ROUNDPS_AZ256,
27283
27284 IX86_BUILTIN_FLOORPS_SFIX256,
27285 IX86_BUILTIN_CEILPS_SFIX256,
27286 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
27287
27288 IX86_BUILTIN_UNPCKHPD256,
27289 IX86_BUILTIN_UNPCKLPD256,
27290 IX86_BUILTIN_UNPCKHPS256,
27291 IX86_BUILTIN_UNPCKLPS256,
27292
27293 IX86_BUILTIN_SI256_SI,
27294 IX86_BUILTIN_PS256_PS,
27295 IX86_BUILTIN_PD256_PD,
27296 IX86_BUILTIN_SI_SI256,
27297 IX86_BUILTIN_PS_PS256,
27298 IX86_BUILTIN_PD_PD256,
27299
27300 IX86_BUILTIN_VTESTZPD,
27301 IX86_BUILTIN_VTESTCPD,
27302 IX86_BUILTIN_VTESTNZCPD,
27303 IX86_BUILTIN_VTESTZPS,
27304 IX86_BUILTIN_VTESTCPS,
27305 IX86_BUILTIN_VTESTNZCPS,
27306 IX86_BUILTIN_VTESTZPD256,
27307 IX86_BUILTIN_VTESTCPD256,
27308 IX86_BUILTIN_VTESTNZCPD256,
27309 IX86_BUILTIN_VTESTZPS256,
27310 IX86_BUILTIN_VTESTCPS256,
27311 IX86_BUILTIN_VTESTNZCPS256,
27312 IX86_BUILTIN_PTESTZ256,
27313 IX86_BUILTIN_PTESTC256,
27314 IX86_BUILTIN_PTESTNZC256,
27315
27316 IX86_BUILTIN_MOVMSKPD256,
27317 IX86_BUILTIN_MOVMSKPS256,
27318
27319 /* AVX2 */
27320 IX86_BUILTIN_MPSADBW256,
27321 IX86_BUILTIN_PABSB256,
27322 IX86_BUILTIN_PABSW256,
27323 IX86_BUILTIN_PABSD256,
27324 IX86_BUILTIN_PACKSSDW256,
27325 IX86_BUILTIN_PACKSSWB256,
27326 IX86_BUILTIN_PACKUSDW256,
27327 IX86_BUILTIN_PACKUSWB256,
27328 IX86_BUILTIN_PADDB256,
27329 IX86_BUILTIN_PADDW256,
27330 IX86_BUILTIN_PADDD256,
27331 IX86_BUILTIN_PADDQ256,
27332 IX86_BUILTIN_PADDSB256,
27333 IX86_BUILTIN_PADDSW256,
27334 IX86_BUILTIN_PADDUSB256,
27335 IX86_BUILTIN_PADDUSW256,
27336 IX86_BUILTIN_PALIGNR256,
27337 IX86_BUILTIN_AND256I,
27338 IX86_BUILTIN_ANDNOT256I,
27339 IX86_BUILTIN_PAVGB256,
27340 IX86_BUILTIN_PAVGW256,
27341 IX86_BUILTIN_PBLENDVB256,
27342 IX86_BUILTIN_PBLENDVW256,
27343 IX86_BUILTIN_PCMPEQB256,
27344 IX86_BUILTIN_PCMPEQW256,
27345 IX86_BUILTIN_PCMPEQD256,
27346 IX86_BUILTIN_PCMPEQQ256,
27347 IX86_BUILTIN_PCMPGTB256,
27348 IX86_BUILTIN_PCMPGTW256,
27349 IX86_BUILTIN_PCMPGTD256,
27350 IX86_BUILTIN_PCMPGTQ256,
27351 IX86_BUILTIN_PHADDW256,
27352 IX86_BUILTIN_PHADDD256,
27353 IX86_BUILTIN_PHADDSW256,
27354 IX86_BUILTIN_PHSUBW256,
27355 IX86_BUILTIN_PHSUBD256,
27356 IX86_BUILTIN_PHSUBSW256,
27357 IX86_BUILTIN_PMADDUBSW256,
27358 IX86_BUILTIN_PMADDWD256,
27359 IX86_BUILTIN_PMAXSB256,
27360 IX86_BUILTIN_PMAXSW256,
27361 IX86_BUILTIN_PMAXSD256,
27362 IX86_BUILTIN_PMAXUB256,
27363 IX86_BUILTIN_PMAXUW256,
27364 IX86_BUILTIN_PMAXUD256,
27365 IX86_BUILTIN_PMINSB256,
27366 IX86_BUILTIN_PMINSW256,
27367 IX86_BUILTIN_PMINSD256,
27368 IX86_BUILTIN_PMINUB256,
27369 IX86_BUILTIN_PMINUW256,
27370 IX86_BUILTIN_PMINUD256,
27371 IX86_BUILTIN_PMOVMSKB256,
27372 IX86_BUILTIN_PMOVSXBW256,
27373 IX86_BUILTIN_PMOVSXBD256,
27374 IX86_BUILTIN_PMOVSXBQ256,
27375 IX86_BUILTIN_PMOVSXWD256,
27376 IX86_BUILTIN_PMOVSXWQ256,
27377 IX86_BUILTIN_PMOVSXDQ256,
27378 IX86_BUILTIN_PMOVZXBW256,
27379 IX86_BUILTIN_PMOVZXBD256,
27380 IX86_BUILTIN_PMOVZXBQ256,
27381 IX86_BUILTIN_PMOVZXWD256,
27382 IX86_BUILTIN_PMOVZXWQ256,
27383 IX86_BUILTIN_PMOVZXDQ256,
27384 IX86_BUILTIN_PMULDQ256,
27385 IX86_BUILTIN_PMULHRSW256,
27386 IX86_BUILTIN_PMULHUW256,
27387 IX86_BUILTIN_PMULHW256,
27388 IX86_BUILTIN_PMULLW256,
27389 IX86_BUILTIN_PMULLD256,
27390 IX86_BUILTIN_PMULUDQ256,
27391 IX86_BUILTIN_POR256,
27392 IX86_BUILTIN_PSADBW256,
27393 IX86_BUILTIN_PSHUFB256,
27394 IX86_BUILTIN_PSHUFD256,
27395 IX86_BUILTIN_PSHUFHW256,
27396 IX86_BUILTIN_PSHUFLW256,
27397 IX86_BUILTIN_PSIGNB256,
27398 IX86_BUILTIN_PSIGNW256,
27399 IX86_BUILTIN_PSIGND256,
27400 IX86_BUILTIN_PSLLDQI256,
27401 IX86_BUILTIN_PSLLWI256,
27402 IX86_BUILTIN_PSLLW256,
27403 IX86_BUILTIN_PSLLDI256,
27404 IX86_BUILTIN_PSLLD256,
27405 IX86_BUILTIN_PSLLQI256,
27406 IX86_BUILTIN_PSLLQ256,
27407 IX86_BUILTIN_PSRAWI256,
27408 IX86_BUILTIN_PSRAW256,
27409 IX86_BUILTIN_PSRADI256,
27410 IX86_BUILTIN_PSRAD256,
27411 IX86_BUILTIN_PSRLDQI256,
27412 IX86_BUILTIN_PSRLWI256,
27413 IX86_BUILTIN_PSRLW256,
27414 IX86_BUILTIN_PSRLDI256,
27415 IX86_BUILTIN_PSRLD256,
27416 IX86_BUILTIN_PSRLQI256,
27417 IX86_BUILTIN_PSRLQ256,
27418 IX86_BUILTIN_PSUBB256,
27419 IX86_BUILTIN_PSUBW256,
27420 IX86_BUILTIN_PSUBD256,
27421 IX86_BUILTIN_PSUBQ256,
27422 IX86_BUILTIN_PSUBSB256,
27423 IX86_BUILTIN_PSUBSW256,
27424 IX86_BUILTIN_PSUBUSB256,
27425 IX86_BUILTIN_PSUBUSW256,
27426 IX86_BUILTIN_PUNPCKHBW256,
27427 IX86_BUILTIN_PUNPCKHWD256,
27428 IX86_BUILTIN_PUNPCKHDQ256,
27429 IX86_BUILTIN_PUNPCKHQDQ256,
27430 IX86_BUILTIN_PUNPCKLBW256,
27431 IX86_BUILTIN_PUNPCKLWD256,
27432 IX86_BUILTIN_PUNPCKLDQ256,
27433 IX86_BUILTIN_PUNPCKLQDQ256,
27434 IX86_BUILTIN_PXOR256,
27435 IX86_BUILTIN_MOVNTDQA256,
27436 IX86_BUILTIN_VBROADCASTSS_PS,
27437 IX86_BUILTIN_VBROADCASTSS_PS256,
27438 IX86_BUILTIN_VBROADCASTSD_PD256,
27439 IX86_BUILTIN_VBROADCASTSI256,
27440 IX86_BUILTIN_PBLENDD256,
27441 IX86_BUILTIN_PBLENDD128,
27442 IX86_BUILTIN_PBROADCASTB256,
27443 IX86_BUILTIN_PBROADCASTW256,
27444 IX86_BUILTIN_PBROADCASTD256,
27445 IX86_BUILTIN_PBROADCASTQ256,
27446 IX86_BUILTIN_PBROADCASTB128,
27447 IX86_BUILTIN_PBROADCASTW128,
27448 IX86_BUILTIN_PBROADCASTD128,
27449 IX86_BUILTIN_PBROADCASTQ128,
27450 IX86_BUILTIN_VPERMVARSI256,
27451 IX86_BUILTIN_VPERMDF256,
27452 IX86_BUILTIN_VPERMVARSF256,
27453 IX86_BUILTIN_VPERMDI256,
27454 IX86_BUILTIN_VPERMTI256,
27455 IX86_BUILTIN_VEXTRACT128I256,
27456 IX86_BUILTIN_VINSERT128I256,
27457 IX86_BUILTIN_MASKLOADD,
27458 IX86_BUILTIN_MASKLOADQ,
27459 IX86_BUILTIN_MASKLOADD256,
27460 IX86_BUILTIN_MASKLOADQ256,
27461 IX86_BUILTIN_MASKSTORED,
27462 IX86_BUILTIN_MASKSTOREQ,
27463 IX86_BUILTIN_MASKSTORED256,
27464 IX86_BUILTIN_MASKSTOREQ256,
27465 IX86_BUILTIN_PSLLVV4DI,
27466 IX86_BUILTIN_PSLLVV2DI,
27467 IX86_BUILTIN_PSLLVV8SI,
27468 IX86_BUILTIN_PSLLVV4SI,
27469 IX86_BUILTIN_PSRAVV8SI,
27470 IX86_BUILTIN_PSRAVV4SI,
27471 IX86_BUILTIN_PSRLVV4DI,
27472 IX86_BUILTIN_PSRLVV2DI,
27473 IX86_BUILTIN_PSRLVV8SI,
27474 IX86_BUILTIN_PSRLVV4SI,
27475
27476 IX86_BUILTIN_GATHERSIV2DF,
27477 IX86_BUILTIN_GATHERSIV4DF,
27478 IX86_BUILTIN_GATHERDIV2DF,
27479 IX86_BUILTIN_GATHERDIV4DF,
27480 IX86_BUILTIN_GATHERSIV4SF,
27481 IX86_BUILTIN_GATHERSIV8SF,
27482 IX86_BUILTIN_GATHERDIV4SF,
27483 IX86_BUILTIN_GATHERDIV8SF,
27484 IX86_BUILTIN_GATHERSIV2DI,
27485 IX86_BUILTIN_GATHERSIV4DI,
27486 IX86_BUILTIN_GATHERDIV2DI,
27487 IX86_BUILTIN_GATHERDIV4DI,
27488 IX86_BUILTIN_GATHERSIV4SI,
27489 IX86_BUILTIN_GATHERSIV8SI,
27490 IX86_BUILTIN_GATHERDIV4SI,
27491 IX86_BUILTIN_GATHERDIV8SI,
27492
27493 /* Alternate 4 element gather for the vectorizer where
27494 all operands are 32-byte wide. */
27495 IX86_BUILTIN_GATHERALTSIV4DF,
27496 IX86_BUILTIN_GATHERALTDIV8SF,
27497 IX86_BUILTIN_GATHERALTSIV4DI,
27498 IX86_BUILTIN_GATHERALTDIV8SI,
27499
27500 /* TFmode support builtins. */
27501 IX86_BUILTIN_INFQ,
27502 IX86_BUILTIN_HUGE_VALQ,
27503 IX86_BUILTIN_FABSQ,
27504 IX86_BUILTIN_COPYSIGNQ,
27505
27506 /* Vectorizer support builtins. */
27507 IX86_BUILTIN_CPYSGNPS,
27508 IX86_BUILTIN_CPYSGNPD,
27509 IX86_BUILTIN_CPYSGNPS256,
27510 IX86_BUILTIN_CPYSGNPD256,
27511
27512 /* FMA4 instructions. */
27513 IX86_BUILTIN_VFMADDSS,
27514 IX86_BUILTIN_VFMADDSD,
27515 IX86_BUILTIN_VFMADDPS,
27516 IX86_BUILTIN_VFMADDPD,
27517 IX86_BUILTIN_VFMADDPS256,
27518 IX86_BUILTIN_VFMADDPD256,
27519 IX86_BUILTIN_VFMADDSUBPS,
27520 IX86_BUILTIN_VFMADDSUBPD,
27521 IX86_BUILTIN_VFMADDSUBPS256,
27522 IX86_BUILTIN_VFMADDSUBPD256,
27523
27524 /* FMA3 instructions. */
27525 IX86_BUILTIN_VFMADDSS3,
27526 IX86_BUILTIN_VFMADDSD3,
27527
27528 /* XOP instructions. */
27529 IX86_BUILTIN_VPCMOV,
27530 IX86_BUILTIN_VPCMOV_V2DI,
27531 IX86_BUILTIN_VPCMOV_V4SI,
27532 IX86_BUILTIN_VPCMOV_V8HI,
27533 IX86_BUILTIN_VPCMOV_V16QI,
27534 IX86_BUILTIN_VPCMOV_V4SF,
27535 IX86_BUILTIN_VPCMOV_V2DF,
27536 IX86_BUILTIN_VPCMOV256,
27537 IX86_BUILTIN_VPCMOV_V4DI256,
27538 IX86_BUILTIN_VPCMOV_V8SI256,
27539 IX86_BUILTIN_VPCMOV_V16HI256,
27540 IX86_BUILTIN_VPCMOV_V32QI256,
27541 IX86_BUILTIN_VPCMOV_V8SF256,
27542 IX86_BUILTIN_VPCMOV_V4DF256,
27543
27544 IX86_BUILTIN_VPPERM,
27545
27546 IX86_BUILTIN_VPMACSSWW,
27547 IX86_BUILTIN_VPMACSWW,
27548 IX86_BUILTIN_VPMACSSWD,
27549 IX86_BUILTIN_VPMACSWD,
27550 IX86_BUILTIN_VPMACSSDD,
27551 IX86_BUILTIN_VPMACSDD,
27552 IX86_BUILTIN_VPMACSSDQL,
27553 IX86_BUILTIN_VPMACSSDQH,
27554 IX86_BUILTIN_VPMACSDQL,
27555 IX86_BUILTIN_VPMACSDQH,
27556 IX86_BUILTIN_VPMADCSSWD,
27557 IX86_BUILTIN_VPMADCSWD,
27558
27559 IX86_BUILTIN_VPHADDBW,
27560 IX86_BUILTIN_VPHADDBD,
27561 IX86_BUILTIN_VPHADDBQ,
27562 IX86_BUILTIN_VPHADDWD,
27563 IX86_BUILTIN_VPHADDWQ,
27564 IX86_BUILTIN_VPHADDDQ,
27565 IX86_BUILTIN_VPHADDUBW,
27566 IX86_BUILTIN_VPHADDUBD,
27567 IX86_BUILTIN_VPHADDUBQ,
27568 IX86_BUILTIN_VPHADDUWD,
27569 IX86_BUILTIN_VPHADDUWQ,
27570 IX86_BUILTIN_VPHADDUDQ,
27571 IX86_BUILTIN_VPHSUBBW,
27572 IX86_BUILTIN_VPHSUBWD,
27573 IX86_BUILTIN_VPHSUBDQ,
27574
27575 IX86_BUILTIN_VPROTB,
27576 IX86_BUILTIN_VPROTW,
27577 IX86_BUILTIN_VPROTD,
27578 IX86_BUILTIN_VPROTQ,
27579 IX86_BUILTIN_VPROTB_IMM,
27580 IX86_BUILTIN_VPROTW_IMM,
27581 IX86_BUILTIN_VPROTD_IMM,
27582 IX86_BUILTIN_VPROTQ_IMM,
27583
27584 IX86_BUILTIN_VPSHLB,
27585 IX86_BUILTIN_VPSHLW,
27586 IX86_BUILTIN_VPSHLD,
27587 IX86_BUILTIN_VPSHLQ,
27588 IX86_BUILTIN_VPSHAB,
27589 IX86_BUILTIN_VPSHAW,
27590 IX86_BUILTIN_VPSHAD,
27591 IX86_BUILTIN_VPSHAQ,
27592
27593 IX86_BUILTIN_VFRCZSS,
27594 IX86_BUILTIN_VFRCZSD,
27595 IX86_BUILTIN_VFRCZPS,
27596 IX86_BUILTIN_VFRCZPD,
27597 IX86_BUILTIN_VFRCZPS256,
27598 IX86_BUILTIN_VFRCZPD256,
27599
27600 IX86_BUILTIN_VPCOMEQUB,
27601 IX86_BUILTIN_VPCOMNEUB,
27602 IX86_BUILTIN_VPCOMLTUB,
27603 IX86_BUILTIN_VPCOMLEUB,
27604 IX86_BUILTIN_VPCOMGTUB,
27605 IX86_BUILTIN_VPCOMGEUB,
27606 IX86_BUILTIN_VPCOMFALSEUB,
27607 IX86_BUILTIN_VPCOMTRUEUB,
27608
27609 IX86_BUILTIN_VPCOMEQUW,
27610 IX86_BUILTIN_VPCOMNEUW,
27611 IX86_BUILTIN_VPCOMLTUW,
27612 IX86_BUILTIN_VPCOMLEUW,
27613 IX86_BUILTIN_VPCOMGTUW,
27614 IX86_BUILTIN_VPCOMGEUW,
27615 IX86_BUILTIN_VPCOMFALSEUW,
27616 IX86_BUILTIN_VPCOMTRUEUW,
27617
27618 IX86_BUILTIN_VPCOMEQUD,
27619 IX86_BUILTIN_VPCOMNEUD,
27620 IX86_BUILTIN_VPCOMLTUD,
27621 IX86_BUILTIN_VPCOMLEUD,
27622 IX86_BUILTIN_VPCOMGTUD,
27623 IX86_BUILTIN_VPCOMGEUD,
27624 IX86_BUILTIN_VPCOMFALSEUD,
27625 IX86_BUILTIN_VPCOMTRUEUD,
27626
27627 IX86_BUILTIN_VPCOMEQUQ,
27628 IX86_BUILTIN_VPCOMNEUQ,
27629 IX86_BUILTIN_VPCOMLTUQ,
27630 IX86_BUILTIN_VPCOMLEUQ,
27631 IX86_BUILTIN_VPCOMGTUQ,
27632 IX86_BUILTIN_VPCOMGEUQ,
27633 IX86_BUILTIN_VPCOMFALSEUQ,
27634 IX86_BUILTIN_VPCOMTRUEUQ,
27635
27636 IX86_BUILTIN_VPCOMEQB,
27637 IX86_BUILTIN_VPCOMNEB,
27638 IX86_BUILTIN_VPCOMLTB,
27639 IX86_BUILTIN_VPCOMLEB,
27640 IX86_BUILTIN_VPCOMGTB,
27641 IX86_BUILTIN_VPCOMGEB,
27642 IX86_BUILTIN_VPCOMFALSEB,
27643 IX86_BUILTIN_VPCOMTRUEB,
27644
27645 IX86_BUILTIN_VPCOMEQW,
27646 IX86_BUILTIN_VPCOMNEW,
27647 IX86_BUILTIN_VPCOMLTW,
27648 IX86_BUILTIN_VPCOMLEW,
27649 IX86_BUILTIN_VPCOMGTW,
27650 IX86_BUILTIN_VPCOMGEW,
27651 IX86_BUILTIN_VPCOMFALSEW,
27652 IX86_BUILTIN_VPCOMTRUEW,
27653
27654 IX86_BUILTIN_VPCOMEQD,
27655 IX86_BUILTIN_VPCOMNED,
27656 IX86_BUILTIN_VPCOMLTD,
27657 IX86_BUILTIN_VPCOMLED,
27658 IX86_BUILTIN_VPCOMGTD,
27659 IX86_BUILTIN_VPCOMGED,
27660 IX86_BUILTIN_VPCOMFALSED,
27661 IX86_BUILTIN_VPCOMTRUED,
27662
27663 IX86_BUILTIN_VPCOMEQQ,
27664 IX86_BUILTIN_VPCOMNEQ,
27665 IX86_BUILTIN_VPCOMLTQ,
27666 IX86_BUILTIN_VPCOMLEQ,
27667 IX86_BUILTIN_VPCOMGTQ,
27668 IX86_BUILTIN_VPCOMGEQ,
27669 IX86_BUILTIN_VPCOMFALSEQ,
27670 IX86_BUILTIN_VPCOMTRUEQ,
27671
27672 /* LWP instructions. */
27673 IX86_BUILTIN_LLWPCB,
27674 IX86_BUILTIN_SLWPCB,
27675 IX86_BUILTIN_LWPVAL32,
27676 IX86_BUILTIN_LWPVAL64,
27677 IX86_BUILTIN_LWPINS32,
27678 IX86_BUILTIN_LWPINS64,
27679
27680 IX86_BUILTIN_CLZS,
27681
27682 /* RTM */
27683 IX86_BUILTIN_XBEGIN,
27684 IX86_BUILTIN_XEND,
27685 IX86_BUILTIN_XABORT,
27686 IX86_BUILTIN_XTEST,
27687
27688 /* BMI instructions. */
27689 IX86_BUILTIN_BEXTR32,
27690 IX86_BUILTIN_BEXTR64,
27691 IX86_BUILTIN_CTZS,
27692
27693 /* TBM instructions. */
27694 IX86_BUILTIN_BEXTRI32,
27695 IX86_BUILTIN_BEXTRI64,
27696
27697 /* BMI2 instructions. */
27698 IX86_BUILTIN_BZHI32,
27699 IX86_BUILTIN_BZHI64,
27700 IX86_BUILTIN_PDEP32,
27701 IX86_BUILTIN_PDEP64,
27702 IX86_BUILTIN_PEXT32,
27703 IX86_BUILTIN_PEXT64,
27704
27705 /* ADX instructions. */
27706 IX86_BUILTIN_ADDCARRYX32,
27707 IX86_BUILTIN_ADDCARRYX64,
27708
27709 /* FSGSBASE instructions. */
27710 IX86_BUILTIN_RDFSBASE32,
27711 IX86_BUILTIN_RDFSBASE64,
27712 IX86_BUILTIN_RDGSBASE32,
27713 IX86_BUILTIN_RDGSBASE64,
27714 IX86_BUILTIN_WRFSBASE32,
27715 IX86_BUILTIN_WRFSBASE64,
27716 IX86_BUILTIN_WRGSBASE32,
27717 IX86_BUILTIN_WRGSBASE64,
27718
27719 /* RDRND instructions. */
27720 IX86_BUILTIN_RDRAND16_STEP,
27721 IX86_BUILTIN_RDRAND32_STEP,
27722 IX86_BUILTIN_RDRAND64_STEP,
27723
27724 /* RDSEED instructions. */
27725 IX86_BUILTIN_RDSEED16_STEP,
27726 IX86_BUILTIN_RDSEED32_STEP,
27727 IX86_BUILTIN_RDSEED64_STEP,
27728
27729 /* F16C instructions. */
27730 IX86_BUILTIN_CVTPH2PS,
27731 IX86_BUILTIN_CVTPH2PS256,
27732 IX86_BUILTIN_CVTPS2PH,
27733 IX86_BUILTIN_CVTPS2PH256,
27734
27735 /* CFString built-in for darwin */
27736 IX86_BUILTIN_CFSTRING,
27737
27738 /* Builtins to get CPU type and supported features. */
27739 IX86_BUILTIN_CPU_INIT,
27740 IX86_BUILTIN_CPU_IS,
27741 IX86_BUILTIN_CPU_SUPPORTS,
27742
27743 IX86_BUILTIN_MAX
27744 };
27745
27746 /* Table for the ix86 builtin decls. */
27747 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
27748
27749 /* Table of all of the builtin functions that are possible with different ISA's
27750 but are waiting to be built until a function is declared to use that
27751 ISA. */
27752 struct builtin_isa {
27753 const char *name; /* function name */
27754 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
27755 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
27756 bool const_p; /* true if the declaration is constant */
27757 bool set_and_not_built_p;
27758 };
27759
27760 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
27761
27762
27763 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
27764 of which isa_flags to use in the ix86_builtins_isa array. Stores the
27765 function decl in the ix86_builtins array. Returns the function decl or
27766 NULL_TREE, if the builtin was not added.
27767
27768 If the front end has a special hook for builtin functions, delay adding
27769 builtin functions that aren't in the current ISA until the ISA is changed
27770 with function specific optimization. Doing so, can save about 300K for the
27771 default compiler. When the builtin is expanded, check at that time whether
27772 it is valid.
27773
27774 If the front end doesn't have a special hook, record all builtins, even if
27775 it isn't an instruction set in the current ISA in case the user uses
27776 function specific options for a different ISA, so that we don't get scope
27777 errors if a builtin is added in the middle of a function scope. */
27778
27779 static inline tree
27780 def_builtin (HOST_WIDE_INT mask, const char *name,
27781 enum ix86_builtin_func_type tcode,
27782 enum ix86_builtins code)
27783 {
27784 tree decl = NULL_TREE;
27785
27786 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
27787 {
27788 ix86_builtins_isa[(int) code].isa = mask;
27789
27790 mask &= ~OPTION_MASK_ISA_64BIT;
27791 if (mask == 0
27792 || (mask & ix86_isa_flags) != 0
27793 || (lang_hooks.builtin_function
27794 == lang_hooks.builtin_function_ext_scope))
27795
27796 {
27797 tree type = ix86_get_builtin_func_type (tcode);
27798 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
27799 NULL, NULL_TREE);
27800 ix86_builtins[(int) code] = decl;
27801 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
27802 }
27803 else
27804 {
27805 ix86_builtins[(int) code] = NULL_TREE;
27806 ix86_builtins_isa[(int) code].tcode = tcode;
27807 ix86_builtins_isa[(int) code].name = name;
27808 ix86_builtins_isa[(int) code].const_p = false;
27809 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
27810 }
27811 }
27812
27813 return decl;
27814 }
27815
27816 /* Like def_builtin, but also marks the function decl "const". */
27817
27818 static inline tree
27819 def_builtin_const (HOST_WIDE_INT mask, const char *name,
27820 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
27821 {
27822 tree decl = def_builtin (mask, name, tcode, code);
27823 if (decl)
27824 TREE_READONLY (decl) = 1;
27825 else
27826 ix86_builtins_isa[(int) code].const_p = true;
27827
27828 return decl;
27829 }
27830
27831 /* Add any new builtin functions for a given ISA that may not have been
27832 declared. This saves a bit of space compared to adding all of the
27833 declarations to the tree, even if we didn't use them. */
27834
27835 static void
27836 ix86_add_new_builtins (HOST_WIDE_INT isa)
27837 {
27838 int i;
27839
27840 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
27841 {
27842 if ((ix86_builtins_isa[i].isa & isa) != 0
27843 && ix86_builtins_isa[i].set_and_not_built_p)
27844 {
27845 tree decl, type;
27846
27847 /* Don't define the builtin again. */
27848 ix86_builtins_isa[i].set_and_not_built_p = false;
27849
27850 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
27851 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
27852 type, i, BUILT_IN_MD, NULL,
27853 NULL_TREE);
27854
27855 ix86_builtins[i] = decl;
27856 if (ix86_builtins_isa[i].const_p)
27857 TREE_READONLY (decl) = 1;
27858 }
27859 }
27860 }
27861
27862 /* Bits for builtin_description.flag. */
27863
27864 /* Set when we don't support the comparison natively, and should
27865 swap_comparison in order to support it. */
27866 #define BUILTIN_DESC_SWAP_OPERANDS 1
27867
27868 struct builtin_description
27869 {
27870 const HOST_WIDE_INT mask;
27871 const enum insn_code icode;
27872 const char *const name;
27873 const enum ix86_builtins code;
27874 const enum rtx_code comparison;
27875 const int flag;
27876 };
27877
27878 static const struct builtin_description bdesc_comi[] =
27879 {
27880 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
27881 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
27882 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
27883 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
27884 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
27885 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
27886 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
27887 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
27888 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
27889 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
27890 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
27891 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
27892 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
27893 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
27894 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
27895 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
27896 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
27897 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
27898 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
27899 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
27900 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
27901 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
27902 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
27903 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
27904 };
27905
27906 static const struct builtin_description bdesc_pcmpestr[] =
27907 {
27908 /* SSE4.2 */
27909 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
27910 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
27911 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
27912 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
27913 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
27914 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
27915 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
27916 };
27917
27918 static const struct builtin_description bdesc_pcmpistr[] =
27919 {
27920 /* SSE4.2 */
27921 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
27922 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
27923 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
27924 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
27925 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
27926 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
27927 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
27928 };
27929
27930 /* Special builtins with variable number of arguments. */
27931 static const struct builtin_description bdesc_special_args[] =
27932 {
27933 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
27934 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
27935 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
27936
27937 /* MMX */
27938 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27939
27940 /* 3DNow! */
27941 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
27942
27943 /* FXSR, XSAVE and XSAVEOPT */
27944 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
27945 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
27946 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27947 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27948 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27949
27950 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27951 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
27952 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27953 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27954 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
27955
27956 /* SSE */
27957 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27958 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27959 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
27960
27961 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27962 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
27963 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27964 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
27965
27966 /* SSE or 3DNow!A */
27967 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27968 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
27969
27970 /* SSE2 */
27971 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27972 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
27973 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27974 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedquv16qi, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
27975 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27976 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
27977 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
27978 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
27979 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
27980 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddquv16qi, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27981
27982 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27983 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
27984
27985 /* SSE3 */
27986 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
27987
27988 /* SSE4.1 */
27989 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
27990
27991 /* SSE4A */
27992 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
27993 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
27994
27995 /* AVX */
27996 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
27997 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
27998
27999 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
28000 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28001 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28002 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
28003 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
28004
28005 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
28006 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
28007 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28008 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28009 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddquv32qi, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28010 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedquv32qi, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
28011 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
28012
28013 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
28014 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
28015 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
28016
28017 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
28018 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
28019 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
28020 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
28021 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
28022 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
28023 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
28024 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
28025
28026 /* AVX2 */
28027 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
28028 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
28029 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
28030 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
28031 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
28032 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
28033 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
28034 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
28035 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
28036
28037 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
28038 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
28039 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
28040 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
28041 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
28042 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
28043
28044 /* FSGSBASE */
28045 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28046 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28047 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28048 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
28049 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28050 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28051 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
28052 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
28053
28054 /* RTM */
28055 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
28056 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
28057 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
28058 };
28059
28060 /* Builtins with variable number of arguments. */
28061 static const struct builtin_description bdesc_args[] =
28062 {
28063 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
28064 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
28065 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
28066 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28067 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28068 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
28069 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
28070
28071 /* MMX */
28072 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28073 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28074 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28075 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28076 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28077 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28078
28079 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28080 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28081 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28082 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28083 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28084 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28085 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28086 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28087
28088 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28089 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28090
28091 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28092 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28093 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28094 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28095
28096 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28097 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28098 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28099 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28100 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28101 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28102
28103 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28104 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28105 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28106 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28107 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
28108 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
28109
28110 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28111 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
28112 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
28113
28114 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
28115
28116 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28117 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28118 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28119 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28120 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28121 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28122
28123 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28124 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28125 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
28126 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28127 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28128 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
28129
28130 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
28131 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
28132 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
28133 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
28134
28135 /* 3DNow! */
28136 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28137 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28138 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28139 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28140
28141 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28142 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28143 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28144 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28145 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28146 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
28147 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28148 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28149 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28150 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28151 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28152 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28153 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28154 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28155 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28156
28157 /* 3DNow!A */
28158 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
28159 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
28160 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28161 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
28162 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28163 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
28164
28165 /* SSE */
28166 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
28167 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28168 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28169 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28170 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28171 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28172 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28173 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28174 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28175 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
28176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
28177 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
28178
28179 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28180
28181 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28182 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28183 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28184 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28185 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28186 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28187 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28188 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28189
28190 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28191 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28192 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28193 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28194 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28195 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28196 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28197 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28198 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28199 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
28200 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
28201 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28202 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
28203 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
28204 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
28205 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28206 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
28207 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
28208 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
28209 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
28210
28211 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28212 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28213 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28214 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28215
28216 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28217 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28218 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28219 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28220
28221 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28222
28223 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28224 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28225 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28226 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28227 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28228
28229 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
28230 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
28231 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
28232
28233 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
28234
28235 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28236 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28237 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
28238
28239 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
28240 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
28241
28242 /* SSE MMX or 3Dnow!A */
28243 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28244 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28245 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28246
28247 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28248 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28249 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28250 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28251
28252 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
28253 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
28254
28255 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
28256
28257 /* SSE2 */
28258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28259
28260 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
28261 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
28262 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28263 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
28264 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
28265
28266 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28267 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28268 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
28269 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
28270 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
28271
28272 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
28273
28274 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28275 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
28276 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28277 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
28278
28279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_fix_notruncv4sfv4si, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
28281 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28282
28283 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28284 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28285 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28286 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28287 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28288 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28289 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28290 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28291
28292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28293 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28294 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28295 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
28297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28298 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
28303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
28305 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
28306 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
28307 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28308 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
28309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
28310 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
28311 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
28312
28313 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28314 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28315 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28316 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28317
28318 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28319 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28320 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28321 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28322
28323 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28324
28325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28326 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28327 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28328
28329 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28330
28331 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28332 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28333 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28334 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28335 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28336 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28337 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28338 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28339
28340 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28341 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28342 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28343 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28344 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28346 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28347 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28348
28349 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28350 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
28351
28352 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28353 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28354 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28355 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28356
28357 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28359
28360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28361 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28362 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28363 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28364 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28365 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28366
28367 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28368 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28369 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28370 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28371
28372 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28373 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28374 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28375 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28376 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28377 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28378 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28379 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28380
28381 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28382 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28383 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
28384
28385 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28386 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
28387
28388 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
28389 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28390
28391 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
28392
28393 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
28394 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
28395 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
28396 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
28397
28398 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28399 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28400 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28401 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28402 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28403 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28404 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28405
28406 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
28407 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28408 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28409 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
28410 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28411 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28412 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
28413
28414 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
28415 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
28416 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
28417 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
28418
28419 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
28420 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28421 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
28422
28423 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
28424
28425 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28426
28427 /* SSE2 MMX */
28428 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28429 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
28430
28431 /* SSE3 */
28432 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
28433 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28434
28435 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28436 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28437 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28438 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28439 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
28440 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
28441
28442 /* SSSE3 */
28443 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28444 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
28445 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28446 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
28447 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28448 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
28449
28450 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28451 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28452 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28453 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28454 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28455 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28456 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28457 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28458 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28459 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28460 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28461 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28462 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
28463 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
28464 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28465 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28466 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28467 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28468 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28469 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
28470 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28471 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
28472 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28473 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
28474
28475 /* SSSE3. */
28476 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
28477 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
28478
28479 /* SSE4.1 */
28480 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28481 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28482 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
28483 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
28484 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28485 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28486 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28487 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
28488 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
28489 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
28490
28491 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28492 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28493 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28494 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28495 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28496 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28497 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
28498 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
28499 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
28500 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
28501 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
28502 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
28503 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28504
28505 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
28506 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28507 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28508 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28509 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28510 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28511 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
28512 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28513 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28514 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
28515 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
28516 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28517
28518 /* SSE4.1 */
28519 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28520 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28521 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28522 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28523
28524 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
28525 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
28526 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
28527 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
28528
28529 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28530 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
28531
28532 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
28533 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
28534
28535 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
28536 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
28537 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
28538 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
28539
28540 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
28541 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
28542
28543 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28544 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
28545
28546 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28547 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28548 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
28549
28550 /* SSE4.2 */
28551 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28552 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
28553 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
28554 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28555 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28556
28557 /* SSE4A */
28558 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
28559 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
28560 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
28561 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28562
28563 /* AES */
28564 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
28565 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28566
28567 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28568 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28569 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28570 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28571
28572 /* PCLMUL */
28573 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
28574
28575 /* AVX */
28576 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28577 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28580 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28581 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28584 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28585 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28586 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28588 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28590 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28591 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28592 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28593 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28594 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28595 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28596 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28597 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28598 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28599 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28600 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28601 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28602
28603 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
28604 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
28605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
28606 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28607
28608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
28611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
28612 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28615 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28616 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28617 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
28618 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
28619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
28622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
28623 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
28624 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
28625 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
28626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
28627 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_fix_notruncv8sfv8si, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28628 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
28629 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28630 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
28631 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
28633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
28634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
28636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
28637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
28640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
28641 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
28642
28643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28645 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28646
28647 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28648 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28649 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28650 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28651 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28652
28653 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28654
28655 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28656 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
28657
28658 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
28659 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
28660 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
28661 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
28662
28663 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
28664 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28665
28666 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28667 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
28668
28669 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
28670 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
28671 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
28672 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
28673
28674 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
28675 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
28676
28677 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
28678 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
28679
28680 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28681 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28682 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28683 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28684
28685 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28686 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28687 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28688 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
28689 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
28690 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
28691
28692 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28693 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28694 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
28695 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28696 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28697 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
28698 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28699 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28700 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
28701 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28702 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28703 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
28704 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28705 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28706 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
28707
28708 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
28709 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
28710
28711 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
28712 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
28713
28714 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
28715
28716 /* AVX2 */
28717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
28718 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
28719 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
28720 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
28721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28722 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28723 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
28724 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
28725 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28726 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28727 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28728 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
28734 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28737 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28738 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
28739 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
28740 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28741 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28742 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28743 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28744 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28745 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28746 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28748 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28749 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28750 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28751 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28752 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28753 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28754 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28755 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
28756 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28757 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28758 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28759 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28760 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28761 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28762 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28763 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28764 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28765 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28766 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28767 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
28769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28770 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
28776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
28777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
28778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
28779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
28780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
28781 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28783 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28784 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28785 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28786 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28787 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
28788 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
28790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
28792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
28794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28798 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28799 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28800 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28801 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28802 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28803 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28804 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28805 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28806 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28807 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28808 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
28809 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
28810 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
28811 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
28812 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
28813 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
28814 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
28815 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28816 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28817 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28818 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28819 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28820 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28821 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28822 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28823 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28824 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28825 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28826 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28827 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
28828 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
28829 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28830 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28831 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28832 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
28833 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
28834 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
28835 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28836 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
28837 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
28838 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
28839 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
28840 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
28841 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
28842 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
28843 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
28844 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
28845 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
28846 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28847 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
28848 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
28849 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
28850 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
28851 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
28852 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
28853 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28854 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28855 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28856 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28857 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28858 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28859 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
28860 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
28861 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
28862 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
28863
28864 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28865
28866 /* BMI */
28867 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28868 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28869 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
28870
28871 /* TBM */
28872 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28873 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28874
28875 /* F16C */
28876 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
28877 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
28878 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
28879 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
28880
28881 /* BMI2 */
28882 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28883 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28884 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28885 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28886 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
28887 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
28888 };
28889
28890 /* FMA4 and XOP. */
28891 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
28892 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
28893 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
28894 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
28895 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
28896 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
28897 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
28898 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
28899 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
28900 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
28901 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
28902 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
28903 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
28904 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
28905 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
28906 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
28907 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
28908 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
28909 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
28910 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
28911 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
28912 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
28913 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
28914 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
28915 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
28916 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
28917 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
28918 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
28919 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
28920 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
28921 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
28922 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
28923 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
28924 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
28925 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
28926 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
28927 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
28928 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
28929 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
28930 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
28931 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
28932 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
28933 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
28934 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
28935 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
28936 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
28937 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
28938 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
28939 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
28940 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
28941 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
28942 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
28943
28944 static const struct builtin_description bdesc_multi_arg[] =
28945 {
28946 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
28947 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
28948 UNKNOWN, (int)MULTI_ARG_3_SF },
28949 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
28950 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
28951 UNKNOWN, (int)MULTI_ARG_3_DF },
28952
28953 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
28954 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
28955 UNKNOWN, (int)MULTI_ARG_3_SF },
28956 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
28957 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
28958 UNKNOWN, (int)MULTI_ARG_3_DF },
28959
28960 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
28961 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
28962 UNKNOWN, (int)MULTI_ARG_3_SF },
28963 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
28964 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
28965 UNKNOWN, (int)MULTI_ARG_3_DF },
28966 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
28967 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
28968 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28969 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
28970 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
28971 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28972
28973 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
28974 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
28975 UNKNOWN, (int)MULTI_ARG_3_SF },
28976 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
28977 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
28978 UNKNOWN, (int)MULTI_ARG_3_DF },
28979 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
28980 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
28981 UNKNOWN, (int)MULTI_ARG_3_SF2 },
28982 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
28983 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
28984 UNKNOWN, (int)MULTI_ARG_3_DF2 },
28985
28986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
28987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
28988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
28989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
28990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
28991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
28992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
28993
28994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28995 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
28996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
28997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
28998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
28999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
29000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
29001
29002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
29003
29004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
29006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
29010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
29014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
29016
29017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
29019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
29020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
29021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
29022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
29023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
29024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
29025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
29027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
29028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
29029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
29030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
29031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
29032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
29033
29034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
29035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
29036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
29037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
29038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
29039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
29040
29041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
29049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
29050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
29052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
29054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
29055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
29056
29057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
29058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
29060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
29061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
29062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
29063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
29064
29065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
29066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
29068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
29069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
29070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
29071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
29072
29073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
29074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
29076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
29077 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
29078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
29079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
29080
29081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
29084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
29085 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
29086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
29087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
29088
29089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
29090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
29092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
29093 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
29094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
29095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
29096
29097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
29098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
29100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
29101 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
29102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
29103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
29104
29105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
29106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
29108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
29109 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
29110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
29111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
29112
29113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
29114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
29116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
29117 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
29118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
29119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
29120
29121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29125 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
29126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
29127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
29128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
29129
29130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29133 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
29135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
29136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
29137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
29138
29139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
29140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
29141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
29142 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
29143
29144 };
29145 \f
29146 /* TM vector builtins. */
29147
29148 /* Reuse the existing x86-specific `struct builtin_description' cause
29149 we're lazy. Add casts to make them fit. */
29150 static const struct builtin_description bdesc_tm[] =
29151 {
29152 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29153 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29154 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
29155 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29156 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29157 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29158 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
29159
29160 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29161 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29162 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
29163 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29164 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29165 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29166 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
29167
29168 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29169 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29170 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
29171 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29172 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29173 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29174 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
29175
29176 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
29177 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
29178 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
29179 };
29180
29181 /* TM callbacks. */
29182
29183 /* Return the builtin decl needed to load a vector of TYPE. */
29184
29185 static tree
29186 ix86_builtin_tm_load (tree type)
29187 {
29188 if (TREE_CODE (type) == VECTOR_TYPE)
29189 {
29190 switch (tree_low_cst (TYPE_SIZE (type), 1))
29191 {
29192 case 64:
29193 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
29194 case 128:
29195 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
29196 case 256:
29197 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
29198 }
29199 }
29200 return NULL_TREE;
29201 }
29202
29203 /* Return the builtin decl needed to store a vector of TYPE. */
29204
29205 static tree
29206 ix86_builtin_tm_store (tree type)
29207 {
29208 if (TREE_CODE (type) == VECTOR_TYPE)
29209 {
29210 switch (tree_low_cst (TYPE_SIZE (type), 1))
29211 {
29212 case 64:
29213 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
29214 case 128:
29215 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
29216 case 256:
29217 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
29218 }
29219 }
29220 return NULL_TREE;
29221 }
29222 \f
29223 /* Initialize the transactional memory vector load/store builtins. */
29224
29225 static void
29226 ix86_init_tm_builtins (void)
29227 {
29228 enum ix86_builtin_func_type ftype;
29229 const struct builtin_description *d;
29230 size_t i;
29231 tree decl;
29232 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
29233 tree attrs_log, attrs_type_log;
29234
29235 if (!flag_tm)
29236 return;
29237
29238 /* If there are no builtins defined, we must be compiling in a
29239 language without trans-mem support. */
29240 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
29241 return;
29242
29243 /* Use whatever attributes a normal TM load has. */
29244 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
29245 attrs_load = DECL_ATTRIBUTES (decl);
29246 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29247 /* Use whatever attributes a normal TM store has. */
29248 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
29249 attrs_store = DECL_ATTRIBUTES (decl);
29250 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29251 /* Use whatever attributes a normal TM log has. */
29252 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
29253 attrs_log = DECL_ATTRIBUTES (decl);
29254 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
29255
29256 for (i = 0, d = bdesc_tm;
29257 i < ARRAY_SIZE (bdesc_tm);
29258 i++, d++)
29259 {
29260 if ((d->mask & ix86_isa_flags) != 0
29261 || (lang_hooks.builtin_function
29262 == lang_hooks.builtin_function_ext_scope))
29263 {
29264 tree type, attrs, attrs_type;
29265 enum built_in_function code = (enum built_in_function) d->code;
29266
29267 ftype = (enum ix86_builtin_func_type) d->flag;
29268 type = ix86_get_builtin_func_type (ftype);
29269
29270 if (BUILTIN_TM_LOAD_P (code))
29271 {
29272 attrs = attrs_load;
29273 attrs_type = attrs_type_load;
29274 }
29275 else if (BUILTIN_TM_STORE_P (code))
29276 {
29277 attrs = attrs_store;
29278 attrs_type = attrs_type_store;
29279 }
29280 else
29281 {
29282 attrs = attrs_log;
29283 attrs_type = attrs_type_log;
29284 }
29285 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
29286 /* The builtin without the prefix for
29287 calling it directly. */
29288 d->name + strlen ("__builtin_"),
29289 attrs);
29290 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
29291 set the TYPE_ATTRIBUTES. */
29292 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
29293
29294 set_builtin_decl (code, decl, false);
29295 }
29296 }
29297 }
29298
29299 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
29300 in the current target ISA to allow the user to compile particular modules
29301 with different target specific options that differ from the command line
29302 options. */
29303 static void
29304 ix86_init_mmx_sse_builtins (void)
29305 {
29306 const struct builtin_description * d;
29307 enum ix86_builtin_func_type ftype;
29308 size_t i;
29309
29310 /* Add all special builtins with variable number of operands. */
29311 for (i = 0, d = bdesc_special_args;
29312 i < ARRAY_SIZE (bdesc_special_args);
29313 i++, d++)
29314 {
29315 if (d->name == 0)
29316 continue;
29317
29318 ftype = (enum ix86_builtin_func_type) d->flag;
29319 def_builtin (d->mask, d->name, ftype, d->code);
29320 }
29321
29322 /* Add all builtins with variable number of operands. */
29323 for (i = 0, d = bdesc_args;
29324 i < ARRAY_SIZE (bdesc_args);
29325 i++, d++)
29326 {
29327 if (d->name == 0)
29328 continue;
29329
29330 ftype = (enum ix86_builtin_func_type) d->flag;
29331 def_builtin_const (d->mask, d->name, ftype, d->code);
29332 }
29333
29334 /* pcmpestr[im] insns. */
29335 for (i = 0, d = bdesc_pcmpestr;
29336 i < ARRAY_SIZE (bdesc_pcmpestr);
29337 i++, d++)
29338 {
29339 if (d->code == IX86_BUILTIN_PCMPESTRM128)
29340 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
29341 else
29342 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
29343 def_builtin_const (d->mask, d->name, ftype, d->code);
29344 }
29345
29346 /* pcmpistr[im] insns. */
29347 for (i = 0, d = bdesc_pcmpistr;
29348 i < ARRAY_SIZE (bdesc_pcmpistr);
29349 i++, d++)
29350 {
29351 if (d->code == IX86_BUILTIN_PCMPISTRM128)
29352 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
29353 else
29354 ftype = INT_FTYPE_V16QI_V16QI_INT;
29355 def_builtin_const (d->mask, d->name, ftype, d->code);
29356 }
29357
29358 /* comi/ucomi insns. */
29359 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
29360 {
29361 if (d->mask == OPTION_MASK_ISA_SSE2)
29362 ftype = INT_FTYPE_V2DF_V2DF;
29363 else
29364 ftype = INT_FTYPE_V4SF_V4SF;
29365 def_builtin_const (d->mask, d->name, ftype, d->code);
29366 }
29367
29368 /* SSE */
29369 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
29370 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
29371 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
29372 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
29373
29374 /* SSE or 3DNow!A */
29375 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29376 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
29377 IX86_BUILTIN_MASKMOVQ);
29378
29379 /* SSE2 */
29380 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
29381 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
29382
29383 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
29384 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
29385 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
29386 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
29387
29388 /* SSE3. */
29389 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
29390 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
29391 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
29392 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
29393
29394 /* AES */
29395 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
29396 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
29397 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
29398 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
29399 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
29400 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
29401 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
29402 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
29403 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
29404 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
29405 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
29406 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
29407
29408 /* PCLMUL */
29409 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
29410 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
29411
29412 /* RDRND */
29413 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
29414 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
29415 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
29416 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
29417 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
29418 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
29419 IX86_BUILTIN_RDRAND64_STEP);
29420
29421 /* AVX2 */
29422 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
29423 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
29424 IX86_BUILTIN_GATHERSIV2DF);
29425
29426 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
29427 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
29428 IX86_BUILTIN_GATHERSIV4DF);
29429
29430 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
29431 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
29432 IX86_BUILTIN_GATHERDIV2DF);
29433
29434 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
29435 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
29436 IX86_BUILTIN_GATHERDIV4DF);
29437
29438 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
29439 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
29440 IX86_BUILTIN_GATHERSIV4SF);
29441
29442 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
29443 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
29444 IX86_BUILTIN_GATHERSIV8SF);
29445
29446 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
29447 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
29448 IX86_BUILTIN_GATHERDIV4SF);
29449
29450 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
29451 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
29452 IX86_BUILTIN_GATHERDIV8SF);
29453
29454 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
29455 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
29456 IX86_BUILTIN_GATHERSIV2DI);
29457
29458 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
29459 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
29460 IX86_BUILTIN_GATHERSIV4DI);
29461
29462 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
29463 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
29464 IX86_BUILTIN_GATHERDIV2DI);
29465
29466 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
29467 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
29468 IX86_BUILTIN_GATHERDIV4DI);
29469
29470 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
29471 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
29472 IX86_BUILTIN_GATHERSIV4SI);
29473
29474 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
29475 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
29476 IX86_BUILTIN_GATHERSIV8SI);
29477
29478 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
29479 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
29480 IX86_BUILTIN_GATHERDIV4SI);
29481
29482 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
29483 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
29484 IX86_BUILTIN_GATHERDIV8SI);
29485
29486 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
29487 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
29488 IX86_BUILTIN_GATHERALTSIV4DF);
29489
29490 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
29491 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
29492 IX86_BUILTIN_GATHERALTDIV8SF);
29493
29494 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
29495 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
29496 IX86_BUILTIN_GATHERALTSIV4DI);
29497
29498 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
29499 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
29500 IX86_BUILTIN_GATHERALTDIV8SI);
29501
29502 /* RTM. */
29503 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
29504 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
29505
29506 /* MMX access to the vec_init patterns. */
29507 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
29508 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
29509
29510 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
29511 V4HI_FTYPE_HI_HI_HI_HI,
29512 IX86_BUILTIN_VEC_INIT_V4HI);
29513
29514 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
29515 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
29516 IX86_BUILTIN_VEC_INIT_V8QI);
29517
29518 /* Access to the vec_extract patterns. */
29519 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
29520 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
29521 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
29522 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
29523 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
29524 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
29525 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
29526 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
29527 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
29528 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
29529
29530 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29531 "__builtin_ia32_vec_ext_v4hi",
29532 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
29533
29534 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
29535 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
29536
29537 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
29538 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
29539
29540 /* Access to the vec_set patterns. */
29541 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
29542 "__builtin_ia32_vec_set_v2di",
29543 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
29544
29545 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
29546 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
29547
29548 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
29549 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
29550
29551 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
29552 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
29553
29554 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
29555 "__builtin_ia32_vec_set_v4hi",
29556 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
29557
29558 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
29559 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
29560
29561 /* RDSEED */
29562 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
29563 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
29564 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
29565 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
29566 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
29567 "__builtin_ia32_rdseed_di_step",
29568 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
29569
29570 /* ADCX */
29571 def_builtin (0, "__builtin_ia32_addcarryx_u32",
29572 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
29573 def_builtin (OPTION_MASK_ISA_64BIT,
29574 "__builtin_ia32_addcarryx_u64",
29575 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
29576 IX86_BUILTIN_ADDCARRYX64);
29577
29578 /* Add FMA4 multi-arg argument instructions */
29579 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
29580 {
29581 if (d->name == 0)
29582 continue;
29583
29584 ftype = (enum ix86_builtin_func_type) d->flag;
29585 def_builtin_const (d->mask, d->name, ftype, d->code);
29586 }
29587 }
29588
29589 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
29590 to return a pointer to VERSION_DECL if the outcome of the expression
29591 formed by PREDICATE_CHAIN is true. This function will be called during
29592 version dispatch to decide which function version to execute. It returns
29593 the basic block at the end, to which more conditions can be added. */
29594
29595 static basic_block
29596 add_condition_to_bb (tree function_decl, tree version_decl,
29597 tree predicate_chain, basic_block new_bb)
29598 {
29599 gimple return_stmt;
29600 tree convert_expr, result_var;
29601 gimple convert_stmt;
29602 gimple call_cond_stmt;
29603 gimple if_else_stmt;
29604
29605 basic_block bb1, bb2, bb3;
29606 edge e12, e23;
29607
29608 tree cond_var, and_expr_var = NULL_TREE;
29609 gimple_seq gseq;
29610
29611 tree predicate_decl, predicate_arg;
29612
29613 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
29614
29615 gcc_assert (new_bb != NULL);
29616 gseq = bb_seq (new_bb);
29617
29618
29619 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
29620 build_fold_addr_expr (version_decl));
29621 result_var = create_tmp_var (ptr_type_node, NULL);
29622 convert_stmt = gimple_build_assign (result_var, convert_expr);
29623 return_stmt = gimple_build_return (result_var);
29624
29625 if (predicate_chain == NULL_TREE)
29626 {
29627 gimple_seq_add_stmt (&gseq, convert_stmt);
29628 gimple_seq_add_stmt (&gseq, return_stmt);
29629 set_bb_seq (new_bb, gseq);
29630 gimple_set_bb (convert_stmt, new_bb);
29631 gimple_set_bb (return_stmt, new_bb);
29632 pop_cfun ();
29633 return new_bb;
29634 }
29635
29636 while (predicate_chain != NULL)
29637 {
29638 cond_var = create_tmp_var (integer_type_node, NULL);
29639 predicate_decl = TREE_PURPOSE (predicate_chain);
29640 predicate_arg = TREE_VALUE (predicate_chain);
29641 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
29642 gimple_call_set_lhs (call_cond_stmt, cond_var);
29643
29644 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
29645 gimple_set_bb (call_cond_stmt, new_bb);
29646 gimple_seq_add_stmt (&gseq, call_cond_stmt);
29647
29648 predicate_chain = TREE_CHAIN (predicate_chain);
29649
29650 if (and_expr_var == NULL)
29651 and_expr_var = cond_var;
29652 else
29653 {
29654 gimple assign_stmt;
29655 /* Use MIN_EXPR to check if any integer is zero?.
29656 and_expr_var = min_expr <cond_var, and_expr_var> */
29657 assign_stmt = gimple_build_assign (and_expr_var,
29658 build2 (MIN_EXPR, integer_type_node,
29659 cond_var, and_expr_var));
29660
29661 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
29662 gimple_set_bb (assign_stmt, new_bb);
29663 gimple_seq_add_stmt (&gseq, assign_stmt);
29664 }
29665 }
29666
29667 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
29668 integer_zero_node,
29669 NULL_TREE, NULL_TREE);
29670 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
29671 gimple_set_bb (if_else_stmt, new_bb);
29672 gimple_seq_add_stmt (&gseq, if_else_stmt);
29673
29674 gimple_seq_add_stmt (&gseq, convert_stmt);
29675 gimple_seq_add_stmt (&gseq, return_stmt);
29676 set_bb_seq (new_bb, gseq);
29677
29678 bb1 = new_bb;
29679 e12 = split_block (bb1, if_else_stmt);
29680 bb2 = e12->dest;
29681 e12->flags &= ~EDGE_FALLTHRU;
29682 e12->flags |= EDGE_TRUE_VALUE;
29683
29684 e23 = split_block (bb2, return_stmt);
29685
29686 gimple_set_bb (convert_stmt, bb2);
29687 gimple_set_bb (return_stmt, bb2);
29688
29689 bb3 = e23->dest;
29690 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
29691
29692 remove_edge (e23);
29693 make_edge (bb2, EXIT_BLOCK_PTR, 0);
29694
29695 pop_cfun ();
29696
29697 return bb3;
29698 }
29699
29700 /* This parses the attribute arguments to target in DECL and determines
29701 the right builtin to use to match the platform specification.
29702 It returns the priority value for this version decl. If PREDICATE_LIST
29703 is not NULL, it stores the list of cpu features that need to be checked
29704 before dispatching this function. */
29705
29706 static unsigned int
29707 get_builtin_code_for_version (tree decl, tree *predicate_list)
29708 {
29709 tree attrs;
29710 struct cl_target_option cur_target;
29711 tree target_node;
29712 struct cl_target_option *new_target;
29713 const char *arg_str = NULL;
29714 const char *attrs_str = NULL;
29715 char *tok_str = NULL;
29716 char *token;
29717
29718 /* Priority of i386 features, greater value is higher priority. This is
29719 used to decide the order in which function dispatch must happen. For
29720 instance, a version specialized for SSE4.2 should be checked for dispatch
29721 before a version for SSE3, as SSE4.2 implies SSE3. */
29722 enum feature_priority
29723 {
29724 P_ZERO = 0,
29725 P_MMX,
29726 P_SSE,
29727 P_SSE2,
29728 P_SSE3,
29729 P_SSSE3,
29730 P_PROC_SSSE3,
29731 P_SSE4_a,
29732 P_PROC_SSE4_a,
29733 P_SSE4_1,
29734 P_SSE4_2,
29735 P_PROC_SSE4_2,
29736 P_POPCNT,
29737 P_AVX,
29738 P_AVX2,
29739 P_FMA,
29740 P_PROC_FMA
29741 };
29742
29743 enum feature_priority priority = P_ZERO;
29744
29745 /* These are the target attribute strings for which a dispatcher is
29746 available, from fold_builtin_cpu. */
29747
29748 static struct _feature_list
29749 {
29750 const char *const name;
29751 const enum feature_priority priority;
29752 }
29753 const feature_list[] =
29754 {
29755 {"mmx", P_MMX},
29756 {"sse", P_SSE},
29757 {"sse2", P_SSE2},
29758 {"sse3", P_SSE3},
29759 {"ssse3", P_SSSE3},
29760 {"sse4.1", P_SSE4_1},
29761 {"sse4.2", P_SSE4_2},
29762 {"popcnt", P_POPCNT},
29763 {"avx", P_AVX},
29764 {"avx2", P_AVX2}
29765 };
29766
29767
29768 static unsigned int NUM_FEATURES
29769 = sizeof (feature_list) / sizeof (struct _feature_list);
29770
29771 unsigned int i;
29772
29773 tree predicate_chain = NULL_TREE;
29774 tree predicate_decl, predicate_arg;
29775
29776 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29777 gcc_assert (attrs != NULL);
29778
29779 attrs = TREE_VALUE (TREE_VALUE (attrs));
29780
29781 gcc_assert (TREE_CODE (attrs) == STRING_CST);
29782 attrs_str = TREE_STRING_POINTER (attrs);
29783
29784 /* Return priority zero for default function. */
29785 if (strcmp (attrs_str, "default") == 0)
29786 return 0;
29787
29788 /* Handle arch= if specified. For priority, set it to be 1 more than
29789 the best instruction set the processor can handle. For instance, if
29790 there is a version for atom and a version for ssse3 (the highest ISA
29791 priority for atom), the atom version must be checked for dispatch
29792 before the ssse3 version. */
29793 if (strstr (attrs_str, "arch=") != NULL)
29794 {
29795 cl_target_option_save (&cur_target, &global_options);
29796 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
29797 &global_options_set);
29798
29799 gcc_assert (target_node);
29800 new_target = TREE_TARGET_OPTION (target_node);
29801 gcc_assert (new_target);
29802
29803 if (new_target->arch_specified && new_target->arch > 0)
29804 {
29805 switch (new_target->arch)
29806 {
29807 case PROCESSOR_CORE2:
29808 arg_str = "core2";
29809 priority = P_PROC_SSSE3;
29810 break;
29811 case PROCESSOR_COREI7:
29812 arg_str = "corei7";
29813 priority = P_PROC_SSE4_2;
29814 break;
29815 case PROCESSOR_COREI7_AVX:
29816 arg_str = "corei7-avx";
29817 priority = P_PROC_SSE4_2;
29818 break;
29819 case PROCESSOR_ATOM:
29820 arg_str = "atom";
29821 priority = P_PROC_SSSE3;
29822 break;
29823 case PROCESSOR_AMDFAM10:
29824 arg_str = "amdfam10h";
29825 priority = P_PROC_SSE4_a;
29826 break;
29827 case PROCESSOR_BDVER1:
29828 arg_str = "bdver1";
29829 priority = P_PROC_FMA;
29830 break;
29831 case PROCESSOR_BDVER2:
29832 arg_str = "bdver2";
29833 priority = P_PROC_FMA;
29834 break;
29835 }
29836 }
29837
29838 cl_target_option_restore (&global_options, &cur_target);
29839
29840 if (predicate_list && arg_str == NULL)
29841 {
29842 error_at (DECL_SOURCE_LOCATION (decl),
29843 "No dispatcher found for the versioning attributes");
29844 return 0;
29845 }
29846
29847 if (predicate_list)
29848 {
29849 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
29850 /* For a C string literal the length includes the trailing NULL. */
29851 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
29852 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29853 predicate_chain);
29854 }
29855 }
29856
29857 /* Process feature name. */
29858 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
29859 strcpy (tok_str, attrs_str);
29860 token = strtok (tok_str, ",");
29861 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
29862
29863 while (token != NULL)
29864 {
29865 /* Do not process "arch=" */
29866 if (strncmp (token, "arch=", 5) == 0)
29867 {
29868 token = strtok (NULL, ",");
29869 continue;
29870 }
29871 for (i = 0; i < NUM_FEATURES; ++i)
29872 {
29873 if (strcmp (token, feature_list[i].name) == 0)
29874 {
29875 if (predicate_list)
29876 {
29877 predicate_arg = build_string_literal (
29878 strlen (feature_list[i].name) + 1,
29879 feature_list[i].name);
29880 predicate_chain = tree_cons (predicate_decl, predicate_arg,
29881 predicate_chain);
29882 }
29883 /* Find the maximum priority feature. */
29884 if (feature_list[i].priority > priority)
29885 priority = feature_list[i].priority;
29886
29887 break;
29888 }
29889 }
29890 if (predicate_list && i == NUM_FEATURES)
29891 {
29892 error_at (DECL_SOURCE_LOCATION (decl),
29893 "No dispatcher found for %s", token);
29894 return 0;
29895 }
29896 token = strtok (NULL, ",");
29897 }
29898 free (tok_str);
29899
29900 if (predicate_list && predicate_chain == NULL_TREE)
29901 {
29902 error_at (DECL_SOURCE_LOCATION (decl),
29903 "No dispatcher found for the versioning attributes : %s",
29904 attrs_str);
29905 return 0;
29906 }
29907 else if (predicate_list)
29908 {
29909 predicate_chain = nreverse (predicate_chain);
29910 *predicate_list = predicate_chain;
29911 }
29912
29913 return priority;
29914 }
29915
29916 /* This compares the priority of target features in function DECL1
29917 and DECL2. It returns positive value if DECL1 is higher priority,
29918 negative value if DECL2 is higher priority and 0 if they are the
29919 same. */
29920
29921 static int
29922 ix86_compare_version_priority (tree decl1, tree decl2)
29923 {
29924 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
29925 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
29926
29927 return (int)priority1 - (int)priority2;
29928 }
29929
29930 /* V1 and V2 point to function versions with different priorities
29931 based on the target ISA. This function compares their priorities. */
29932
29933 static int
29934 feature_compare (const void *v1, const void *v2)
29935 {
29936 typedef struct _function_version_info
29937 {
29938 tree version_decl;
29939 tree predicate_chain;
29940 unsigned int dispatch_priority;
29941 } function_version_info;
29942
29943 const function_version_info c1 = *(const function_version_info *)v1;
29944 const function_version_info c2 = *(const function_version_info *)v2;
29945 return (c2.dispatch_priority - c1.dispatch_priority);
29946 }
29947
29948 /* This function generates the dispatch function for
29949 multi-versioned functions. DISPATCH_DECL is the function which will
29950 contain the dispatch logic. FNDECLS are the function choices for
29951 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
29952 in DISPATCH_DECL in which the dispatch code is generated. */
29953
29954 static int
29955 dispatch_function_versions (tree dispatch_decl,
29956 void *fndecls_p,
29957 basic_block *empty_bb)
29958 {
29959 tree default_decl;
29960 gimple ifunc_cpu_init_stmt;
29961 gimple_seq gseq;
29962 int ix;
29963 tree ele;
29964 vec<tree> *fndecls;
29965 unsigned int num_versions = 0;
29966 unsigned int actual_versions = 0;
29967 unsigned int i;
29968
29969 struct _function_version_info
29970 {
29971 tree version_decl;
29972 tree predicate_chain;
29973 unsigned int dispatch_priority;
29974 }*function_version_info;
29975
29976 gcc_assert (dispatch_decl != NULL
29977 && fndecls_p != NULL
29978 && empty_bb != NULL);
29979
29980 /*fndecls_p is actually a vector. */
29981 fndecls = static_cast<vec<tree> *> (fndecls_p);
29982
29983 /* At least one more version other than the default. */
29984 num_versions = fndecls->length ();
29985 gcc_assert (num_versions >= 2);
29986
29987 function_version_info = (struct _function_version_info *)
29988 XNEWVEC (struct _function_version_info, (num_versions - 1));
29989
29990 /* The first version in the vector is the default decl. */
29991 default_decl = (*fndecls)[0];
29992
29993 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
29994
29995 gseq = bb_seq (*empty_bb);
29996 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
29997 constructors, so explicity call __builtin_cpu_init here. */
29998 ifunc_cpu_init_stmt = gimple_build_call_vec (
29999 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
30000 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
30001 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
30002 set_bb_seq (*empty_bb, gseq);
30003
30004 pop_cfun ();
30005
30006
30007 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
30008 {
30009 tree version_decl = ele;
30010 tree predicate_chain = NULL_TREE;
30011 unsigned int priority;
30012 /* Get attribute string, parse it and find the right predicate decl.
30013 The predicate function could be a lengthy combination of many
30014 features, like arch-type and various isa-variants. */
30015 priority = get_builtin_code_for_version (version_decl,
30016 &predicate_chain);
30017
30018 if (predicate_chain == NULL_TREE)
30019 continue;
30020
30021 function_version_info [actual_versions].version_decl = version_decl;
30022 function_version_info [actual_versions].predicate_chain
30023 = predicate_chain;
30024 function_version_info [actual_versions].dispatch_priority = priority;
30025 actual_versions++;
30026 }
30027
30028 /* Sort the versions according to descending order of dispatch priority. The
30029 priority is based on the ISA. This is not a perfect solution. There
30030 could still be ambiguity. If more than one function version is suitable
30031 to execute, which one should be dispatched? In future, allow the user
30032 to specify a dispatch priority next to the version. */
30033 qsort (function_version_info, actual_versions,
30034 sizeof (struct _function_version_info), feature_compare);
30035
30036 for (i = 0; i < actual_versions; ++i)
30037 *empty_bb = add_condition_to_bb (dispatch_decl,
30038 function_version_info[i].version_decl,
30039 function_version_info[i].predicate_chain,
30040 *empty_bb);
30041
30042 /* dispatch default version at the end. */
30043 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
30044 NULL, *empty_bb);
30045
30046 free (function_version_info);
30047 return 0;
30048 }
30049
30050 /* Comparator function to be used in qsort routine to sort attribute
30051 specification strings to "target". */
30052
30053 static int
30054 attr_strcmp (const void *v1, const void *v2)
30055 {
30056 const char *c1 = *(char *const*)v1;
30057 const char *c2 = *(char *const*)v2;
30058 return strcmp (c1, c2);
30059 }
30060
30061 /* ARGLIST is the argument to target attribute. This function tokenizes
30062 the comma separated arguments, sorts them and returns a string which
30063 is a unique identifier for the comma separated arguments. It also
30064 replaces non-identifier characters "=,-" with "_". */
30065
30066 static char *
30067 sorted_attr_string (tree arglist)
30068 {
30069 tree arg;
30070 size_t str_len_sum = 0;
30071 char **args = NULL;
30072 char *attr_str, *ret_str;
30073 char *attr = NULL;
30074 unsigned int argnum = 1;
30075 unsigned int i;
30076
30077 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30078 {
30079 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30080 size_t len = strlen (str);
30081 str_len_sum += len + 1;
30082 if (arg != arglist)
30083 argnum++;
30084 for (i = 0; i < strlen (str); i++)
30085 if (str[i] == ',')
30086 argnum++;
30087 }
30088
30089 attr_str = XNEWVEC (char, str_len_sum);
30090 str_len_sum = 0;
30091 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
30092 {
30093 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
30094 size_t len = strlen (str);
30095 memcpy (attr_str + str_len_sum, str, len);
30096 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
30097 str_len_sum += len + 1;
30098 }
30099
30100 /* Replace "=,-" with "_". */
30101 for (i = 0; i < strlen (attr_str); i++)
30102 if (attr_str[i] == '=' || attr_str[i]== '-')
30103 attr_str[i] = '_';
30104
30105 if (argnum == 1)
30106 return attr_str;
30107
30108 args = XNEWVEC (char *, argnum);
30109
30110 i = 0;
30111 attr = strtok (attr_str, ",");
30112 while (attr != NULL)
30113 {
30114 args[i] = attr;
30115 i++;
30116 attr = strtok (NULL, ",");
30117 }
30118
30119 qsort (args, argnum, sizeof (char *), attr_strcmp);
30120
30121 ret_str = XNEWVEC (char, str_len_sum);
30122 str_len_sum = 0;
30123 for (i = 0; i < argnum; i++)
30124 {
30125 size_t len = strlen (args[i]);
30126 memcpy (ret_str + str_len_sum, args[i], len);
30127 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
30128 str_len_sum += len + 1;
30129 }
30130
30131 XDELETEVEC (args);
30132 XDELETEVEC (attr_str);
30133 return ret_str;
30134 }
30135
30136 /* This function changes the assembler name for functions that are
30137 versions. If DECL is a function version and has a "target"
30138 attribute, it appends the attribute string to its assembler name. */
30139
30140 static tree
30141 ix86_mangle_function_version_assembler_name (tree decl, tree id)
30142 {
30143 tree version_attr;
30144 const char *orig_name, *version_string;
30145 char *attr_str, *assembler_name;
30146
30147 if (DECL_DECLARED_INLINE_P (decl)
30148 && lookup_attribute ("gnu_inline",
30149 DECL_ATTRIBUTES (decl)))
30150 error_at (DECL_SOURCE_LOCATION (decl),
30151 "Function versions cannot be marked as gnu_inline,"
30152 " bodies have to be generated");
30153
30154 if (DECL_VIRTUAL_P (decl)
30155 || DECL_VINDEX (decl))
30156 sorry ("Virtual function multiversioning not supported");
30157
30158 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30159
30160 /* target attribute string cannot be NULL. */
30161 gcc_assert (version_attr != NULL_TREE);
30162
30163 orig_name = IDENTIFIER_POINTER (id);
30164 version_string
30165 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
30166
30167 if (strcmp (version_string, "default") == 0)
30168 return id;
30169
30170 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
30171 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
30172
30173 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
30174
30175 /* Allow assembler name to be modified if already set. */
30176 if (DECL_ASSEMBLER_NAME_SET_P (decl))
30177 SET_DECL_RTL (decl, NULL);
30178
30179 tree ret = get_identifier (assembler_name);
30180 XDELETEVEC (attr_str);
30181 XDELETEVEC (assembler_name);
30182 return ret;
30183 }
30184
30185 /* This function returns true if FN1 and FN2 are versions of the same function,
30186 that is, the target strings of the function decls are different. This assumes
30187 that FN1 and FN2 have the same signature. */
30188
30189 static bool
30190 ix86_function_versions (tree fn1, tree fn2)
30191 {
30192 tree attr1, attr2;
30193 char *target1, *target2;
30194 bool result;
30195
30196 if (TREE_CODE (fn1) != FUNCTION_DECL
30197 || TREE_CODE (fn2) != FUNCTION_DECL)
30198 return false;
30199
30200 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
30201 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
30202
30203 /* At least one function decl should have the target attribute specified. */
30204 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
30205 return false;
30206
30207 /* Diagnose missing target attribute if one of the decls is already
30208 multi-versioned. */
30209 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
30210 {
30211 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
30212 {
30213 if (attr2 != NULL_TREE)
30214 {
30215 tree tem = fn1;
30216 fn1 = fn2;
30217 fn2 = tem;
30218 attr1 = attr2;
30219 }
30220 error_at (DECL_SOURCE_LOCATION (fn2),
30221 "missing %<target%> attribute for multi-versioned %D",
30222 fn2);
30223 inform (DECL_SOURCE_LOCATION (fn1),
30224 "previous declaration of %D", fn1);
30225 /* Prevent diagnosing of the same error multiple times. */
30226 DECL_ATTRIBUTES (fn2)
30227 = tree_cons (get_identifier ("target"),
30228 copy_node (TREE_VALUE (attr1)),
30229 DECL_ATTRIBUTES (fn2));
30230 }
30231 return false;
30232 }
30233
30234 target1 = sorted_attr_string (TREE_VALUE (attr1));
30235 target2 = sorted_attr_string (TREE_VALUE (attr2));
30236
30237 /* The sorted target strings must be different for fn1 and fn2
30238 to be versions. */
30239 if (strcmp (target1, target2) == 0)
30240 result = false;
30241 else
30242 result = true;
30243
30244 XDELETEVEC (target1);
30245 XDELETEVEC (target2);
30246
30247 return result;
30248 }
30249
30250 static tree
30251 ix86_mangle_decl_assembler_name (tree decl, tree id)
30252 {
30253 /* For function version, add the target suffix to the assembler name. */
30254 if (TREE_CODE (decl) == FUNCTION_DECL
30255 && DECL_FUNCTION_VERSIONED (decl))
30256 id = ix86_mangle_function_version_assembler_name (decl, id);
30257 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
30258 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
30259 #endif
30260
30261 return id;
30262 }
30263
30264 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
30265 is true, append the full path name of the source file. */
30266
30267 static char *
30268 make_name (tree decl, const char *suffix, bool make_unique)
30269 {
30270 char *global_var_name;
30271 int name_len;
30272 const char *name;
30273 const char *unique_name = NULL;
30274
30275 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
30276
30277 /* Get a unique name that can be used globally without any chances
30278 of collision at link time. */
30279 if (make_unique)
30280 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
30281
30282 name_len = strlen (name) + strlen (suffix) + 2;
30283
30284 if (make_unique)
30285 name_len += strlen (unique_name) + 1;
30286 global_var_name = XNEWVEC (char, name_len);
30287
30288 /* Use '.' to concatenate names as it is demangler friendly. */
30289 if (make_unique)
30290 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
30291 suffix);
30292 else
30293 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
30294
30295 return global_var_name;
30296 }
30297
30298 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30299
30300 /* Make a dispatcher declaration for the multi-versioned function DECL.
30301 Calls to DECL function will be replaced with calls to the dispatcher
30302 by the front-end. Return the decl created. */
30303
30304 static tree
30305 make_dispatcher_decl (const tree decl)
30306 {
30307 tree func_decl;
30308 char *func_name;
30309 tree fn_type, func_type;
30310 bool is_uniq = false;
30311
30312 if (TREE_PUBLIC (decl) == 0)
30313 is_uniq = true;
30314
30315 func_name = make_name (decl, "ifunc", is_uniq);
30316
30317 fn_type = TREE_TYPE (decl);
30318 func_type = build_function_type (TREE_TYPE (fn_type),
30319 TYPE_ARG_TYPES (fn_type));
30320
30321 func_decl = build_fn_decl (func_name, func_type);
30322 XDELETEVEC (func_name);
30323 TREE_USED (func_decl) = 1;
30324 DECL_CONTEXT (func_decl) = NULL_TREE;
30325 DECL_INITIAL (func_decl) = error_mark_node;
30326 DECL_ARTIFICIAL (func_decl) = 1;
30327 /* Mark this func as external, the resolver will flip it again if
30328 it gets generated. */
30329 DECL_EXTERNAL (func_decl) = 1;
30330 /* This will be of type IFUNCs have to be externally visible. */
30331 TREE_PUBLIC (func_decl) = 1;
30332
30333 return func_decl;
30334 }
30335
30336 #endif
30337
30338 /* Returns true if decl is multi-versioned and DECL is the default function,
30339 that is it is not tagged with target specific optimization. */
30340
30341 static bool
30342 is_function_default_version (const tree decl)
30343 {
30344 if (TREE_CODE (decl) != FUNCTION_DECL
30345 || !DECL_FUNCTION_VERSIONED (decl))
30346 return false;
30347 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
30348 gcc_assert (attr);
30349 attr = TREE_VALUE (TREE_VALUE (attr));
30350 return (TREE_CODE (attr) == STRING_CST
30351 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
30352 }
30353
30354 /* Make a dispatcher declaration for the multi-versioned function DECL.
30355 Calls to DECL function will be replaced with calls to the dispatcher
30356 by the front-end. Returns the decl of the dispatcher function. */
30357
30358 static tree
30359 ix86_get_function_versions_dispatcher (void *decl)
30360 {
30361 tree fn = (tree) decl;
30362 struct cgraph_node *node = NULL;
30363 struct cgraph_node *default_node = NULL;
30364 struct cgraph_function_version_info *node_v = NULL;
30365 struct cgraph_function_version_info *first_v = NULL;
30366
30367 tree dispatch_decl = NULL;
30368
30369 struct cgraph_function_version_info *default_version_info = NULL;
30370
30371 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
30372
30373 node = cgraph_get_node (fn);
30374 gcc_assert (node != NULL);
30375
30376 node_v = get_cgraph_node_version (node);
30377 gcc_assert (node_v != NULL);
30378
30379 if (node_v->dispatcher_resolver != NULL)
30380 return node_v->dispatcher_resolver;
30381
30382 /* Find the default version and make it the first node. */
30383 first_v = node_v;
30384 /* Go to the beginning of the chain. */
30385 while (first_v->prev != NULL)
30386 first_v = first_v->prev;
30387 default_version_info = first_v;
30388 while (default_version_info != NULL)
30389 {
30390 if (is_function_default_version
30391 (default_version_info->this_node->decl))
30392 break;
30393 default_version_info = default_version_info->next;
30394 }
30395
30396 /* If there is no default node, just return NULL. */
30397 if (default_version_info == NULL)
30398 return NULL;
30399
30400 /* Make default info the first node. */
30401 if (first_v != default_version_info)
30402 {
30403 default_version_info->prev->next = default_version_info->next;
30404 if (default_version_info->next)
30405 default_version_info->next->prev = default_version_info->prev;
30406 first_v->prev = default_version_info;
30407 default_version_info->next = first_v;
30408 default_version_info->prev = NULL;
30409 }
30410
30411 default_node = default_version_info->this_node;
30412
30413 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
30414 if (targetm.has_ifunc_p ())
30415 {
30416 struct cgraph_function_version_info *it_v = NULL;
30417 struct cgraph_node *dispatcher_node = NULL;
30418 struct cgraph_function_version_info *dispatcher_version_info = NULL;
30419
30420 /* Right now, the dispatching is done via ifunc. */
30421 dispatch_decl = make_dispatcher_decl (default_node->decl);
30422
30423 dispatcher_node = cgraph_get_create_node (dispatch_decl);
30424 gcc_assert (dispatcher_node != NULL);
30425 dispatcher_node->dispatcher_function = 1;
30426 dispatcher_version_info
30427 = insert_new_cgraph_node_version (dispatcher_node);
30428 dispatcher_version_info->next = default_version_info;
30429 dispatcher_node->definition = 1;
30430
30431 /* Set the dispatcher for all the versions. */
30432 it_v = default_version_info;
30433 while (it_v != NULL)
30434 {
30435 it_v->dispatcher_resolver = dispatch_decl;
30436 it_v = it_v->next;
30437 }
30438 }
30439 else
30440 #endif
30441 {
30442 error_at (DECL_SOURCE_LOCATION (default_node->decl),
30443 "multiversioning needs ifunc which is not supported "
30444 "on this target");
30445 }
30446
30447 return dispatch_decl;
30448 }
30449
30450 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
30451 it to CHAIN. */
30452
30453 static tree
30454 make_attribute (const char *name, const char *arg_name, tree chain)
30455 {
30456 tree attr_name;
30457 tree attr_arg_name;
30458 tree attr_args;
30459 tree attr;
30460
30461 attr_name = get_identifier (name);
30462 attr_arg_name = build_string (strlen (arg_name), arg_name);
30463 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
30464 attr = tree_cons (attr_name, attr_args, chain);
30465 return attr;
30466 }
30467
30468 /* Make the resolver function decl to dispatch the versions of
30469 a multi-versioned function, DEFAULT_DECL. Create an
30470 empty basic block in the resolver and store the pointer in
30471 EMPTY_BB. Return the decl of the resolver function. */
30472
30473 static tree
30474 make_resolver_func (const tree default_decl,
30475 const tree dispatch_decl,
30476 basic_block *empty_bb)
30477 {
30478 char *resolver_name;
30479 tree decl, type, decl_name, t;
30480 bool is_uniq = false;
30481
30482 /* IFUNC's have to be globally visible. So, if the default_decl is
30483 not, then the name of the IFUNC should be made unique. */
30484 if (TREE_PUBLIC (default_decl) == 0)
30485 is_uniq = true;
30486
30487 /* Append the filename to the resolver function if the versions are
30488 not externally visible. This is because the resolver function has
30489 to be externally visible for the loader to find it. So, appending
30490 the filename will prevent conflicts with a resolver function from
30491 another module which is based on the same version name. */
30492 resolver_name = make_name (default_decl, "resolver", is_uniq);
30493
30494 /* The resolver function should return a (void *). */
30495 type = build_function_type_list (ptr_type_node, NULL_TREE);
30496
30497 decl = build_fn_decl (resolver_name, type);
30498 decl_name = get_identifier (resolver_name);
30499 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
30500
30501 DECL_NAME (decl) = decl_name;
30502 TREE_USED (decl) = 1;
30503 DECL_ARTIFICIAL (decl) = 1;
30504 DECL_IGNORED_P (decl) = 0;
30505 /* IFUNC resolvers have to be externally visible. */
30506 TREE_PUBLIC (decl) = 1;
30507 DECL_UNINLINABLE (decl) = 1;
30508
30509 /* Resolver is not external, body is generated. */
30510 DECL_EXTERNAL (decl) = 0;
30511 DECL_EXTERNAL (dispatch_decl) = 0;
30512
30513 DECL_CONTEXT (decl) = NULL_TREE;
30514 DECL_INITIAL (decl) = make_node (BLOCK);
30515 DECL_STATIC_CONSTRUCTOR (decl) = 0;
30516
30517 if (DECL_COMDAT_GROUP (default_decl)
30518 || TREE_PUBLIC (default_decl))
30519 {
30520 /* In this case, each translation unit with a call to this
30521 versioned function will put out a resolver. Ensure it
30522 is comdat to keep just one copy. */
30523 DECL_COMDAT (decl) = 1;
30524 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
30525 }
30526 /* Build result decl and add to function_decl. */
30527 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
30528 DECL_ARTIFICIAL (t) = 1;
30529 DECL_IGNORED_P (t) = 1;
30530 DECL_RESULT (decl) = t;
30531
30532 gimplify_function_tree (decl);
30533 push_cfun (DECL_STRUCT_FUNCTION (decl));
30534 *empty_bb = init_lowered_empty_function (decl, false);
30535
30536 cgraph_add_new_function (decl, true);
30537 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
30538
30539 pop_cfun ();
30540
30541 gcc_assert (dispatch_decl != NULL);
30542 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
30543 DECL_ATTRIBUTES (dispatch_decl)
30544 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
30545
30546 /* Create the alias for dispatch to resolver here. */
30547 /*cgraph_create_function_alias (dispatch_decl, decl);*/
30548 cgraph_same_body_alias (NULL, dispatch_decl, decl);
30549 XDELETEVEC (resolver_name);
30550 return decl;
30551 }
30552
30553 /* Generate the dispatching code body to dispatch multi-versioned function
30554 DECL. The target hook is called to process the "target" attributes and
30555 provide the code to dispatch the right function at run-time. NODE points
30556 to the dispatcher decl whose body will be created. */
30557
30558 static tree
30559 ix86_generate_version_dispatcher_body (void *node_p)
30560 {
30561 tree resolver_decl;
30562 basic_block empty_bb;
30563 vec<tree> fn_ver_vec = vNULL;
30564 tree default_ver_decl;
30565 struct cgraph_node *versn;
30566 struct cgraph_node *node;
30567
30568 struct cgraph_function_version_info *node_version_info = NULL;
30569 struct cgraph_function_version_info *versn_info = NULL;
30570
30571 node = (cgraph_node *)node_p;
30572
30573 node_version_info = get_cgraph_node_version (node);
30574 gcc_assert (node->dispatcher_function
30575 && node_version_info != NULL);
30576
30577 if (node_version_info->dispatcher_resolver)
30578 return node_version_info->dispatcher_resolver;
30579
30580 /* The first version in the chain corresponds to the default version. */
30581 default_ver_decl = node_version_info->next->this_node->decl;
30582
30583 /* node is going to be an alias, so remove the finalized bit. */
30584 node->definition = false;
30585
30586 resolver_decl = make_resolver_func (default_ver_decl,
30587 node->decl, &empty_bb);
30588
30589 node_version_info->dispatcher_resolver = resolver_decl;
30590
30591 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
30592
30593 fn_ver_vec.create (2);
30594
30595 for (versn_info = node_version_info->next; versn_info;
30596 versn_info = versn_info->next)
30597 {
30598 versn = versn_info->this_node;
30599 /* Check for virtual functions here again, as by this time it should
30600 have been determined if this function needs a vtable index or
30601 not. This happens for methods in derived classes that override
30602 virtual methods in base classes but are not explicitly marked as
30603 virtual. */
30604 if (DECL_VINDEX (versn->decl))
30605 sorry ("Virtual function multiversioning not supported");
30606
30607 fn_ver_vec.safe_push (versn->decl);
30608 }
30609
30610 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
30611 fn_ver_vec.release ();
30612 rebuild_cgraph_edges ();
30613 pop_cfun ();
30614 return resolver_decl;
30615 }
30616 /* This builds the processor_model struct type defined in
30617 libgcc/config/i386/cpuinfo.c */
30618
30619 static tree
30620 build_processor_model_struct (void)
30621 {
30622 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
30623 "__cpu_features"};
30624 tree field = NULL_TREE, field_chain = NULL_TREE;
30625 int i;
30626 tree type = make_node (RECORD_TYPE);
30627
30628 /* The first 3 fields are unsigned int. */
30629 for (i = 0; i < 3; ++i)
30630 {
30631 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30632 get_identifier (field_name[i]), unsigned_type_node);
30633 if (field_chain != NULL_TREE)
30634 DECL_CHAIN (field) = field_chain;
30635 field_chain = field;
30636 }
30637
30638 /* The last field is an array of unsigned integers of size one. */
30639 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
30640 get_identifier (field_name[3]),
30641 build_array_type (unsigned_type_node,
30642 build_index_type (size_one_node)));
30643 if (field_chain != NULL_TREE)
30644 DECL_CHAIN (field) = field_chain;
30645 field_chain = field;
30646
30647 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
30648 return type;
30649 }
30650
30651 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
30652
30653 static tree
30654 make_var_decl (tree type, const char *name)
30655 {
30656 tree new_decl;
30657
30658 new_decl = build_decl (UNKNOWN_LOCATION,
30659 VAR_DECL,
30660 get_identifier(name),
30661 type);
30662
30663 DECL_EXTERNAL (new_decl) = 1;
30664 TREE_STATIC (new_decl) = 1;
30665 TREE_PUBLIC (new_decl) = 1;
30666 DECL_INITIAL (new_decl) = 0;
30667 DECL_ARTIFICIAL (new_decl) = 0;
30668 DECL_PRESERVE_P (new_decl) = 1;
30669
30670 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
30671 assemble_variable (new_decl, 0, 0, 0);
30672
30673 return new_decl;
30674 }
30675
30676 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
30677 into an integer defined in libgcc/config/i386/cpuinfo.c */
30678
30679 static tree
30680 fold_builtin_cpu (tree fndecl, tree *args)
30681 {
30682 unsigned int i;
30683 enum ix86_builtins fn_code = (enum ix86_builtins)
30684 DECL_FUNCTION_CODE (fndecl);
30685 tree param_string_cst = NULL;
30686
30687 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
30688 enum processor_features
30689 {
30690 F_CMOV = 0,
30691 F_MMX,
30692 F_POPCNT,
30693 F_SSE,
30694 F_SSE2,
30695 F_SSE3,
30696 F_SSSE3,
30697 F_SSE4_1,
30698 F_SSE4_2,
30699 F_AVX,
30700 F_AVX2,
30701 F_MAX
30702 };
30703
30704 /* These are the values for vendor types and cpu types and subtypes
30705 in cpuinfo.c. Cpu types and subtypes should be subtracted by
30706 the corresponding start value. */
30707 enum processor_model
30708 {
30709 M_INTEL = 1,
30710 M_AMD,
30711 M_CPU_TYPE_START,
30712 M_INTEL_ATOM,
30713 M_INTEL_CORE2,
30714 M_INTEL_COREI7,
30715 M_AMDFAM10H,
30716 M_AMDFAM15H,
30717 M_INTEL_SLM,
30718 M_CPU_SUBTYPE_START,
30719 M_INTEL_COREI7_NEHALEM,
30720 M_INTEL_COREI7_WESTMERE,
30721 M_INTEL_COREI7_SANDYBRIDGE,
30722 M_AMDFAM10H_BARCELONA,
30723 M_AMDFAM10H_SHANGHAI,
30724 M_AMDFAM10H_ISTANBUL,
30725 M_AMDFAM15H_BDVER1,
30726 M_AMDFAM15H_BDVER2,
30727 M_AMDFAM15H_BDVER3
30728 };
30729
30730 static struct _arch_names_table
30731 {
30732 const char *const name;
30733 const enum processor_model model;
30734 }
30735 const arch_names_table[] =
30736 {
30737 {"amd", M_AMD},
30738 {"intel", M_INTEL},
30739 {"atom", M_INTEL_ATOM},
30740 {"slm", M_INTEL_SLM},
30741 {"core2", M_INTEL_CORE2},
30742 {"corei7", M_INTEL_COREI7},
30743 {"nehalem", M_INTEL_COREI7_NEHALEM},
30744 {"westmere", M_INTEL_COREI7_WESTMERE},
30745 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
30746 {"amdfam10h", M_AMDFAM10H},
30747 {"barcelona", M_AMDFAM10H_BARCELONA},
30748 {"shanghai", M_AMDFAM10H_SHANGHAI},
30749 {"istanbul", M_AMDFAM10H_ISTANBUL},
30750 {"amdfam15h", M_AMDFAM15H},
30751 {"bdver1", M_AMDFAM15H_BDVER1},
30752 {"bdver2", M_AMDFAM15H_BDVER2},
30753 {"bdver3", M_AMDFAM15H_BDVER3},
30754 };
30755
30756 static struct _isa_names_table
30757 {
30758 const char *const name;
30759 const enum processor_features feature;
30760 }
30761 const isa_names_table[] =
30762 {
30763 {"cmov", F_CMOV},
30764 {"mmx", F_MMX},
30765 {"popcnt", F_POPCNT},
30766 {"sse", F_SSE},
30767 {"sse2", F_SSE2},
30768 {"sse3", F_SSE3},
30769 {"ssse3", F_SSSE3},
30770 {"sse4.1", F_SSE4_1},
30771 {"sse4.2", F_SSE4_2},
30772 {"avx", F_AVX},
30773 {"avx2", F_AVX2}
30774 };
30775
30776 tree __processor_model_type = build_processor_model_struct ();
30777 tree __cpu_model_var = make_var_decl (__processor_model_type,
30778 "__cpu_model");
30779
30780
30781 varpool_add_new_variable (__cpu_model_var);
30782
30783 gcc_assert ((args != NULL) && (*args != NULL));
30784
30785 param_string_cst = *args;
30786 while (param_string_cst
30787 && TREE_CODE (param_string_cst) != STRING_CST)
30788 {
30789 /* *args must be a expr that can contain other EXPRS leading to a
30790 STRING_CST. */
30791 if (!EXPR_P (param_string_cst))
30792 {
30793 error ("Parameter to builtin must be a string constant or literal");
30794 return integer_zero_node;
30795 }
30796 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
30797 }
30798
30799 gcc_assert (param_string_cst);
30800
30801 if (fn_code == IX86_BUILTIN_CPU_IS)
30802 {
30803 tree ref;
30804 tree field;
30805 tree final;
30806
30807 unsigned int field_val = 0;
30808 unsigned int NUM_ARCH_NAMES
30809 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
30810
30811 for (i = 0; i < NUM_ARCH_NAMES; i++)
30812 if (strcmp (arch_names_table[i].name,
30813 TREE_STRING_POINTER (param_string_cst)) == 0)
30814 break;
30815
30816 if (i == NUM_ARCH_NAMES)
30817 {
30818 error ("Parameter to builtin not valid: %s",
30819 TREE_STRING_POINTER (param_string_cst));
30820 return integer_zero_node;
30821 }
30822
30823 field = TYPE_FIELDS (__processor_model_type);
30824 field_val = arch_names_table[i].model;
30825
30826 /* CPU types are stored in the next field. */
30827 if (field_val > M_CPU_TYPE_START
30828 && field_val < M_CPU_SUBTYPE_START)
30829 {
30830 field = DECL_CHAIN (field);
30831 field_val -= M_CPU_TYPE_START;
30832 }
30833
30834 /* CPU subtypes are stored in the next field. */
30835 if (field_val > M_CPU_SUBTYPE_START)
30836 {
30837 field = DECL_CHAIN ( DECL_CHAIN (field));
30838 field_val -= M_CPU_SUBTYPE_START;
30839 }
30840
30841 /* Get the appropriate field in __cpu_model. */
30842 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30843 field, NULL_TREE);
30844
30845 /* Check the value. */
30846 final = build2 (EQ_EXPR, unsigned_type_node, ref,
30847 build_int_cstu (unsigned_type_node, field_val));
30848 return build1 (CONVERT_EXPR, integer_type_node, final);
30849 }
30850 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30851 {
30852 tree ref;
30853 tree array_elt;
30854 tree field;
30855 tree final;
30856
30857 unsigned int field_val = 0;
30858 unsigned int NUM_ISA_NAMES
30859 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
30860
30861 for (i = 0; i < NUM_ISA_NAMES; i++)
30862 if (strcmp (isa_names_table[i].name,
30863 TREE_STRING_POINTER (param_string_cst)) == 0)
30864 break;
30865
30866 if (i == NUM_ISA_NAMES)
30867 {
30868 error ("Parameter to builtin not valid: %s",
30869 TREE_STRING_POINTER (param_string_cst));
30870 return integer_zero_node;
30871 }
30872
30873 field = TYPE_FIELDS (__processor_model_type);
30874 /* Get the last field, which is __cpu_features. */
30875 while (DECL_CHAIN (field))
30876 field = DECL_CHAIN (field);
30877
30878 /* Get the appropriate field: __cpu_model.__cpu_features */
30879 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
30880 field, NULL_TREE);
30881
30882 /* Access the 0th element of __cpu_features array. */
30883 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
30884 integer_zero_node, NULL_TREE, NULL_TREE);
30885
30886 field_val = (1 << isa_names_table[i].feature);
30887 /* Return __cpu_model.__cpu_features[0] & field_val */
30888 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
30889 build_int_cstu (unsigned_type_node, field_val));
30890 return build1 (CONVERT_EXPR, integer_type_node, final);
30891 }
30892 gcc_unreachable ();
30893 }
30894
30895 static tree
30896 ix86_fold_builtin (tree fndecl, int n_args,
30897 tree *args, bool ignore ATTRIBUTE_UNUSED)
30898 {
30899 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
30900 {
30901 enum ix86_builtins fn_code = (enum ix86_builtins)
30902 DECL_FUNCTION_CODE (fndecl);
30903 if (fn_code == IX86_BUILTIN_CPU_IS
30904 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
30905 {
30906 gcc_assert (n_args == 1);
30907 return fold_builtin_cpu (fndecl, args);
30908 }
30909 }
30910
30911 #ifdef SUBTARGET_FOLD_BUILTIN
30912 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
30913 #endif
30914
30915 return NULL_TREE;
30916 }
30917
30918 /* Make builtins to detect cpu type and features supported. NAME is
30919 the builtin name, CODE is the builtin code, and FTYPE is the function
30920 type of the builtin. */
30921
30922 static void
30923 make_cpu_type_builtin (const char* name, int code,
30924 enum ix86_builtin_func_type ftype, bool is_const)
30925 {
30926 tree decl;
30927 tree type;
30928
30929 type = ix86_get_builtin_func_type (ftype);
30930 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
30931 NULL, NULL_TREE);
30932 gcc_assert (decl != NULL_TREE);
30933 ix86_builtins[(int) code] = decl;
30934 TREE_READONLY (decl) = is_const;
30935 }
30936
30937 /* Make builtins to get CPU type and features supported. The created
30938 builtins are :
30939
30940 __builtin_cpu_init (), to detect cpu type and features,
30941 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
30942 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
30943 */
30944
30945 static void
30946 ix86_init_platform_type_builtins (void)
30947 {
30948 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
30949 INT_FTYPE_VOID, false);
30950 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
30951 INT_FTYPE_PCCHAR, true);
30952 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
30953 INT_FTYPE_PCCHAR, true);
30954 }
30955
30956 /* Internal method for ix86_init_builtins. */
30957
30958 static void
30959 ix86_init_builtins_va_builtins_abi (void)
30960 {
30961 tree ms_va_ref, sysv_va_ref;
30962 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
30963 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
30964 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
30965 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
30966
30967 if (!TARGET_64BIT)
30968 return;
30969 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
30970 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
30971 ms_va_ref = build_reference_type (ms_va_list_type_node);
30972 sysv_va_ref =
30973 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
30974
30975 fnvoid_va_end_ms =
30976 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30977 fnvoid_va_start_ms =
30978 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
30979 fnvoid_va_end_sysv =
30980 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
30981 fnvoid_va_start_sysv =
30982 build_varargs_function_type_list (void_type_node, sysv_va_ref,
30983 NULL_TREE);
30984 fnvoid_va_copy_ms =
30985 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
30986 NULL_TREE);
30987 fnvoid_va_copy_sysv =
30988 build_function_type_list (void_type_node, sysv_va_ref,
30989 sysv_va_ref, NULL_TREE);
30990
30991 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
30992 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
30993 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
30994 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
30995 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
30996 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
30997 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
30998 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
30999 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
31000 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31001 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
31002 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
31003 }
31004
31005 static void
31006 ix86_init_builtin_types (void)
31007 {
31008 tree float128_type_node, float80_type_node;
31009
31010 /* The __float80 type. */
31011 float80_type_node = long_double_type_node;
31012 if (TYPE_MODE (float80_type_node) != XFmode)
31013 {
31014 /* The __float80 type. */
31015 float80_type_node = make_node (REAL_TYPE);
31016
31017 TYPE_PRECISION (float80_type_node) = 80;
31018 layout_type (float80_type_node);
31019 }
31020 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
31021
31022 /* The __float128 type. */
31023 float128_type_node = make_node (REAL_TYPE);
31024 TYPE_PRECISION (float128_type_node) = 128;
31025 layout_type (float128_type_node);
31026 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
31027
31028 /* This macro is built by i386-builtin-types.awk. */
31029 DEFINE_BUILTIN_PRIMITIVE_TYPES;
31030 }
31031
31032 static void
31033 ix86_init_builtins (void)
31034 {
31035 tree t;
31036
31037 ix86_init_builtin_types ();
31038
31039 /* Builtins to get CPU type and features. */
31040 ix86_init_platform_type_builtins ();
31041
31042 /* TFmode support builtins. */
31043 def_builtin_const (0, "__builtin_infq",
31044 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
31045 def_builtin_const (0, "__builtin_huge_valq",
31046 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
31047
31048 /* We will expand them to normal call if SSE isn't available since
31049 they are used by libgcc. */
31050 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
31051 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
31052 BUILT_IN_MD, "__fabstf2", NULL_TREE);
31053 TREE_READONLY (t) = 1;
31054 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
31055
31056 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
31057 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
31058 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
31059 TREE_READONLY (t) = 1;
31060 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
31061
31062 ix86_init_tm_builtins ();
31063 ix86_init_mmx_sse_builtins ();
31064
31065 if (TARGET_LP64)
31066 ix86_init_builtins_va_builtins_abi ();
31067
31068 #ifdef SUBTARGET_INIT_BUILTINS
31069 SUBTARGET_INIT_BUILTINS;
31070 #endif
31071 }
31072
31073 /* Return the ix86 builtin for CODE. */
31074
31075 static tree
31076 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
31077 {
31078 if (code >= IX86_BUILTIN_MAX)
31079 return error_mark_node;
31080
31081 return ix86_builtins[code];
31082 }
31083
31084 /* Errors in the source file can cause expand_expr to return const0_rtx
31085 where we expect a vector. To avoid crashing, use one of the vector
31086 clear instructions. */
31087 static rtx
31088 safe_vector_operand (rtx x, enum machine_mode mode)
31089 {
31090 if (x == const0_rtx)
31091 x = CONST0_RTX (mode);
31092 return x;
31093 }
31094
31095 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
31096
31097 static rtx
31098 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
31099 {
31100 rtx pat;
31101 tree arg0 = CALL_EXPR_ARG (exp, 0);
31102 tree arg1 = CALL_EXPR_ARG (exp, 1);
31103 rtx op0 = expand_normal (arg0);
31104 rtx op1 = expand_normal (arg1);
31105 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31106 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31107 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
31108
31109 if (VECTOR_MODE_P (mode0))
31110 op0 = safe_vector_operand (op0, mode0);
31111 if (VECTOR_MODE_P (mode1))
31112 op1 = safe_vector_operand (op1, mode1);
31113
31114 if (optimize || !target
31115 || GET_MODE (target) != tmode
31116 || !insn_data[icode].operand[0].predicate (target, tmode))
31117 target = gen_reg_rtx (tmode);
31118
31119 if (GET_MODE (op1) == SImode && mode1 == TImode)
31120 {
31121 rtx x = gen_reg_rtx (V4SImode);
31122 emit_insn (gen_sse2_loadd (x, op1));
31123 op1 = gen_lowpart (TImode, x);
31124 }
31125
31126 if (!insn_data[icode].operand[1].predicate (op0, mode0))
31127 op0 = copy_to_mode_reg (mode0, op0);
31128 if (!insn_data[icode].operand[2].predicate (op1, mode1))
31129 op1 = copy_to_mode_reg (mode1, op1);
31130
31131 pat = GEN_FCN (icode) (target, op0, op1);
31132 if (! pat)
31133 return 0;
31134
31135 emit_insn (pat);
31136
31137 return target;
31138 }
31139
31140 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
31141
31142 static rtx
31143 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
31144 enum ix86_builtin_func_type m_type,
31145 enum rtx_code sub_code)
31146 {
31147 rtx pat;
31148 int i;
31149 int nargs;
31150 bool comparison_p = false;
31151 bool tf_p = false;
31152 bool last_arg_constant = false;
31153 int num_memory = 0;
31154 struct {
31155 rtx op;
31156 enum machine_mode mode;
31157 } args[4];
31158
31159 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31160
31161 switch (m_type)
31162 {
31163 case MULTI_ARG_4_DF2_DI_I:
31164 case MULTI_ARG_4_DF2_DI_I1:
31165 case MULTI_ARG_4_SF2_SI_I:
31166 case MULTI_ARG_4_SF2_SI_I1:
31167 nargs = 4;
31168 last_arg_constant = true;
31169 break;
31170
31171 case MULTI_ARG_3_SF:
31172 case MULTI_ARG_3_DF:
31173 case MULTI_ARG_3_SF2:
31174 case MULTI_ARG_3_DF2:
31175 case MULTI_ARG_3_DI:
31176 case MULTI_ARG_3_SI:
31177 case MULTI_ARG_3_SI_DI:
31178 case MULTI_ARG_3_HI:
31179 case MULTI_ARG_3_HI_SI:
31180 case MULTI_ARG_3_QI:
31181 case MULTI_ARG_3_DI2:
31182 case MULTI_ARG_3_SI2:
31183 case MULTI_ARG_3_HI2:
31184 case MULTI_ARG_3_QI2:
31185 nargs = 3;
31186 break;
31187
31188 case MULTI_ARG_2_SF:
31189 case MULTI_ARG_2_DF:
31190 case MULTI_ARG_2_DI:
31191 case MULTI_ARG_2_SI:
31192 case MULTI_ARG_2_HI:
31193 case MULTI_ARG_2_QI:
31194 nargs = 2;
31195 break;
31196
31197 case MULTI_ARG_2_DI_IMM:
31198 case MULTI_ARG_2_SI_IMM:
31199 case MULTI_ARG_2_HI_IMM:
31200 case MULTI_ARG_2_QI_IMM:
31201 nargs = 2;
31202 last_arg_constant = true;
31203 break;
31204
31205 case MULTI_ARG_1_SF:
31206 case MULTI_ARG_1_DF:
31207 case MULTI_ARG_1_SF2:
31208 case MULTI_ARG_1_DF2:
31209 case MULTI_ARG_1_DI:
31210 case MULTI_ARG_1_SI:
31211 case MULTI_ARG_1_HI:
31212 case MULTI_ARG_1_QI:
31213 case MULTI_ARG_1_SI_DI:
31214 case MULTI_ARG_1_HI_DI:
31215 case MULTI_ARG_1_HI_SI:
31216 case MULTI_ARG_1_QI_DI:
31217 case MULTI_ARG_1_QI_SI:
31218 case MULTI_ARG_1_QI_HI:
31219 nargs = 1;
31220 break;
31221
31222 case MULTI_ARG_2_DI_CMP:
31223 case MULTI_ARG_2_SI_CMP:
31224 case MULTI_ARG_2_HI_CMP:
31225 case MULTI_ARG_2_QI_CMP:
31226 nargs = 2;
31227 comparison_p = true;
31228 break;
31229
31230 case MULTI_ARG_2_SF_TF:
31231 case MULTI_ARG_2_DF_TF:
31232 case MULTI_ARG_2_DI_TF:
31233 case MULTI_ARG_2_SI_TF:
31234 case MULTI_ARG_2_HI_TF:
31235 case MULTI_ARG_2_QI_TF:
31236 nargs = 2;
31237 tf_p = true;
31238 break;
31239
31240 default:
31241 gcc_unreachable ();
31242 }
31243
31244 if (optimize || !target
31245 || GET_MODE (target) != tmode
31246 || !insn_data[icode].operand[0].predicate (target, tmode))
31247 target = gen_reg_rtx (tmode);
31248
31249 gcc_assert (nargs <= 4);
31250
31251 for (i = 0; i < nargs; i++)
31252 {
31253 tree arg = CALL_EXPR_ARG (exp, i);
31254 rtx op = expand_normal (arg);
31255 int adjust = (comparison_p) ? 1 : 0;
31256 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
31257
31258 if (last_arg_constant && i == nargs - 1)
31259 {
31260 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
31261 {
31262 enum insn_code new_icode = icode;
31263 switch (icode)
31264 {
31265 case CODE_FOR_xop_vpermil2v2df3:
31266 case CODE_FOR_xop_vpermil2v4sf3:
31267 case CODE_FOR_xop_vpermil2v4df3:
31268 case CODE_FOR_xop_vpermil2v8sf3:
31269 error ("the last argument must be a 2-bit immediate");
31270 return gen_reg_rtx (tmode);
31271 case CODE_FOR_xop_rotlv2di3:
31272 new_icode = CODE_FOR_rotlv2di3;
31273 goto xop_rotl;
31274 case CODE_FOR_xop_rotlv4si3:
31275 new_icode = CODE_FOR_rotlv4si3;
31276 goto xop_rotl;
31277 case CODE_FOR_xop_rotlv8hi3:
31278 new_icode = CODE_FOR_rotlv8hi3;
31279 goto xop_rotl;
31280 case CODE_FOR_xop_rotlv16qi3:
31281 new_icode = CODE_FOR_rotlv16qi3;
31282 xop_rotl:
31283 if (CONST_INT_P (op))
31284 {
31285 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
31286 op = GEN_INT (INTVAL (op) & mask);
31287 gcc_checking_assert
31288 (insn_data[icode].operand[i + 1].predicate (op, mode));
31289 }
31290 else
31291 {
31292 gcc_checking_assert
31293 (nargs == 2
31294 && insn_data[new_icode].operand[0].mode == tmode
31295 && insn_data[new_icode].operand[1].mode == tmode
31296 && insn_data[new_icode].operand[2].mode == mode
31297 && insn_data[new_icode].operand[0].predicate
31298 == insn_data[icode].operand[0].predicate
31299 && insn_data[new_icode].operand[1].predicate
31300 == insn_data[icode].operand[1].predicate);
31301 icode = new_icode;
31302 goto non_constant;
31303 }
31304 break;
31305 default:
31306 gcc_unreachable ();
31307 }
31308 }
31309 }
31310 else
31311 {
31312 non_constant:
31313 if (VECTOR_MODE_P (mode))
31314 op = safe_vector_operand (op, mode);
31315
31316 /* If we aren't optimizing, only allow one memory operand to be
31317 generated. */
31318 if (memory_operand (op, mode))
31319 num_memory++;
31320
31321 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
31322
31323 if (optimize
31324 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
31325 || num_memory > 1)
31326 op = force_reg (mode, op);
31327 }
31328
31329 args[i].op = op;
31330 args[i].mode = mode;
31331 }
31332
31333 switch (nargs)
31334 {
31335 case 1:
31336 pat = GEN_FCN (icode) (target, args[0].op);
31337 break;
31338
31339 case 2:
31340 if (tf_p)
31341 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
31342 GEN_INT ((int)sub_code));
31343 else if (! comparison_p)
31344 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31345 else
31346 {
31347 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
31348 args[0].op,
31349 args[1].op);
31350
31351 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
31352 }
31353 break;
31354
31355 case 3:
31356 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31357 break;
31358
31359 case 4:
31360 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
31361 break;
31362
31363 default:
31364 gcc_unreachable ();
31365 }
31366
31367 if (! pat)
31368 return 0;
31369
31370 emit_insn (pat);
31371 return target;
31372 }
31373
31374 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
31375 insns with vec_merge. */
31376
31377 static rtx
31378 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
31379 rtx target)
31380 {
31381 rtx pat;
31382 tree arg0 = CALL_EXPR_ARG (exp, 0);
31383 rtx op1, op0 = expand_normal (arg0);
31384 enum machine_mode tmode = insn_data[icode].operand[0].mode;
31385 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
31386
31387 if (optimize || !target
31388 || GET_MODE (target) != tmode
31389 || !insn_data[icode].operand[0].predicate (target, tmode))
31390 target = gen_reg_rtx (tmode);
31391
31392 if (VECTOR_MODE_P (mode0))
31393 op0 = safe_vector_operand (op0, mode0);
31394
31395 if ((optimize && !register_operand (op0, mode0))
31396 || !insn_data[icode].operand[1].predicate (op0, mode0))
31397 op0 = copy_to_mode_reg (mode0, op0);
31398
31399 op1 = op0;
31400 if (!insn_data[icode].operand[2].predicate (op1, mode0))
31401 op1 = copy_to_mode_reg (mode0, op1);
31402
31403 pat = GEN_FCN (icode) (target, op0, op1);
31404 if (! pat)
31405 return 0;
31406 emit_insn (pat);
31407 return target;
31408 }
31409
31410 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
31411
31412 static rtx
31413 ix86_expand_sse_compare (const struct builtin_description *d,
31414 tree exp, rtx target, bool swap)
31415 {
31416 rtx pat;
31417 tree arg0 = CALL_EXPR_ARG (exp, 0);
31418 tree arg1 = CALL_EXPR_ARG (exp, 1);
31419 rtx op0 = expand_normal (arg0);
31420 rtx op1 = expand_normal (arg1);
31421 rtx op2;
31422 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31423 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31424 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31425 enum rtx_code comparison = d->comparison;
31426
31427 if (VECTOR_MODE_P (mode0))
31428 op0 = safe_vector_operand (op0, mode0);
31429 if (VECTOR_MODE_P (mode1))
31430 op1 = safe_vector_operand (op1, mode1);
31431
31432 /* Swap operands if we have a comparison that isn't available in
31433 hardware. */
31434 if (swap)
31435 {
31436 rtx tmp = gen_reg_rtx (mode1);
31437 emit_move_insn (tmp, op1);
31438 op1 = op0;
31439 op0 = tmp;
31440 }
31441
31442 if (optimize || !target
31443 || GET_MODE (target) != tmode
31444 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31445 target = gen_reg_rtx (tmode);
31446
31447 if ((optimize && !register_operand (op0, mode0))
31448 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
31449 op0 = copy_to_mode_reg (mode0, op0);
31450 if ((optimize && !register_operand (op1, mode1))
31451 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
31452 op1 = copy_to_mode_reg (mode1, op1);
31453
31454 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
31455 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31456 if (! pat)
31457 return 0;
31458 emit_insn (pat);
31459 return target;
31460 }
31461
31462 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
31463
31464 static rtx
31465 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
31466 rtx target)
31467 {
31468 rtx pat;
31469 tree arg0 = CALL_EXPR_ARG (exp, 0);
31470 tree arg1 = CALL_EXPR_ARG (exp, 1);
31471 rtx op0 = expand_normal (arg0);
31472 rtx op1 = expand_normal (arg1);
31473 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31474 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31475 enum rtx_code comparison = d->comparison;
31476
31477 if (VECTOR_MODE_P (mode0))
31478 op0 = safe_vector_operand (op0, mode0);
31479 if (VECTOR_MODE_P (mode1))
31480 op1 = safe_vector_operand (op1, mode1);
31481
31482 /* Swap operands if we have a comparison that isn't available in
31483 hardware. */
31484 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
31485 {
31486 rtx tmp = op1;
31487 op1 = op0;
31488 op0 = tmp;
31489 }
31490
31491 target = gen_reg_rtx (SImode);
31492 emit_move_insn (target, const0_rtx);
31493 target = gen_rtx_SUBREG (QImode, target, 0);
31494
31495 if ((optimize && !register_operand (op0, mode0))
31496 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31497 op0 = copy_to_mode_reg (mode0, op0);
31498 if ((optimize && !register_operand (op1, mode1))
31499 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31500 op1 = copy_to_mode_reg (mode1, op1);
31501
31502 pat = GEN_FCN (d->icode) (op0, op1);
31503 if (! pat)
31504 return 0;
31505 emit_insn (pat);
31506 emit_insn (gen_rtx_SET (VOIDmode,
31507 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31508 gen_rtx_fmt_ee (comparison, QImode,
31509 SET_DEST (pat),
31510 const0_rtx)));
31511
31512 return SUBREG_REG (target);
31513 }
31514
31515 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
31516
31517 static rtx
31518 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
31519 rtx target)
31520 {
31521 rtx pat;
31522 tree arg0 = CALL_EXPR_ARG (exp, 0);
31523 rtx op1, op0 = expand_normal (arg0);
31524 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31525 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31526
31527 if (optimize || target == 0
31528 || GET_MODE (target) != tmode
31529 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31530 target = gen_reg_rtx (tmode);
31531
31532 if (VECTOR_MODE_P (mode0))
31533 op0 = safe_vector_operand (op0, mode0);
31534
31535 if ((optimize && !register_operand (op0, mode0))
31536 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31537 op0 = copy_to_mode_reg (mode0, op0);
31538
31539 op1 = GEN_INT (d->comparison);
31540
31541 pat = GEN_FCN (d->icode) (target, op0, op1);
31542 if (! pat)
31543 return 0;
31544 emit_insn (pat);
31545 return target;
31546 }
31547
31548 static rtx
31549 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
31550 tree exp, rtx target)
31551 {
31552 rtx pat;
31553 tree arg0 = CALL_EXPR_ARG (exp, 0);
31554 tree arg1 = CALL_EXPR_ARG (exp, 1);
31555 rtx op0 = expand_normal (arg0);
31556 rtx op1 = expand_normal (arg1);
31557 rtx op2;
31558 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
31559 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
31560 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
31561
31562 if (optimize || target == 0
31563 || GET_MODE (target) != tmode
31564 || !insn_data[d->icode].operand[0].predicate (target, tmode))
31565 target = gen_reg_rtx (tmode);
31566
31567 op0 = safe_vector_operand (op0, mode0);
31568 op1 = safe_vector_operand (op1, mode1);
31569
31570 if ((optimize && !register_operand (op0, mode0))
31571 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31572 op0 = copy_to_mode_reg (mode0, op0);
31573 if ((optimize && !register_operand (op1, mode1))
31574 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31575 op1 = copy_to_mode_reg (mode1, op1);
31576
31577 op2 = GEN_INT (d->comparison);
31578
31579 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
31580 if (! pat)
31581 return 0;
31582 emit_insn (pat);
31583 return target;
31584 }
31585
31586 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
31587
31588 static rtx
31589 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
31590 rtx target)
31591 {
31592 rtx pat;
31593 tree arg0 = CALL_EXPR_ARG (exp, 0);
31594 tree arg1 = CALL_EXPR_ARG (exp, 1);
31595 rtx op0 = expand_normal (arg0);
31596 rtx op1 = expand_normal (arg1);
31597 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
31598 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
31599 enum rtx_code comparison = d->comparison;
31600
31601 if (VECTOR_MODE_P (mode0))
31602 op0 = safe_vector_operand (op0, mode0);
31603 if (VECTOR_MODE_P (mode1))
31604 op1 = safe_vector_operand (op1, mode1);
31605
31606 target = gen_reg_rtx (SImode);
31607 emit_move_insn (target, const0_rtx);
31608 target = gen_rtx_SUBREG (QImode, target, 0);
31609
31610 if ((optimize && !register_operand (op0, mode0))
31611 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
31612 op0 = copy_to_mode_reg (mode0, op0);
31613 if ((optimize && !register_operand (op1, mode1))
31614 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
31615 op1 = copy_to_mode_reg (mode1, op1);
31616
31617 pat = GEN_FCN (d->icode) (op0, op1);
31618 if (! pat)
31619 return 0;
31620 emit_insn (pat);
31621 emit_insn (gen_rtx_SET (VOIDmode,
31622 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31623 gen_rtx_fmt_ee (comparison, QImode,
31624 SET_DEST (pat),
31625 const0_rtx)));
31626
31627 return SUBREG_REG (target);
31628 }
31629
31630 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
31631
31632 static rtx
31633 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
31634 tree exp, rtx target)
31635 {
31636 rtx pat;
31637 tree arg0 = CALL_EXPR_ARG (exp, 0);
31638 tree arg1 = CALL_EXPR_ARG (exp, 1);
31639 tree arg2 = CALL_EXPR_ARG (exp, 2);
31640 tree arg3 = CALL_EXPR_ARG (exp, 3);
31641 tree arg4 = CALL_EXPR_ARG (exp, 4);
31642 rtx scratch0, scratch1;
31643 rtx op0 = expand_normal (arg0);
31644 rtx op1 = expand_normal (arg1);
31645 rtx op2 = expand_normal (arg2);
31646 rtx op3 = expand_normal (arg3);
31647 rtx op4 = expand_normal (arg4);
31648 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
31649
31650 tmode0 = insn_data[d->icode].operand[0].mode;
31651 tmode1 = insn_data[d->icode].operand[1].mode;
31652 modev2 = insn_data[d->icode].operand[2].mode;
31653 modei3 = insn_data[d->icode].operand[3].mode;
31654 modev4 = insn_data[d->icode].operand[4].mode;
31655 modei5 = insn_data[d->icode].operand[5].mode;
31656 modeimm = insn_data[d->icode].operand[6].mode;
31657
31658 if (VECTOR_MODE_P (modev2))
31659 op0 = safe_vector_operand (op0, modev2);
31660 if (VECTOR_MODE_P (modev4))
31661 op2 = safe_vector_operand (op2, modev4);
31662
31663 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31664 op0 = copy_to_mode_reg (modev2, op0);
31665 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
31666 op1 = copy_to_mode_reg (modei3, op1);
31667 if ((optimize && !register_operand (op2, modev4))
31668 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
31669 op2 = copy_to_mode_reg (modev4, op2);
31670 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
31671 op3 = copy_to_mode_reg (modei5, op3);
31672
31673 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
31674 {
31675 error ("the fifth argument must be an 8-bit immediate");
31676 return const0_rtx;
31677 }
31678
31679 if (d->code == IX86_BUILTIN_PCMPESTRI128)
31680 {
31681 if (optimize || !target
31682 || GET_MODE (target) != tmode0
31683 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31684 target = gen_reg_rtx (tmode0);
31685
31686 scratch1 = gen_reg_rtx (tmode1);
31687
31688 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
31689 }
31690 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
31691 {
31692 if (optimize || !target
31693 || GET_MODE (target) != tmode1
31694 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31695 target = gen_reg_rtx (tmode1);
31696
31697 scratch0 = gen_reg_rtx (tmode0);
31698
31699 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
31700 }
31701 else
31702 {
31703 gcc_assert (d->flag);
31704
31705 scratch0 = gen_reg_rtx (tmode0);
31706 scratch1 = gen_reg_rtx (tmode1);
31707
31708 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
31709 }
31710
31711 if (! pat)
31712 return 0;
31713
31714 emit_insn (pat);
31715
31716 if (d->flag)
31717 {
31718 target = gen_reg_rtx (SImode);
31719 emit_move_insn (target, const0_rtx);
31720 target = gen_rtx_SUBREG (QImode, target, 0);
31721
31722 emit_insn
31723 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31724 gen_rtx_fmt_ee (EQ, QImode,
31725 gen_rtx_REG ((enum machine_mode) d->flag,
31726 FLAGS_REG),
31727 const0_rtx)));
31728 return SUBREG_REG (target);
31729 }
31730 else
31731 return target;
31732 }
31733
31734
31735 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
31736
31737 static rtx
31738 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
31739 tree exp, rtx target)
31740 {
31741 rtx pat;
31742 tree arg0 = CALL_EXPR_ARG (exp, 0);
31743 tree arg1 = CALL_EXPR_ARG (exp, 1);
31744 tree arg2 = CALL_EXPR_ARG (exp, 2);
31745 rtx scratch0, scratch1;
31746 rtx op0 = expand_normal (arg0);
31747 rtx op1 = expand_normal (arg1);
31748 rtx op2 = expand_normal (arg2);
31749 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
31750
31751 tmode0 = insn_data[d->icode].operand[0].mode;
31752 tmode1 = insn_data[d->icode].operand[1].mode;
31753 modev2 = insn_data[d->icode].operand[2].mode;
31754 modev3 = insn_data[d->icode].operand[3].mode;
31755 modeimm = insn_data[d->icode].operand[4].mode;
31756
31757 if (VECTOR_MODE_P (modev2))
31758 op0 = safe_vector_operand (op0, modev2);
31759 if (VECTOR_MODE_P (modev3))
31760 op1 = safe_vector_operand (op1, modev3);
31761
31762 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
31763 op0 = copy_to_mode_reg (modev2, op0);
31764 if ((optimize && !register_operand (op1, modev3))
31765 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
31766 op1 = copy_to_mode_reg (modev3, op1);
31767
31768 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
31769 {
31770 error ("the third argument must be an 8-bit immediate");
31771 return const0_rtx;
31772 }
31773
31774 if (d->code == IX86_BUILTIN_PCMPISTRI128)
31775 {
31776 if (optimize || !target
31777 || GET_MODE (target) != tmode0
31778 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
31779 target = gen_reg_rtx (tmode0);
31780
31781 scratch1 = gen_reg_rtx (tmode1);
31782
31783 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
31784 }
31785 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
31786 {
31787 if (optimize || !target
31788 || GET_MODE (target) != tmode1
31789 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
31790 target = gen_reg_rtx (tmode1);
31791
31792 scratch0 = gen_reg_rtx (tmode0);
31793
31794 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
31795 }
31796 else
31797 {
31798 gcc_assert (d->flag);
31799
31800 scratch0 = gen_reg_rtx (tmode0);
31801 scratch1 = gen_reg_rtx (tmode1);
31802
31803 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
31804 }
31805
31806 if (! pat)
31807 return 0;
31808
31809 emit_insn (pat);
31810
31811 if (d->flag)
31812 {
31813 target = gen_reg_rtx (SImode);
31814 emit_move_insn (target, const0_rtx);
31815 target = gen_rtx_SUBREG (QImode, target, 0);
31816
31817 emit_insn
31818 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
31819 gen_rtx_fmt_ee (EQ, QImode,
31820 gen_rtx_REG ((enum machine_mode) d->flag,
31821 FLAGS_REG),
31822 const0_rtx)));
31823 return SUBREG_REG (target);
31824 }
31825 else
31826 return target;
31827 }
31828
31829 /* Subroutine of ix86_expand_builtin to take care of insns with
31830 variable number of operands. */
31831
31832 static rtx
31833 ix86_expand_args_builtin (const struct builtin_description *d,
31834 tree exp, rtx target)
31835 {
31836 rtx pat, real_target;
31837 unsigned int i, nargs;
31838 unsigned int nargs_constant = 0;
31839 int num_memory = 0;
31840 struct
31841 {
31842 rtx op;
31843 enum machine_mode mode;
31844 } args[4];
31845 bool last_arg_count = false;
31846 enum insn_code icode = d->icode;
31847 const struct insn_data_d *insn_p = &insn_data[icode];
31848 enum machine_mode tmode = insn_p->operand[0].mode;
31849 enum machine_mode rmode = VOIDmode;
31850 bool swap = false;
31851 enum rtx_code comparison = d->comparison;
31852
31853 switch ((enum ix86_builtin_func_type) d->flag)
31854 {
31855 case V2DF_FTYPE_V2DF_ROUND:
31856 case V4DF_FTYPE_V4DF_ROUND:
31857 case V4SF_FTYPE_V4SF_ROUND:
31858 case V8SF_FTYPE_V8SF_ROUND:
31859 case V4SI_FTYPE_V4SF_ROUND:
31860 case V8SI_FTYPE_V8SF_ROUND:
31861 return ix86_expand_sse_round (d, exp, target);
31862 case V4SI_FTYPE_V2DF_V2DF_ROUND:
31863 case V8SI_FTYPE_V4DF_V4DF_ROUND:
31864 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
31865 case INT_FTYPE_V8SF_V8SF_PTEST:
31866 case INT_FTYPE_V4DI_V4DI_PTEST:
31867 case INT_FTYPE_V4DF_V4DF_PTEST:
31868 case INT_FTYPE_V4SF_V4SF_PTEST:
31869 case INT_FTYPE_V2DI_V2DI_PTEST:
31870 case INT_FTYPE_V2DF_V2DF_PTEST:
31871 return ix86_expand_sse_ptest (d, exp, target);
31872 case FLOAT128_FTYPE_FLOAT128:
31873 case FLOAT_FTYPE_FLOAT:
31874 case INT_FTYPE_INT:
31875 case UINT64_FTYPE_INT:
31876 case UINT16_FTYPE_UINT16:
31877 case INT64_FTYPE_INT64:
31878 case INT64_FTYPE_V4SF:
31879 case INT64_FTYPE_V2DF:
31880 case INT_FTYPE_V16QI:
31881 case INT_FTYPE_V8QI:
31882 case INT_FTYPE_V8SF:
31883 case INT_FTYPE_V4DF:
31884 case INT_FTYPE_V4SF:
31885 case INT_FTYPE_V2DF:
31886 case INT_FTYPE_V32QI:
31887 case V16QI_FTYPE_V16QI:
31888 case V8SI_FTYPE_V8SF:
31889 case V8SI_FTYPE_V4SI:
31890 case V8HI_FTYPE_V8HI:
31891 case V8HI_FTYPE_V16QI:
31892 case V8QI_FTYPE_V8QI:
31893 case V8SF_FTYPE_V8SF:
31894 case V8SF_FTYPE_V8SI:
31895 case V8SF_FTYPE_V4SF:
31896 case V8SF_FTYPE_V8HI:
31897 case V4SI_FTYPE_V4SI:
31898 case V4SI_FTYPE_V16QI:
31899 case V4SI_FTYPE_V4SF:
31900 case V4SI_FTYPE_V8SI:
31901 case V4SI_FTYPE_V8HI:
31902 case V4SI_FTYPE_V4DF:
31903 case V4SI_FTYPE_V2DF:
31904 case V4HI_FTYPE_V4HI:
31905 case V4DF_FTYPE_V4DF:
31906 case V4DF_FTYPE_V4SI:
31907 case V4DF_FTYPE_V4SF:
31908 case V4DF_FTYPE_V2DF:
31909 case V4SF_FTYPE_V4SF:
31910 case V4SF_FTYPE_V4SI:
31911 case V4SF_FTYPE_V8SF:
31912 case V4SF_FTYPE_V4DF:
31913 case V4SF_FTYPE_V8HI:
31914 case V4SF_FTYPE_V2DF:
31915 case V2DI_FTYPE_V2DI:
31916 case V2DI_FTYPE_V16QI:
31917 case V2DI_FTYPE_V8HI:
31918 case V2DI_FTYPE_V4SI:
31919 case V2DF_FTYPE_V2DF:
31920 case V2DF_FTYPE_V4SI:
31921 case V2DF_FTYPE_V4DF:
31922 case V2DF_FTYPE_V4SF:
31923 case V2DF_FTYPE_V2SI:
31924 case V2SI_FTYPE_V2SI:
31925 case V2SI_FTYPE_V4SF:
31926 case V2SI_FTYPE_V2SF:
31927 case V2SI_FTYPE_V2DF:
31928 case V2SF_FTYPE_V2SF:
31929 case V2SF_FTYPE_V2SI:
31930 case V32QI_FTYPE_V32QI:
31931 case V32QI_FTYPE_V16QI:
31932 case V16HI_FTYPE_V16HI:
31933 case V16HI_FTYPE_V8HI:
31934 case V8SI_FTYPE_V8SI:
31935 case V16HI_FTYPE_V16QI:
31936 case V8SI_FTYPE_V16QI:
31937 case V4DI_FTYPE_V16QI:
31938 case V8SI_FTYPE_V8HI:
31939 case V4DI_FTYPE_V8HI:
31940 case V4DI_FTYPE_V4SI:
31941 case V4DI_FTYPE_V2DI:
31942 nargs = 1;
31943 break;
31944 case V4SF_FTYPE_V4SF_VEC_MERGE:
31945 case V2DF_FTYPE_V2DF_VEC_MERGE:
31946 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
31947 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
31948 case V16QI_FTYPE_V16QI_V16QI:
31949 case V16QI_FTYPE_V8HI_V8HI:
31950 case V8QI_FTYPE_V8QI_V8QI:
31951 case V8QI_FTYPE_V4HI_V4HI:
31952 case V8HI_FTYPE_V8HI_V8HI:
31953 case V8HI_FTYPE_V16QI_V16QI:
31954 case V8HI_FTYPE_V4SI_V4SI:
31955 case V8SF_FTYPE_V8SF_V8SF:
31956 case V8SF_FTYPE_V8SF_V8SI:
31957 case V4SI_FTYPE_V4SI_V4SI:
31958 case V4SI_FTYPE_V8HI_V8HI:
31959 case V4SI_FTYPE_V4SF_V4SF:
31960 case V4SI_FTYPE_V2DF_V2DF:
31961 case V4HI_FTYPE_V4HI_V4HI:
31962 case V4HI_FTYPE_V8QI_V8QI:
31963 case V4HI_FTYPE_V2SI_V2SI:
31964 case V4DF_FTYPE_V4DF_V4DF:
31965 case V4DF_FTYPE_V4DF_V4DI:
31966 case V4SF_FTYPE_V4SF_V4SF:
31967 case V4SF_FTYPE_V4SF_V4SI:
31968 case V4SF_FTYPE_V4SF_V2SI:
31969 case V4SF_FTYPE_V4SF_V2DF:
31970 case V4SF_FTYPE_V4SF_DI:
31971 case V4SF_FTYPE_V4SF_SI:
31972 case V2DI_FTYPE_V2DI_V2DI:
31973 case V2DI_FTYPE_V16QI_V16QI:
31974 case V2DI_FTYPE_V4SI_V4SI:
31975 case V2UDI_FTYPE_V4USI_V4USI:
31976 case V2DI_FTYPE_V2DI_V16QI:
31977 case V2DI_FTYPE_V2DF_V2DF:
31978 case V2SI_FTYPE_V2SI_V2SI:
31979 case V2SI_FTYPE_V4HI_V4HI:
31980 case V2SI_FTYPE_V2SF_V2SF:
31981 case V2DF_FTYPE_V2DF_V2DF:
31982 case V2DF_FTYPE_V2DF_V4SF:
31983 case V2DF_FTYPE_V2DF_V2DI:
31984 case V2DF_FTYPE_V2DF_DI:
31985 case V2DF_FTYPE_V2DF_SI:
31986 case V2SF_FTYPE_V2SF_V2SF:
31987 case V1DI_FTYPE_V1DI_V1DI:
31988 case V1DI_FTYPE_V8QI_V8QI:
31989 case V1DI_FTYPE_V2SI_V2SI:
31990 case V32QI_FTYPE_V16HI_V16HI:
31991 case V16HI_FTYPE_V8SI_V8SI:
31992 case V32QI_FTYPE_V32QI_V32QI:
31993 case V16HI_FTYPE_V32QI_V32QI:
31994 case V16HI_FTYPE_V16HI_V16HI:
31995 case V8SI_FTYPE_V4DF_V4DF:
31996 case V8SI_FTYPE_V8SI_V8SI:
31997 case V8SI_FTYPE_V16HI_V16HI:
31998 case V4DI_FTYPE_V4DI_V4DI:
31999 case V4DI_FTYPE_V8SI_V8SI:
32000 case V4UDI_FTYPE_V8USI_V8USI:
32001 if (comparison == UNKNOWN)
32002 return ix86_expand_binop_builtin (icode, exp, target);
32003 nargs = 2;
32004 break;
32005 case V4SF_FTYPE_V4SF_V4SF_SWAP:
32006 case V2DF_FTYPE_V2DF_V2DF_SWAP:
32007 gcc_assert (comparison != UNKNOWN);
32008 nargs = 2;
32009 swap = true;
32010 break;
32011 case V16HI_FTYPE_V16HI_V8HI_COUNT:
32012 case V16HI_FTYPE_V16HI_SI_COUNT:
32013 case V8SI_FTYPE_V8SI_V4SI_COUNT:
32014 case V8SI_FTYPE_V8SI_SI_COUNT:
32015 case V4DI_FTYPE_V4DI_V2DI_COUNT:
32016 case V4DI_FTYPE_V4DI_INT_COUNT:
32017 case V8HI_FTYPE_V8HI_V8HI_COUNT:
32018 case V8HI_FTYPE_V8HI_SI_COUNT:
32019 case V4SI_FTYPE_V4SI_V4SI_COUNT:
32020 case V4SI_FTYPE_V4SI_SI_COUNT:
32021 case V4HI_FTYPE_V4HI_V4HI_COUNT:
32022 case V4HI_FTYPE_V4HI_SI_COUNT:
32023 case V2DI_FTYPE_V2DI_V2DI_COUNT:
32024 case V2DI_FTYPE_V2DI_SI_COUNT:
32025 case V2SI_FTYPE_V2SI_V2SI_COUNT:
32026 case V2SI_FTYPE_V2SI_SI_COUNT:
32027 case V1DI_FTYPE_V1DI_V1DI_COUNT:
32028 case V1DI_FTYPE_V1DI_SI_COUNT:
32029 nargs = 2;
32030 last_arg_count = true;
32031 break;
32032 case UINT64_FTYPE_UINT64_UINT64:
32033 case UINT_FTYPE_UINT_UINT:
32034 case UINT_FTYPE_UINT_USHORT:
32035 case UINT_FTYPE_UINT_UCHAR:
32036 case UINT16_FTYPE_UINT16_INT:
32037 case UINT8_FTYPE_UINT8_INT:
32038 nargs = 2;
32039 break;
32040 case V2DI_FTYPE_V2DI_INT_CONVERT:
32041 nargs = 2;
32042 rmode = V1TImode;
32043 nargs_constant = 1;
32044 break;
32045 case V4DI_FTYPE_V4DI_INT_CONVERT:
32046 nargs = 2;
32047 rmode = V2TImode;
32048 nargs_constant = 1;
32049 break;
32050 case V8HI_FTYPE_V8HI_INT:
32051 case V8HI_FTYPE_V8SF_INT:
32052 case V8HI_FTYPE_V4SF_INT:
32053 case V8SF_FTYPE_V8SF_INT:
32054 case V4SI_FTYPE_V4SI_INT:
32055 case V4SI_FTYPE_V8SI_INT:
32056 case V4HI_FTYPE_V4HI_INT:
32057 case V4DF_FTYPE_V4DF_INT:
32058 case V4SF_FTYPE_V4SF_INT:
32059 case V4SF_FTYPE_V8SF_INT:
32060 case V2DI_FTYPE_V2DI_INT:
32061 case V2DF_FTYPE_V2DF_INT:
32062 case V2DF_FTYPE_V4DF_INT:
32063 case V16HI_FTYPE_V16HI_INT:
32064 case V8SI_FTYPE_V8SI_INT:
32065 case V4DI_FTYPE_V4DI_INT:
32066 case V2DI_FTYPE_V4DI_INT:
32067 nargs = 2;
32068 nargs_constant = 1;
32069 break;
32070 case V16QI_FTYPE_V16QI_V16QI_V16QI:
32071 case V8SF_FTYPE_V8SF_V8SF_V8SF:
32072 case V4DF_FTYPE_V4DF_V4DF_V4DF:
32073 case V4SF_FTYPE_V4SF_V4SF_V4SF:
32074 case V2DF_FTYPE_V2DF_V2DF_V2DF:
32075 case V32QI_FTYPE_V32QI_V32QI_V32QI:
32076 nargs = 3;
32077 break;
32078 case V32QI_FTYPE_V32QI_V32QI_INT:
32079 case V16HI_FTYPE_V16HI_V16HI_INT:
32080 case V16QI_FTYPE_V16QI_V16QI_INT:
32081 case V4DI_FTYPE_V4DI_V4DI_INT:
32082 case V8HI_FTYPE_V8HI_V8HI_INT:
32083 case V8SI_FTYPE_V8SI_V8SI_INT:
32084 case V8SI_FTYPE_V8SI_V4SI_INT:
32085 case V8SF_FTYPE_V8SF_V8SF_INT:
32086 case V8SF_FTYPE_V8SF_V4SF_INT:
32087 case V4SI_FTYPE_V4SI_V4SI_INT:
32088 case V4DF_FTYPE_V4DF_V4DF_INT:
32089 case V4DF_FTYPE_V4DF_V2DF_INT:
32090 case V4SF_FTYPE_V4SF_V4SF_INT:
32091 case V2DI_FTYPE_V2DI_V2DI_INT:
32092 case V4DI_FTYPE_V4DI_V2DI_INT:
32093 case V2DF_FTYPE_V2DF_V2DF_INT:
32094 nargs = 3;
32095 nargs_constant = 1;
32096 break;
32097 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
32098 nargs = 3;
32099 rmode = V4DImode;
32100 nargs_constant = 1;
32101 break;
32102 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
32103 nargs = 3;
32104 rmode = V2DImode;
32105 nargs_constant = 1;
32106 break;
32107 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
32108 nargs = 3;
32109 rmode = DImode;
32110 nargs_constant = 1;
32111 break;
32112 case V2DI_FTYPE_V2DI_UINT_UINT:
32113 nargs = 3;
32114 nargs_constant = 2;
32115 break;
32116 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
32117 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
32118 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
32119 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
32120 nargs = 4;
32121 nargs_constant = 1;
32122 break;
32123 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
32124 nargs = 4;
32125 nargs_constant = 2;
32126 break;
32127 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
32128 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
32129 nargs = 4;
32130 break;
32131 default:
32132 gcc_unreachable ();
32133 }
32134
32135 gcc_assert (nargs <= ARRAY_SIZE (args));
32136
32137 if (comparison != UNKNOWN)
32138 {
32139 gcc_assert (nargs == 2);
32140 return ix86_expand_sse_compare (d, exp, target, swap);
32141 }
32142
32143 if (rmode == VOIDmode || rmode == tmode)
32144 {
32145 if (optimize
32146 || target == 0
32147 || GET_MODE (target) != tmode
32148 || !insn_p->operand[0].predicate (target, tmode))
32149 target = gen_reg_rtx (tmode);
32150 real_target = target;
32151 }
32152 else
32153 {
32154 real_target = gen_reg_rtx (tmode);
32155 target = simplify_gen_subreg (rmode, real_target, tmode, 0);
32156 }
32157
32158 for (i = 0; i < nargs; i++)
32159 {
32160 tree arg = CALL_EXPR_ARG (exp, i);
32161 rtx op = expand_normal (arg);
32162 enum machine_mode mode = insn_p->operand[i + 1].mode;
32163 bool match = insn_p->operand[i + 1].predicate (op, mode);
32164
32165 if (last_arg_count && (i + 1) == nargs)
32166 {
32167 /* SIMD shift insns take either an 8-bit immediate or
32168 register as count. But builtin functions take int as
32169 count. If count doesn't match, we put it in register. */
32170 if (!match)
32171 {
32172 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
32173 if (!insn_p->operand[i + 1].predicate (op, mode))
32174 op = copy_to_reg (op);
32175 }
32176 }
32177 else if ((nargs - i) <= nargs_constant)
32178 {
32179 if (!match)
32180 switch (icode)
32181 {
32182 case CODE_FOR_avx2_inserti128:
32183 case CODE_FOR_avx2_extracti128:
32184 error ("the last argument must be an 1-bit immediate");
32185 return const0_rtx;
32186
32187 case CODE_FOR_sse4_1_roundsd:
32188 case CODE_FOR_sse4_1_roundss:
32189
32190 case CODE_FOR_sse4_1_roundpd:
32191 case CODE_FOR_sse4_1_roundps:
32192 case CODE_FOR_avx_roundpd256:
32193 case CODE_FOR_avx_roundps256:
32194
32195 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
32196 case CODE_FOR_sse4_1_roundps_sfix:
32197 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
32198 case CODE_FOR_avx_roundps_sfix256:
32199
32200 case CODE_FOR_sse4_1_blendps:
32201 case CODE_FOR_avx_blendpd256:
32202 case CODE_FOR_avx_vpermilv4df:
32203 error ("the last argument must be a 4-bit immediate");
32204 return const0_rtx;
32205
32206 case CODE_FOR_sse4_1_blendpd:
32207 case CODE_FOR_avx_vpermilv2df:
32208 case CODE_FOR_xop_vpermil2v2df3:
32209 case CODE_FOR_xop_vpermil2v4sf3:
32210 case CODE_FOR_xop_vpermil2v4df3:
32211 case CODE_FOR_xop_vpermil2v8sf3:
32212 error ("the last argument must be a 2-bit immediate");
32213 return const0_rtx;
32214
32215 case CODE_FOR_avx_vextractf128v4df:
32216 case CODE_FOR_avx_vextractf128v8sf:
32217 case CODE_FOR_avx_vextractf128v8si:
32218 case CODE_FOR_avx_vinsertf128v4df:
32219 case CODE_FOR_avx_vinsertf128v8sf:
32220 case CODE_FOR_avx_vinsertf128v8si:
32221 error ("the last argument must be a 1-bit immediate");
32222 return const0_rtx;
32223
32224 case CODE_FOR_avx_vmcmpv2df3:
32225 case CODE_FOR_avx_vmcmpv4sf3:
32226 case CODE_FOR_avx_cmpv2df3:
32227 case CODE_FOR_avx_cmpv4sf3:
32228 case CODE_FOR_avx_cmpv4df3:
32229 case CODE_FOR_avx_cmpv8sf3:
32230 error ("the last argument must be a 5-bit immediate");
32231 return const0_rtx;
32232
32233 default:
32234 switch (nargs_constant)
32235 {
32236 case 2:
32237 if ((nargs - i) == nargs_constant)
32238 {
32239 error ("the next to last argument must be an 8-bit immediate");
32240 break;
32241 }
32242 case 1:
32243 error ("the last argument must be an 8-bit immediate");
32244 break;
32245 default:
32246 gcc_unreachable ();
32247 }
32248 return const0_rtx;
32249 }
32250 }
32251 else
32252 {
32253 if (VECTOR_MODE_P (mode))
32254 op = safe_vector_operand (op, mode);
32255
32256 /* If we aren't optimizing, only allow one memory operand to
32257 be generated. */
32258 if (memory_operand (op, mode))
32259 num_memory++;
32260
32261 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
32262 {
32263 if (optimize || !match || num_memory > 1)
32264 op = copy_to_mode_reg (mode, op);
32265 }
32266 else
32267 {
32268 op = copy_to_reg (op);
32269 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
32270 }
32271 }
32272
32273 args[i].op = op;
32274 args[i].mode = mode;
32275 }
32276
32277 switch (nargs)
32278 {
32279 case 1:
32280 pat = GEN_FCN (icode) (real_target, args[0].op);
32281 break;
32282 case 2:
32283 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
32284 break;
32285 case 3:
32286 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32287 args[2].op);
32288 break;
32289 case 4:
32290 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
32291 args[2].op, args[3].op);
32292 break;
32293 default:
32294 gcc_unreachable ();
32295 }
32296
32297 if (! pat)
32298 return 0;
32299
32300 emit_insn (pat);
32301 return target;
32302 }
32303
32304 /* Subroutine of ix86_expand_builtin to take care of special insns
32305 with variable number of operands. */
32306
32307 static rtx
32308 ix86_expand_special_args_builtin (const struct builtin_description *d,
32309 tree exp, rtx target)
32310 {
32311 tree arg;
32312 rtx pat, op;
32313 unsigned int i, nargs, arg_adjust, memory;
32314 struct
32315 {
32316 rtx op;
32317 enum machine_mode mode;
32318 } args[3];
32319 enum insn_code icode = d->icode;
32320 bool last_arg_constant = false;
32321 const struct insn_data_d *insn_p = &insn_data[icode];
32322 enum machine_mode tmode = insn_p->operand[0].mode;
32323 enum { load, store } klass;
32324
32325 switch ((enum ix86_builtin_func_type) d->flag)
32326 {
32327 case VOID_FTYPE_VOID:
32328 emit_insn (GEN_FCN (icode) (target));
32329 return 0;
32330 case VOID_FTYPE_UINT64:
32331 case VOID_FTYPE_UNSIGNED:
32332 nargs = 0;
32333 klass = store;
32334 memory = 0;
32335 break;
32336
32337 case INT_FTYPE_VOID:
32338 case UINT64_FTYPE_VOID:
32339 case UNSIGNED_FTYPE_VOID:
32340 nargs = 0;
32341 klass = load;
32342 memory = 0;
32343 break;
32344 case UINT64_FTYPE_PUNSIGNED:
32345 case V2DI_FTYPE_PV2DI:
32346 case V4DI_FTYPE_PV4DI:
32347 case V32QI_FTYPE_PCCHAR:
32348 case V16QI_FTYPE_PCCHAR:
32349 case V8SF_FTYPE_PCV4SF:
32350 case V8SF_FTYPE_PCFLOAT:
32351 case V4SF_FTYPE_PCFLOAT:
32352 case V4DF_FTYPE_PCV2DF:
32353 case V4DF_FTYPE_PCDOUBLE:
32354 case V2DF_FTYPE_PCDOUBLE:
32355 case VOID_FTYPE_PVOID:
32356 nargs = 1;
32357 klass = load;
32358 memory = 0;
32359 break;
32360 case VOID_FTYPE_PV2SF_V4SF:
32361 case VOID_FTYPE_PV4DI_V4DI:
32362 case VOID_FTYPE_PV2DI_V2DI:
32363 case VOID_FTYPE_PCHAR_V32QI:
32364 case VOID_FTYPE_PCHAR_V16QI:
32365 case VOID_FTYPE_PFLOAT_V8SF:
32366 case VOID_FTYPE_PFLOAT_V4SF:
32367 case VOID_FTYPE_PDOUBLE_V4DF:
32368 case VOID_FTYPE_PDOUBLE_V2DF:
32369 case VOID_FTYPE_PLONGLONG_LONGLONG:
32370 case VOID_FTYPE_PULONGLONG_ULONGLONG:
32371 case VOID_FTYPE_PINT_INT:
32372 nargs = 1;
32373 klass = store;
32374 /* Reserve memory operand for target. */
32375 memory = ARRAY_SIZE (args);
32376 break;
32377 case V4SF_FTYPE_V4SF_PCV2SF:
32378 case V2DF_FTYPE_V2DF_PCDOUBLE:
32379 nargs = 2;
32380 klass = load;
32381 memory = 1;
32382 break;
32383 case V8SF_FTYPE_PCV8SF_V8SI:
32384 case V4DF_FTYPE_PCV4DF_V4DI:
32385 case V4SF_FTYPE_PCV4SF_V4SI:
32386 case V2DF_FTYPE_PCV2DF_V2DI:
32387 case V8SI_FTYPE_PCV8SI_V8SI:
32388 case V4DI_FTYPE_PCV4DI_V4DI:
32389 case V4SI_FTYPE_PCV4SI_V4SI:
32390 case V2DI_FTYPE_PCV2DI_V2DI:
32391 nargs = 2;
32392 klass = load;
32393 memory = 0;
32394 break;
32395 case VOID_FTYPE_PV8SF_V8SI_V8SF:
32396 case VOID_FTYPE_PV4DF_V4DI_V4DF:
32397 case VOID_FTYPE_PV4SF_V4SI_V4SF:
32398 case VOID_FTYPE_PV2DF_V2DI_V2DF:
32399 case VOID_FTYPE_PV8SI_V8SI_V8SI:
32400 case VOID_FTYPE_PV4DI_V4DI_V4DI:
32401 case VOID_FTYPE_PV4SI_V4SI_V4SI:
32402 case VOID_FTYPE_PV2DI_V2DI_V2DI:
32403 nargs = 2;
32404 klass = store;
32405 /* Reserve memory operand for target. */
32406 memory = ARRAY_SIZE (args);
32407 break;
32408 case VOID_FTYPE_UINT_UINT_UINT:
32409 case VOID_FTYPE_UINT64_UINT_UINT:
32410 case UCHAR_FTYPE_UINT_UINT_UINT:
32411 case UCHAR_FTYPE_UINT64_UINT_UINT:
32412 nargs = 3;
32413 klass = load;
32414 memory = ARRAY_SIZE (args);
32415 last_arg_constant = true;
32416 break;
32417 default:
32418 gcc_unreachable ();
32419 }
32420
32421 gcc_assert (nargs <= ARRAY_SIZE (args));
32422
32423 if (klass == store)
32424 {
32425 arg = CALL_EXPR_ARG (exp, 0);
32426 op = expand_normal (arg);
32427 gcc_assert (target == 0);
32428 if (memory)
32429 {
32430 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32431 target = gen_rtx_MEM (tmode, op);
32432 }
32433 else
32434 target = force_reg (tmode, op);
32435 arg_adjust = 1;
32436 }
32437 else
32438 {
32439 arg_adjust = 0;
32440 if (optimize
32441 || target == 0
32442 || !register_operand (target, tmode)
32443 || GET_MODE (target) != tmode)
32444 target = gen_reg_rtx (tmode);
32445 }
32446
32447 for (i = 0; i < nargs; i++)
32448 {
32449 enum machine_mode mode = insn_p->operand[i + 1].mode;
32450 bool match;
32451
32452 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
32453 op = expand_normal (arg);
32454 match = insn_p->operand[i + 1].predicate (op, mode);
32455
32456 if (last_arg_constant && (i + 1) == nargs)
32457 {
32458 if (!match)
32459 {
32460 if (icode == CODE_FOR_lwp_lwpvalsi3
32461 || icode == CODE_FOR_lwp_lwpinssi3
32462 || icode == CODE_FOR_lwp_lwpvaldi3
32463 || icode == CODE_FOR_lwp_lwpinsdi3)
32464 error ("the last argument must be a 32-bit immediate");
32465 else
32466 error ("the last argument must be an 8-bit immediate");
32467 return const0_rtx;
32468 }
32469 }
32470 else
32471 {
32472 if (i == memory)
32473 {
32474 /* This must be the memory operand. */
32475 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
32476 op = gen_rtx_MEM (mode, op);
32477 gcc_assert (GET_MODE (op) == mode
32478 || GET_MODE (op) == VOIDmode);
32479 }
32480 else
32481 {
32482 /* This must be register. */
32483 if (VECTOR_MODE_P (mode))
32484 op = safe_vector_operand (op, mode);
32485
32486 gcc_assert (GET_MODE (op) == mode
32487 || GET_MODE (op) == VOIDmode);
32488 op = copy_to_mode_reg (mode, op);
32489 }
32490 }
32491
32492 args[i].op = op;
32493 args[i].mode = mode;
32494 }
32495
32496 switch (nargs)
32497 {
32498 case 0:
32499 pat = GEN_FCN (icode) (target);
32500 break;
32501 case 1:
32502 pat = GEN_FCN (icode) (target, args[0].op);
32503 break;
32504 case 2:
32505 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32506 break;
32507 case 3:
32508 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32509 break;
32510 default:
32511 gcc_unreachable ();
32512 }
32513
32514 if (! pat)
32515 return 0;
32516 emit_insn (pat);
32517 return klass == store ? 0 : target;
32518 }
32519
32520 /* Return the integer constant in ARG. Constrain it to be in the range
32521 of the subparts of VEC_TYPE; issue an error if not. */
32522
32523 static int
32524 get_element_number (tree vec_type, tree arg)
32525 {
32526 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
32527
32528 if (!host_integerp (arg, 1)
32529 || (elt = tree_low_cst (arg, 1), elt > max))
32530 {
32531 error ("selector must be an integer constant in the range 0..%wi", max);
32532 return 0;
32533 }
32534
32535 return elt;
32536 }
32537
32538 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32539 ix86_expand_vector_init. We DO have language-level syntax for this, in
32540 the form of (type){ init-list }. Except that since we can't place emms
32541 instructions from inside the compiler, we can't allow the use of MMX
32542 registers unless the user explicitly asks for it. So we do *not* define
32543 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
32544 we have builtins invoked by mmintrin.h that gives us license to emit
32545 these sorts of instructions. */
32546
32547 static rtx
32548 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
32549 {
32550 enum machine_mode tmode = TYPE_MODE (type);
32551 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
32552 int i, n_elt = GET_MODE_NUNITS (tmode);
32553 rtvec v = rtvec_alloc (n_elt);
32554
32555 gcc_assert (VECTOR_MODE_P (tmode));
32556 gcc_assert (call_expr_nargs (exp) == n_elt);
32557
32558 for (i = 0; i < n_elt; ++i)
32559 {
32560 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
32561 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
32562 }
32563
32564 if (!target || !register_operand (target, tmode))
32565 target = gen_reg_rtx (tmode);
32566
32567 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
32568 return target;
32569 }
32570
32571 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32572 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
32573 had a language-level syntax for referencing vector elements. */
32574
32575 static rtx
32576 ix86_expand_vec_ext_builtin (tree exp, rtx target)
32577 {
32578 enum machine_mode tmode, mode0;
32579 tree arg0, arg1;
32580 int elt;
32581 rtx op0;
32582
32583 arg0 = CALL_EXPR_ARG (exp, 0);
32584 arg1 = CALL_EXPR_ARG (exp, 1);
32585
32586 op0 = expand_normal (arg0);
32587 elt = get_element_number (TREE_TYPE (arg0), arg1);
32588
32589 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32590 mode0 = TYPE_MODE (TREE_TYPE (arg0));
32591 gcc_assert (VECTOR_MODE_P (mode0));
32592
32593 op0 = force_reg (mode0, op0);
32594
32595 if (optimize || !target || !register_operand (target, tmode))
32596 target = gen_reg_rtx (tmode);
32597
32598 ix86_expand_vector_extract (true, target, op0, elt);
32599
32600 return target;
32601 }
32602
32603 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
32604 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
32605 a language-level syntax for referencing vector elements. */
32606
32607 static rtx
32608 ix86_expand_vec_set_builtin (tree exp)
32609 {
32610 enum machine_mode tmode, mode1;
32611 tree arg0, arg1, arg2;
32612 int elt;
32613 rtx op0, op1, target;
32614
32615 arg0 = CALL_EXPR_ARG (exp, 0);
32616 arg1 = CALL_EXPR_ARG (exp, 1);
32617 arg2 = CALL_EXPR_ARG (exp, 2);
32618
32619 tmode = TYPE_MODE (TREE_TYPE (arg0));
32620 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
32621 gcc_assert (VECTOR_MODE_P (tmode));
32622
32623 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
32624 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
32625 elt = get_element_number (TREE_TYPE (arg0), arg2);
32626
32627 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
32628 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
32629
32630 op0 = force_reg (tmode, op0);
32631 op1 = force_reg (mode1, op1);
32632
32633 /* OP0 is the source of these builtin functions and shouldn't be
32634 modified. Create a copy, use it and return it as target. */
32635 target = gen_reg_rtx (tmode);
32636 emit_move_insn (target, op0);
32637 ix86_expand_vector_set (true, target, op1, elt);
32638
32639 return target;
32640 }
32641
32642 /* Expand an expression EXP that calls a built-in function,
32643 with result going to TARGET if that's convenient
32644 (and in mode MODE if that's convenient).
32645 SUBTARGET may be used as the target for computing one of EXP's operands.
32646 IGNORE is nonzero if the value is to be ignored. */
32647
32648 static rtx
32649 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
32650 enum machine_mode mode, int ignore)
32651 {
32652 const struct builtin_description *d;
32653 size_t i;
32654 enum insn_code icode;
32655 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
32656 tree arg0, arg1, arg2, arg3, arg4;
32657 rtx op0, op1, op2, op3, op4, pat, insn;
32658 enum machine_mode mode0, mode1, mode2, mode3, mode4;
32659 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
32660
32661 /* For CPU builtins that can be folded, fold first and expand the fold. */
32662 switch (fcode)
32663 {
32664 case IX86_BUILTIN_CPU_INIT:
32665 {
32666 /* Make it call __cpu_indicator_init in libgcc. */
32667 tree call_expr, fndecl, type;
32668 type = build_function_type_list (integer_type_node, NULL_TREE);
32669 fndecl = build_fn_decl ("__cpu_indicator_init", type);
32670 call_expr = build_call_expr (fndecl, 0);
32671 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
32672 }
32673 case IX86_BUILTIN_CPU_IS:
32674 case IX86_BUILTIN_CPU_SUPPORTS:
32675 {
32676 tree arg0 = CALL_EXPR_ARG (exp, 0);
32677 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
32678 gcc_assert (fold_expr != NULL_TREE);
32679 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
32680 }
32681 }
32682
32683 /* Determine whether the builtin function is available under the current ISA.
32684 Originally the builtin was not created if it wasn't applicable to the
32685 current ISA based on the command line switches. With function specific
32686 options, we need to check in the context of the function making the call
32687 whether it is supported. */
32688 if (ix86_builtins_isa[fcode].isa
32689 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
32690 {
32691 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
32692 NULL, (enum fpmath_unit) 0, false);
32693
32694 if (!opts)
32695 error ("%qE needs unknown isa option", fndecl);
32696 else
32697 {
32698 gcc_assert (opts != NULL);
32699 error ("%qE needs isa option %s", fndecl, opts);
32700 free (opts);
32701 }
32702 return const0_rtx;
32703 }
32704
32705 switch (fcode)
32706 {
32707 case IX86_BUILTIN_MASKMOVQ:
32708 case IX86_BUILTIN_MASKMOVDQU:
32709 icode = (fcode == IX86_BUILTIN_MASKMOVQ
32710 ? CODE_FOR_mmx_maskmovq
32711 : CODE_FOR_sse2_maskmovdqu);
32712 /* Note the arg order is different from the operand order. */
32713 arg1 = CALL_EXPR_ARG (exp, 0);
32714 arg2 = CALL_EXPR_ARG (exp, 1);
32715 arg0 = CALL_EXPR_ARG (exp, 2);
32716 op0 = expand_normal (arg0);
32717 op1 = expand_normal (arg1);
32718 op2 = expand_normal (arg2);
32719 mode0 = insn_data[icode].operand[0].mode;
32720 mode1 = insn_data[icode].operand[1].mode;
32721 mode2 = insn_data[icode].operand[2].mode;
32722
32723 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32724 op0 = gen_rtx_MEM (mode1, op0);
32725
32726 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32727 op0 = copy_to_mode_reg (mode0, op0);
32728 if (!insn_data[icode].operand[1].predicate (op1, mode1))
32729 op1 = copy_to_mode_reg (mode1, op1);
32730 if (!insn_data[icode].operand[2].predicate (op2, mode2))
32731 op2 = copy_to_mode_reg (mode2, op2);
32732 pat = GEN_FCN (icode) (op0, op1, op2);
32733 if (! pat)
32734 return 0;
32735 emit_insn (pat);
32736 return 0;
32737
32738 case IX86_BUILTIN_LDMXCSR:
32739 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
32740 target = assign_386_stack_local (SImode, SLOT_TEMP);
32741 emit_move_insn (target, op0);
32742 emit_insn (gen_sse_ldmxcsr (target));
32743 return 0;
32744
32745 case IX86_BUILTIN_STMXCSR:
32746 target = assign_386_stack_local (SImode, SLOT_TEMP);
32747 emit_insn (gen_sse_stmxcsr (target));
32748 return copy_to_mode_reg (SImode, target);
32749
32750 case IX86_BUILTIN_CLFLUSH:
32751 arg0 = CALL_EXPR_ARG (exp, 0);
32752 op0 = expand_normal (arg0);
32753 icode = CODE_FOR_sse2_clflush;
32754 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
32755 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32756
32757 emit_insn (gen_sse2_clflush (op0));
32758 return 0;
32759
32760 case IX86_BUILTIN_MONITOR:
32761 arg0 = CALL_EXPR_ARG (exp, 0);
32762 arg1 = CALL_EXPR_ARG (exp, 1);
32763 arg2 = CALL_EXPR_ARG (exp, 2);
32764 op0 = expand_normal (arg0);
32765 op1 = expand_normal (arg1);
32766 op2 = expand_normal (arg2);
32767 if (!REG_P (op0))
32768 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
32769 if (!REG_P (op1))
32770 op1 = copy_to_mode_reg (SImode, op1);
32771 if (!REG_P (op2))
32772 op2 = copy_to_mode_reg (SImode, op2);
32773 emit_insn (ix86_gen_monitor (op0, op1, op2));
32774 return 0;
32775
32776 case IX86_BUILTIN_MWAIT:
32777 arg0 = CALL_EXPR_ARG (exp, 0);
32778 arg1 = CALL_EXPR_ARG (exp, 1);
32779 op0 = expand_normal (arg0);
32780 op1 = expand_normal (arg1);
32781 if (!REG_P (op0))
32782 op0 = copy_to_mode_reg (SImode, op0);
32783 if (!REG_P (op1))
32784 op1 = copy_to_mode_reg (SImode, op1);
32785 emit_insn (gen_sse3_mwait (op0, op1));
32786 return 0;
32787
32788 case IX86_BUILTIN_VEC_INIT_V2SI:
32789 case IX86_BUILTIN_VEC_INIT_V4HI:
32790 case IX86_BUILTIN_VEC_INIT_V8QI:
32791 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
32792
32793 case IX86_BUILTIN_VEC_EXT_V2DF:
32794 case IX86_BUILTIN_VEC_EXT_V2DI:
32795 case IX86_BUILTIN_VEC_EXT_V4SF:
32796 case IX86_BUILTIN_VEC_EXT_V4SI:
32797 case IX86_BUILTIN_VEC_EXT_V8HI:
32798 case IX86_BUILTIN_VEC_EXT_V2SI:
32799 case IX86_BUILTIN_VEC_EXT_V4HI:
32800 case IX86_BUILTIN_VEC_EXT_V16QI:
32801 return ix86_expand_vec_ext_builtin (exp, target);
32802
32803 case IX86_BUILTIN_VEC_SET_V2DI:
32804 case IX86_BUILTIN_VEC_SET_V4SF:
32805 case IX86_BUILTIN_VEC_SET_V4SI:
32806 case IX86_BUILTIN_VEC_SET_V8HI:
32807 case IX86_BUILTIN_VEC_SET_V4HI:
32808 case IX86_BUILTIN_VEC_SET_V16QI:
32809 return ix86_expand_vec_set_builtin (exp);
32810
32811 case IX86_BUILTIN_INFQ:
32812 case IX86_BUILTIN_HUGE_VALQ:
32813 {
32814 REAL_VALUE_TYPE inf;
32815 rtx tmp;
32816
32817 real_inf (&inf);
32818 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
32819
32820 tmp = validize_mem (force_const_mem (mode, tmp));
32821
32822 if (target == 0)
32823 target = gen_reg_rtx (mode);
32824
32825 emit_move_insn (target, tmp);
32826 return target;
32827 }
32828
32829 case IX86_BUILTIN_RDPMC:
32830 case IX86_BUILTIN_RDTSC:
32831 case IX86_BUILTIN_RDTSCP:
32832
32833 op0 = gen_reg_rtx (DImode);
32834 op1 = gen_reg_rtx (DImode);
32835
32836 if (fcode == IX86_BUILTIN_RDPMC)
32837 {
32838 arg0 = CALL_EXPR_ARG (exp, 0);
32839 op2 = expand_normal (arg0);
32840 if (!register_operand (op2, SImode))
32841 op2 = copy_to_mode_reg (SImode, op2);
32842
32843 insn = (TARGET_64BIT
32844 ? gen_rdpmc_rex64 (op0, op1, op2)
32845 : gen_rdpmc (op0, op2));
32846 emit_insn (insn);
32847 }
32848 else if (fcode == IX86_BUILTIN_RDTSC)
32849 {
32850 insn = (TARGET_64BIT
32851 ? gen_rdtsc_rex64 (op0, op1)
32852 : gen_rdtsc (op0));
32853 emit_insn (insn);
32854 }
32855 else
32856 {
32857 op2 = gen_reg_rtx (SImode);
32858
32859 insn = (TARGET_64BIT
32860 ? gen_rdtscp_rex64 (op0, op1, op2)
32861 : gen_rdtscp (op0, op2));
32862 emit_insn (insn);
32863
32864 arg0 = CALL_EXPR_ARG (exp, 0);
32865 op4 = expand_normal (arg0);
32866 if (!address_operand (op4, VOIDmode))
32867 {
32868 op4 = convert_memory_address (Pmode, op4);
32869 op4 = copy_addr_to_reg (op4);
32870 }
32871 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
32872 }
32873
32874 if (target == 0)
32875 {
32876 /* mode is VOIDmode if __builtin_rd* has been called
32877 without lhs. */
32878 if (mode == VOIDmode)
32879 return target;
32880 target = gen_reg_rtx (mode);
32881 }
32882
32883 if (TARGET_64BIT)
32884 {
32885 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
32886 op1, 1, OPTAB_DIRECT);
32887 op0 = expand_simple_binop (DImode, IOR, op0, op1,
32888 op0, 1, OPTAB_DIRECT);
32889 }
32890
32891 emit_move_insn (target, op0);
32892 return target;
32893
32894 case IX86_BUILTIN_FXSAVE:
32895 case IX86_BUILTIN_FXRSTOR:
32896 case IX86_BUILTIN_FXSAVE64:
32897 case IX86_BUILTIN_FXRSTOR64:
32898 switch (fcode)
32899 {
32900 case IX86_BUILTIN_FXSAVE:
32901 icode = CODE_FOR_fxsave;
32902 break;
32903 case IX86_BUILTIN_FXRSTOR:
32904 icode = CODE_FOR_fxrstor;
32905 break;
32906 case IX86_BUILTIN_FXSAVE64:
32907 icode = CODE_FOR_fxsave64;
32908 break;
32909 case IX86_BUILTIN_FXRSTOR64:
32910 icode = CODE_FOR_fxrstor64;
32911 break;
32912 default:
32913 gcc_unreachable ();
32914 }
32915
32916 arg0 = CALL_EXPR_ARG (exp, 0);
32917 op0 = expand_normal (arg0);
32918
32919 if (!address_operand (op0, VOIDmode))
32920 {
32921 op0 = convert_memory_address (Pmode, op0);
32922 op0 = copy_addr_to_reg (op0);
32923 }
32924 op0 = gen_rtx_MEM (BLKmode, op0);
32925
32926 pat = GEN_FCN (icode) (op0);
32927 if (pat)
32928 emit_insn (pat);
32929 return 0;
32930
32931 case IX86_BUILTIN_XSAVE:
32932 case IX86_BUILTIN_XRSTOR:
32933 case IX86_BUILTIN_XSAVE64:
32934 case IX86_BUILTIN_XRSTOR64:
32935 case IX86_BUILTIN_XSAVEOPT:
32936 case IX86_BUILTIN_XSAVEOPT64:
32937 arg0 = CALL_EXPR_ARG (exp, 0);
32938 arg1 = CALL_EXPR_ARG (exp, 1);
32939 op0 = expand_normal (arg0);
32940 op1 = expand_normal (arg1);
32941
32942 if (!address_operand (op0, VOIDmode))
32943 {
32944 op0 = convert_memory_address (Pmode, op0);
32945 op0 = copy_addr_to_reg (op0);
32946 }
32947 op0 = gen_rtx_MEM (BLKmode, op0);
32948
32949 op1 = force_reg (DImode, op1);
32950
32951 if (TARGET_64BIT)
32952 {
32953 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
32954 NULL, 1, OPTAB_DIRECT);
32955 switch (fcode)
32956 {
32957 case IX86_BUILTIN_XSAVE:
32958 icode = CODE_FOR_xsave_rex64;
32959 break;
32960 case IX86_BUILTIN_XRSTOR:
32961 icode = CODE_FOR_xrstor_rex64;
32962 break;
32963 case IX86_BUILTIN_XSAVE64:
32964 icode = CODE_FOR_xsave64;
32965 break;
32966 case IX86_BUILTIN_XRSTOR64:
32967 icode = CODE_FOR_xrstor64;
32968 break;
32969 case IX86_BUILTIN_XSAVEOPT:
32970 icode = CODE_FOR_xsaveopt_rex64;
32971 break;
32972 case IX86_BUILTIN_XSAVEOPT64:
32973 icode = CODE_FOR_xsaveopt64;
32974 break;
32975 default:
32976 gcc_unreachable ();
32977 }
32978
32979 op2 = gen_lowpart (SImode, op2);
32980 op1 = gen_lowpart (SImode, op1);
32981 pat = GEN_FCN (icode) (op0, op1, op2);
32982 }
32983 else
32984 {
32985 switch (fcode)
32986 {
32987 case IX86_BUILTIN_XSAVE:
32988 icode = CODE_FOR_xsave;
32989 break;
32990 case IX86_BUILTIN_XRSTOR:
32991 icode = CODE_FOR_xrstor;
32992 break;
32993 case IX86_BUILTIN_XSAVEOPT:
32994 icode = CODE_FOR_xsaveopt;
32995 break;
32996 default:
32997 gcc_unreachable ();
32998 }
32999 pat = GEN_FCN (icode) (op0, op1);
33000 }
33001
33002 if (pat)
33003 emit_insn (pat);
33004 return 0;
33005
33006 case IX86_BUILTIN_LLWPCB:
33007 arg0 = CALL_EXPR_ARG (exp, 0);
33008 op0 = expand_normal (arg0);
33009 icode = CODE_FOR_lwp_llwpcb;
33010 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
33011 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
33012 emit_insn (gen_lwp_llwpcb (op0));
33013 return 0;
33014
33015 case IX86_BUILTIN_SLWPCB:
33016 icode = CODE_FOR_lwp_slwpcb;
33017 if (!target
33018 || !insn_data[icode].operand[0].predicate (target, Pmode))
33019 target = gen_reg_rtx (Pmode);
33020 emit_insn (gen_lwp_slwpcb (target));
33021 return target;
33022
33023 case IX86_BUILTIN_BEXTRI32:
33024 case IX86_BUILTIN_BEXTRI64:
33025 arg0 = CALL_EXPR_ARG (exp, 0);
33026 arg1 = CALL_EXPR_ARG (exp, 1);
33027 op0 = expand_normal (arg0);
33028 op1 = expand_normal (arg1);
33029 icode = (fcode == IX86_BUILTIN_BEXTRI32
33030 ? CODE_FOR_tbm_bextri_si
33031 : CODE_FOR_tbm_bextri_di);
33032 if (!CONST_INT_P (op1))
33033 {
33034 error ("last argument must be an immediate");
33035 return const0_rtx;
33036 }
33037 else
33038 {
33039 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
33040 unsigned char lsb_index = INTVAL (op1) & 0xFF;
33041 op1 = GEN_INT (length);
33042 op2 = GEN_INT (lsb_index);
33043 pat = GEN_FCN (icode) (target, op0, op1, op2);
33044 if (pat)
33045 emit_insn (pat);
33046 return target;
33047 }
33048
33049 case IX86_BUILTIN_RDRAND16_STEP:
33050 icode = CODE_FOR_rdrandhi_1;
33051 mode0 = HImode;
33052 goto rdrand_step;
33053
33054 case IX86_BUILTIN_RDRAND32_STEP:
33055 icode = CODE_FOR_rdrandsi_1;
33056 mode0 = SImode;
33057 goto rdrand_step;
33058
33059 case IX86_BUILTIN_RDRAND64_STEP:
33060 icode = CODE_FOR_rdranddi_1;
33061 mode0 = DImode;
33062
33063 rdrand_step:
33064 op0 = gen_reg_rtx (mode0);
33065 emit_insn (GEN_FCN (icode) (op0));
33066
33067 arg0 = CALL_EXPR_ARG (exp, 0);
33068 op1 = expand_normal (arg0);
33069 if (!address_operand (op1, VOIDmode))
33070 {
33071 op1 = convert_memory_address (Pmode, op1);
33072 op1 = copy_addr_to_reg (op1);
33073 }
33074 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33075
33076 op1 = gen_reg_rtx (SImode);
33077 emit_move_insn (op1, CONST1_RTX (SImode));
33078
33079 /* Emit SImode conditional move. */
33080 if (mode0 == HImode)
33081 {
33082 op2 = gen_reg_rtx (SImode);
33083 emit_insn (gen_zero_extendhisi2 (op2, op0));
33084 }
33085 else if (mode0 == SImode)
33086 op2 = op0;
33087 else
33088 op2 = gen_rtx_SUBREG (SImode, op0, 0);
33089
33090 if (target == 0)
33091 target = gen_reg_rtx (SImode);
33092
33093 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
33094 const0_rtx);
33095 emit_insn (gen_rtx_SET (VOIDmode, target,
33096 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
33097 return target;
33098
33099 case IX86_BUILTIN_RDSEED16_STEP:
33100 icode = CODE_FOR_rdseedhi_1;
33101 mode0 = HImode;
33102 goto rdseed_step;
33103
33104 case IX86_BUILTIN_RDSEED32_STEP:
33105 icode = CODE_FOR_rdseedsi_1;
33106 mode0 = SImode;
33107 goto rdseed_step;
33108
33109 case IX86_BUILTIN_RDSEED64_STEP:
33110 icode = CODE_FOR_rdseeddi_1;
33111 mode0 = DImode;
33112
33113 rdseed_step:
33114 op0 = gen_reg_rtx (mode0);
33115 emit_insn (GEN_FCN (icode) (op0));
33116
33117 arg0 = CALL_EXPR_ARG (exp, 0);
33118 op1 = expand_normal (arg0);
33119 if (!address_operand (op1, VOIDmode))
33120 {
33121 op1 = convert_memory_address (Pmode, op1);
33122 op1 = copy_addr_to_reg (op1);
33123 }
33124 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
33125
33126 op2 = gen_reg_rtx (QImode);
33127
33128 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
33129 const0_rtx);
33130 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
33131
33132 if (target == 0)
33133 target = gen_reg_rtx (SImode);
33134
33135 emit_insn (gen_zero_extendqisi2 (target, op2));
33136 return target;
33137
33138 case IX86_BUILTIN_ADDCARRYX32:
33139 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
33140 mode0 = SImode;
33141 goto addcarryx;
33142
33143 case IX86_BUILTIN_ADDCARRYX64:
33144 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
33145 mode0 = DImode;
33146
33147 addcarryx:
33148 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
33149 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
33150 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
33151 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
33152
33153 op0 = gen_reg_rtx (QImode);
33154
33155 /* Generate CF from input operand. */
33156 op1 = expand_normal (arg0);
33157 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
33158 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
33159
33160 /* Gen ADCX instruction to compute X+Y+CF. */
33161 op2 = expand_normal (arg1);
33162 op3 = expand_normal (arg2);
33163
33164 if (!REG_P (op2))
33165 op2 = copy_to_mode_reg (mode0, op2);
33166 if (!REG_P (op3))
33167 op3 = copy_to_mode_reg (mode0, op3);
33168
33169 op0 = gen_reg_rtx (mode0);
33170
33171 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
33172 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
33173 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
33174
33175 /* Store the result. */
33176 op4 = expand_normal (arg3);
33177 if (!address_operand (op4, VOIDmode))
33178 {
33179 op4 = convert_memory_address (Pmode, op4);
33180 op4 = copy_addr_to_reg (op4);
33181 }
33182 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
33183
33184 /* Return current CF value. */
33185 if (target == 0)
33186 target = gen_reg_rtx (QImode);
33187
33188 PUT_MODE (pat, QImode);
33189 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
33190 return target;
33191
33192 case IX86_BUILTIN_GATHERSIV2DF:
33193 icode = CODE_FOR_avx2_gathersiv2df;
33194 goto gather_gen;
33195 case IX86_BUILTIN_GATHERSIV4DF:
33196 icode = CODE_FOR_avx2_gathersiv4df;
33197 goto gather_gen;
33198 case IX86_BUILTIN_GATHERDIV2DF:
33199 icode = CODE_FOR_avx2_gatherdiv2df;
33200 goto gather_gen;
33201 case IX86_BUILTIN_GATHERDIV4DF:
33202 icode = CODE_FOR_avx2_gatherdiv4df;
33203 goto gather_gen;
33204 case IX86_BUILTIN_GATHERSIV4SF:
33205 icode = CODE_FOR_avx2_gathersiv4sf;
33206 goto gather_gen;
33207 case IX86_BUILTIN_GATHERSIV8SF:
33208 icode = CODE_FOR_avx2_gathersiv8sf;
33209 goto gather_gen;
33210 case IX86_BUILTIN_GATHERDIV4SF:
33211 icode = CODE_FOR_avx2_gatherdiv4sf;
33212 goto gather_gen;
33213 case IX86_BUILTIN_GATHERDIV8SF:
33214 icode = CODE_FOR_avx2_gatherdiv8sf;
33215 goto gather_gen;
33216 case IX86_BUILTIN_GATHERSIV2DI:
33217 icode = CODE_FOR_avx2_gathersiv2di;
33218 goto gather_gen;
33219 case IX86_BUILTIN_GATHERSIV4DI:
33220 icode = CODE_FOR_avx2_gathersiv4di;
33221 goto gather_gen;
33222 case IX86_BUILTIN_GATHERDIV2DI:
33223 icode = CODE_FOR_avx2_gatherdiv2di;
33224 goto gather_gen;
33225 case IX86_BUILTIN_GATHERDIV4DI:
33226 icode = CODE_FOR_avx2_gatherdiv4di;
33227 goto gather_gen;
33228 case IX86_BUILTIN_GATHERSIV4SI:
33229 icode = CODE_FOR_avx2_gathersiv4si;
33230 goto gather_gen;
33231 case IX86_BUILTIN_GATHERSIV8SI:
33232 icode = CODE_FOR_avx2_gathersiv8si;
33233 goto gather_gen;
33234 case IX86_BUILTIN_GATHERDIV4SI:
33235 icode = CODE_FOR_avx2_gatherdiv4si;
33236 goto gather_gen;
33237 case IX86_BUILTIN_GATHERDIV8SI:
33238 icode = CODE_FOR_avx2_gatherdiv8si;
33239 goto gather_gen;
33240 case IX86_BUILTIN_GATHERALTSIV4DF:
33241 icode = CODE_FOR_avx2_gathersiv4df;
33242 goto gather_gen;
33243 case IX86_BUILTIN_GATHERALTDIV8SF:
33244 icode = CODE_FOR_avx2_gatherdiv8sf;
33245 goto gather_gen;
33246 case IX86_BUILTIN_GATHERALTSIV4DI:
33247 icode = CODE_FOR_avx2_gathersiv4di;
33248 goto gather_gen;
33249 case IX86_BUILTIN_GATHERALTDIV8SI:
33250 icode = CODE_FOR_avx2_gatherdiv8si;
33251 goto gather_gen;
33252
33253 gather_gen:
33254 arg0 = CALL_EXPR_ARG (exp, 0);
33255 arg1 = CALL_EXPR_ARG (exp, 1);
33256 arg2 = CALL_EXPR_ARG (exp, 2);
33257 arg3 = CALL_EXPR_ARG (exp, 3);
33258 arg4 = CALL_EXPR_ARG (exp, 4);
33259 op0 = expand_normal (arg0);
33260 op1 = expand_normal (arg1);
33261 op2 = expand_normal (arg2);
33262 op3 = expand_normal (arg3);
33263 op4 = expand_normal (arg4);
33264 /* Note the arg order is different from the operand order. */
33265 mode0 = insn_data[icode].operand[1].mode;
33266 mode2 = insn_data[icode].operand[3].mode;
33267 mode3 = insn_data[icode].operand[4].mode;
33268 mode4 = insn_data[icode].operand[5].mode;
33269
33270 if (target == NULL_RTX
33271 || GET_MODE (target) != insn_data[icode].operand[0].mode)
33272 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
33273 else
33274 subtarget = target;
33275
33276 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
33277 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
33278 {
33279 rtx half = gen_reg_rtx (V4SImode);
33280 if (!nonimmediate_operand (op2, V8SImode))
33281 op2 = copy_to_mode_reg (V8SImode, op2);
33282 emit_insn (gen_vec_extract_lo_v8si (half, op2));
33283 op2 = half;
33284 }
33285 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
33286 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
33287 {
33288 rtx (*gen) (rtx, rtx);
33289 rtx half = gen_reg_rtx (mode0);
33290 if (mode0 == V4SFmode)
33291 gen = gen_vec_extract_lo_v8sf;
33292 else
33293 gen = gen_vec_extract_lo_v8si;
33294 if (!nonimmediate_operand (op0, GET_MODE (op0)))
33295 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
33296 emit_insn (gen (half, op0));
33297 op0 = half;
33298 if (!nonimmediate_operand (op3, GET_MODE (op3)))
33299 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
33300 emit_insn (gen (half, op3));
33301 op3 = half;
33302 }
33303
33304 /* Force memory operand only with base register here. But we
33305 don't want to do it on memory operand for other builtin
33306 functions. */
33307 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
33308
33309 if (!insn_data[icode].operand[1].predicate (op0, mode0))
33310 op0 = copy_to_mode_reg (mode0, op0);
33311 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
33312 op1 = copy_to_mode_reg (Pmode, op1);
33313 if (!insn_data[icode].operand[3].predicate (op2, mode2))
33314 op2 = copy_to_mode_reg (mode2, op2);
33315 if (!insn_data[icode].operand[4].predicate (op3, mode3))
33316 op3 = copy_to_mode_reg (mode3, op3);
33317 if (!insn_data[icode].operand[5].predicate (op4, mode4))
33318 {
33319 error ("last argument must be scale 1, 2, 4, 8");
33320 return const0_rtx;
33321 }
33322
33323 /* Optimize. If mask is known to have all high bits set,
33324 replace op0 with pc_rtx to signal that the instruction
33325 overwrites the whole destination and doesn't use its
33326 previous contents. */
33327 if (optimize)
33328 {
33329 if (TREE_CODE (arg3) == VECTOR_CST)
33330 {
33331 unsigned int negative = 0;
33332 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
33333 {
33334 tree cst = VECTOR_CST_ELT (arg3, i);
33335 if (TREE_CODE (cst) == INTEGER_CST
33336 && tree_int_cst_sign_bit (cst))
33337 negative++;
33338 else if (TREE_CODE (cst) == REAL_CST
33339 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
33340 negative++;
33341 }
33342 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
33343 op0 = pc_rtx;
33344 }
33345 else if (TREE_CODE (arg3) == SSA_NAME)
33346 {
33347 /* Recognize also when mask is like:
33348 __v2df src = _mm_setzero_pd ();
33349 __v2df mask = _mm_cmpeq_pd (src, src);
33350 or
33351 __v8sf src = _mm256_setzero_ps ();
33352 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
33353 as that is a cheaper way to load all ones into
33354 a register than having to load a constant from
33355 memory. */
33356 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
33357 if (is_gimple_call (def_stmt))
33358 {
33359 tree fndecl = gimple_call_fndecl (def_stmt);
33360 if (fndecl
33361 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
33362 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
33363 {
33364 case IX86_BUILTIN_CMPPD:
33365 case IX86_BUILTIN_CMPPS:
33366 case IX86_BUILTIN_CMPPD256:
33367 case IX86_BUILTIN_CMPPS256:
33368 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
33369 break;
33370 /* FALLTHRU */
33371 case IX86_BUILTIN_CMPEQPD:
33372 case IX86_BUILTIN_CMPEQPS:
33373 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
33374 && initializer_zerop (gimple_call_arg (def_stmt,
33375 1)))
33376 op0 = pc_rtx;
33377 break;
33378 default:
33379 break;
33380 }
33381 }
33382 }
33383 }
33384
33385 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
33386 if (! pat)
33387 return const0_rtx;
33388 emit_insn (pat);
33389
33390 if (fcode == IX86_BUILTIN_GATHERDIV8SF
33391 || fcode == IX86_BUILTIN_GATHERDIV8SI)
33392 {
33393 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
33394 ? V4SFmode : V4SImode;
33395 if (target == NULL_RTX)
33396 target = gen_reg_rtx (tmode);
33397 if (tmode == V4SFmode)
33398 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
33399 else
33400 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
33401 }
33402 else
33403 target = subtarget;
33404
33405 return target;
33406
33407 case IX86_BUILTIN_XABORT:
33408 icode = CODE_FOR_xabort;
33409 arg0 = CALL_EXPR_ARG (exp, 0);
33410 op0 = expand_normal (arg0);
33411 mode0 = insn_data[icode].operand[0].mode;
33412 if (!insn_data[icode].operand[0].predicate (op0, mode0))
33413 {
33414 error ("the xabort's argument must be an 8-bit immediate");
33415 return const0_rtx;
33416 }
33417 emit_insn (gen_xabort (op0));
33418 return 0;
33419
33420 default:
33421 break;
33422 }
33423
33424 for (i = 0, d = bdesc_special_args;
33425 i < ARRAY_SIZE (bdesc_special_args);
33426 i++, d++)
33427 if (d->code == fcode)
33428 return ix86_expand_special_args_builtin (d, exp, target);
33429
33430 for (i = 0, d = bdesc_args;
33431 i < ARRAY_SIZE (bdesc_args);
33432 i++, d++)
33433 if (d->code == fcode)
33434 switch (fcode)
33435 {
33436 case IX86_BUILTIN_FABSQ:
33437 case IX86_BUILTIN_COPYSIGNQ:
33438 if (!TARGET_SSE)
33439 /* Emit a normal call if SSE isn't available. */
33440 return expand_call (exp, target, ignore);
33441 default:
33442 return ix86_expand_args_builtin (d, exp, target);
33443 }
33444
33445 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
33446 if (d->code == fcode)
33447 return ix86_expand_sse_comi (d, exp, target);
33448
33449 for (i = 0, d = bdesc_pcmpestr;
33450 i < ARRAY_SIZE (bdesc_pcmpestr);
33451 i++, d++)
33452 if (d->code == fcode)
33453 return ix86_expand_sse_pcmpestr (d, exp, target);
33454
33455 for (i = 0, d = bdesc_pcmpistr;
33456 i < ARRAY_SIZE (bdesc_pcmpistr);
33457 i++, d++)
33458 if (d->code == fcode)
33459 return ix86_expand_sse_pcmpistr (d, exp, target);
33460
33461 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
33462 if (d->code == fcode)
33463 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
33464 (enum ix86_builtin_func_type)
33465 d->flag, d->comparison);
33466
33467 gcc_unreachable ();
33468 }
33469
33470 /* Returns a function decl for a vectorized version of the builtin function
33471 with builtin function code FN and the result vector type TYPE, or NULL_TREE
33472 if it is not available. */
33473
33474 static tree
33475 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
33476 tree type_in)
33477 {
33478 enum machine_mode in_mode, out_mode;
33479 int in_n, out_n;
33480 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
33481
33482 if (TREE_CODE (type_out) != VECTOR_TYPE
33483 || TREE_CODE (type_in) != VECTOR_TYPE
33484 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
33485 return NULL_TREE;
33486
33487 out_mode = TYPE_MODE (TREE_TYPE (type_out));
33488 out_n = TYPE_VECTOR_SUBPARTS (type_out);
33489 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33490 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33491
33492 switch (fn)
33493 {
33494 case BUILT_IN_SQRT:
33495 if (out_mode == DFmode && in_mode == DFmode)
33496 {
33497 if (out_n == 2 && in_n == 2)
33498 return ix86_builtins[IX86_BUILTIN_SQRTPD];
33499 else if (out_n == 4 && in_n == 4)
33500 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
33501 }
33502 break;
33503
33504 case BUILT_IN_SQRTF:
33505 if (out_mode == SFmode && in_mode == SFmode)
33506 {
33507 if (out_n == 4 && in_n == 4)
33508 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
33509 else if (out_n == 8 && in_n == 8)
33510 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
33511 }
33512 break;
33513
33514 case BUILT_IN_IFLOOR:
33515 case BUILT_IN_LFLOOR:
33516 case BUILT_IN_LLFLOOR:
33517 /* The round insn does not trap on denormals. */
33518 if (flag_trapping_math || !TARGET_ROUND)
33519 break;
33520
33521 if (out_mode == SImode && in_mode == DFmode)
33522 {
33523 if (out_n == 4 && in_n == 2)
33524 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
33525 else if (out_n == 8 && in_n == 4)
33526 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
33527 }
33528 break;
33529
33530 case BUILT_IN_IFLOORF:
33531 case BUILT_IN_LFLOORF:
33532 case BUILT_IN_LLFLOORF:
33533 /* The round insn does not trap on denormals. */
33534 if (flag_trapping_math || !TARGET_ROUND)
33535 break;
33536
33537 if (out_mode == SImode && in_mode == SFmode)
33538 {
33539 if (out_n == 4 && in_n == 4)
33540 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
33541 else if (out_n == 8 && in_n == 8)
33542 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
33543 }
33544 break;
33545
33546 case BUILT_IN_ICEIL:
33547 case BUILT_IN_LCEIL:
33548 case BUILT_IN_LLCEIL:
33549 /* The round insn does not trap on denormals. */
33550 if (flag_trapping_math || !TARGET_ROUND)
33551 break;
33552
33553 if (out_mode == SImode && in_mode == DFmode)
33554 {
33555 if (out_n == 4 && in_n == 2)
33556 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
33557 else if (out_n == 8 && in_n == 4)
33558 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
33559 }
33560 break;
33561
33562 case BUILT_IN_ICEILF:
33563 case BUILT_IN_LCEILF:
33564 case BUILT_IN_LLCEILF:
33565 /* The round insn does not trap on denormals. */
33566 if (flag_trapping_math || !TARGET_ROUND)
33567 break;
33568
33569 if (out_mode == SImode && in_mode == SFmode)
33570 {
33571 if (out_n == 4 && in_n == 4)
33572 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
33573 else if (out_n == 8 && in_n == 8)
33574 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
33575 }
33576 break;
33577
33578 case BUILT_IN_IRINT:
33579 case BUILT_IN_LRINT:
33580 case BUILT_IN_LLRINT:
33581 if (out_mode == SImode && in_mode == DFmode)
33582 {
33583 if (out_n == 4 && in_n == 2)
33584 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
33585 else if (out_n == 8 && in_n == 4)
33586 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
33587 }
33588 break;
33589
33590 case BUILT_IN_IRINTF:
33591 case BUILT_IN_LRINTF:
33592 case BUILT_IN_LLRINTF:
33593 if (out_mode == SImode && in_mode == SFmode)
33594 {
33595 if (out_n == 4 && in_n == 4)
33596 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
33597 else if (out_n == 8 && in_n == 8)
33598 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
33599 }
33600 break;
33601
33602 case BUILT_IN_IROUND:
33603 case BUILT_IN_LROUND:
33604 case BUILT_IN_LLROUND:
33605 /* The round insn does not trap on denormals. */
33606 if (flag_trapping_math || !TARGET_ROUND)
33607 break;
33608
33609 if (out_mode == SImode && in_mode == DFmode)
33610 {
33611 if (out_n == 4 && in_n == 2)
33612 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
33613 else if (out_n == 8 && in_n == 4)
33614 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
33615 }
33616 break;
33617
33618 case BUILT_IN_IROUNDF:
33619 case BUILT_IN_LROUNDF:
33620 case BUILT_IN_LLROUNDF:
33621 /* The round insn does not trap on denormals. */
33622 if (flag_trapping_math || !TARGET_ROUND)
33623 break;
33624
33625 if (out_mode == SImode && in_mode == SFmode)
33626 {
33627 if (out_n == 4 && in_n == 4)
33628 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
33629 else if (out_n == 8 && in_n == 8)
33630 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
33631 }
33632 break;
33633
33634 case BUILT_IN_COPYSIGN:
33635 if (out_mode == DFmode && in_mode == DFmode)
33636 {
33637 if (out_n == 2 && in_n == 2)
33638 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
33639 else if (out_n == 4 && in_n == 4)
33640 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
33641 }
33642 break;
33643
33644 case BUILT_IN_COPYSIGNF:
33645 if (out_mode == SFmode && in_mode == SFmode)
33646 {
33647 if (out_n == 4 && in_n == 4)
33648 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
33649 else if (out_n == 8 && in_n == 8)
33650 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
33651 }
33652 break;
33653
33654 case BUILT_IN_FLOOR:
33655 /* The round insn does not trap on denormals. */
33656 if (flag_trapping_math || !TARGET_ROUND)
33657 break;
33658
33659 if (out_mode == DFmode && in_mode == DFmode)
33660 {
33661 if (out_n == 2 && in_n == 2)
33662 return ix86_builtins[IX86_BUILTIN_FLOORPD];
33663 else if (out_n == 4 && in_n == 4)
33664 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
33665 }
33666 break;
33667
33668 case BUILT_IN_FLOORF:
33669 /* The round insn does not trap on denormals. */
33670 if (flag_trapping_math || !TARGET_ROUND)
33671 break;
33672
33673 if (out_mode == SFmode && in_mode == SFmode)
33674 {
33675 if (out_n == 4 && in_n == 4)
33676 return ix86_builtins[IX86_BUILTIN_FLOORPS];
33677 else if (out_n == 8 && in_n == 8)
33678 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
33679 }
33680 break;
33681
33682 case BUILT_IN_CEIL:
33683 /* The round insn does not trap on denormals. */
33684 if (flag_trapping_math || !TARGET_ROUND)
33685 break;
33686
33687 if (out_mode == DFmode && in_mode == DFmode)
33688 {
33689 if (out_n == 2 && in_n == 2)
33690 return ix86_builtins[IX86_BUILTIN_CEILPD];
33691 else if (out_n == 4 && in_n == 4)
33692 return ix86_builtins[IX86_BUILTIN_CEILPD256];
33693 }
33694 break;
33695
33696 case BUILT_IN_CEILF:
33697 /* The round insn does not trap on denormals. */
33698 if (flag_trapping_math || !TARGET_ROUND)
33699 break;
33700
33701 if (out_mode == SFmode && in_mode == SFmode)
33702 {
33703 if (out_n == 4 && in_n == 4)
33704 return ix86_builtins[IX86_BUILTIN_CEILPS];
33705 else if (out_n == 8 && in_n == 8)
33706 return ix86_builtins[IX86_BUILTIN_CEILPS256];
33707 }
33708 break;
33709
33710 case BUILT_IN_TRUNC:
33711 /* The round insn does not trap on denormals. */
33712 if (flag_trapping_math || !TARGET_ROUND)
33713 break;
33714
33715 if (out_mode == DFmode && in_mode == DFmode)
33716 {
33717 if (out_n == 2 && in_n == 2)
33718 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
33719 else if (out_n == 4 && in_n == 4)
33720 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
33721 }
33722 break;
33723
33724 case BUILT_IN_TRUNCF:
33725 /* The round insn does not trap on denormals. */
33726 if (flag_trapping_math || !TARGET_ROUND)
33727 break;
33728
33729 if (out_mode == SFmode && in_mode == SFmode)
33730 {
33731 if (out_n == 4 && in_n == 4)
33732 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
33733 else if (out_n == 8 && in_n == 8)
33734 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
33735 }
33736 break;
33737
33738 case BUILT_IN_RINT:
33739 /* The round insn does not trap on denormals. */
33740 if (flag_trapping_math || !TARGET_ROUND)
33741 break;
33742
33743 if (out_mode == DFmode && in_mode == DFmode)
33744 {
33745 if (out_n == 2 && in_n == 2)
33746 return ix86_builtins[IX86_BUILTIN_RINTPD];
33747 else if (out_n == 4 && in_n == 4)
33748 return ix86_builtins[IX86_BUILTIN_RINTPD256];
33749 }
33750 break;
33751
33752 case BUILT_IN_RINTF:
33753 /* The round insn does not trap on denormals. */
33754 if (flag_trapping_math || !TARGET_ROUND)
33755 break;
33756
33757 if (out_mode == SFmode && in_mode == SFmode)
33758 {
33759 if (out_n == 4 && in_n == 4)
33760 return ix86_builtins[IX86_BUILTIN_RINTPS];
33761 else if (out_n == 8 && in_n == 8)
33762 return ix86_builtins[IX86_BUILTIN_RINTPS256];
33763 }
33764 break;
33765
33766 case BUILT_IN_ROUND:
33767 /* The round insn does not trap on denormals. */
33768 if (flag_trapping_math || !TARGET_ROUND)
33769 break;
33770
33771 if (out_mode == DFmode && in_mode == DFmode)
33772 {
33773 if (out_n == 2 && in_n == 2)
33774 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
33775 else if (out_n == 4 && in_n == 4)
33776 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
33777 }
33778 break;
33779
33780 case BUILT_IN_ROUNDF:
33781 /* The round insn does not trap on denormals. */
33782 if (flag_trapping_math || !TARGET_ROUND)
33783 break;
33784
33785 if (out_mode == SFmode && in_mode == SFmode)
33786 {
33787 if (out_n == 4 && in_n == 4)
33788 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
33789 else if (out_n == 8 && in_n == 8)
33790 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
33791 }
33792 break;
33793
33794 case BUILT_IN_FMA:
33795 if (out_mode == DFmode && in_mode == DFmode)
33796 {
33797 if (out_n == 2 && in_n == 2)
33798 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
33799 if (out_n == 4 && in_n == 4)
33800 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
33801 }
33802 break;
33803
33804 case BUILT_IN_FMAF:
33805 if (out_mode == SFmode && in_mode == SFmode)
33806 {
33807 if (out_n == 4 && in_n == 4)
33808 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
33809 if (out_n == 8 && in_n == 8)
33810 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
33811 }
33812 break;
33813
33814 default:
33815 break;
33816 }
33817
33818 /* Dispatch to a handler for a vectorization library. */
33819 if (ix86_veclib_handler)
33820 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
33821 type_in);
33822
33823 return NULL_TREE;
33824 }
33825
33826 /* Handler for an SVML-style interface to
33827 a library with vectorized intrinsics. */
33828
33829 static tree
33830 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
33831 {
33832 char name[20];
33833 tree fntype, new_fndecl, args;
33834 unsigned arity;
33835 const char *bname;
33836 enum machine_mode el_mode, in_mode;
33837 int n, in_n;
33838
33839 /* The SVML is suitable for unsafe math only. */
33840 if (!flag_unsafe_math_optimizations)
33841 return NULL_TREE;
33842
33843 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33844 n = TYPE_VECTOR_SUBPARTS (type_out);
33845 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33846 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33847 if (el_mode != in_mode
33848 || n != in_n)
33849 return NULL_TREE;
33850
33851 switch (fn)
33852 {
33853 case BUILT_IN_EXP:
33854 case BUILT_IN_LOG:
33855 case BUILT_IN_LOG10:
33856 case BUILT_IN_POW:
33857 case BUILT_IN_TANH:
33858 case BUILT_IN_TAN:
33859 case BUILT_IN_ATAN:
33860 case BUILT_IN_ATAN2:
33861 case BUILT_IN_ATANH:
33862 case BUILT_IN_CBRT:
33863 case BUILT_IN_SINH:
33864 case BUILT_IN_SIN:
33865 case BUILT_IN_ASINH:
33866 case BUILT_IN_ASIN:
33867 case BUILT_IN_COSH:
33868 case BUILT_IN_COS:
33869 case BUILT_IN_ACOSH:
33870 case BUILT_IN_ACOS:
33871 if (el_mode != DFmode || n != 2)
33872 return NULL_TREE;
33873 break;
33874
33875 case BUILT_IN_EXPF:
33876 case BUILT_IN_LOGF:
33877 case BUILT_IN_LOG10F:
33878 case BUILT_IN_POWF:
33879 case BUILT_IN_TANHF:
33880 case BUILT_IN_TANF:
33881 case BUILT_IN_ATANF:
33882 case BUILT_IN_ATAN2F:
33883 case BUILT_IN_ATANHF:
33884 case BUILT_IN_CBRTF:
33885 case BUILT_IN_SINHF:
33886 case BUILT_IN_SINF:
33887 case BUILT_IN_ASINHF:
33888 case BUILT_IN_ASINF:
33889 case BUILT_IN_COSHF:
33890 case BUILT_IN_COSF:
33891 case BUILT_IN_ACOSHF:
33892 case BUILT_IN_ACOSF:
33893 if (el_mode != SFmode || n != 4)
33894 return NULL_TREE;
33895 break;
33896
33897 default:
33898 return NULL_TREE;
33899 }
33900
33901 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
33902
33903 if (fn == BUILT_IN_LOGF)
33904 strcpy (name, "vmlsLn4");
33905 else if (fn == BUILT_IN_LOG)
33906 strcpy (name, "vmldLn2");
33907 else if (n == 4)
33908 {
33909 sprintf (name, "vmls%s", bname+10);
33910 name[strlen (name)-1] = '4';
33911 }
33912 else
33913 sprintf (name, "vmld%s2", bname+10);
33914
33915 /* Convert to uppercase. */
33916 name[4] &= ~0x20;
33917
33918 arity = 0;
33919 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
33920 args;
33921 args = TREE_CHAIN (args))
33922 arity++;
33923
33924 if (arity == 1)
33925 fntype = build_function_type_list (type_out, type_in, NULL);
33926 else
33927 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
33928
33929 /* Build a function declaration for the vectorized function. */
33930 new_fndecl = build_decl (BUILTINS_LOCATION,
33931 FUNCTION_DECL, get_identifier (name), fntype);
33932 TREE_PUBLIC (new_fndecl) = 1;
33933 DECL_EXTERNAL (new_fndecl) = 1;
33934 DECL_IS_NOVOPS (new_fndecl) = 1;
33935 TREE_READONLY (new_fndecl) = 1;
33936
33937 return new_fndecl;
33938 }
33939
33940 /* Handler for an ACML-style interface to
33941 a library with vectorized intrinsics. */
33942
33943 static tree
33944 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
33945 {
33946 char name[20] = "__vr.._";
33947 tree fntype, new_fndecl, args;
33948 unsigned arity;
33949 const char *bname;
33950 enum machine_mode el_mode, in_mode;
33951 int n, in_n;
33952
33953 /* The ACML is 64bits only and suitable for unsafe math only as
33954 it does not correctly support parts of IEEE with the required
33955 precision such as denormals. */
33956 if (!TARGET_64BIT
33957 || !flag_unsafe_math_optimizations)
33958 return NULL_TREE;
33959
33960 el_mode = TYPE_MODE (TREE_TYPE (type_out));
33961 n = TYPE_VECTOR_SUBPARTS (type_out);
33962 in_mode = TYPE_MODE (TREE_TYPE (type_in));
33963 in_n = TYPE_VECTOR_SUBPARTS (type_in);
33964 if (el_mode != in_mode
33965 || n != in_n)
33966 return NULL_TREE;
33967
33968 switch (fn)
33969 {
33970 case BUILT_IN_SIN:
33971 case BUILT_IN_COS:
33972 case BUILT_IN_EXP:
33973 case BUILT_IN_LOG:
33974 case BUILT_IN_LOG2:
33975 case BUILT_IN_LOG10:
33976 name[4] = 'd';
33977 name[5] = '2';
33978 if (el_mode != DFmode
33979 || n != 2)
33980 return NULL_TREE;
33981 break;
33982
33983 case BUILT_IN_SINF:
33984 case BUILT_IN_COSF:
33985 case BUILT_IN_EXPF:
33986 case BUILT_IN_POWF:
33987 case BUILT_IN_LOGF:
33988 case BUILT_IN_LOG2F:
33989 case BUILT_IN_LOG10F:
33990 name[4] = 's';
33991 name[5] = '4';
33992 if (el_mode != SFmode
33993 || n != 4)
33994 return NULL_TREE;
33995 break;
33996
33997 default:
33998 return NULL_TREE;
33999 }
34000
34001 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
34002 sprintf (name + 7, "%s", bname+10);
34003
34004 arity = 0;
34005 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
34006 args;
34007 args = TREE_CHAIN (args))
34008 arity++;
34009
34010 if (arity == 1)
34011 fntype = build_function_type_list (type_out, type_in, NULL);
34012 else
34013 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
34014
34015 /* Build a function declaration for the vectorized function. */
34016 new_fndecl = build_decl (BUILTINS_LOCATION,
34017 FUNCTION_DECL, get_identifier (name), fntype);
34018 TREE_PUBLIC (new_fndecl) = 1;
34019 DECL_EXTERNAL (new_fndecl) = 1;
34020 DECL_IS_NOVOPS (new_fndecl) = 1;
34021 TREE_READONLY (new_fndecl) = 1;
34022
34023 return new_fndecl;
34024 }
34025
34026 /* Returns a decl of a function that implements gather load with
34027 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
34028 Return NULL_TREE if it is not available. */
34029
34030 static tree
34031 ix86_vectorize_builtin_gather (const_tree mem_vectype,
34032 const_tree index_type, int scale)
34033 {
34034 bool si;
34035 enum ix86_builtins code;
34036
34037 if (! TARGET_AVX2)
34038 return NULL_TREE;
34039
34040 if ((TREE_CODE (index_type) != INTEGER_TYPE
34041 && !POINTER_TYPE_P (index_type))
34042 || (TYPE_MODE (index_type) != SImode
34043 && TYPE_MODE (index_type) != DImode))
34044 return NULL_TREE;
34045
34046 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
34047 return NULL_TREE;
34048
34049 /* v*gather* insn sign extends index to pointer mode. */
34050 if (TYPE_PRECISION (index_type) < POINTER_SIZE
34051 && TYPE_UNSIGNED (index_type))
34052 return NULL_TREE;
34053
34054 if (scale <= 0
34055 || scale > 8
34056 || (scale & (scale - 1)) != 0)
34057 return NULL_TREE;
34058
34059 si = TYPE_MODE (index_type) == SImode;
34060 switch (TYPE_MODE (mem_vectype))
34061 {
34062 case V2DFmode:
34063 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
34064 break;
34065 case V4DFmode:
34066 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
34067 break;
34068 case V2DImode:
34069 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
34070 break;
34071 case V4DImode:
34072 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
34073 break;
34074 case V4SFmode:
34075 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
34076 break;
34077 case V8SFmode:
34078 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
34079 break;
34080 case V4SImode:
34081 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
34082 break;
34083 case V8SImode:
34084 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
34085 break;
34086 default:
34087 return NULL_TREE;
34088 }
34089
34090 return ix86_builtins[code];
34091 }
34092
34093 /* Returns a code for a target-specific builtin that implements
34094 reciprocal of the function, or NULL_TREE if not available. */
34095
34096 static tree
34097 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
34098 bool sqrt ATTRIBUTE_UNUSED)
34099 {
34100 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
34101 && flag_finite_math_only && !flag_trapping_math
34102 && flag_unsafe_math_optimizations))
34103 return NULL_TREE;
34104
34105 if (md_fn)
34106 /* Machine dependent builtins. */
34107 switch (fn)
34108 {
34109 /* Vectorized version of sqrt to rsqrt conversion. */
34110 case IX86_BUILTIN_SQRTPS_NR:
34111 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
34112
34113 case IX86_BUILTIN_SQRTPS_NR256:
34114 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
34115
34116 default:
34117 return NULL_TREE;
34118 }
34119 else
34120 /* Normal builtins. */
34121 switch (fn)
34122 {
34123 /* Sqrt to rsqrt conversion. */
34124 case BUILT_IN_SQRTF:
34125 return ix86_builtins[IX86_BUILTIN_RSQRTF];
34126
34127 default:
34128 return NULL_TREE;
34129 }
34130 }
34131 \f
34132 /* Helper for avx_vpermilps256_operand et al. This is also used by
34133 the expansion functions to turn the parallel back into a mask.
34134 The return value is 0 for no match and the imm8+1 for a match. */
34135
34136 int
34137 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
34138 {
34139 unsigned i, nelt = GET_MODE_NUNITS (mode);
34140 unsigned mask = 0;
34141 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34142
34143 if (XVECLEN (par, 0) != (int) nelt)
34144 return 0;
34145
34146 /* Validate that all of the elements are constants, and not totally
34147 out of range. Copy the data into an integral array to make the
34148 subsequent checks easier. */
34149 for (i = 0; i < nelt; ++i)
34150 {
34151 rtx er = XVECEXP (par, 0, i);
34152 unsigned HOST_WIDE_INT ei;
34153
34154 if (!CONST_INT_P (er))
34155 return 0;
34156 ei = INTVAL (er);
34157 if (ei >= nelt)
34158 return 0;
34159 ipar[i] = ei;
34160 }
34161
34162 switch (mode)
34163 {
34164 case V4DFmode:
34165 /* In the 256-bit DFmode case, we can only move elements within
34166 a 128-bit lane. */
34167 for (i = 0; i < 2; ++i)
34168 {
34169 if (ipar[i] >= 2)
34170 return 0;
34171 mask |= ipar[i] << i;
34172 }
34173 for (i = 2; i < 4; ++i)
34174 {
34175 if (ipar[i] < 2)
34176 return 0;
34177 mask |= (ipar[i] - 2) << i;
34178 }
34179 break;
34180
34181 case V8SFmode:
34182 /* In the 256-bit SFmode case, we have full freedom of movement
34183 within the low 128-bit lane, but the high 128-bit lane must
34184 mirror the exact same pattern. */
34185 for (i = 0; i < 4; ++i)
34186 if (ipar[i] + 4 != ipar[i + 4])
34187 return 0;
34188 nelt = 4;
34189 /* FALLTHRU */
34190
34191 case V2DFmode:
34192 case V4SFmode:
34193 /* In the 128-bit case, we've full freedom in the placement of
34194 the elements from the source operand. */
34195 for (i = 0; i < nelt; ++i)
34196 mask |= ipar[i] << (i * (nelt / 2));
34197 break;
34198
34199 default:
34200 gcc_unreachable ();
34201 }
34202
34203 /* Make sure success has a non-zero value by adding one. */
34204 return mask + 1;
34205 }
34206
34207 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
34208 the expansion functions to turn the parallel back into a mask.
34209 The return value is 0 for no match and the imm8+1 for a match. */
34210
34211 int
34212 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
34213 {
34214 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
34215 unsigned mask = 0;
34216 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
34217
34218 if (XVECLEN (par, 0) != (int) nelt)
34219 return 0;
34220
34221 /* Validate that all of the elements are constants, and not totally
34222 out of range. Copy the data into an integral array to make the
34223 subsequent checks easier. */
34224 for (i = 0; i < nelt; ++i)
34225 {
34226 rtx er = XVECEXP (par, 0, i);
34227 unsigned HOST_WIDE_INT ei;
34228
34229 if (!CONST_INT_P (er))
34230 return 0;
34231 ei = INTVAL (er);
34232 if (ei >= 2 * nelt)
34233 return 0;
34234 ipar[i] = ei;
34235 }
34236
34237 /* Validate that the halves of the permute are halves. */
34238 for (i = 0; i < nelt2 - 1; ++i)
34239 if (ipar[i] + 1 != ipar[i + 1])
34240 return 0;
34241 for (i = nelt2; i < nelt - 1; ++i)
34242 if (ipar[i] + 1 != ipar[i + 1])
34243 return 0;
34244
34245 /* Reconstruct the mask. */
34246 for (i = 0; i < 2; ++i)
34247 {
34248 unsigned e = ipar[i * nelt2];
34249 if (e % nelt2)
34250 return 0;
34251 e /= nelt2;
34252 mask |= e << (i * 4);
34253 }
34254
34255 /* Make sure success has a non-zero value by adding one. */
34256 return mask + 1;
34257 }
34258 \f
34259 /* Store OPERAND to the memory after reload is completed. This means
34260 that we can't easily use assign_stack_local. */
34261 rtx
34262 ix86_force_to_memory (enum machine_mode mode, rtx operand)
34263 {
34264 rtx result;
34265
34266 gcc_assert (reload_completed);
34267 if (ix86_using_red_zone ())
34268 {
34269 result = gen_rtx_MEM (mode,
34270 gen_rtx_PLUS (Pmode,
34271 stack_pointer_rtx,
34272 GEN_INT (-RED_ZONE_SIZE)));
34273 emit_move_insn (result, operand);
34274 }
34275 else if (TARGET_64BIT)
34276 {
34277 switch (mode)
34278 {
34279 case HImode:
34280 case SImode:
34281 operand = gen_lowpart (DImode, operand);
34282 /* FALLTHRU */
34283 case DImode:
34284 emit_insn (
34285 gen_rtx_SET (VOIDmode,
34286 gen_rtx_MEM (DImode,
34287 gen_rtx_PRE_DEC (DImode,
34288 stack_pointer_rtx)),
34289 operand));
34290 break;
34291 default:
34292 gcc_unreachable ();
34293 }
34294 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34295 }
34296 else
34297 {
34298 switch (mode)
34299 {
34300 case DImode:
34301 {
34302 rtx operands[2];
34303 split_double_mode (mode, &operand, 1, operands, operands + 1);
34304 emit_insn (
34305 gen_rtx_SET (VOIDmode,
34306 gen_rtx_MEM (SImode,
34307 gen_rtx_PRE_DEC (Pmode,
34308 stack_pointer_rtx)),
34309 operands[1]));
34310 emit_insn (
34311 gen_rtx_SET (VOIDmode,
34312 gen_rtx_MEM (SImode,
34313 gen_rtx_PRE_DEC (Pmode,
34314 stack_pointer_rtx)),
34315 operands[0]));
34316 }
34317 break;
34318 case HImode:
34319 /* Store HImodes as SImodes. */
34320 operand = gen_lowpart (SImode, operand);
34321 /* FALLTHRU */
34322 case SImode:
34323 emit_insn (
34324 gen_rtx_SET (VOIDmode,
34325 gen_rtx_MEM (GET_MODE (operand),
34326 gen_rtx_PRE_DEC (SImode,
34327 stack_pointer_rtx)),
34328 operand));
34329 break;
34330 default:
34331 gcc_unreachable ();
34332 }
34333 result = gen_rtx_MEM (mode, stack_pointer_rtx);
34334 }
34335 return result;
34336 }
34337
34338 /* Free operand from the memory. */
34339 void
34340 ix86_free_from_memory (enum machine_mode mode)
34341 {
34342 if (!ix86_using_red_zone ())
34343 {
34344 int size;
34345
34346 if (mode == DImode || TARGET_64BIT)
34347 size = 8;
34348 else
34349 size = 4;
34350 /* Use LEA to deallocate stack space. In peephole2 it will be converted
34351 to pop or add instruction if registers are available. */
34352 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
34353 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
34354 GEN_INT (size))));
34355 }
34356 }
34357
34358 /* Return a register priority for hard reg REGNO. */
34359 static int
34360 ix86_register_priority (int hard_regno)
34361 {
34362 /* ebp and r13 as the base always wants a displacement, r12 as the
34363 base always wants an index. So discourage their usage in an
34364 address. */
34365 if (hard_regno == R12_REG || hard_regno == R13_REG)
34366 return 0;
34367 if (hard_regno == BP_REG)
34368 return 1;
34369 /* New x86-64 int registers result in bigger code size. Discourage
34370 them. */
34371 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
34372 return 2;
34373 /* New x86-64 SSE registers result in bigger code size. Discourage
34374 them. */
34375 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
34376 return 2;
34377 /* Usage of AX register results in smaller code. Prefer it. */
34378 if (hard_regno == 0)
34379 return 4;
34380 return 3;
34381 }
34382
34383 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
34384
34385 Put float CONST_DOUBLE in the constant pool instead of fp regs.
34386 QImode must go into class Q_REGS.
34387 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
34388 movdf to do mem-to-mem moves through integer regs. */
34389
34390 static reg_class_t
34391 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
34392 {
34393 enum machine_mode mode = GET_MODE (x);
34394
34395 /* We're only allowed to return a subclass of CLASS. Many of the
34396 following checks fail for NO_REGS, so eliminate that early. */
34397 if (regclass == NO_REGS)
34398 return NO_REGS;
34399
34400 /* All classes can load zeros. */
34401 if (x == CONST0_RTX (mode))
34402 return regclass;
34403
34404 /* Force constants into memory if we are loading a (nonzero) constant into
34405 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
34406 instructions to load from a constant. */
34407 if (CONSTANT_P (x)
34408 && (MAYBE_MMX_CLASS_P (regclass)
34409 || MAYBE_SSE_CLASS_P (regclass)
34410 || MAYBE_MASK_CLASS_P (regclass)))
34411 return NO_REGS;
34412
34413 /* Prefer SSE regs only, if we can use them for math. */
34414 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
34415 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
34416
34417 /* Floating-point constants need more complex checks. */
34418 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
34419 {
34420 /* General regs can load everything. */
34421 if (reg_class_subset_p (regclass, GENERAL_REGS))
34422 return regclass;
34423
34424 /* Floats can load 0 and 1 plus some others. Note that we eliminated
34425 zero above. We only want to wind up preferring 80387 registers if
34426 we plan on doing computation with them. */
34427 if (TARGET_80387
34428 && standard_80387_constant_p (x) > 0)
34429 {
34430 /* Limit class to non-sse. */
34431 if (regclass == FLOAT_SSE_REGS)
34432 return FLOAT_REGS;
34433 if (regclass == FP_TOP_SSE_REGS)
34434 return FP_TOP_REG;
34435 if (regclass == FP_SECOND_SSE_REGS)
34436 return FP_SECOND_REG;
34437 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
34438 return regclass;
34439 }
34440
34441 return NO_REGS;
34442 }
34443
34444 /* Generally when we see PLUS here, it's the function invariant
34445 (plus soft-fp const_int). Which can only be computed into general
34446 regs. */
34447 if (GET_CODE (x) == PLUS)
34448 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
34449
34450 /* QImode constants are easy to load, but non-constant QImode data
34451 must go into Q_REGS. */
34452 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
34453 {
34454 if (reg_class_subset_p (regclass, Q_REGS))
34455 return regclass;
34456 if (reg_class_subset_p (Q_REGS, regclass))
34457 return Q_REGS;
34458 return NO_REGS;
34459 }
34460
34461 return regclass;
34462 }
34463
34464 /* Discourage putting floating-point values in SSE registers unless
34465 SSE math is being used, and likewise for the 387 registers. */
34466 static reg_class_t
34467 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
34468 {
34469 enum machine_mode mode = GET_MODE (x);
34470
34471 /* Restrict the output reload class to the register bank that we are doing
34472 math on. If we would like not to return a subset of CLASS, reject this
34473 alternative: if reload cannot do this, it will still use its choice. */
34474 mode = GET_MODE (x);
34475 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
34476 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
34477
34478 if (X87_FLOAT_MODE_P (mode))
34479 {
34480 if (regclass == FP_TOP_SSE_REGS)
34481 return FP_TOP_REG;
34482 else if (regclass == FP_SECOND_SSE_REGS)
34483 return FP_SECOND_REG;
34484 else
34485 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
34486 }
34487
34488 return regclass;
34489 }
34490
34491 static reg_class_t
34492 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
34493 enum machine_mode mode, secondary_reload_info *sri)
34494 {
34495 /* Double-word spills from general registers to non-offsettable memory
34496 references (zero-extended addresses) require special handling. */
34497 if (TARGET_64BIT
34498 && MEM_P (x)
34499 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
34500 && INTEGER_CLASS_P (rclass)
34501 && !offsettable_memref_p (x))
34502 {
34503 sri->icode = (in_p
34504 ? CODE_FOR_reload_noff_load
34505 : CODE_FOR_reload_noff_store);
34506 /* Add the cost of moving address to a temporary. */
34507 sri->extra_cost = 1;
34508
34509 return NO_REGS;
34510 }
34511
34512 /* QImode spills from non-QI registers require
34513 intermediate register on 32bit targets. */
34514 if (mode == QImode
34515 && (MAYBE_MASK_CLASS_P (rclass)
34516 || (!TARGET_64BIT && !in_p
34517 && INTEGER_CLASS_P (rclass)
34518 && MAYBE_NON_Q_CLASS_P (rclass))))
34519 {
34520 int regno;
34521
34522 if (REG_P (x))
34523 regno = REGNO (x);
34524 else
34525 regno = -1;
34526
34527 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
34528 regno = true_regnum (x);
34529
34530 /* Return Q_REGS if the operand is in memory. */
34531 if (regno == -1)
34532 return Q_REGS;
34533 }
34534
34535 /* This condition handles corner case where an expression involving
34536 pointers gets vectorized. We're trying to use the address of a
34537 stack slot as a vector initializer.
34538
34539 (set (reg:V2DI 74 [ vect_cst_.2 ])
34540 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
34541
34542 Eventually frame gets turned into sp+offset like this:
34543
34544 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34545 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34546 (const_int 392 [0x188]))))
34547
34548 That later gets turned into:
34549
34550 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34551 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
34552 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
34553
34554 We'll have the following reload recorded:
34555
34556 Reload 0: reload_in (DI) =
34557 (plus:DI (reg/f:DI 7 sp)
34558 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
34559 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34560 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
34561 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
34562 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
34563 reload_reg_rtx: (reg:V2DI 22 xmm1)
34564
34565 Which isn't going to work since SSE instructions can't handle scalar
34566 additions. Returning GENERAL_REGS forces the addition into integer
34567 register and reload can handle subsequent reloads without problems. */
34568
34569 if (in_p && GET_CODE (x) == PLUS
34570 && SSE_CLASS_P (rclass)
34571 && SCALAR_INT_MODE_P (mode))
34572 return GENERAL_REGS;
34573
34574 return NO_REGS;
34575 }
34576
34577 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
34578
34579 static bool
34580 ix86_class_likely_spilled_p (reg_class_t rclass)
34581 {
34582 switch (rclass)
34583 {
34584 case AREG:
34585 case DREG:
34586 case CREG:
34587 case BREG:
34588 case AD_REGS:
34589 case SIREG:
34590 case DIREG:
34591 case SSE_FIRST_REG:
34592 case FP_TOP_REG:
34593 case FP_SECOND_REG:
34594 case BND_REGS:
34595 return true;
34596
34597 default:
34598 break;
34599 }
34600
34601 return false;
34602 }
34603
34604 /* If we are copying between general and FP registers, we need a memory
34605 location. The same is true for SSE and MMX registers.
34606
34607 To optimize register_move_cost performance, allow inline variant.
34608
34609 The macro can't work reliably when one of the CLASSES is class containing
34610 registers from multiple units (SSE, MMX, integer). We avoid this by never
34611 combining those units in single alternative in the machine description.
34612 Ensure that this constraint holds to avoid unexpected surprises.
34613
34614 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
34615 enforce these sanity checks. */
34616
34617 static inline bool
34618 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34619 enum machine_mode mode, int strict)
34620 {
34621 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
34622 return false;
34623 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
34624 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
34625 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
34626 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
34627 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
34628 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
34629 {
34630 gcc_assert (!strict || lra_in_progress);
34631 return true;
34632 }
34633
34634 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
34635 return true;
34636
34637 /* ??? This is a lie. We do have moves between mmx/general, and for
34638 mmx/sse2. But by saying we need secondary memory we discourage the
34639 register allocator from using the mmx registers unless needed. */
34640 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
34641 return true;
34642
34643 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34644 {
34645 /* SSE1 doesn't have any direct moves from other classes. */
34646 if (!TARGET_SSE2)
34647 return true;
34648
34649 /* If the target says that inter-unit moves are more expensive
34650 than moving through memory, then don't generate them. */
34651 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
34652 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
34653 return true;
34654
34655 /* Between SSE and general, we have moves no larger than word size. */
34656 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34657 return true;
34658 }
34659
34660 return false;
34661 }
34662
34663 bool
34664 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
34665 enum machine_mode mode, int strict)
34666 {
34667 return inline_secondary_memory_needed (class1, class2, mode, strict);
34668 }
34669
34670 /* Implement the TARGET_CLASS_MAX_NREGS hook.
34671
34672 On the 80386, this is the size of MODE in words,
34673 except in the FP regs, where a single reg is always enough. */
34674
34675 static unsigned char
34676 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
34677 {
34678 if (MAYBE_INTEGER_CLASS_P (rclass))
34679 {
34680 if (mode == XFmode)
34681 return (TARGET_64BIT ? 2 : 3);
34682 else if (mode == XCmode)
34683 return (TARGET_64BIT ? 4 : 6);
34684 else
34685 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
34686 }
34687 else
34688 {
34689 if (COMPLEX_MODE_P (mode))
34690 return 2;
34691 else
34692 return 1;
34693 }
34694 }
34695
34696 /* Return true if the registers in CLASS cannot represent the change from
34697 modes FROM to TO. */
34698
34699 bool
34700 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
34701 enum reg_class regclass)
34702 {
34703 if (from == to)
34704 return false;
34705
34706 /* x87 registers can't do subreg at all, as all values are reformatted
34707 to extended precision. */
34708 if (MAYBE_FLOAT_CLASS_P (regclass))
34709 return true;
34710
34711 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
34712 {
34713 /* Vector registers do not support QI or HImode loads. If we don't
34714 disallow a change to these modes, reload will assume it's ok to
34715 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
34716 the vec_dupv4hi pattern. */
34717 if (GET_MODE_SIZE (from) < 4)
34718 return true;
34719
34720 /* Vector registers do not support subreg with nonzero offsets, which
34721 are otherwise valid for integer registers. Since we can't see
34722 whether we have a nonzero offset from here, prohibit all
34723 nonparadoxical subregs changing size. */
34724 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
34725 return true;
34726 }
34727
34728 return false;
34729 }
34730
34731 /* Return the cost of moving data of mode M between a
34732 register and memory. A value of 2 is the default; this cost is
34733 relative to those in `REGISTER_MOVE_COST'.
34734
34735 This function is used extensively by register_move_cost that is used to
34736 build tables at startup. Make it inline in this case.
34737 When IN is 2, return maximum of in and out move cost.
34738
34739 If moving between registers and memory is more expensive than
34740 between two registers, you should define this macro to express the
34741 relative cost.
34742
34743 Model also increased moving costs of QImode registers in non
34744 Q_REGS classes.
34745 */
34746 static inline int
34747 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
34748 int in)
34749 {
34750 int cost;
34751 if (FLOAT_CLASS_P (regclass))
34752 {
34753 int index;
34754 switch (mode)
34755 {
34756 case SFmode:
34757 index = 0;
34758 break;
34759 case DFmode:
34760 index = 1;
34761 break;
34762 case XFmode:
34763 index = 2;
34764 break;
34765 default:
34766 return 100;
34767 }
34768 if (in == 2)
34769 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
34770 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
34771 }
34772 if (SSE_CLASS_P (regclass))
34773 {
34774 int index;
34775 switch (GET_MODE_SIZE (mode))
34776 {
34777 case 4:
34778 index = 0;
34779 break;
34780 case 8:
34781 index = 1;
34782 break;
34783 case 16:
34784 index = 2;
34785 break;
34786 default:
34787 return 100;
34788 }
34789 if (in == 2)
34790 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
34791 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
34792 }
34793 if (MMX_CLASS_P (regclass))
34794 {
34795 int index;
34796 switch (GET_MODE_SIZE (mode))
34797 {
34798 case 4:
34799 index = 0;
34800 break;
34801 case 8:
34802 index = 1;
34803 break;
34804 default:
34805 return 100;
34806 }
34807 if (in)
34808 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
34809 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
34810 }
34811 switch (GET_MODE_SIZE (mode))
34812 {
34813 case 1:
34814 if (Q_CLASS_P (regclass) || TARGET_64BIT)
34815 {
34816 if (!in)
34817 return ix86_cost->int_store[0];
34818 if (TARGET_PARTIAL_REG_DEPENDENCY
34819 && optimize_function_for_speed_p (cfun))
34820 cost = ix86_cost->movzbl_load;
34821 else
34822 cost = ix86_cost->int_load[0];
34823 if (in == 2)
34824 return MAX (cost, ix86_cost->int_store[0]);
34825 return cost;
34826 }
34827 else
34828 {
34829 if (in == 2)
34830 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
34831 if (in)
34832 return ix86_cost->movzbl_load;
34833 else
34834 return ix86_cost->int_store[0] + 4;
34835 }
34836 break;
34837 case 2:
34838 if (in == 2)
34839 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
34840 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
34841 default:
34842 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
34843 if (mode == TFmode)
34844 mode = XFmode;
34845 if (in == 2)
34846 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
34847 else if (in)
34848 cost = ix86_cost->int_load[2];
34849 else
34850 cost = ix86_cost->int_store[2];
34851 return (cost * (((int) GET_MODE_SIZE (mode)
34852 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
34853 }
34854 }
34855
34856 static int
34857 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
34858 bool in)
34859 {
34860 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
34861 }
34862
34863
34864 /* Return the cost of moving data from a register in class CLASS1 to
34865 one in class CLASS2.
34866
34867 It is not required that the cost always equal 2 when FROM is the same as TO;
34868 on some machines it is expensive to move between registers if they are not
34869 general registers. */
34870
34871 static int
34872 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
34873 reg_class_t class2_i)
34874 {
34875 enum reg_class class1 = (enum reg_class) class1_i;
34876 enum reg_class class2 = (enum reg_class) class2_i;
34877
34878 /* In case we require secondary memory, compute cost of the store followed
34879 by load. In order to avoid bad register allocation choices, we need
34880 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
34881
34882 if (inline_secondary_memory_needed (class1, class2, mode, 0))
34883 {
34884 int cost = 1;
34885
34886 cost += inline_memory_move_cost (mode, class1, 2);
34887 cost += inline_memory_move_cost (mode, class2, 2);
34888
34889 /* In case of copying from general_purpose_register we may emit multiple
34890 stores followed by single load causing memory size mismatch stall.
34891 Count this as arbitrarily high cost of 20. */
34892 if (targetm.class_max_nregs (class1, mode)
34893 > targetm.class_max_nregs (class2, mode))
34894 cost += 20;
34895
34896 /* In the case of FP/MMX moves, the registers actually overlap, and we
34897 have to switch modes in order to treat them differently. */
34898 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
34899 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
34900 cost += 20;
34901
34902 return cost;
34903 }
34904
34905 /* Moves between SSE/MMX and integer unit are expensive. */
34906 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
34907 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
34908
34909 /* ??? By keeping returned value relatively high, we limit the number
34910 of moves between integer and MMX/SSE registers for all targets.
34911 Additionally, high value prevents problem with x86_modes_tieable_p(),
34912 where integer modes in MMX/SSE registers are not tieable
34913 because of missing QImode and HImode moves to, from or between
34914 MMX/SSE registers. */
34915 return MAX (8, ix86_cost->mmxsse_to_integer);
34916
34917 if (MAYBE_FLOAT_CLASS_P (class1))
34918 return ix86_cost->fp_move;
34919 if (MAYBE_SSE_CLASS_P (class1))
34920 return ix86_cost->sse_move;
34921 if (MAYBE_MMX_CLASS_P (class1))
34922 return ix86_cost->mmx_move;
34923 return 2;
34924 }
34925
34926 /* Return TRUE if hard register REGNO can hold a value of machine-mode
34927 MODE. */
34928
34929 bool
34930 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
34931 {
34932 /* Flags and only flags can only hold CCmode values. */
34933 if (CC_REGNO_P (regno))
34934 return GET_MODE_CLASS (mode) == MODE_CC;
34935 if (GET_MODE_CLASS (mode) == MODE_CC
34936 || GET_MODE_CLASS (mode) == MODE_RANDOM
34937 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
34938 return false;
34939 if (STACK_REGNO_P (regno))
34940 return VALID_FP_MODE_P (mode);
34941 if (MASK_REGNO_P (regno))
34942 return VALID_MASK_REG_MODE (mode);
34943 if (BND_REGNO_P (regno))
34944 return VALID_BND_REG_MODE (mode);
34945 if (SSE_REGNO_P (regno))
34946 {
34947 /* We implement the move patterns for all vector modes into and
34948 out of SSE registers, even when no operation instructions
34949 are available. */
34950
34951 /* For AVX-512 we allow, regardless of regno:
34952 - XI mode
34953 - any of 512-bit wide vector mode
34954 - any scalar mode. */
34955 if (TARGET_AVX512F
34956 && (mode == XImode
34957 || VALID_AVX512F_REG_MODE (mode)
34958 || VALID_AVX512F_SCALAR_MODE (mode)))
34959 return true;
34960
34961 /* xmm16-xmm31 are only available for AVX-512. */
34962 if (EXT_REX_SSE_REGNO_P (regno))
34963 return false;
34964
34965 /* OImode move is available only when AVX is enabled. */
34966 return ((TARGET_AVX && mode == OImode)
34967 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34968 || VALID_SSE_REG_MODE (mode)
34969 || VALID_SSE2_REG_MODE (mode)
34970 || VALID_MMX_REG_MODE (mode)
34971 || VALID_MMX_REG_MODE_3DNOW (mode));
34972 }
34973 if (MMX_REGNO_P (regno))
34974 {
34975 /* We implement the move patterns for 3DNOW modes even in MMX mode,
34976 so if the register is available at all, then we can move data of
34977 the given mode into or out of it. */
34978 return (VALID_MMX_REG_MODE (mode)
34979 || VALID_MMX_REG_MODE_3DNOW (mode));
34980 }
34981
34982 if (mode == QImode)
34983 {
34984 /* Take care for QImode values - they can be in non-QI regs,
34985 but then they do cause partial register stalls. */
34986 if (ANY_QI_REGNO_P (regno))
34987 return true;
34988 if (!TARGET_PARTIAL_REG_STALL)
34989 return true;
34990 /* LRA checks if the hard register is OK for the given mode.
34991 QImode values can live in non-QI regs, so we allow all
34992 registers here. */
34993 if (lra_in_progress)
34994 return true;
34995 return !can_create_pseudo_p ();
34996 }
34997 /* We handle both integer and floats in the general purpose registers. */
34998 else if (VALID_INT_MODE_P (mode))
34999 return true;
35000 else if (VALID_FP_MODE_P (mode))
35001 return true;
35002 else if (VALID_DFP_MODE_P (mode))
35003 return true;
35004 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
35005 on to use that value in smaller contexts, this can easily force a
35006 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
35007 supporting DImode, allow it. */
35008 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
35009 return true;
35010
35011 return false;
35012 }
35013
35014 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
35015 tieable integer mode. */
35016
35017 static bool
35018 ix86_tieable_integer_mode_p (enum machine_mode mode)
35019 {
35020 switch (mode)
35021 {
35022 case HImode:
35023 case SImode:
35024 return true;
35025
35026 case QImode:
35027 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
35028
35029 case DImode:
35030 return TARGET_64BIT;
35031
35032 default:
35033 return false;
35034 }
35035 }
35036
35037 /* Return true if MODE1 is accessible in a register that can hold MODE2
35038 without copying. That is, all register classes that can hold MODE2
35039 can also hold MODE1. */
35040
35041 bool
35042 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
35043 {
35044 if (mode1 == mode2)
35045 return true;
35046
35047 if (ix86_tieable_integer_mode_p (mode1)
35048 && ix86_tieable_integer_mode_p (mode2))
35049 return true;
35050
35051 /* MODE2 being XFmode implies fp stack or general regs, which means we
35052 can tie any smaller floating point modes to it. Note that we do not
35053 tie this with TFmode. */
35054 if (mode2 == XFmode)
35055 return mode1 == SFmode || mode1 == DFmode;
35056
35057 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
35058 that we can tie it with SFmode. */
35059 if (mode2 == DFmode)
35060 return mode1 == SFmode;
35061
35062 /* If MODE2 is only appropriate for an SSE register, then tie with
35063 any other mode acceptable to SSE registers. */
35064 if (GET_MODE_SIZE (mode2) == 32
35065 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35066 return (GET_MODE_SIZE (mode1) == 32
35067 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35068 if (GET_MODE_SIZE (mode2) == 16
35069 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
35070 return (GET_MODE_SIZE (mode1) == 16
35071 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
35072
35073 /* If MODE2 is appropriate for an MMX register, then tie
35074 with any other mode acceptable to MMX registers. */
35075 if (GET_MODE_SIZE (mode2) == 8
35076 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
35077 return (GET_MODE_SIZE (mode1) == 8
35078 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
35079
35080 return false;
35081 }
35082
35083 /* Return the cost of moving between two registers of mode MODE. */
35084
35085 static int
35086 ix86_set_reg_reg_cost (enum machine_mode mode)
35087 {
35088 unsigned int units = UNITS_PER_WORD;
35089
35090 switch (GET_MODE_CLASS (mode))
35091 {
35092 default:
35093 break;
35094
35095 case MODE_CC:
35096 units = GET_MODE_SIZE (CCmode);
35097 break;
35098
35099 case MODE_FLOAT:
35100 if ((TARGET_SSE && mode == TFmode)
35101 || (TARGET_80387 && mode == XFmode)
35102 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
35103 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
35104 units = GET_MODE_SIZE (mode);
35105 break;
35106
35107 case MODE_COMPLEX_FLOAT:
35108 if ((TARGET_SSE && mode == TCmode)
35109 || (TARGET_80387 && mode == XCmode)
35110 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
35111 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
35112 units = GET_MODE_SIZE (mode);
35113 break;
35114
35115 case MODE_VECTOR_INT:
35116 case MODE_VECTOR_FLOAT:
35117 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
35118 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
35119 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
35120 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
35121 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
35122 units = GET_MODE_SIZE (mode);
35123 }
35124
35125 /* Return the cost of moving between two registers of mode MODE,
35126 assuming that the move will be in pieces of at most UNITS bytes. */
35127 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
35128 }
35129
35130 /* Compute a (partial) cost for rtx X. Return true if the complete
35131 cost has been computed, and false if subexpressions should be
35132 scanned. In either case, *TOTAL contains the cost result. */
35133
35134 static bool
35135 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
35136 bool speed)
35137 {
35138 enum rtx_code code = (enum rtx_code) code_i;
35139 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
35140 enum machine_mode mode = GET_MODE (x);
35141 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
35142
35143 switch (code)
35144 {
35145 case SET:
35146 if (register_operand (SET_DEST (x), VOIDmode)
35147 && reg_or_0_operand (SET_SRC (x), VOIDmode))
35148 {
35149 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
35150 return true;
35151 }
35152 return false;
35153
35154 case CONST_INT:
35155 case CONST:
35156 case LABEL_REF:
35157 case SYMBOL_REF:
35158 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
35159 *total = 3;
35160 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
35161 *total = 2;
35162 else if (flag_pic && SYMBOLIC_CONST (x)
35163 && (!TARGET_64BIT
35164 || (!GET_CODE (x) != LABEL_REF
35165 && (GET_CODE (x) != SYMBOL_REF
35166 || !SYMBOL_REF_LOCAL_P (x)))))
35167 *total = 1;
35168 else
35169 *total = 0;
35170 return true;
35171
35172 case CONST_DOUBLE:
35173 if (mode == VOIDmode)
35174 {
35175 *total = 0;
35176 return true;
35177 }
35178 switch (standard_80387_constant_p (x))
35179 {
35180 case 1: /* 0.0 */
35181 *total = 1;
35182 return true;
35183 default: /* Other constants */
35184 *total = 2;
35185 return true;
35186 case 0:
35187 case -1:
35188 break;
35189 }
35190 if (SSE_FLOAT_MODE_P (mode))
35191 {
35192 case CONST_VECTOR:
35193 switch (standard_sse_constant_p (x))
35194 {
35195 case 0:
35196 break;
35197 case 1: /* 0: xor eliminates false dependency */
35198 *total = 0;
35199 return true;
35200 default: /* -1: cmp contains false dependency */
35201 *total = 1;
35202 return true;
35203 }
35204 }
35205 /* Fall back to (MEM (SYMBOL_REF)), since that's where
35206 it'll probably end up. Add a penalty for size. */
35207 *total = (COSTS_N_INSNS (1)
35208 + (flag_pic != 0 && !TARGET_64BIT)
35209 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
35210 return true;
35211
35212 case ZERO_EXTEND:
35213 /* The zero extensions is often completely free on x86_64, so make
35214 it as cheap as possible. */
35215 if (TARGET_64BIT && mode == DImode
35216 && GET_MODE (XEXP (x, 0)) == SImode)
35217 *total = 1;
35218 else if (TARGET_ZERO_EXTEND_WITH_AND)
35219 *total = cost->add;
35220 else
35221 *total = cost->movzx;
35222 return false;
35223
35224 case SIGN_EXTEND:
35225 *total = cost->movsx;
35226 return false;
35227
35228 case ASHIFT:
35229 if (SCALAR_INT_MODE_P (mode)
35230 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
35231 && CONST_INT_P (XEXP (x, 1)))
35232 {
35233 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35234 if (value == 1)
35235 {
35236 *total = cost->add;
35237 return false;
35238 }
35239 if ((value == 2 || value == 3)
35240 && cost->lea <= cost->shift_const)
35241 {
35242 *total = cost->lea;
35243 return false;
35244 }
35245 }
35246 /* FALLTHRU */
35247
35248 case ROTATE:
35249 case ASHIFTRT:
35250 case LSHIFTRT:
35251 case ROTATERT:
35252 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35253 {
35254 /* ??? Should be SSE vector operation cost. */
35255 /* At least for published AMD latencies, this really is the same
35256 as the latency for a simple fpu operation like fabs. */
35257 /* V*QImode is emulated with 1-11 insns. */
35258 if (mode == V16QImode || mode == V32QImode)
35259 {
35260 int count = 11;
35261 if (TARGET_XOP && mode == V16QImode)
35262 {
35263 /* For XOP we use vpshab, which requires a broadcast of the
35264 value to the variable shift insn. For constants this
35265 means a V16Q const in mem; even when we can perform the
35266 shift with one insn set the cost to prefer paddb. */
35267 if (CONSTANT_P (XEXP (x, 1)))
35268 {
35269 *total = (cost->fabs
35270 + rtx_cost (XEXP (x, 0), code, 0, speed)
35271 + (speed ? 2 : COSTS_N_BYTES (16)));
35272 return true;
35273 }
35274 count = 3;
35275 }
35276 else if (TARGET_SSSE3)
35277 count = 7;
35278 *total = cost->fabs * count;
35279 }
35280 else
35281 *total = cost->fabs;
35282 }
35283 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35284 {
35285 if (CONST_INT_P (XEXP (x, 1)))
35286 {
35287 if (INTVAL (XEXP (x, 1)) > 32)
35288 *total = cost->shift_const + COSTS_N_INSNS (2);
35289 else
35290 *total = cost->shift_const * 2;
35291 }
35292 else
35293 {
35294 if (GET_CODE (XEXP (x, 1)) == AND)
35295 *total = cost->shift_var * 2;
35296 else
35297 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
35298 }
35299 }
35300 else
35301 {
35302 if (CONST_INT_P (XEXP (x, 1)))
35303 *total = cost->shift_const;
35304 else if (GET_CODE (XEXP (x, 1)) == SUBREG
35305 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
35306 {
35307 /* Return the cost after shift-and truncation. */
35308 *total = cost->shift_var;
35309 return true;
35310 }
35311 else
35312 *total = cost->shift_var;
35313 }
35314 return false;
35315
35316 case FMA:
35317 {
35318 rtx sub;
35319
35320 gcc_assert (FLOAT_MODE_P (mode));
35321 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
35322
35323 /* ??? SSE scalar/vector cost should be used here. */
35324 /* ??? Bald assumption that fma has the same cost as fmul. */
35325 *total = cost->fmul;
35326 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
35327
35328 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
35329 sub = XEXP (x, 0);
35330 if (GET_CODE (sub) == NEG)
35331 sub = XEXP (sub, 0);
35332 *total += rtx_cost (sub, FMA, 0, speed);
35333
35334 sub = XEXP (x, 2);
35335 if (GET_CODE (sub) == NEG)
35336 sub = XEXP (sub, 0);
35337 *total += rtx_cost (sub, FMA, 2, speed);
35338 return true;
35339 }
35340
35341 case MULT:
35342 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35343 {
35344 /* ??? SSE scalar cost should be used here. */
35345 *total = cost->fmul;
35346 return false;
35347 }
35348 else if (X87_FLOAT_MODE_P (mode))
35349 {
35350 *total = cost->fmul;
35351 return false;
35352 }
35353 else if (FLOAT_MODE_P (mode))
35354 {
35355 /* ??? SSE vector cost should be used here. */
35356 *total = cost->fmul;
35357 return false;
35358 }
35359 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35360 {
35361 /* V*QImode is emulated with 7-13 insns. */
35362 if (mode == V16QImode || mode == V32QImode)
35363 {
35364 int extra = 11;
35365 if (TARGET_XOP && mode == V16QImode)
35366 extra = 5;
35367 else if (TARGET_SSSE3)
35368 extra = 6;
35369 *total = cost->fmul * 2 + cost->fabs * extra;
35370 }
35371 /* V*DImode is emulated with 5-8 insns. */
35372 else if (mode == V2DImode || mode == V4DImode)
35373 {
35374 if (TARGET_XOP && mode == V2DImode)
35375 *total = cost->fmul * 2 + cost->fabs * 3;
35376 else
35377 *total = cost->fmul * 3 + cost->fabs * 5;
35378 }
35379 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
35380 insns, including two PMULUDQ. */
35381 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
35382 *total = cost->fmul * 2 + cost->fabs * 5;
35383 else
35384 *total = cost->fmul;
35385 return false;
35386 }
35387 else
35388 {
35389 rtx op0 = XEXP (x, 0);
35390 rtx op1 = XEXP (x, 1);
35391 int nbits;
35392 if (CONST_INT_P (XEXP (x, 1)))
35393 {
35394 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
35395 for (nbits = 0; value != 0; value &= value - 1)
35396 nbits++;
35397 }
35398 else
35399 /* This is arbitrary. */
35400 nbits = 7;
35401
35402 /* Compute costs correctly for widening multiplication. */
35403 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
35404 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
35405 == GET_MODE_SIZE (mode))
35406 {
35407 int is_mulwiden = 0;
35408 enum machine_mode inner_mode = GET_MODE (op0);
35409
35410 if (GET_CODE (op0) == GET_CODE (op1))
35411 is_mulwiden = 1, op1 = XEXP (op1, 0);
35412 else if (CONST_INT_P (op1))
35413 {
35414 if (GET_CODE (op0) == SIGN_EXTEND)
35415 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
35416 == INTVAL (op1);
35417 else
35418 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
35419 }
35420
35421 if (is_mulwiden)
35422 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
35423 }
35424
35425 *total = (cost->mult_init[MODE_INDEX (mode)]
35426 + nbits * cost->mult_bit
35427 + rtx_cost (op0, outer_code, opno, speed)
35428 + rtx_cost (op1, outer_code, opno, speed));
35429
35430 return true;
35431 }
35432
35433 case DIV:
35434 case UDIV:
35435 case MOD:
35436 case UMOD:
35437 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35438 /* ??? SSE cost should be used here. */
35439 *total = cost->fdiv;
35440 else if (X87_FLOAT_MODE_P (mode))
35441 *total = cost->fdiv;
35442 else if (FLOAT_MODE_P (mode))
35443 /* ??? SSE vector cost should be used here. */
35444 *total = cost->fdiv;
35445 else
35446 *total = cost->divide[MODE_INDEX (mode)];
35447 return false;
35448
35449 case PLUS:
35450 if (GET_MODE_CLASS (mode) == MODE_INT
35451 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
35452 {
35453 if (GET_CODE (XEXP (x, 0)) == PLUS
35454 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
35455 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
35456 && CONSTANT_P (XEXP (x, 1)))
35457 {
35458 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
35459 if (val == 2 || val == 4 || val == 8)
35460 {
35461 *total = cost->lea;
35462 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35463 outer_code, opno, speed);
35464 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
35465 outer_code, opno, speed);
35466 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35467 return true;
35468 }
35469 }
35470 else if (GET_CODE (XEXP (x, 0)) == MULT
35471 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
35472 {
35473 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
35474 if (val == 2 || val == 4 || val == 8)
35475 {
35476 *total = cost->lea;
35477 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35478 outer_code, opno, speed);
35479 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35480 return true;
35481 }
35482 }
35483 else if (GET_CODE (XEXP (x, 0)) == PLUS)
35484 {
35485 *total = cost->lea;
35486 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
35487 outer_code, opno, speed);
35488 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
35489 outer_code, opno, speed);
35490 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
35491 return true;
35492 }
35493 }
35494 /* FALLTHRU */
35495
35496 case MINUS:
35497 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35498 {
35499 /* ??? SSE cost should be used here. */
35500 *total = cost->fadd;
35501 return false;
35502 }
35503 else if (X87_FLOAT_MODE_P (mode))
35504 {
35505 *total = cost->fadd;
35506 return false;
35507 }
35508 else if (FLOAT_MODE_P (mode))
35509 {
35510 /* ??? SSE vector cost should be used here. */
35511 *total = cost->fadd;
35512 return false;
35513 }
35514 /* FALLTHRU */
35515
35516 case AND:
35517 case IOR:
35518 case XOR:
35519 if (GET_MODE_CLASS (mode) == MODE_INT
35520 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35521 {
35522 *total = (cost->add * 2
35523 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
35524 << (GET_MODE (XEXP (x, 0)) != DImode))
35525 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
35526 << (GET_MODE (XEXP (x, 1)) != DImode)));
35527 return true;
35528 }
35529 /* FALLTHRU */
35530
35531 case NEG:
35532 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35533 {
35534 /* ??? SSE cost should be used here. */
35535 *total = cost->fchs;
35536 return false;
35537 }
35538 else if (X87_FLOAT_MODE_P (mode))
35539 {
35540 *total = cost->fchs;
35541 return false;
35542 }
35543 else if (FLOAT_MODE_P (mode))
35544 {
35545 /* ??? SSE vector cost should be used here. */
35546 *total = cost->fchs;
35547 return false;
35548 }
35549 /* FALLTHRU */
35550
35551 case NOT:
35552 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
35553 {
35554 /* ??? Should be SSE vector operation cost. */
35555 /* At least for published AMD latencies, this really is the same
35556 as the latency for a simple fpu operation like fabs. */
35557 *total = cost->fabs;
35558 }
35559 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
35560 *total = cost->add * 2;
35561 else
35562 *total = cost->add;
35563 return false;
35564
35565 case COMPARE:
35566 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
35567 && XEXP (XEXP (x, 0), 1) == const1_rtx
35568 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
35569 && XEXP (x, 1) == const0_rtx)
35570 {
35571 /* This kind of construct is implemented using test[bwl].
35572 Treat it as if we had an AND. */
35573 *total = (cost->add
35574 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
35575 + rtx_cost (const1_rtx, outer_code, opno, speed));
35576 return true;
35577 }
35578 return false;
35579
35580 case FLOAT_EXTEND:
35581 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
35582 *total = 0;
35583 return false;
35584
35585 case ABS:
35586 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35587 /* ??? SSE cost should be used here. */
35588 *total = cost->fabs;
35589 else if (X87_FLOAT_MODE_P (mode))
35590 *total = cost->fabs;
35591 else if (FLOAT_MODE_P (mode))
35592 /* ??? SSE vector cost should be used here. */
35593 *total = cost->fabs;
35594 return false;
35595
35596 case SQRT:
35597 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
35598 /* ??? SSE cost should be used here. */
35599 *total = cost->fsqrt;
35600 else if (X87_FLOAT_MODE_P (mode))
35601 *total = cost->fsqrt;
35602 else if (FLOAT_MODE_P (mode))
35603 /* ??? SSE vector cost should be used here. */
35604 *total = cost->fsqrt;
35605 return false;
35606
35607 case UNSPEC:
35608 if (XINT (x, 1) == UNSPEC_TP)
35609 *total = 0;
35610 return false;
35611
35612 case VEC_SELECT:
35613 case VEC_CONCAT:
35614 case VEC_MERGE:
35615 case VEC_DUPLICATE:
35616 /* ??? Assume all of these vector manipulation patterns are
35617 recognizable. In which case they all pretty much have the
35618 same cost. */
35619 *total = cost->fabs;
35620 return true;
35621
35622 default:
35623 return false;
35624 }
35625 }
35626
35627 #if TARGET_MACHO
35628
35629 static int current_machopic_label_num;
35630
35631 /* Given a symbol name and its associated stub, write out the
35632 definition of the stub. */
35633
35634 void
35635 machopic_output_stub (FILE *file, const char *symb, const char *stub)
35636 {
35637 unsigned int length;
35638 char *binder_name, *symbol_name, lazy_ptr_name[32];
35639 int label = ++current_machopic_label_num;
35640
35641 /* For 64-bit we shouldn't get here. */
35642 gcc_assert (!TARGET_64BIT);
35643
35644 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
35645 symb = targetm.strip_name_encoding (symb);
35646
35647 length = strlen (stub);
35648 binder_name = XALLOCAVEC (char, length + 32);
35649 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
35650
35651 length = strlen (symb);
35652 symbol_name = XALLOCAVEC (char, length + 32);
35653 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
35654
35655 sprintf (lazy_ptr_name, "L%d$lz", label);
35656
35657 if (MACHOPIC_ATT_STUB)
35658 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
35659 else if (MACHOPIC_PURE)
35660 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
35661 else
35662 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
35663
35664 fprintf (file, "%s:\n", stub);
35665 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35666
35667 if (MACHOPIC_ATT_STUB)
35668 {
35669 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
35670 }
35671 else if (MACHOPIC_PURE)
35672 {
35673 /* PIC stub. */
35674 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35675 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
35676 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
35677 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
35678 label, lazy_ptr_name, label);
35679 fprintf (file, "\tjmp\t*%%ecx\n");
35680 }
35681 else
35682 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
35683
35684 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
35685 it needs no stub-binding-helper. */
35686 if (MACHOPIC_ATT_STUB)
35687 return;
35688
35689 fprintf (file, "%s:\n", binder_name);
35690
35691 if (MACHOPIC_PURE)
35692 {
35693 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
35694 fprintf (file, "\tpushl\t%%ecx\n");
35695 }
35696 else
35697 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
35698
35699 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
35700
35701 /* N.B. Keep the correspondence of these
35702 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
35703 old-pic/new-pic/non-pic stubs; altering this will break
35704 compatibility with existing dylibs. */
35705 if (MACHOPIC_PURE)
35706 {
35707 /* 25-byte PIC stub using "CALL get_pc_thunk". */
35708 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
35709 }
35710 else
35711 /* 16-byte -mdynamic-no-pic stub. */
35712 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
35713
35714 fprintf (file, "%s:\n", lazy_ptr_name);
35715 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
35716 fprintf (file, ASM_LONG "%s\n", binder_name);
35717 }
35718 #endif /* TARGET_MACHO */
35719
35720 /* Order the registers for register allocator. */
35721
35722 void
35723 x86_order_regs_for_local_alloc (void)
35724 {
35725 int pos = 0;
35726 int i;
35727
35728 /* First allocate the local general purpose registers. */
35729 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35730 if (GENERAL_REGNO_P (i) && call_used_regs[i])
35731 reg_alloc_order [pos++] = i;
35732
35733 /* Global general purpose registers. */
35734 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
35735 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
35736 reg_alloc_order [pos++] = i;
35737
35738 /* x87 registers come first in case we are doing FP math
35739 using them. */
35740 if (!TARGET_SSE_MATH)
35741 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35742 reg_alloc_order [pos++] = i;
35743
35744 /* SSE registers. */
35745 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
35746 reg_alloc_order [pos++] = i;
35747 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
35748 reg_alloc_order [pos++] = i;
35749
35750 /* Extended REX SSE registers. */
35751 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
35752 reg_alloc_order [pos++] = i;
35753
35754 /* Mask register. */
35755 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
35756 reg_alloc_order [pos++] = i;
35757
35758 /* MPX bound registers. */
35759 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
35760 reg_alloc_order [pos++] = i;
35761
35762 /* x87 registers. */
35763 if (TARGET_SSE_MATH)
35764 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
35765 reg_alloc_order [pos++] = i;
35766
35767 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
35768 reg_alloc_order [pos++] = i;
35769
35770 /* Initialize the rest of array as we do not allocate some registers
35771 at all. */
35772 while (pos < FIRST_PSEUDO_REGISTER)
35773 reg_alloc_order [pos++] = 0;
35774 }
35775
35776 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
35777 in struct attribute_spec handler. */
35778 static tree
35779 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
35780 tree args,
35781 int flags ATTRIBUTE_UNUSED,
35782 bool *no_add_attrs)
35783 {
35784 if (TREE_CODE (*node) != FUNCTION_TYPE
35785 && TREE_CODE (*node) != METHOD_TYPE
35786 && TREE_CODE (*node) != FIELD_DECL
35787 && TREE_CODE (*node) != TYPE_DECL)
35788 {
35789 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35790 name);
35791 *no_add_attrs = true;
35792 return NULL_TREE;
35793 }
35794 if (TARGET_64BIT)
35795 {
35796 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
35797 name);
35798 *no_add_attrs = true;
35799 return NULL_TREE;
35800 }
35801 if (is_attribute_p ("callee_pop_aggregate_return", name))
35802 {
35803 tree cst;
35804
35805 cst = TREE_VALUE (args);
35806 if (TREE_CODE (cst) != INTEGER_CST)
35807 {
35808 warning (OPT_Wattributes,
35809 "%qE attribute requires an integer constant argument",
35810 name);
35811 *no_add_attrs = true;
35812 }
35813 else if (compare_tree_int (cst, 0) != 0
35814 && compare_tree_int (cst, 1) != 0)
35815 {
35816 warning (OPT_Wattributes,
35817 "argument to %qE attribute is neither zero, nor one",
35818 name);
35819 *no_add_attrs = true;
35820 }
35821
35822 return NULL_TREE;
35823 }
35824
35825 return NULL_TREE;
35826 }
35827
35828 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
35829 struct attribute_spec.handler. */
35830 static tree
35831 ix86_handle_abi_attribute (tree *node, tree name,
35832 tree args ATTRIBUTE_UNUSED,
35833 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35834 {
35835 if (TREE_CODE (*node) != FUNCTION_TYPE
35836 && TREE_CODE (*node) != METHOD_TYPE
35837 && TREE_CODE (*node) != FIELD_DECL
35838 && TREE_CODE (*node) != TYPE_DECL)
35839 {
35840 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35841 name);
35842 *no_add_attrs = true;
35843 return NULL_TREE;
35844 }
35845
35846 /* Can combine regparm with all attributes but fastcall. */
35847 if (is_attribute_p ("ms_abi", name))
35848 {
35849 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
35850 {
35851 error ("ms_abi and sysv_abi attributes are not compatible");
35852 }
35853
35854 return NULL_TREE;
35855 }
35856 else if (is_attribute_p ("sysv_abi", name))
35857 {
35858 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
35859 {
35860 error ("ms_abi and sysv_abi attributes are not compatible");
35861 }
35862
35863 return NULL_TREE;
35864 }
35865
35866 return NULL_TREE;
35867 }
35868
35869 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
35870 struct attribute_spec.handler. */
35871 static tree
35872 ix86_handle_struct_attribute (tree *node, tree name,
35873 tree args ATTRIBUTE_UNUSED,
35874 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35875 {
35876 tree *type = NULL;
35877 if (DECL_P (*node))
35878 {
35879 if (TREE_CODE (*node) == TYPE_DECL)
35880 type = &TREE_TYPE (*node);
35881 }
35882 else
35883 type = node;
35884
35885 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
35886 {
35887 warning (OPT_Wattributes, "%qE attribute ignored",
35888 name);
35889 *no_add_attrs = true;
35890 }
35891
35892 else if ((is_attribute_p ("ms_struct", name)
35893 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
35894 || ((is_attribute_p ("gcc_struct", name)
35895 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
35896 {
35897 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
35898 name);
35899 *no_add_attrs = true;
35900 }
35901
35902 return NULL_TREE;
35903 }
35904
35905 static tree
35906 ix86_handle_fndecl_attribute (tree *node, tree name,
35907 tree args ATTRIBUTE_UNUSED,
35908 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
35909 {
35910 if (TREE_CODE (*node) != FUNCTION_DECL)
35911 {
35912 warning (OPT_Wattributes, "%qE attribute only applies to functions",
35913 name);
35914 *no_add_attrs = true;
35915 }
35916 return NULL_TREE;
35917 }
35918
35919 static bool
35920 ix86_ms_bitfield_layout_p (const_tree record_type)
35921 {
35922 return ((TARGET_MS_BITFIELD_LAYOUT
35923 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
35924 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
35925 }
35926
35927 /* Returns an expression indicating where the this parameter is
35928 located on entry to the FUNCTION. */
35929
35930 static rtx
35931 x86_this_parameter (tree function)
35932 {
35933 tree type = TREE_TYPE (function);
35934 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
35935 int nregs;
35936
35937 if (TARGET_64BIT)
35938 {
35939 const int *parm_regs;
35940
35941 if (ix86_function_type_abi (type) == MS_ABI)
35942 parm_regs = x86_64_ms_abi_int_parameter_registers;
35943 else
35944 parm_regs = x86_64_int_parameter_registers;
35945 return gen_rtx_REG (Pmode, parm_regs[aggr]);
35946 }
35947
35948 nregs = ix86_function_regparm (type, function);
35949
35950 if (nregs > 0 && !stdarg_p (type))
35951 {
35952 int regno;
35953 unsigned int ccvt = ix86_get_callcvt (type);
35954
35955 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
35956 regno = aggr ? DX_REG : CX_REG;
35957 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
35958 {
35959 regno = CX_REG;
35960 if (aggr)
35961 return gen_rtx_MEM (SImode,
35962 plus_constant (Pmode, stack_pointer_rtx, 4));
35963 }
35964 else
35965 {
35966 regno = AX_REG;
35967 if (aggr)
35968 {
35969 regno = DX_REG;
35970 if (nregs == 1)
35971 return gen_rtx_MEM (SImode,
35972 plus_constant (Pmode,
35973 stack_pointer_rtx, 4));
35974 }
35975 }
35976 return gen_rtx_REG (SImode, regno);
35977 }
35978
35979 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
35980 aggr ? 8 : 4));
35981 }
35982
35983 /* Determine whether x86_output_mi_thunk can succeed. */
35984
35985 static bool
35986 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
35987 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
35988 HOST_WIDE_INT vcall_offset, const_tree function)
35989 {
35990 /* 64-bit can handle anything. */
35991 if (TARGET_64BIT)
35992 return true;
35993
35994 /* For 32-bit, everything's fine if we have one free register. */
35995 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
35996 return true;
35997
35998 /* Need a free register for vcall_offset. */
35999 if (vcall_offset)
36000 return false;
36001
36002 /* Need a free register for GOT references. */
36003 if (flag_pic && !targetm.binds_local_p (function))
36004 return false;
36005
36006 /* Otherwise ok. */
36007 return true;
36008 }
36009
36010 /* Output the assembler code for a thunk function. THUNK_DECL is the
36011 declaration for the thunk function itself, FUNCTION is the decl for
36012 the target function. DELTA is an immediate constant offset to be
36013 added to THIS. If VCALL_OFFSET is nonzero, the word at
36014 *(*this + vcall_offset) should be added to THIS. */
36015
36016 static void
36017 x86_output_mi_thunk (FILE *file,
36018 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
36019 HOST_WIDE_INT vcall_offset, tree function)
36020 {
36021 rtx this_param = x86_this_parameter (function);
36022 rtx this_reg, tmp, fnaddr;
36023 unsigned int tmp_regno;
36024
36025 if (TARGET_64BIT)
36026 tmp_regno = R10_REG;
36027 else
36028 {
36029 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
36030 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
36031 tmp_regno = AX_REG;
36032 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
36033 tmp_regno = DX_REG;
36034 else
36035 tmp_regno = CX_REG;
36036 }
36037
36038 emit_note (NOTE_INSN_PROLOGUE_END);
36039
36040 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
36041 pull it in now and let DELTA benefit. */
36042 if (REG_P (this_param))
36043 this_reg = this_param;
36044 else if (vcall_offset)
36045 {
36046 /* Put the this parameter into %eax. */
36047 this_reg = gen_rtx_REG (Pmode, AX_REG);
36048 emit_move_insn (this_reg, this_param);
36049 }
36050 else
36051 this_reg = NULL_RTX;
36052
36053 /* Adjust the this parameter by a fixed constant. */
36054 if (delta)
36055 {
36056 rtx delta_rtx = GEN_INT (delta);
36057 rtx delta_dst = this_reg ? this_reg : this_param;
36058
36059 if (TARGET_64BIT)
36060 {
36061 if (!x86_64_general_operand (delta_rtx, Pmode))
36062 {
36063 tmp = gen_rtx_REG (Pmode, tmp_regno);
36064 emit_move_insn (tmp, delta_rtx);
36065 delta_rtx = tmp;
36066 }
36067 }
36068
36069 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
36070 }
36071
36072 /* Adjust the this parameter by a value stored in the vtable. */
36073 if (vcall_offset)
36074 {
36075 rtx vcall_addr, vcall_mem, this_mem;
36076
36077 tmp = gen_rtx_REG (Pmode, tmp_regno);
36078
36079 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
36080 if (Pmode != ptr_mode)
36081 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
36082 emit_move_insn (tmp, this_mem);
36083
36084 /* Adjust the this parameter. */
36085 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
36086 if (TARGET_64BIT
36087 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
36088 {
36089 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
36090 emit_move_insn (tmp2, GEN_INT (vcall_offset));
36091 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
36092 }
36093
36094 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
36095 if (Pmode != ptr_mode)
36096 emit_insn (gen_addsi_1_zext (this_reg,
36097 gen_rtx_REG (ptr_mode,
36098 REGNO (this_reg)),
36099 vcall_mem));
36100 else
36101 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
36102 }
36103
36104 /* If necessary, drop THIS back to its stack slot. */
36105 if (this_reg && this_reg != this_param)
36106 emit_move_insn (this_param, this_reg);
36107
36108 fnaddr = XEXP (DECL_RTL (function), 0);
36109 if (TARGET_64BIT)
36110 {
36111 if (!flag_pic || targetm.binds_local_p (function)
36112 || TARGET_PECOFF)
36113 ;
36114 else
36115 {
36116 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
36117 tmp = gen_rtx_CONST (Pmode, tmp);
36118 fnaddr = gen_rtx_MEM (Pmode, tmp);
36119 }
36120 }
36121 else
36122 {
36123 if (!flag_pic || targetm.binds_local_p (function))
36124 ;
36125 #if TARGET_MACHO
36126 else if (TARGET_MACHO)
36127 {
36128 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
36129 fnaddr = XEXP (fnaddr, 0);
36130 }
36131 #endif /* TARGET_MACHO */
36132 else
36133 {
36134 tmp = gen_rtx_REG (Pmode, CX_REG);
36135 output_set_got (tmp, NULL_RTX);
36136
36137 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
36138 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
36139 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
36140 }
36141 }
36142
36143 /* Our sibling call patterns do not allow memories, because we have no
36144 predicate that can distinguish between frame and non-frame memory.
36145 For our purposes here, we can get away with (ab)using a jump pattern,
36146 because we're going to do no optimization. */
36147 if (MEM_P (fnaddr))
36148 emit_jump_insn (gen_indirect_jump (fnaddr));
36149 else
36150 {
36151 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
36152 fnaddr = legitimize_pic_address (fnaddr,
36153 gen_rtx_REG (Pmode, tmp_regno));
36154
36155 if (!sibcall_insn_operand (fnaddr, word_mode))
36156 {
36157 tmp = gen_rtx_REG (word_mode, tmp_regno);
36158 if (GET_MODE (fnaddr) != word_mode)
36159 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
36160 emit_move_insn (tmp, fnaddr);
36161 fnaddr = tmp;
36162 }
36163
36164 tmp = gen_rtx_MEM (QImode, fnaddr);
36165 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
36166 tmp = emit_call_insn (tmp);
36167 SIBLING_CALL_P (tmp) = 1;
36168 }
36169 emit_barrier ();
36170
36171 /* Emit just enough of rest_of_compilation to get the insns emitted.
36172 Note that use_thunk calls assemble_start_function et al. */
36173 tmp = get_insns ();
36174 shorten_branches (tmp);
36175 final_start_function (tmp, file, 1);
36176 final (tmp, file, 1);
36177 final_end_function ();
36178 }
36179
36180 static void
36181 x86_file_start (void)
36182 {
36183 default_file_start ();
36184 #if TARGET_MACHO
36185 darwin_file_start ();
36186 #endif
36187 if (X86_FILE_START_VERSION_DIRECTIVE)
36188 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
36189 if (X86_FILE_START_FLTUSED)
36190 fputs ("\t.global\t__fltused\n", asm_out_file);
36191 if (ix86_asm_dialect == ASM_INTEL)
36192 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
36193 }
36194
36195 int
36196 x86_field_alignment (tree field, int computed)
36197 {
36198 enum machine_mode mode;
36199 tree type = TREE_TYPE (field);
36200
36201 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
36202 return computed;
36203 mode = TYPE_MODE (strip_array_types (type));
36204 if (mode == DFmode || mode == DCmode
36205 || GET_MODE_CLASS (mode) == MODE_INT
36206 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
36207 return MIN (32, computed);
36208 return computed;
36209 }
36210
36211 /* Output assembler code to FILE to increment profiler label # LABELNO
36212 for profiling a function entry. */
36213 void
36214 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
36215 {
36216 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
36217 : MCOUNT_NAME);
36218
36219 if (TARGET_64BIT)
36220 {
36221 #ifndef NO_PROFILE_COUNTERS
36222 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
36223 #endif
36224
36225 if (!TARGET_PECOFF && flag_pic)
36226 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
36227 else
36228 fprintf (file, "\tcall\t%s\n", mcount_name);
36229 }
36230 else if (flag_pic)
36231 {
36232 #ifndef NO_PROFILE_COUNTERS
36233 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
36234 LPREFIX, labelno);
36235 #endif
36236 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
36237 }
36238 else
36239 {
36240 #ifndef NO_PROFILE_COUNTERS
36241 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
36242 LPREFIX, labelno);
36243 #endif
36244 fprintf (file, "\tcall\t%s\n", mcount_name);
36245 }
36246 }
36247
36248 /* We don't have exact information about the insn sizes, but we may assume
36249 quite safely that we are informed about all 1 byte insns and memory
36250 address sizes. This is enough to eliminate unnecessary padding in
36251 99% of cases. */
36252
36253 static int
36254 min_insn_size (rtx insn)
36255 {
36256 int l = 0, len;
36257
36258 if (!INSN_P (insn) || !active_insn_p (insn))
36259 return 0;
36260
36261 /* Discard alignments we've emit and jump instructions. */
36262 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
36263 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
36264 return 0;
36265
36266 /* Important case - calls are always 5 bytes.
36267 It is common to have many calls in the row. */
36268 if (CALL_P (insn)
36269 && symbolic_reference_mentioned_p (PATTERN (insn))
36270 && !SIBLING_CALL_P (insn))
36271 return 5;
36272 len = get_attr_length (insn);
36273 if (len <= 1)
36274 return 1;
36275
36276 /* For normal instructions we rely on get_attr_length being exact,
36277 with a few exceptions. */
36278 if (!JUMP_P (insn))
36279 {
36280 enum attr_type type = get_attr_type (insn);
36281
36282 switch (type)
36283 {
36284 case TYPE_MULTI:
36285 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
36286 || asm_noperands (PATTERN (insn)) >= 0)
36287 return 0;
36288 break;
36289 case TYPE_OTHER:
36290 case TYPE_FCMP:
36291 break;
36292 default:
36293 /* Otherwise trust get_attr_length. */
36294 return len;
36295 }
36296
36297 l = get_attr_length_address (insn);
36298 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
36299 l = 4;
36300 }
36301 if (l)
36302 return 1+l;
36303 else
36304 return 2;
36305 }
36306
36307 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36308
36309 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
36310 window. */
36311
36312 static void
36313 ix86_avoid_jump_mispredicts (void)
36314 {
36315 rtx insn, start = get_insns ();
36316 int nbytes = 0, njumps = 0;
36317 int isjump = 0;
36318
36319 /* Look for all minimal intervals of instructions containing 4 jumps.
36320 The intervals are bounded by START and INSN. NBYTES is the total
36321 size of instructions in the interval including INSN and not including
36322 START. When the NBYTES is smaller than 16 bytes, it is possible
36323 that the end of START and INSN ends up in the same 16byte page.
36324
36325 The smallest offset in the page INSN can start is the case where START
36326 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
36327 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
36328 */
36329 for (insn = start; insn; insn = NEXT_INSN (insn))
36330 {
36331 int min_size;
36332
36333 if (LABEL_P (insn))
36334 {
36335 int align = label_to_alignment (insn);
36336 int max_skip = label_to_max_skip (insn);
36337
36338 if (max_skip > 15)
36339 max_skip = 15;
36340 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
36341 already in the current 16 byte page, because otherwise
36342 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
36343 bytes to reach 16 byte boundary. */
36344 if (align <= 0
36345 || (align <= 3 && max_skip != (1 << align) - 1))
36346 max_skip = 0;
36347 if (dump_file)
36348 fprintf (dump_file, "Label %i with max_skip %i\n",
36349 INSN_UID (insn), max_skip);
36350 if (max_skip)
36351 {
36352 while (nbytes + max_skip >= 16)
36353 {
36354 start = NEXT_INSN (start);
36355 if (JUMP_P (start) || CALL_P (start))
36356 njumps--, isjump = 1;
36357 else
36358 isjump = 0;
36359 nbytes -= min_insn_size (start);
36360 }
36361 }
36362 continue;
36363 }
36364
36365 min_size = min_insn_size (insn);
36366 nbytes += min_size;
36367 if (dump_file)
36368 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
36369 INSN_UID (insn), min_size);
36370 if (JUMP_P (insn) || CALL_P (insn))
36371 njumps++;
36372 else
36373 continue;
36374
36375 while (njumps > 3)
36376 {
36377 start = NEXT_INSN (start);
36378 if (JUMP_P (start) || CALL_P (start))
36379 njumps--, isjump = 1;
36380 else
36381 isjump = 0;
36382 nbytes -= min_insn_size (start);
36383 }
36384 gcc_assert (njumps >= 0);
36385 if (dump_file)
36386 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
36387 INSN_UID (start), INSN_UID (insn), nbytes);
36388
36389 if (njumps == 3 && isjump && nbytes < 16)
36390 {
36391 int padsize = 15 - nbytes + min_insn_size (insn);
36392
36393 if (dump_file)
36394 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
36395 INSN_UID (insn), padsize);
36396 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
36397 }
36398 }
36399 }
36400 #endif
36401
36402 /* AMD Athlon works faster
36403 when RET is not destination of conditional jump or directly preceded
36404 by other jump instruction. We avoid the penalty by inserting NOP just
36405 before the RET instructions in such cases. */
36406 static void
36407 ix86_pad_returns (void)
36408 {
36409 edge e;
36410 edge_iterator ei;
36411
36412 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36413 {
36414 basic_block bb = e->src;
36415 rtx ret = BB_END (bb);
36416 rtx prev;
36417 bool replace = false;
36418
36419 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
36420 || optimize_bb_for_size_p (bb))
36421 continue;
36422 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
36423 if (active_insn_p (prev) || LABEL_P (prev))
36424 break;
36425 if (prev && LABEL_P (prev))
36426 {
36427 edge e;
36428 edge_iterator ei;
36429
36430 FOR_EACH_EDGE (e, ei, bb->preds)
36431 if (EDGE_FREQUENCY (e) && e->src->index >= 0
36432 && !(e->flags & EDGE_FALLTHRU))
36433 {
36434 replace = true;
36435 break;
36436 }
36437 }
36438 if (!replace)
36439 {
36440 prev = prev_active_insn (ret);
36441 if (prev
36442 && ((JUMP_P (prev) && any_condjump_p (prev))
36443 || CALL_P (prev)))
36444 replace = true;
36445 /* Empty functions get branch mispredict even when
36446 the jump destination is not visible to us. */
36447 if (!prev && !optimize_function_for_size_p (cfun))
36448 replace = true;
36449 }
36450 if (replace)
36451 {
36452 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
36453 delete_insn (ret);
36454 }
36455 }
36456 }
36457
36458 /* Count the minimum number of instructions in BB. Return 4 if the
36459 number of instructions >= 4. */
36460
36461 static int
36462 ix86_count_insn_bb (basic_block bb)
36463 {
36464 rtx insn;
36465 int insn_count = 0;
36466
36467 /* Count number of instructions in this block. Return 4 if the number
36468 of instructions >= 4. */
36469 FOR_BB_INSNS (bb, insn)
36470 {
36471 /* Only happen in exit blocks. */
36472 if (JUMP_P (insn)
36473 && ANY_RETURN_P (PATTERN (insn)))
36474 break;
36475
36476 if (NONDEBUG_INSN_P (insn)
36477 && GET_CODE (PATTERN (insn)) != USE
36478 && GET_CODE (PATTERN (insn)) != CLOBBER)
36479 {
36480 insn_count++;
36481 if (insn_count >= 4)
36482 return insn_count;
36483 }
36484 }
36485
36486 return insn_count;
36487 }
36488
36489
36490 /* Count the minimum number of instructions in code path in BB.
36491 Return 4 if the number of instructions >= 4. */
36492
36493 static int
36494 ix86_count_insn (basic_block bb)
36495 {
36496 edge e;
36497 edge_iterator ei;
36498 int min_prev_count;
36499
36500 /* Only bother counting instructions along paths with no
36501 more than 2 basic blocks between entry and exit. Given
36502 that BB has an edge to exit, determine if a predecessor
36503 of BB has an edge from entry. If so, compute the number
36504 of instructions in the predecessor block. If there
36505 happen to be multiple such blocks, compute the minimum. */
36506 min_prev_count = 4;
36507 FOR_EACH_EDGE (e, ei, bb->preds)
36508 {
36509 edge prev_e;
36510 edge_iterator prev_ei;
36511
36512 if (e->src == ENTRY_BLOCK_PTR)
36513 {
36514 min_prev_count = 0;
36515 break;
36516 }
36517 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
36518 {
36519 if (prev_e->src == ENTRY_BLOCK_PTR)
36520 {
36521 int count = ix86_count_insn_bb (e->src);
36522 if (count < min_prev_count)
36523 min_prev_count = count;
36524 break;
36525 }
36526 }
36527 }
36528
36529 if (min_prev_count < 4)
36530 min_prev_count += ix86_count_insn_bb (bb);
36531
36532 return min_prev_count;
36533 }
36534
36535 /* Pad short function to 4 instructions. */
36536
36537 static void
36538 ix86_pad_short_function (void)
36539 {
36540 edge e;
36541 edge_iterator ei;
36542
36543 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36544 {
36545 rtx ret = BB_END (e->src);
36546 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
36547 {
36548 int insn_count = ix86_count_insn (e->src);
36549
36550 /* Pad short function. */
36551 if (insn_count < 4)
36552 {
36553 rtx insn = ret;
36554
36555 /* Find epilogue. */
36556 while (insn
36557 && (!NOTE_P (insn)
36558 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
36559 insn = PREV_INSN (insn);
36560
36561 if (!insn)
36562 insn = ret;
36563
36564 /* Two NOPs count as one instruction. */
36565 insn_count = 2 * (4 - insn_count);
36566 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
36567 }
36568 }
36569 }
36570 }
36571
36572 /* Fix up a Windows system unwinder issue. If an EH region falls through into
36573 the epilogue, the Windows system unwinder will apply epilogue logic and
36574 produce incorrect offsets. This can be avoided by adding a nop between
36575 the last insn that can throw and the first insn of the epilogue. */
36576
36577 static void
36578 ix86_seh_fixup_eh_fallthru (void)
36579 {
36580 edge e;
36581 edge_iterator ei;
36582
36583 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
36584 {
36585 rtx insn, next;
36586
36587 /* Find the beginning of the epilogue. */
36588 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
36589 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
36590 break;
36591 if (insn == NULL)
36592 continue;
36593
36594 /* We only care about preceding insns that can throw. */
36595 insn = prev_active_insn (insn);
36596 if (insn == NULL || !can_throw_internal (insn))
36597 continue;
36598
36599 /* Do not separate calls from their debug information. */
36600 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
36601 if (NOTE_P (next)
36602 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
36603 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
36604 insn = next;
36605 else
36606 break;
36607
36608 emit_insn_after (gen_nops (const1_rtx), insn);
36609 }
36610 }
36611
36612 /* Implement machine specific optimizations. We implement padding of returns
36613 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
36614 static void
36615 ix86_reorg (void)
36616 {
36617 /* We are freeing block_for_insn in the toplev to keep compatibility
36618 with old MDEP_REORGS that are not CFG based. Recompute it now. */
36619 compute_bb_for_insn ();
36620
36621 if (TARGET_SEH && current_function_has_exception_handlers ())
36622 ix86_seh_fixup_eh_fallthru ();
36623
36624 if (optimize && optimize_function_for_speed_p (cfun))
36625 {
36626 if (TARGET_PAD_SHORT_FUNCTION)
36627 ix86_pad_short_function ();
36628 else if (TARGET_PAD_RETURNS)
36629 ix86_pad_returns ();
36630 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
36631 if (TARGET_FOUR_JUMP_LIMIT)
36632 ix86_avoid_jump_mispredicts ();
36633 #endif
36634 }
36635 }
36636
36637 /* Return nonzero when QImode register that must be represented via REX prefix
36638 is used. */
36639 bool
36640 x86_extended_QIreg_mentioned_p (rtx insn)
36641 {
36642 int i;
36643 extract_insn_cached (insn);
36644 for (i = 0; i < recog_data.n_operands; i++)
36645 if (GENERAL_REG_P (recog_data.operand[i])
36646 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
36647 return true;
36648 return false;
36649 }
36650
36651 /* Return nonzero when P points to register encoded via REX prefix.
36652 Called via for_each_rtx. */
36653 static int
36654 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
36655 {
36656 unsigned int regno;
36657 if (!REG_P (*p))
36658 return 0;
36659 regno = REGNO (*p);
36660 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
36661 }
36662
36663 /* Return true when INSN mentions register that must be encoded using REX
36664 prefix. */
36665 bool
36666 x86_extended_reg_mentioned_p (rtx insn)
36667 {
36668 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
36669 extended_reg_mentioned_1, NULL);
36670 }
36671
36672 /* If profitable, negate (without causing overflow) integer constant
36673 of mode MODE at location LOC. Return true in this case. */
36674 bool
36675 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
36676 {
36677 HOST_WIDE_INT val;
36678
36679 if (!CONST_INT_P (*loc))
36680 return false;
36681
36682 switch (mode)
36683 {
36684 case DImode:
36685 /* DImode x86_64 constants must fit in 32 bits. */
36686 gcc_assert (x86_64_immediate_operand (*loc, mode));
36687
36688 mode = SImode;
36689 break;
36690
36691 case SImode:
36692 case HImode:
36693 case QImode:
36694 break;
36695
36696 default:
36697 gcc_unreachable ();
36698 }
36699
36700 /* Avoid overflows. */
36701 if (mode_signbit_p (mode, *loc))
36702 return false;
36703
36704 val = INTVAL (*loc);
36705
36706 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
36707 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
36708 if ((val < 0 && val != -128)
36709 || val == 128)
36710 {
36711 *loc = GEN_INT (-val);
36712 return true;
36713 }
36714
36715 return false;
36716 }
36717
36718 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
36719 optabs would emit if we didn't have TFmode patterns. */
36720
36721 void
36722 x86_emit_floatuns (rtx operands[2])
36723 {
36724 rtx neglab, donelab, i0, i1, f0, in, out;
36725 enum machine_mode mode, inmode;
36726
36727 inmode = GET_MODE (operands[1]);
36728 gcc_assert (inmode == SImode || inmode == DImode);
36729
36730 out = operands[0];
36731 in = force_reg (inmode, operands[1]);
36732 mode = GET_MODE (out);
36733 neglab = gen_label_rtx ();
36734 donelab = gen_label_rtx ();
36735 f0 = gen_reg_rtx (mode);
36736
36737 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
36738
36739 expand_float (out, in, 0);
36740
36741 emit_jump_insn (gen_jump (donelab));
36742 emit_barrier ();
36743
36744 emit_label (neglab);
36745
36746 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
36747 1, OPTAB_DIRECT);
36748 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
36749 1, OPTAB_DIRECT);
36750 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
36751
36752 expand_float (f0, i0, 0);
36753
36754 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
36755
36756 emit_label (donelab);
36757 }
36758 \f
36759 /* AVX512F does support 64-byte integer vector operations,
36760 thus the longest vector we are faced with is V64QImode. */
36761 #define MAX_VECT_LEN 64
36762
36763 struct expand_vec_perm_d
36764 {
36765 rtx target, op0, op1;
36766 unsigned char perm[MAX_VECT_LEN];
36767 enum machine_mode vmode;
36768 unsigned char nelt;
36769 bool one_operand_p;
36770 bool testing_p;
36771 };
36772
36773 static bool canonicalize_perm (struct expand_vec_perm_d *d);
36774 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
36775 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
36776
36777 /* Get a vector mode of the same size as the original but with elements
36778 twice as wide. This is only guaranteed to apply to integral vectors. */
36779
36780 static inline enum machine_mode
36781 get_mode_wider_vector (enum machine_mode o)
36782 {
36783 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
36784 enum machine_mode n = GET_MODE_WIDER_MODE (o);
36785 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
36786 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
36787 return n;
36788 }
36789
36790 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36791 with all elements equal to VAR. Return true if successful. */
36792
36793 static bool
36794 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
36795 rtx target, rtx val)
36796 {
36797 bool ok;
36798
36799 switch (mode)
36800 {
36801 case V2SImode:
36802 case V2SFmode:
36803 if (!mmx_ok)
36804 return false;
36805 /* FALLTHRU */
36806
36807 case V4DFmode:
36808 case V4DImode:
36809 case V8SFmode:
36810 case V8SImode:
36811 case V2DFmode:
36812 case V2DImode:
36813 case V4SFmode:
36814 case V4SImode:
36815 {
36816 rtx insn, dup;
36817
36818 /* First attempt to recognize VAL as-is. */
36819 dup = gen_rtx_VEC_DUPLICATE (mode, val);
36820 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
36821 if (recog_memoized (insn) < 0)
36822 {
36823 rtx seq;
36824 /* If that fails, force VAL into a register. */
36825
36826 start_sequence ();
36827 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
36828 seq = get_insns ();
36829 end_sequence ();
36830 if (seq)
36831 emit_insn_before (seq, insn);
36832
36833 ok = recog_memoized (insn) >= 0;
36834 gcc_assert (ok);
36835 }
36836 }
36837 return true;
36838
36839 case V4HImode:
36840 if (!mmx_ok)
36841 return false;
36842 if (TARGET_SSE || TARGET_3DNOW_A)
36843 {
36844 rtx x;
36845
36846 val = gen_lowpart (SImode, val);
36847 x = gen_rtx_TRUNCATE (HImode, val);
36848 x = gen_rtx_VEC_DUPLICATE (mode, x);
36849 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36850 return true;
36851 }
36852 goto widen;
36853
36854 case V8QImode:
36855 if (!mmx_ok)
36856 return false;
36857 goto widen;
36858
36859 case V8HImode:
36860 if (TARGET_SSE2)
36861 {
36862 struct expand_vec_perm_d dperm;
36863 rtx tmp1, tmp2;
36864
36865 permute:
36866 memset (&dperm, 0, sizeof (dperm));
36867 dperm.target = target;
36868 dperm.vmode = mode;
36869 dperm.nelt = GET_MODE_NUNITS (mode);
36870 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
36871 dperm.one_operand_p = true;
36872
36873 /* Extend to SImode using a paradoxical SUBREG. */
36874 tmp1 = gen_reg_rtx (SImode);
36875 emit_move_insn (tmp1, gen_lowpart (SImode, val));
36876
36877 /* Insert the SImode value as low element of a V4SImode vector. */
36878 tmp2 = gen_reg_rtx (V4SImode);
36879 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
36880 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
36881
36882 ok = (expand_vec_perm_1 (&dperm)
36883 || expand_vec_perm_broadcast_1 (&dperm));
36884 gcc_assert (ok);
36885 return ok;
36886 }
36887 goto widen;
36888
36889 case V16QImode:
36890 if (TARGET_SSE2)
36891 goto permute;
36892 goto widen;
36893
36894 widen:
36895 /* Replicate the value once into the next wider mode and recurse. */
36896 {
36897 enum machine_mode smode, wsmode, wvmode;
36898 rtx x;
36899
36900 smode = GET_MODE_INNER (mode);
36901 wvmode = get_mode_wider_vector (mode);
36902 wsmode = GET_MODE_INNER (wvmode);
36903
36904 val = convert_modes (wsmode, smode, val, true);
36905 x = expand_simple_binop (wsmode, ASHIFT, val,
36906 GEN_INT (GET_MODE_BITSIZE (smode)),
36907 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36908 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
36909
36910 x = gen_reg_rtx (wvmode);
36911 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
36912 gcc_assert (ok);
36913 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
36914 return ok;
36915 }
36916
36917 case V16HImode:
36918 case V32QImode:
36919 {
36920 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
36921 rtx x = gen_reg_rtx (hvmode);
36922
36923 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
36924 gcc_assert (ok);
36925
36926 x = gen_rtx_VEC_CONCAT (mode, x, x);
36927 emit_insn (gen_rtx_SET (VOIDmode, target, x));
36928 }
36929 return true;
36930
36931 default:
36932 return false;
36933 }
36934 }
36935
36936 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36937 whose ONE_VAR element is VAR, and other elements are zero. Return true
36938 if successful. */
36939
36940 static bool
36941 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
36942 rtx target, rtx var, int one_var)
36943 {
36944 enum machine_mode vsimode;
36945 rtx new_target;
36946 rtx x, tmp;
36947 bool use_vector_set = false;
36948
36949 switch (mode)
36950 {
36951 case V2DImode:
36952 /* For SSE4.1, we normally use vector set. But if the second
36953 element is zero and inter-unit moves are OK, we use movq
36954 instead. */
36955 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
36956 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
36957 && one_var == 0));
36958 break;
36959 case V16QImode:
36960 case V4SImode:
36961 case V4SFmode:
36962 use_vector_set = TARGET_SSE4_1;
36963 break;
36964 case V8HImode:
36965 use_vector_set = TARGET_SSE2;
36966 break;
36967 case V4HImode:
36968 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
36969 break;
36970 case V32QImode:
36971 case V16HImode:
36972 case V8SImode:
36973 case V8SFmode:
36974 case V4DFmode:
36975 use_vector_set = TARGET_AVX;
36976 break;
36977 case V4DImode:
36978 /* Use ix86_expand_vector_set in 64bit mode only. */
36979 use_vector_set = TARGET_AVX && TARGET_64BIT;
36980 break;
36981 default:
36982 break;
36983 }
36984
36985 if (use_vector_set)
36986 {
36987 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
36988 var = force_reg (GET_MODE_INNER (mode), var);
36989 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36990 return true;
36991 }
36992
36993 switch (mode)
36994 {
36995 case V2SFmode:
36996 case V2SImode:
36997 if (!mmx_ok)
36998 return false;
36999 /* FALLTHRU */
37000
37001 case V2DFmode:
37002 case V2DImode:
37003 if (one_var != 0)
37004 return false;
37005 var = force_reg (GET_MODE_INNER (mode), var);
37006 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
37007 emit_insn (gen_rtx_SET (VOIDmode, target, x));
37008 return true;
37009
37010 case V4SFmode:
37011 case V4SImode:
37012 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
37013 new_target = gen_reg_rtx (mode);
37014 else
37015 new_target = target;
37016 var = force_reg (GET_MODE_INNER (mode), var);
37017 x = gen_rtx_VEC_DUPLICATE (mode, var);
37018 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
37019 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
37020 if (one_var != 0)
37021 {
37022 /* We need to shuffle the value to the correct position, so
37023 create a new pseudo to store the intermediate result. */
37024
37025 /* With SSE2, we can use the integer shuffle insns. */
37026 if (mode != V4SFmode && TARGET_SSE2)
37027 {
37028 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
37029 const1_rtx,
37030 GEN_INT (one_var == 1 ? 0 : 1),
37031 GEN_INT (one_var == 2 ? 0 : 1),
37032 GEN_INT (one_var == 3 ? 0 : 1)));
37033 if (target != new_target)
37034 emit_move_insn (target, new_target);
37035 return true;
37036 }
37037
37038 /* Otherwise convert the intermediate result to V4SFmode and
37039 use the SSE1 shuffle instructions. */
37040 if (mode != V4SFmode)
37041 {
37042 tmp = gen_reg_rtx (V4SFmode);
37043 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
37044 }
37045 else
37046 tmp = new_target;
37047
37048 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
37049 const1_rtx,
37050 GEN_INT (one_var == 1 ? 0 : 1),
37051 GEN_INT (one_var == 2 ? 0+4 : 1+4),
37052 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
37053
37054 if (mode != V4SFmode)
37055 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
37056 else if (tmp != target)
37057 emit_move_insn (target, tmp);
37058 }
37059 else if (target != new_target)
37060 emit_move_insn (target, new_target);
37061 return true;
37062
37063 case V8HImode:
37064 case V16QImode:
37065 vsimode = V4SImode;
37066 goto widen;
37067 case V4HImode:
37068 case V8QImode:
37069 if (!mmx_ok)
37070 return false;
37071 vsimode = V2SImode;
37072 goto widen;
37073 widen:
37074 if (one_var != 0)
37075 return false;
37076
37077 /* Zero extend the variable element to SImode and recurse. */
37078 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
37079
37080 x = gen_reg_rtx (vsimode);
37081 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
37082 var, one_var))
37083 gcc_unreachable ();
37084
37085 emit_move_insn (target, gen_lowpart (mode, x));
37086 return true;
37087
37088 default:
37089 return false;
37090 }
37091 }
37092
37093 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
37094 consisting of the values in VALS. It is known that all elements
37095 except ONE_VAR are constants. Return true if successful. */
37096
37097 static bool
37098 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
37099 rtx target, rtx vals, int one_var)
37100 {
37101 rtx var = XVECEXP (vals, 0, one_var);
37102 enum machine_mode wmode;
37103 rtx const_vec, x;
37104
37105 const_vec = copy_rtx (vals);
37106 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
37107 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
37108
37109 switch (mode)
37110 {
37111 case V2DFmode:
37112 case V2DImode:
37113 case V2SFmode:
37114 case V2SImode:
37115 /* For the two element vectors, it's just as easy to use
37116 the general case. */
37117 return false;
37118
37119 case V4DImode:
37120 /* Use ix86_expand_vector_set in 64bit mode only. */
37121 if (!TARGET_64BIT)
37122 return false;
37123 case V4DFmode:
37124 case V8SFmode:
37125 case V8SImode:
37126 case V16HImode:
37127 case V32QImode:
37128 case V4SFmode:
37129 case V4SImode:
37130 case V8HImode:
37131 case V4HImode:
37132 break;
37133
37134 case V16QImode:
37135 if (TARGET_SSE4_1)
37136 break;
37137 wmode = V8HImode;
37138 goto widen;
37139 case V8QImode:
37140 wmode = V4HImode;
37141 goto widen;
37142 widen:
37143 /* There's no way to set one QImode entry easily. Combine
37144 the variable value with its adjacent constant value, and
37145 promote to an HImode set. */
37146 x = XVECEXP (vals, 0, one_var ^ 1);
37147 if (one_var & 1)
37148 {
37149 var = convert_modes (HImode, QImode, var, true);
37150 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
37151 NULL_RTX, 1, OPTAB_LIB_WIDEN);
37152 x = GEN_INT (INTVAL (x) & 0xff);
37153 }
37154 else
37155 {
37156 var = convert_modes (HImode, QImode, var, true);
37157 x = gen_int_mode (INTVAL (x) << 8, HImode);
37158 }
37159 if (x != const0_rtx)
37160 var = expand_simple_binop (HImode, IOR, var, x, var,
37161 1, OPTAB_LIB_WIDEN);
37162
37163 x = gen_reg_rtx (wmode);
37164 emit_move_insn (x, gen_lowpart (wmode, const_vec));
37165 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
37166
37167 emit_move_insn (target, gen_lowpart (mode, x));
37168 return true;
37169
37170 default:
37171 return false;
37172 }
37173
37174 emit_move_insn (target, const_vec);
37175 ix86_expand_vector_set (mmx_ok, target, var, one_var);
37176 return true;
37177 }
37178
37179 /* A subroutine of ix86_expand_vector_init_general. Use vector
37180 concatenate to handle the most general case: all values variable,
37181 and none identical. */
37182
37183 static void
37184 ix86_expand_vector_init_concat (enum machine_mode mode,
37185 rtx target, rtx *ops, int n)
37186 {
37187 enum machine_mode cmode, hmode = VOIDmode;
37188 rtx first[8], second[4];
37189 rtvec v;
37190 int i, j;
37191
37192 switch (n)
37193 {
37194 case 2:
37195 switch (mode)
37196 {
37197 case V8SImode:
37198 cmode = V4SImode;
37199 break;
37200 case V8SFmode:
37201 cmode = V4SFmode;
37202 break;
37203 case V4DImode:
37204 cmode = V2DImode;
37205 break;
37206 case V4DFmode:
37207 cmode = V2DFmode;
37208 break;
37209 case V4SImode:
37210 cmode = V2SImode;
37211 break;
37212 case V4SFmode:
37213 cmode = V2SFmode;
37214 break;
37215 case V2DImode:
37216 cmode = DImode;
37217 break;
37218 case V2SImode:
37219 cmode = SImode;
37220 break;
37221 case V2DFmode:
37222 cmode = DFmode;
37223 break;
37224 case V2SFmode:
37225 cmode = SFmode;
37226 break;
37227 default:
37228 gcc_unreachable ();
37229 }
37230
37231 if (!register_operand (ops[1], cmode))
37232 ops[1] = force_reg (cmode, ops[1]);
37233 if (!register_operand (ops[0], cmode))
37234 ops[0] = force_reg (cmode, ops[0]);
37235 emit_insn (gen_rtx_SET (VOIDmode, target,
37236 gen_rtx_VEC_CONCAT (mode, ops[0],
37237 ops[1])));
37238 break;
37239
37240 case 4:
37241 switch (mode)
37242 {
37243 case V4DImode:
37244 cmode = V2DImode;
37245 break;
37246 case V4DFmode:
37247 cmode = V2DFmode;
37248 break;
37249 case V4SImode:
37250 cmode = V2SImode;
37251 break;
37252 case V4SFmode:
37253 cmode = V2SFmode;
37254 break;
37255 default:
37256 gcc_unreachable ();
37257 }
37258 goto half;
37259
37260 case 8:
37261 switch (mode)
37262 {
37263 case V8SImode:
37264 cmode = V2SImode;
37265 hmode = V4SImode;
37266 break;
37267 case V8SFmode:
37268 cmode = V2SFmode;
37269 hmode = V4SFmode;
37270 break;
37271 default:
37272 gcc_unreachable ();
37273 }
37274 goto half;
37275
37276 half:
37277 /* FIXME: We process inputs backward to help RA. PR 36222. */
37278 i = n - 1;
37279 j = (n >> 1) - 1;
37280 for (; i > 0; i -= 2, j--)
37281 {
37282 first[j] = gen_reg_rtx (cmode);
37283 v = gen_rtvec (2, ops[i - 1], ops[i]);
37284 ix86_expand_vector_init (false, first[j],
37285 gen_rtx_PARALLEL (cmode, v));
37286 }
37287
37288 n >>= 1;
37289 if (n > 2)
37290 {
37291 gcc_assert (hmode != VOIDmode);
37292 for (i = j = 0; i < n; i += 2, j++)
37293 {
37294 second[j] = gen_reg_rtx (hmode);
37295 ix86_expand_vector_init_concat (hmode, second [j],
37296 &first [i], 2);
37297 }
37298 n >>= 1;
37299 ix86_expand_vector_init_concat (mode, target, second, n);
37300 }
37301 else
37302 ix86_expand_vector_init_concat (mode, target, first, n);
37303 break;
37304
37305 default:
37306 gcc_unreachable ();
37307 }
37308 }
37309
37310 /* A subroutine of ix86_expand_vector_init_general. Use vector
37311 interleave to handle the most general case: all values variable,
37312 and none identical. */
37313
37314 static void
37315 ix86_expand_vector_init_interleave (enum machine_mode mode,
37316 rtx target, rtx *ops, int n)
37317 {
37318 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
37319 int i, j;
37320 rtx op0, op1;
37321 rtx (*gen_load_even) (rtx, rtx, rtx);
37322 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
37323 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
37324
37325 switch (mode)
37326 {
37327 case V8HImode:
37328 gen_load_even = gen_vec_setv8hi;
37329 gen_interleave_first_low = gen_vec_interleave_lowv4si;
37330 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37331 inner_mode = HImode;
37332 first_imode = V4SImode;
37333 second_imode = V2DImode;
37334 third_imode = VOIDmode;
37335 break;
37336 case V16QImode:
37337 gen_load_even = gen_vec_setv16qi;
37338 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
37339 gen_interleave_second_low = gen_vec_interleave_lowv4si;
37340 inner_mode = QImode;
37341 first_imode = V8HImode;
37342 second_imode = V4SImode;
37343 third_imode = V2DImode;
37344 break;
37345 default:
37346 gcc_unreachable ();
37347 }
37348
37349 for (i = 0; i < n; i++)
37350 {
37351 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
37352 op0 = gen_reg_rtx (SImode);
37353 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
37354
37355 /* Insert the SImode value as low element of V4SImode vector. */
37356 op1 = gen_reg_rtx (V4SImode);
37357 op0 = gen_rtx_VEC_MERGE (V4SImode,
37358 gen_rtx_VEC_DUPLICATE (V4SImode,
37359 op0),
37360 CONST0_RTX (V4SImode),
37361 const1_rtx);
37362 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
37363
37364 /* Cast the V4SImode vector back to a vector in orignal mode. */
37365 op0 = gen_reg_rtx (mode);
37366 emit_move_insn (op0, gen_lowpart (mode, op1));
37367
37368 /* Load even elements into the second position. */
37369 emit_insn (gen_load_even (op0,
37370 force_reg (inner_mode,
37371 ops [i + i + 1]),
37372 const1_rtx));
37373
37374 /* Cast vector to FIRST_IMODE vector. */
37375 ops[i] = gen_reg_rtx (first_imode);
37376 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
37377 }
37378
37379 /* Interleave low FIRST_IMODE vectors. */
37380 for (i = j = 0; i < n; i += 2, j++)
37381 {
37382 op0 = gen_reg_rtx (first_imode);
37383 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
37384
37385 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
37386 ops[j] = gen_reg_rtx (second_imode);
37387 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
37388 }
37389
37390 /* Interleave low SECOND_IMODE vectors. */
37391 switch (second_imode)
37392 {
37393 case V4SImode:
37394 for (i = j = 0; i < n / 2; i += 2, j++)
37395 {
37396 op0 = gen_reg_rtx (second_imode);
37397 emit_insn (gen_interleave_second_low (op0, ops[i],
37398 ops[i + 1]));
37399
37400 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
37401 vector. */
37402 ops[j] = gen_reg_rtx (third_imode);
37403 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
37404 }
37405 second_imode = V2DImode;
37406 gen_interleave_second_low = gen_vec_interleave_lowv2di;
37407 /* FALLTHRU */
37408
37409 case V2DImode:
37410 op0 = gen_reg_rtx (second_imode);
37411 emit_insn (gen_interleave_second_low (op0, ops[0],
37412 ops[1]));
37413
37414 /* Cast the SECOND_IMODE vector back to a vector on original
37415 mode. */
37416 emit_insn (gen_rtx_SET (VOIDmode, target,
37417 gen_lowpart (mode, op0)));
37418 break;
37419
37420 default:
37421 gcc_unreachable ();
37422 }
37423 }
37424
37425 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
37426 all values variable, and none identical. */
37427
37428 static void
37429 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
37430 rtx target, rtx vals)
37431 {
37432 rtx ops[32], op0, op1;
37433 enum machine_mode half_mode = VOIDmode;
37434 int n, i;
37435
37436 switch (mode)
37437 {
37438 case V2SFmode:
37439 case V2SImode:
37440 if (!mmx_ok && !TARGET_SSE)
37441 break;
37442 /* FALLTHRU */
37443
37444 case V8SFmode:
37445 case V8SImode:
37446 case V4DFmode:
37447 case V4DImode:
37448 case V4SFmode:
37449 case V4SImode:
37450 case V2DFmode:
37451 case V2DImode:
37452 n = GET_MODE_NUNITS (mode);
37453 for (i = 0; i < n; i++)
37454 ops[i] = XVECEXP (vals, 0, i);
37455 ix86_expand_vector_init_concat (mode, target, ops, n);
37456 return;
37457
37458 case V32QImode:
37459 half_mode = V16QImode;
37460 goto half;
37461
37462 case V16HImode:
37463 half_mode = V8HImode;
37464 goto half;
37465
37466 half:
37467 n = GET_MODE_NUNITS (mode);
37468 for (i = 0; i < n; i++)
37469 ops[i] = XVECEXP (vals, 0, i);
37470 op0 = gen_reg_rtx (half_mode);
37471 op1 = gen_reg_rtx (half_mode);
37472 ix86_expand_vector_init_interleave (half_mode, op0, ops,
37473 n >> 2);
37474 ix86_expand_vector_init_interleave (half_mode, op1,
37475 &ops [n >> 1], n >> 2);
37476 emit_insn (gen_rtx_SET (VOIDmode, target,
37477 gen_rtx_VEC_CONCAT (mode, op0, op1)));
37478 return;
37479
37480 case V16QImode:
37481 if (!TARGET_SSE4_1)
37482 break;
37483 /* FALLTHRU */
37484
37485 case V8HImode:
37486 if (!TARGET_SSE2)
37487 break;
37488
37489 /* Don't use ix86_expand_vector_init_interleave if we can't
37490 move from GPR to SSE register directly. */
37491 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
37492 break;
37493
37494 n = GET_MODE_NUNITS (mode);
37495 for (i = 0; i < n; i++)
37496 ops[i] = XVECEXP (vals, 0, i);
37497 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
37498 return;
37499
37500 case V4HImode:
37501 case V8QImode:
37502 break;
37503
37504 default:
37505 gcc_unreachable ();
37506 }
37507
37508 {
37509 int i, j, n_elts, n_words, n_elt_per_word;
37510 enum machine_mode inner_mode;
37511 rtx words[4], shift;
37512
37513 inner_mode = GET_MODE_INNER (mode);
37514 n_elts = GET_MODE_NUNITS (mode);
37515 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
37516 n_elt_per_word = n_elts / n_words;
37517 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
37518
37519 for (i = 0; i < n_words; ++i)
37520 {
37521 rtx word = NULL_RTX;
37522
37523 for (j = 0; j < n_elt_per_word; ++j)
37524 {
37525 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
37526 elt = convert_modes (word_mode, inner_mode, elt, true);
37527
37528 if (j == 0)
37529 word = elt;
37530 else
37531 {
37532 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
37533 word, 1, OPTAB_LIB_WIDEN);
37534 word = expand_simple_binop (word_mode, IOR, word, elt,
37535 word, 1, OPTAB_LIB_WIDEN);
37536 }
37537 }
37538
37539 words[i] = word;
37540 }
37541
37542 if (n_words == 1)
37543 emit_move_insn (target, gen_lowpart (mode, words[0]));
37544 else if (n_words == 2)
37545 {
37546 rtx tmp = gen_reg_rtx (mode);
37547 emit_clobber (tmp);
37548 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
37549 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
37550 emit_move_insn (target, tmp);
37551 }
37552 else if (n_words == 4)
37553 {
37554 rtx tmp = gen_reg_rtx (V4SImode);
37555 gcc_assert (word_mode == SImode);
37556 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
37557 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
37558 emit_move_insn (target, gen_lowpart (mode, tmp));
37559 }
37560 else
37561 gcc_unreachable ();
37562 }
37563 }
37564
37565 /* Initialize vector TARGET via VALS. Suppress the use of MMX
37566 instructions unless MMX_OK is true. */
37567
37568 void
37569 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
37570 {
37571 enum machine_mode mode = GET_MODE (target);
37572 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37573 int n_elts = GET_MODE_NUNITS (mode);
37574 int n_var = 0, one_var = -1;
37575 bool all_same = true, all_const_zero = true;
37576 int i;
37577 rtx x;
37578
37579 for (i = 0; i < n_elts; ++i)
37580 {
37581 x = XVECEXP (vals, 0, i);
37582 if (!(CONST_INT_P (x)
37583 || GET_CODE (x) == CONST_DOUBLE
37584 || GET_CODE (x) == CONST_FIXED))
37585 n_var++, one_var = i;
37586 else if (x != CONST0_RTX (inner_mode))
37587 all_const_zero = false;
37588 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
37589 all_same = false;
37590 }
37591
37592 /* Constants are best loaded from the constant pool. */
37593 if (n_var == 0)
37594 {
37595 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
37596 return;
37597 }
37598
37599 /* If all values are identical, broadcast the value. */
37600 if (all_same
37601 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
37602 XVECEXP (vals, 0, 0)))
37603 return;
37604
37605 /* Values where only one field is non-constant are best loaded from
37606 the pool and overwritten via move later. */
37607 if (n_var == 1)
37608 {
37609 if (all_const_zero
37610 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
37611 XVECEXP (vals, 0, one_var),
37612 one_var))
37613 return;
37614
37615 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
37616 return;
37617 }
37618
37619 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
37620 }
37621
37622 void
37623 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
37624 {
37625 enum machine_mode mode = GET_MODE (target);
37626 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37627 enum machine_mode half_mode;
37628 bool use_vec_merge = false;
37629 rtx tmp;
37630 static rtx (*gen_extract[6][2]) (rtx, rtx)
37631 = {
37632 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
37633 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
37634 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
37635 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
37636 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
37637 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
37638 };
37639 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
37640 = {
37641 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
37642 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
37643 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
37644 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
37645 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
37646 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
37647 };
37648 int i, j, n;
37649
37650 switch (mode)
37651 {
37652 case V2SFmode:
37653 case V2SImode:
37654 if (mmx_ok)
37655 {
37656 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37657 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
37658 if (elt == 0)
37659 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37660 else
37661 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37662 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37663 return;
37664 }
37665 break;
37666
37667 case V2DImode:
37668 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
37669 if (use_vec_merge)
37670 break;
37671
37672 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
37673 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
37674 if (elt == 0)
37675 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
37676 else
37677 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
37678 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37679 return;
37680
37681 case V2DFmode:
37682 {
37683 rtx op0, op1;
37684
37685 /* For the two element vectors, we implement a VEC_CONCAT with
37686 the extraction of the other element. */
37687
37688 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
37689 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
37690
37691 if (elt == 0)
37692 op0 = val, op1 = tmp;
37693 else
37694 op0 = tmp, op1 = val;
37695
37696 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
37697 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37698 }
37699 return;
37700
37701 case V4SFmode:
37702 use_vec_merge = TARGET_SSE4_1;
37703 if (use_vec_merge)
37704 break;
37705
37706 switch (elt)
37707 {
37708 case 0:
37709 use_vec_merge = true;
37710 break;
37711
37712 case 1:
37713 /* tmp = target = A B C D */
37714 tmp = copy_to_reg (target);
37715 /* target = A A B B */
37716 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
37717 /* target = X A B B */
37718 ix86_expand_vector_set (false, target, val, 0);
37719 /* target = A X C D */
37720 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37721 const1_rtx, const0_rtx,
37722 GEN_INT (2+4), GEN_INT (3+4)));
37723 return;
37724
37725 case 2:
37726 /* tmp = target = A B C D */
37727 tmp = copy_to_reg (target);
37728 /* tmp = X B C D */
37729 ix86_expand_vector_set (false, tmp, val, 0);
37730 /* target = A B X D */
37731 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37732 const0_rtx, const1_rtx,
37733 GEN_INT (0+4), GEN_INT (3+4)));
37734 return;
37735
37736 case 3:
37737 /* tmp = target = A B C D */
37738 tmp = copy_to_reg (target);
37739 /* tmp = X B C D */
37740 ix86_expand_vector_set (false, tmp, val, 0);
37741 /* target = A B X D */
37742 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
37743 const0_rtx, const1_rtx,
37744 GEN_INT (2+4), GEN_INT (0+4)));
37745 return;
37746
37747 default:
37748 gcc_unreachable ();
37749 }
37750 break;
37751
37752 case V4SImode:
37753 use_vec_merge = TARGET_SSE4_1;
37754 if (use_vec_merge)
37755 break;
37756
37757 /* Element 0 handled by vec_merge below. */
37758 if (elt == 0)
37759 {
37760 use_vec_merge = true;
37761 break;
37762 }
37763
37764 if (TARGET_SSE2)
37765 {
37766 /* With SSE2, use integer shuffles to swap element 0 and ELT,
37767 store into element 0, then shuffle them back. */
37768
37769 rtx order[4];
37770
37771 order[0] = GEN_INT (elt);
37772 order[1] = const1_rtx;
37773 order[2] = const2_rtx;
37774 order[3] = GEN_INT (3);
37775 order[elt] = const0_rtx;
37776
37777 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37778 order[1], order[2], order[3]));
37779
37780 ix86_expand_vector_set (false, target, val, 0);
37781
37782 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
37783 order[1], order[2], order[3]));
37784 }
37785 else
37786 {
37787 /* For SSE1, we have to reuse the V4SF code. */
37788 rtx t = gen_reg_rtx (V4SFmode);
37789 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
37790 emit_move_insn (target, gen_lowpart (mode, t));
37791 }
37792 return;
37793
37794 case V8HImode:
37795 use_vec_merge = TARGET_SSE2;
37796 break;
37797 case V4HImode:
37798 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37799 break;
37800
37801 case V16QImode:
37802 use_vec_merge = TARGET_SSE4_1;
37803 break;
37804
37805 case V8QImode:
37806 break;
37807
37808 case V32QImode:
37809 half_mode = V16QImode;
37810 j = 0;
37811 n = 16;
37812 goto half;
37813
37814 case V16HImode:
37815 half_mode = V8HImode;
37816 j = 1;
37817 n = 8;
37818 goto half;
37819
37820 case V8SImode:
37821 half_mode = V4SImode;
37822 j = 2;
37823 n = 4;
37824 goto half;
37825
37826 case V4DImode:
37827 half_mode = V2DImode;
37828 j = 3;
37829 n = 2;
37830 goto half;
37831
37832 case V8SFmode:
37833 half_mode = V4SFmode;
37834 j = 4;
37835 n = 4;
37836 goto half;
37837
37838 case V4DFmode:
37839 half_mode = V2DFmode;
37840 j = 5;
37841 n = 2;
37842 goto half;
37843
37844 half:
37845 /* Compute offset. */
37846 i = elt / n;
37847 elt %= n;
37848
37849 gcc_assert (i <= 1);
37850
37851 /* Extract the half. */
37852 tmp = gen_reg_rtx (half_mode);
37853 emit_insn (gen_extract[j][i] (tmp, target));
37854
37855 /* Put val in tmp at elt. */
37856 ix86_expand_vector_set (false, tmp, val, elt);
37857
37858 /* Put it back. */
37859 emit_insn (gen_insert[j][i] (target, target, tmp));
37860 return;
37861
37862 default:
37863 break;
37864 }
37865
37866 if (use_vec_merge)
37867 {
37868 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
37869 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
37870 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37871 }
37872 else
37873 {
37874 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37875
37876 emit_move_insn (mem, target);
37877
37878 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37879 emit_move_insn (tmp, val);
37880
37881 emit_move_insn (target, mem);
37882 }
37883 }
37884
37885 void
37886 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
37887 {
37888 enum machine_mode mode = GET_MODE (vec);
37889 enum machine_mode inner_mode = GET_MODE_INNER (mode);
37890 bool use_vec_extr = false;
37891 rtx tmp;
37892
37893 switch (mode)
37894 {
37895 case V2SImode:
37896 case V2SFmode:
37897 if (!mmx_ok)
37898 break;
37899 /* FALLTHRU */
37900
37901 case V2DFmode:
37902 case V2DImode:
37903 use_vec_extr = true;
37904 break;
37905
37906 case V4SFmode:
37907 use_vec_extr = TARGET_SSE4_1;
37908 if (use_vec_extr)
37909 break;
37910
37911 switch (elt)
37912 {
37913 case 0:
37914 tmp = vec;
37915 break;
37916
37917 case 1:
37918 case 3:
37919 tmp = gen_reg_rtx (mode);
37920 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
37921 GEN_INT (elt), GEN_INT (elt),
37922 GEN_INT (elt+4), GEN_INT (elt+4)));
37923 break;
37924
37925 case 2:
37926 tmp = gen_reg_rtx (mode);
37927 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
37928 break;
37929
37930 default:
37931 gcc_unreachable ();
37932 }
37933 vec = tmp;
37934 use_vec_extr = true;
37935 elt = 0;
37936 break;
37937
37938 case V4SImode:
37939 use_vec_extr = TARGET_SSE4_1;
37940 if (use_vec_extr)
37941 break;
37942
37943 if (TARGET_SSE2)
37944 {
37945 switch (elt)
37946 {
37947 case 0:
37948 tmp = vec;
37949 break;
37950
37951 case 1:
37952 case 3:
37953 tmp = gen_reg_rtx (mode);
37954 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
37955 GEN_INT (elt), GEN_INT (elt),
37956 GEN_INT (elt), GEN_INT (elt)));
37957 break;
37958
37959 case 2:
37960 tmp = gen_reg_rtx (mode);
37961 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
37962 break;
37963
37964 default:
37965 gcc_unreachable ();
37966 }
37967 vec = tmp;
37968 use_vec_extr = true;
37969 elt = 0;
37970 }
37971 else
37972 {
37973 /* For SSE1, we have to reuse the V4SF code. */
37974 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
37975 gen_lowpart (V4SFmode, vec), elt);
37976 return;
37977 }
37978 break;
37979
37980 case V8HImode:
37981 use_vec_extr = TARGET_SSE2;
37982 break;
37983 case V4HImode:
37984 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
37985 break;
37986
37987 case V16QImode:
37988 use_vec_extr = TARGET_SSE4_1;
37989 break;
37990
37991 case V8SFmode:
37992 if (TARGET_AVX)
37993 {
37994 tmp = gen_reg_rtx (V4SFmode);
37995 if (elt < 4)
37996 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
37997 else
37998 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
37999 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38000 return;
38001 }
38002 break;
38003
38004 case V4DFmode:
38005 if (TARGET_AVX)
38006 {
38007 tmp = gen_reg_rtx (V2DFmode);
38008 if (elt < 2)
38009 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
38010 else
38011 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
38012 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38013 return;
38014 }
38015 break;
38016
38017 case V32QImode:
38018 if (TARGET_AVX)
38019 {
38020 tmp = gen_reg_rtx (V16QImode);
38021 if (elt < 16)
38022 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
38023 else
38024 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
38025 ix86_expand_vector_extract (false, target, tmp, elt & 15);
38026 return;
38027 }
38028 break;
38029
38030 case V16HImode:
38031 if (TARGET_AVX)
38032 {
38033 tmp = gen_reg_rtx (V8HImode);
38034 if (elt < 8)
38035 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
38036 else
38037 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
38038 ix86_expand_vector_extract (false, target, tmp, elt & 7);
38039 return;
38040 }
38041 break;
38042
38043 case V8SImode:
38044 if (TARGET_AVX)
38045 {
38046 tmp = gen_reg_rtx (V4SImode);
38047 if (elt < 4)
38048 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
38049 else
38050 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
38051 ix86_expand_vector_extract (false, target, tmp, elt & 3);
38052 return;
38053 }
38054 break;
38055
38056 case V4DImode:
38057 if (TARGET_AVX)
38058 {
38059 tmp = gen_reg_rtx (V2DImode);
38060 if (elt < 2)
38061 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
38062 else
38063 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
38064 ix86_expand_vector_extract (false, target, tmp, elt & 1);
38065 return;
38066 }
38067 break;
38068
38069 case V8QImode:
38070 /* ??? Could extract the appropriate HImode element and shift. */
38071 default:
38072 break;
38073 }
38074
38075 if (use_vec_extr)
38076 {
38077 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
38078 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
38079
38080 /* Let the rtl optimizers know about the zero extension performed. */
38081 if (inner_mode == QImode || inner_mode == HImode)
38082 {
38083 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
38084 target = gen_lowpart (SImode, target);
38085 }
38086
38087 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
38088 }
38089 else
38090 {
38091 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
38092
38093 emit_move_insn (mem, vec);
38094
38095 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
38096 emit_move_insn (target, tmp);
38097 }
38098 }
38099
38100 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
38101 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
38102 The upper bits of DEST are undefined, though they shouldn't cause
38103 exceptions (some bits from src or all zeros are ok). */
38104
38105 static void
38106 emit_reduc_half (rtx dest, rtx src, int i)
38107 {
38108 rtx tem, d = dest;
38109 switch (GET_MODE (src))
38110 {
38111 case V4SFmode:
38112 if (i == 128)
38113 tem = gen_sse_movhlps (dest, src, src);
38114 else
38115 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
38116 GEN_INT (1 + 4), GEN_INT (1 + 4));
38117 break;
38118 case V2DFmode:
38119 tem = gen_vec_interleave_highv2df (dest, src, src);
38120 break;
38121 case V16QImode:
38122 case V8HImode:
38123 case V4SImode:
38124 case V2DImode:
38125 d = gen_reg_rtx (V1TImode);
38126 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
38127 GEN_INT (i / 2));
38128 break;
38129 case V8SFmode:
38130 if (i == 256)
38131 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
38132 else
38133 tem = gen_avx_shufps256 (dest, src, src,
38134 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
38135 break;
38136 case V4DFmode:
38137 if (i == 256)
38138 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
38139 else
38140 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
38141 break;
38142 case V32QImode:
38143 case V16HImode:
38144 case V8SImode:
38145 case V4DImode:
38146 if (i == 256)
38147 {
38148 if (GET_MODE (dest) != V4DImode)
38149 d = gen_reg_rtx (V4DImode);
38150 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
38151 gen_lowpart (V4DImode, src),
38152 const1_rtx);
38153 }
38154 else
38155 {
38156 d = gen_reg_rtx (V2TImode);
38157 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
38158 GEN_INT (i / 2));
38159 }
38160 break;
38161 default:
38162 gcc_unreachable ();
38163 }
38164 emit_insn (tem);
38165 if (d != dest)
38166 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
38167 }
38168
38169 /* Expand a vector reduction. FN is the binary pattern to reduce;
38170 DEST is the destination; IN is the input vector. */
38171
38172 void
38173 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
38174 {
38175 rtx half, dst, vec = in;
38176 enum machine_mode mode = GET_MODE (in);
38177 int i;
38178
38179 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
38180 if (TARGET_SSE4_1
38181 && mode == V8HImode
38182 && fn == gen_uminv8hi3)
38183 {
38184 emit_insn (gen_sse4_1_phminposuw (dest, in));
38185 return;
38186 }
38187
38188 for (i = GET_MODE_BITSIZE (mode);
38189 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
38190 i >>= 1)
38191 {
38192 half = gen_reg_rtx (mode);
38193 emit_reduc_half (half, vec, i);
38194 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
38195 dst = dest;
38196 else
38197 dst = gen_reg_rtx (mode);
38198 emit_insn (fn (dst, half, vec));
38199 vec = dst;
38200 }
38201 }
38202 \f
38203 /* Target hook for scalar_mode_supported_p. */
38204 static bool
38205 ix86_scalar_mode_supported_p (enum machine_mode mode)
38206 {
38207 if (DECIMAL_FLOAT_MODE_P (mode))
38208 return default_decimal_float_supported_p ();
38209 else if (mode == TFmode)
38210 return true;
38211 else
38212 return default_scalar_mode_supported_p (mode);
38213 }
38214
38215 /* Implements target hook vector_mode_supported_p. */
38216 static bool
38217 ix86_vector_mode_supported_p (enum machine_mode mode)
38218 {
38219 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
38220 return true;
38221 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
38222 return true;
38223 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
38224 return true;
38225 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
38226 return true;
38227 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
38228 return true;
38229 return false;
38230 }
38231
38232 /* Target hook for c_mode_for_suffix. */
38233 static enum machine_mode
38234 ix86_c_mode_for_suffix (char suffix)
38235 {
38236 if (suffix == 'q')
38237 return TFmode;
38238 if (suffix == 'w')
38239 return XFmode;
38240
38241 return VOIDmode;
38242 }
38243
38244 /* Worker function for TARGET_MD_ASM_CLOBBERS.
38245
38246 We do this in the new i386 backend to maintain source compatibility
38247 with the old cc0-based compiler. */
38248
38249 static tree
38250 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
38251 tree inputs ATTRIBUTE_UNUSED,
38252 tree clobbers)
38253 {
38254 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
38255 clobbers);
38256 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
38257 clobbers);
38258 return clobbers;
38259 }
38260
38261 /* Implements target vector targetm.asm.encode_section_info. */
38262
38263 static void ATTRIBUTE_UNUSED
38264 ix86_encode_section_info (tree decl, rtx rtl, int first)
38265 {
38266 default_encode_section_info (decl, rtl, first);
38267
38268 if (TREE_CODE (decl) == VAR_DECL
38269 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
38270 && ix86_in_large_data_p (decl))
38271 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
38272 }
38273
38274 /* Worker function for REVERSE_CONDITION. */
38275
38276 enum rtx_code
38277 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
38278 {
38279 return (mode != CCFPmode && mode != CCFPUmode
38280 ? reverse_condition (code)
38281 : reverse_condition_maybe_unordered (code));
38282 }
38283
38284 /* Output code to perform an x87 FP register move, from OPERANDS[1]
38285 to OPERANDS[0]. */
38286
38287 const char *
38288 output_387_reg_move (rtx insn, rtx *operands)
38289 {
38290 if (REG_P (operands[0]))
38291 {
38292 if (REG_P (operands[1])
38293 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38294 {
38295 if (REGNO (operands[0]) == FIRST_STACK_REG)
38296 return output_387_ffreep (operands, 0);
38297 return "fstp\t%y0";
38298 }
38299 if (STACK_TOP_P (operands[0]))
38300 return "fld%Z1\t%y1";
38301 return "fst\t%y0";
38302 }
38303 else if (MEM_P (operands[0]))
38304 {
38305 gcc_assert (REG_P (operands[1]));
38306 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
38307 return "fstp%Z0\t%y0";
38308 else
38309 {
38310 /* There is no non-popping store to memory for XFmode.
38311 So if we need one, follow the store with a load. */
38312 if (GET_MODE (operands[0]) == XFmode)
38313 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
38314 else
38315 return "fst%Z0\t%y0";
38316 }
38317 }
38318 else
38319 gcc_unreachable();
38320 }
38321
38322 /* Output code to perform a conditional jump to LABEL, if C2 flag in
38323 FP status register is set. */
38324
38325 void
38326 ix86_emit_fp_unordered_jump (rtx label)
38327 {
38328 rtx reg = gen_reg_rtx (HImode);
38329 rtx temp;
38330
38331 emit_insn (gen_x86_fnstsw_1 (reg));
38332
38333 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
38334 {
38335 emit_insn (gen_x86_sahf_1 (reg));
38336
38337 temp = gen_rtx_REG (CCmode, FLAGS_REG);
38338 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
38339 }
38340 else
38341 {
38342 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
38343
38344 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
38345 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
38346 }
38347
38348 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
38349 gen_rtx_LABEL_REF (VOIDmode, label),
38350 pc_rtx);
38351 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
38352
38353 emit_jump_insn (temp);
38354 predict_jump (REG_BR_PROB_BASE * 10 / 100);
38355 }
38356
38357 /* Output code to perform a log1p XFmode calculation. */
38358
38359 void ix86_emit_i387_log1p (rtx op0, rtx op1)
38360 {
38361 rtx label1 = gen_label_rtx ();
38362 rtx label2 = gen_label_rtx ();
38363
38364 rtx tmp = gen_reg_rtx (XFmode);
38365 rtx tmp2 = gen_reg_rtx (XFmode);
38366 rtx test;
38367
38368 emit_insn (gen_absxf2 (tmp, op1));
38369 test = gen_rtx_GE (VOIDmode, tmp,
38370 CONST_DOUBLE_FROM_REAL_VALUE (
38371 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
38372 XFmode));
38373 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
38374
38375 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38376 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
38377 emit_jump (label2);
38378
38379 emit_label (label1);
38380 emit_move_insn (tmp, CONST1_RTX (XFmode));
38381 emit_insn (gen_addxf3 (tmp, op1, tmp));
38382 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
38383 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
38384
38385 emit_label (label2);
38386 }
38387
38388 /* Emit code for round calculation. */
38389 void ix86_emit_i387_round (rtx op0, rtx op1)
38390 {
38391 enum machine_mode inmode = GET_MODE (op1);
38392 enum machine_mode outmode = GET_MODE (op0);
38393 rtx e1, e2, res, tmp, tmp1, half;
38394 rtx scratch = gen_reg_rtx (HImode);
38395 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
38396 rtx jump_label = gen_label_rtx ();
38397 rtx insn;
38398 rtx (*gen_abs) (rtx, rtx);
38399 rtx (*gen_neg) (rtx, rtx);
38400
38401 switch (inmode)
38402 {
38403 case SFmode:
38404 gen_abs = gen_abssf2;
38405 break;
38406 case DFmode:
38407 gen_abs = gen_absdf2;
38408 break;
38409 case XFmode:
38410 gen_abs = gen_absxf2;
38411 break;
38412 default:
38413 gcc_unreachable ();
38414 }
38415
38416 switch (outmode)
38417 {
38418 case SFmode:
38419 gen_neg = gen_negsf2;
38420 break;
38421 case DFmode:
38422 gen_neg = gen_negdf2;
38423 break;
38424 case XFmode:
38425 gen_neg = gen_negxf2;
38426 break;
38427 case HImode:
38428 gen_neg = gen_neghi2;
38429 break;
38430 case SImode:
38431 gen_neg = gen_negsi2;
38432 break;
38433 case DImode:
38434 gen_neg = gen_negdi2;
38435 break;
38436 default:
38437 gcc_unreachable ();
38438 }
38439
38440 e1 = gen_reg_rtx (inmode);
38441 e2 = gen_reg_rtx (inmode);
38442 res = gen_reg_rtx (outmode);
38443
38444 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
38445
38446 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
38447
38448 /* scratch = fxam(op1) */
38449 emit_insn (gen_rtx_SET (VOIDmode, scratch,
38450 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
38451 UNSPEC_FXAM)));
38452 /* e1 = fabs(op1) */
38453 emit_insn (gen_abs (e1, op1));
38454
38455 /* e2 = e1 + 0.5 */
38456 half = force_reg (inmode, half);
38457 emit_insn (gen_rtx_SET (VOIDmode, e2,
38458 gen_rtx_PLUS (inmode, e1, half)));
38459
38460 /* res = floor(e2) */
38461 if (inmode != XFmode)
38462 {
38463 tmp1 = gen_reg_rtx (XFmode);
38464
38465 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
38466 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
38467 }
38468 else
38469 tmp1 = e2;
38470
38471 switch (outmode)
38472 {
38473 case SFmode:
38474 case DFmode:
38475 {
38476 rtx tmp0 = gen_reg_rtx (XFmode);
38477
38478 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
38479
38480 emit_insn (gen_rtx_SET (VOIDmode, res,
38481 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
38482 UNSPEC_TRUNC_NOOP)));
38483 }
38484 break;
38485 case XFmode:
38486 emit_insn (gen_frndintxf2_floor (res, tmp1));
38487 break;
38488 case HImode:
38489 emit_insn (gen_lfloorxfhi2 (res, tmp1));
38490 break;
38491 case SImode:
38492 emit_insn (gen_lfloorxfsi2 (res, tmp1));
38493 break;
38494 case DImode:
38495 emit_insn (gen_lfloorxfdi2 (res, tmp1));
38496 break;
38497 default:
38498 gcc_unreachable ();
38499 }
38500
38501 /* flags = signbit(a) */
38502 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
38503
38504 /* if (flags) then res = -res */
38505 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
38506 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
38507 gen_rtx_LABEL_REF (VOIDmode, jump_label),
38508 pc_rtx);
38509 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38510 predict_jump (REG_BR_PROB_BASE * 50 / 100);
38511 JUMP_LABEL (insn) = jump_label;
38512
38513 emit_insn (gen_neg (res, res));
38514
38515 emit_label (jump_label);
38516 LABEL_NUSES (jump_label) = 1;
38517
38518 emit_move_insn (op0, res);
38519 }
38520
38521 /* Output code to perform a Newton-Rhapson approximation of a single precision
38522 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
38523
38524 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
38525 {
38526 rtx x0, x1, e0, e1;
38527
38528 x0 = gen_reg_rtx (mode);
38529 e0 = gen_reg_rtx (mode);
38530 e1 = gen_reg_rtx (mode);
38531 x1 = gen_reg_rtx (mode);
38532
38533 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
38534
38535 b = force_reg (mode, b);
38536
38537 /* x0 = rcp(b) estimate */
38538 emit_insn (gen_rtx_SET (VOIDmode, x0,
38539 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
38540 UNSPEC_RCP)));
38541 /* e0 = x0 * b */
38542 emit_insn (gen_rtx_SET (VOIDmode, e0,
38543 gen_rtx_MULT (mode, x0, b)));
38544
38545 /* e0 = x0 * e0 */
38546 emit_insn (gen_rtx_SET (VOIDmode, e0,
38547 gen_rtx_MULT (mode, x0, e0)));
38548
38549 /* e1 = x0 + x0 */
38550 emit_insn (gen_rtx_SET (VOIDmode, e1,
38551 gen_rtx_PLUS (mode, x0, x0)));
38552
38553 /* x1 = e1 - e0 */
38554 emit_insn (gen_rtx_SET (VOIDmode, x1,
38555 gen_rtx_MINUS (mode, e1, e0)));
38556
38557 /* res = a * x1 */
38558 emit_insn (gen_rtx_SET (VOIDmode, res,
38559 gen_rtx_MULT (mode, a, x1)));
38560 }
38561
38562 /* Output code to perform a Newton-Rhapson approximation of a
38563 single precision floating point [reciprocal] square root. */
38564
38565 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
38566 bool recip)
38567 {
38568 rtx x0, e0, e1, e2, e3, mthree, mhalf;
38569 REAL_VALUE_TYPE r;
38570
38571 x0 = gen_reg_rtx (mode);
38572 e0 = gen_reg_rtx (mode);
38573 e1 = gen_reg_rtx (mode);
38574 e2 = gen_reg_rtx (mode);
38575 e3 = gen_reg_rtx (mode);
38576
38577 real_from_integer (&r, VOIDmode, -3, -1, 0);
38578 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38579
38580 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
38581 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
38582
38583 if (VECTOR_MODE_P (mode))
38584 {
38585 mthree = ix86_build_const_vector (mode, true, mthree);
38586 mhalf = ix86_build_const_vector (mode, true, mhalf);
38587 }
38588
38589 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
38590 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
38591
38592 a = force_reg (mode, a);
38593
38594 /* x0 = rsqrt(a) estimate */
38595 emit_insn (gen_rtx_SET (VOIDmode, x0,
38596 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
38597 UNSPEC_RSQRT)));
38598
38599 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
38600 if (!recip)
38601 {
38602 rtx zero, mask;
38603
38604 zero = gen_reg_rtx (mode);
38605 mask = gen_reg_rtx (mode);
38606
38607 zero = force_reg (mode, CONST0_RTX(mode));
38608 emit_insn (gen_rtx_SET (VOIDmode, mask,
38609 gen_rtx_NE (mode, zero, a)));
38610
38611 emit_insn (gen_rtx_SET (VOIDmode, x0,
38612 gen_rtx_AND (mode, x0, mask)));
38613 }
38614
38615 /* e0 = x0 * a */
38616 emit_insn (gen_rtx_SET (VOIDmode, e0,
38617 gen_rtx_MULT (mode, x0, a)));
38618 /* e1 = e0 * x0 */
38619 emit_insn (gen_rtx_SET (VOIDmode, e1,
38620 gen_rtx_MULT (mode, e0, x0)));
38621
38622 /* e2 = e1 - 3. */
38623 mthree = force_reg (mode, mthree);
38624 emit_insn (gen_rtx_SET (VOIDmode, e2,
38625 gen_rtx_PLUS (mode, e1, mthree)));
38626
38627 mhalf = force_reg (mode, mhalf);
38628 if (recip)
38629 /* e3 = -.5 * x0 */
38630 emit_insn (gen_rtx_SET (VOIDmode, e3,
38631 gen_rtx_MULT (mode, x0, mhalf)));
38632 else
38633 /* e3 = -.5 * e0 */
38634 emit_insn (gen_rtx_SET (VOIDmode, e3,
38635 gen_rtx_MULT (mode, e0, mhalf)));
38636 /* ret = e2 * e3 */
38637 emit_insn (gen_rtx_SET (VOIDmode, res,
38638 gen_rtx_MULT (mode, e2, e3)));
38639 }
38640
38641 #ifdef TARGET_SOLARIS
38642 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
38643
38644 static void
38645 i386_solaris_elf_named_section (const char *name, unsigned int flags,
38646 tree decl)
38647 {
38648 /* With Binutils 2.15, the "@unwind" marker must be specified on
38649 every occurrence of the ".eh_frame" section, not just the first
38650 one. */
38651 if (TARGET_64BIT
38652 && strcmp (name, ".eh_frame") == 0)
38653 {
38654 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
38655 flags & SECTION_WRITE ? "aw" : "a");
38656 return;
38657 }
38658
38659 #ifndef USE_GAS
38660 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
38661 {
38662 solaris_elf_asm_comdat_section (name, flags, decl);
38663 return;
38664 }
38665 #endif
38666
38667 default_elf_asm_named_section (name, flags, decl);
38668 }
38669 #endif /* TARGET_SOLARIS */
38670
38671 /* Return the mangling of TYPE if it is an extended fundamental type. */
38672
38673 static const char *
38674 ix86_mangle_type (const_tree type)
38675 {
38676 type = TYPE_MAIN_VARIANT (type);
38677
38678 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
38679 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
38680 return NULL;
38681
38682 switch (TYPE_MODE (type))
38683 {
38684 case TFmode:
38685 /* __float128 is "g". */
38686 return "g";
38687 case XFmode:
38688 /* "long double" or __float80 is "e". */
38689 return "e";
38690 default:
38691 return NULL;
38692 }
38693 }
38694
38695 /* For 32-bit code we can save PIC register setup by using
38696 __stack_chk_fail_local hidden function instead of calling
38697 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
38698 register, so it is better to call __stack_chk_fail directly. */
38699
38700 static tree ATTRIBUTE_UNUSED
38701 ix86_stack_protect_fail (void)
38702 {
38703 return TARGET_64BIT
38704 ? default_external_stack_protect_fail ()
38705 : default_hidden_stack_protect_fail ();
38706 }
38707
38708 /* Select a format to encode pointers in exception handling data. CODE
38709 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
38710 true if the symbol may be affected by dynamic relocations.
38711
38712 ??? All x86 object file formats are capable of representing this.
38713 After all, the relocation needed is the same as for the call insn.
38714 Whether or not a particular assembler allows us to enter such, I
38715 guess we'll have to see. */
38716 int
38717 asm_preferred_eh_data_format (int code, int global)
38718 {
38719 if (flag_pic)
38720 {
38721 int type = DW_EH_PE_sdata8;
38722 if (!TARGET_64BIT
38723 || ix86_cmodel == CM_SMALL_PIC
38724 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
38725 type = DW_EH_PE_sdata4;
38726 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
38727 }
38728 if (ix86_cmodel == CM_SMALL
38729 || (ix86_cmodel == CM_MEDIUM && code))
38730 return DW_EH_PE_udata4;
38731 return DW_EH_PE_absptr;
38732 }
38733 \f
38734 /* Expand copysign from SIGN to the positive value ABS_VALUE
38735 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
38736 the sign-bit. */
38737 static void
38738 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
38739 {
38740 enum machine_mode mode = GET_MODE (sign);
38741 rtx sgn = gen_reg_rtx (mode);
38742 if (mask == NULL_RTX)
38743 {
38744 enum machine_mode vmode;
38745
38746 if (mode == SFmode)
38747 vmode = V4SFmode;
38748 else if (mode == DFmode)
38749 vmode = V2DFmode;
38750 else
38751 vmode = mode;
38752
38753 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
38754 if (!VECTOR_MODE_P (mode))
38755 {
38756 /* We need to generate a scalar mode mask in this case. */
38757 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38758 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38759 mask = gen_reg_rtx (mode);
38760 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38761 }
38762 }
38763 else
38764 mask = gen_rtx_NOT (mode, mask);
38765 emit_insn (gen_rtx_SET (VOIDmode, sgn,
38766 gen_rtx_AND (mode, mask, sign)));
38767 emit_insn (gen_rtx_SET (VOIDmode, result,
38768 gen_rtx_IOR (mode, abs_value, sgn)));
38769 }
38770
38771 /* Expand fabs (OP0) and return a new rtx that holds the result. The
38772 mask for masking out the sign-bit is stored in *SMASK, if that is
38773 non-null. */
38774 static rtx
38775 ix86_expand_sse_fabs (rtx op0, rtx *smask)
38776 {
38777 enum machine_mode vmode, mode = GET_MODE (op0);
38778 rtx xa, mask;
38779
38780 xa = gen_reg_rtx (mode);
38781 if (mode == SFmode)
38782 vmode = V4SFmode;
38783 else if (mode == DFmode)
38784 vmode = V2DFmode;
38785 else
38786 vmode = mode;
38787 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
38788 if (!VECTOR_MODE_P (mode))
38789 {
38790 /* We need to generate a scalar mode mask in this case. */
38791 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
38792 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
38793 mask = gen_reg_rtx (mode);
38794 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
38795 }
38796 emit_insn (gen_rtx_SET (VOIDmode, xa,
38797 gen_rtx_AND (mode, op0, mask)));
38798
38799 if (smask)
38800 *smask = mask;
38801
38802 return xa;
38803 }
38804
38805 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
38806 swapping the operands if SWAP_OPERANDS is true. The expanded
38807 code is a forward jump to a newly created label in case the
38808 comparison is true. The generated label rtx is returned. */
38809 static rtx
38810 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
38811 bool swap_operands)
38812 {
38813 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
38814 rtx label, tmp;
38815
38816 if (swap_operands)
38817 {
38818 tmp = op0;
38819 op0 = op1;
38820 op1 = tmp;
38821 }
38822
38823 label = gen_label_rtx ();
38824 tmp = gen_rtx_REG (fpcmp_mode, FLAGS_REG);
38825 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38826 gen_rtx_COMPARE (fpcmp_mode, op0, op1)));
38827 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
38828 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
38829 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
38830 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
38831 JUMP_LABEL (tmp) = label;
38832
38833 return label;
38834 }
38835
38836 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
38837 using comparison code CODE. Operands are swapped for the comparison if
38838 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
38839 static rtx
38840 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
38841 bool swap_operands)
38842 {
38843 rtx (*insn)(rtx, rtx, rtx, rtx);
38844 enum machine_mode mode = GET_MODE (op0);
38845 rtx mask = gen_reg_rtx (mode);
38846
38847 if (swap_operands)
38848 {
38849 rtx tmp = op0;
38850 op0 = op1;
38851 op1 = tmp;
38852 }
38853
38854 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
38855
38856 emit_insn (insn (mask, op0, op1,
38857 gen_rtx_fmt_ee (code, mode, op0, op1)));
38858 return mask;
38859 }
38860
38861 /* Generate and return a rtx of mode MODE for 2**n where n is the number
38862 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
38863 static rtx
38864 ix86_gen_TWO52 (enum machine_mode mode)
38865 {
38866 REAL_VALUE_TYPE TWO52r;
38867 rtx TWO52;
38868
38869 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
38870 TWO52 = const_double_from_real_value (TWO52r, mode);
38871 TWO52 = force_reg (mode, TWO52);
38872
38873 return TWO52;
38874 }
38875
38876 /* Expand SSE sequence for computing lround from OP1 storing
38877 into OP0. */
38878 void
38879 ix86_expand_lround (rtx op0, rtx op1)
38880 {
38881 /* C code for the stuff we're doing below:
38882 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
38883 return (long)tmp;
38884 */
38885 enum machine_mode mode = GET_MODE (op1);
38886 const struct real_format *fmt;
38887 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38888 rtx adj;
38889
38890 /* load nextafter (0.5, 0.0) */
38891 fmt = REAL_MODE_FORMAT (mode);
38892 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38893 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38894
38895 /* adj = copysign (0.5, op1) */
38896 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
38897 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
38898
38899 /* adj = op1 + adj */
38900 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
38901
38902 /* op0 = (imode)adj */
38903 expand_fix (op0, adj, 0);
38904 }
38905
38906 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
38907 into OPERAND0. */
38908 void
38909 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
38910 {
38911 /* C code for the stuff we're doing below (for do_floor):
38912 xi = (long)op1;
38913 xi -= (double)xi > op1 ? 1 : 0;
38914 return xi;
38915 */
38916 enum machine_mode fmode = GET_MODE (op1);
38917 enum machine_mode imode = GET_MODE (op0);
38918 rtx ireg, freg, label, tmp;
38919
38920 /* reg = (long)op1 */
38921 ireg = gen_reg_rtx (imode);
38922 expand_fix (ireg, op1, 0);
38923
38924 /* freg = (double)reg */
38925 freg = gen_reg_rtx (fmode);
38926 expand_float (freg, ireg, 0);
38927
38928 /* ireg = (freg > op1) ? ireg - 1 : ireg */
38929 label = ix86_expand_sse_compare_and_jump (UNLE,
38930 freg, op1, !do_floor);
38931 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
38932 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
38933 emit_move_insn (ireg, tmp);
38934
38935 emit_label (label);
38936 LABEL_NUSES (label) = 1;
38937
38938 emit_move_insn (op0, ireg);
38939 }
38940
38941 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
38942 result in OPERAND0. */
38943 void
38944 ix86_expand_rint (rtx operand0, rtx operand1)
38945 {
38946 /* C code for the stuff we're doing below:
38947 xa = fabs (operand1);
38948 if (!isless (xa, 2**52))
38949 return operand1;
38950 xa = xa + 2**52 - 2**52;
38951 return copysign (xa, operand1);
38952 */
38953 enum machine_mode mode = GET_MODE (operand0);
38954 rtx res, xa, label, TWO52, mask;
38955
38956 res = gen_reg_rtx (mode);
38957 emit_move_insn (res, operand1);
38958
38959 /* xa = abs (operand1) */
38960 xa = ix86_expand_sse_fabs (res, &mask);
38961
38962 /* if (!isless (xa, TWO52)) goto label; */
38963 TWO52 = ix86_gen_TWO52 (mode);
38964 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38965
38966 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38967 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
38968
38969 ix86_sse_copysign_to_positive (res, xa, res, mask);
38970
38971 emit_label (label);
38972 LABEL_NUSES (label) = 1;
38973
38974 emit_move_insn (operand0, res);
38975 }
38976
38977 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
38978 into OPERAND0. */
38979 void
38980 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
38981 {
38982 /* C code for the stuff we expand below.
38983 double xa = fabs (x), x2;
38984 if (!isless (xa, TWO52))
38985 return x;
38986 xa = xa + TWO52 - TWO52;
38987 x2 = copysign (xa, x);
38988 Compensate. Floor:
38989 if (x2 > x)
38990 x2 -= 1;
38991 Compensate. Ceil:
38992 if (x2 < x)
38993 x2 -= -1;
38994 return x2;
38995 */
38996 enum machine_mode mode = GET_MODE (operand0);
38997 rtx xa, TWO52, tmp, label, one, res, mask;
38998
38999 TWO52 = ix86_gen_TWO52 (mode);
39000
39001 /* Temporary for holding the result, initialized to the input
39002 operand to ease control flow. */
39003 res = gen_reg_rtx (mode);
39004 emit_move_insn (res, operand1);
39005
39006 /* xa = abs (operand1) */
39007 xa = ix86_expand_sse_fabs (res, &mask);
39008
39009 /* if (!isless (xa, TWO52)) goto label; */
39010 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39011
39012 /* xa = xa + TWO52 - TWO52; */
39013 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39014 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
39015
39016 /* xa = copysign (xa, operand1) */
39017 ix86_sse_copysign_to_positive (xa, xa, res, mask);
39018
39019 /* generate 1.0 or -1.0 */
39020 one = force_reg (mode,
39021 const_double_from_real_value (do_floor
39022 ? dconst1 : dconstm1, mode));
39023
39024 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39025 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39026 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39027 gen_rtx_AND (mode, one, tmp)));
39028 /* We always need to subtract here to preserve signed zero. */
39029 tmp = expand_simple_binop (mode, MINUS,
39030 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39031 emit_move_insn (res, tmp);
39032
39033 emit_label (label);
39034 LABEL_NUSES (label) = 1;
39035
39036 emit_move_insn (operand0, res);
39037 }
39038
39039 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
39040 into OPERAND0. */
39041 void
39042 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
39043 {
39044 /* C code for the stuff we expand below.
39045 double xa = fabs (x), x2;
39046 if (!isless (xa, TWO52))
39047 return x;
39048 x2 = (double)(long)x;
39049 Compensate. Floor:
39050 if (x2 > x)
39051 x2 -= 1;
39052 Compensate. Ceil:
39053 if (x2 < x)
39054 x2 += 1;
39055 if (HONOR_SIGNED_ZEROS (mode))
39056 return copysign (x2, x);
39057 return x2;
39058 */
39059 enum machine_mode mode = GET_MODE (operand0);
39060 rtx xa, xi, TWO52, tmp, label, one, res, mask;
39061
39062 TWO52 = ix86_gen_TWO52 (mode);
39063
39064 /* Temporary for holding the result, initialized to the input
39065 operand to ease control flow. */
39066 res = gen_reg_rtx (mode);
39067 emit_move_insn (res, operand1);
39068
39069 /* xa = abs (operand1) */
39070 xa = ix86_expand_sse_fabs (res, &mask);
39071
39072 /* if (!isless (xa, TWO52)) goto label; */
39073 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39074
39075 /* xa = (double)(long)x */
39076 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39077 expand_fix (xi, res, 0);
39078 expand_float (xa, xi, 0);
39079
39080 /* generate 1.0 */
39081 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39082
39083 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
39084 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
39085 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39086 gen_rtx_AND (mode, one, tmp)));
39087 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
39088 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39089 emit_move_insn (res, tmp);
39090
39091 if (HONOR_SIGNED_ZEROS (mode))
39092 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39093
39094 emit_label (label);
39095 LABEL_NUSES (label) = 1;
39096
39097 emit_move_insn (operand0, res);
39098 }
39099
39100 /* Expand SSE sequence for computing round from OPERAND1 storing
39101 into OPERAND0. Sequence that works without relying on DImode truncation
39102 via cvttsd2siq that is only available on 64bit targets. */
39103 void
39104 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
39105 {
39106 /* C code for the stuff we expand below.
39107 double xa = fabs (x), xa2, x2;
39108 if (!isless (xa, TWO52))
39109 return x;
39110 Using the absolute value and copying back sign makes
39111 -0.0 -> -0.0 correct.
39112 xa2 = xa + TWO52 - TWO52;
39113 Compensate.
39114 dxa = xa2 - xa;
39115 if (dxa <= -0.5)
39116 xa2 += 1;
39117 else if (dxa > 0.5)
39118 xa2 -= 1;
39119 x2 = copysign (xa2, x);
39120 return x2;
39121 */
39122 enum machine_mode mode = GET_MODE (operand0);
39123 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
39124
39125 TWO52 = ix86_gen_TWO52 (mode);
39126
39127 /* Temporary for holding the result, initialized to the input
39128 operand to ease control flow. */
39129 res = gen_reg_rtx (mode);
39130 emit_move_insn (res, operand1);
39131
39132 /* xa = abs (operand1) */
39133 xa = ix86_expand_sse_fabs (res, &mask);
39134
39135 /* if (!isless (xa, TWO52)) goto label; */
39136 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39137
39138 /* xa2 = xa + TWO52 - TWO52; */
39139 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39140 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
39141
39142 /* dxa = xa2 - xa; */
39143 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
39144
39145 /* generate 0.5, 1.0 and -0.5 */
39146 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
39147 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
39148 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
39149 0, OPTAB_DIRECT);
39150
39151 /* Compensate. */
39152 tmp = gen_reg_rtx (mode);
39153 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
39154 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
39155 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39156 gen_rtx_AND (mode, one, tmp)));
39157 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39158 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
39159 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
39160 emit_insn (gen_rtx_SET (VOIDmode, tmp,
39161 gen_rtx_AND (mode, one, tmp)));
39162 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
39163
39164 /* res = copysign (xa2, operand1) */
39165 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
39166
39167 emit_label (label);
39168 LABEL_NUSES (label) = 1;
39169
39170 emit_move_insn (operand0, res);
39171 }
39172
39173 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39174 into OPERAND0. */
39175 void
39176 ix86_expand_trunc (rtx operand0, rtx operand1)
39177 {
39178 /* C code for SSE variant we expand below.
39179 double xa = fabs (x), x2;
39180 if (!isless (xa, TWO52))
39181 return x;
39182 x2 = (double)(long)x;
39183 if (HONOR_SIGNED_ZEROS (mode))
39184 return copysign (x2, x);
39185 return x2;
39186 */
39187 enum machine_mode mode = GET_MODE (operand0);
39188 rtx xa, xi, TWO52, label, res, mask;
39189
39190 TWO52 = ix86_gen_TWO52 (mode);
39191
39192 /* Temporary for holding the result, initialized to the input
39193 operand to ease control flow. */
39194 res = gen_reg_rtx (mode);
39195 emit_move_insn (res, operand1);
39196
39197 /* xa = abs (operand1) */
39198 xa = ix86_expand_sse_fabs (res, &mask);
39199
39200 /* if (!isless (xa, TWO52)) goto label; */
39201 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39202
39203 /* x = (double)(long)x */
39204 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39205 expand_fix (xi, res, 0);
39206 expand_float (res, xi, 0);
39207
39208 if (HONOR_SIGNED_ZEROS (mode))
39209 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
39210
39211 emit_label (label);
39212 LABEL_NUSES (label) = 1;
39213
39214 emit_move_insn (operand0, res);
39215 }
39216
39217 /* Expand SSE sequence for computing trunc from OPERAND1 storing
39218 into OPERAND0. */
39219 void
39220 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
39221 {
39222 enum machine_mode mode = GET_MODE (operand0);
39223 rtx xa, mask, TWO52, label, one, res, smask, tmp;
39224
39225 /* C code for SSE variant we expand below.
39226 double xa = fabs (x), x2;
39227 if (!isless (xa, TWO52))
39228 return x;
39229 xa2 = xa + TWO52 - TWO52;
39230 Compensate:
39231 if (xa2 > xa)
39232 xa2 -= 1.0;
39233 x2 = copysign (xa2, x);
39234 return x2;
39235 */
39236
39237 TWO52 = ix86_gen_TWO52 (mode);
39238
39239 /* Temporary for holding the result, initialized to the input
39240 operand to ease control flow. */
39241 res = gen_reg_rtx (mode);
39242 emit_move_insn (res, operand1);
39243
39244 /* xa = abs (operand1) */
39245 xa = ix86_expand_sse_fabs (res, &smask);
39246
39247 /* if (!isless (xa, TWO52)) goto label; */
39248 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39249
39250 /* res = xa + TWO52 - TWO52; */
39251 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
39252 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
39253 emit_move_insn (res, tmp);
39254
39255 /* generate 1.0 */
39256 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
39257
39258 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
39259 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
39260 emit_insn (gen_rtx_SET (VOIDmode, mask,
39261 gen_rtx_AND (mode, mask, one)));
39262 tmp = expand_simple_binop (mode, MINUS,
39263 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
39264 emit_move_insn (res, tmp);
39265
39266 /* res = copysign (res, operand1) */
39267 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
39268
39269 emit_label (label);
39270 LABEL_NUSES (label) = 1;
39271
39272 emit_move_insn (operand0, res);
39273 }
39274
39275 /* Expand SSE sequence for computing round from OPERAND1 storing
39276 into OPERAND0. */
39277 void
39278 ix86_expand_round (rtx operand0, rtx operand1)
39279 {
39280 /* C code for the stuff we're doing below:
39281 double xa = fabs (x);
39282 if (!isless (xa, TWO52))
39283 return x;
39284 xa = (double)(long)(xa + nextafter (0.5, 0.0));
39285 return copysign (xa, x);
39286 */
39287 enum machine_mode mode = GET_MODE (operand0);
39288 rtx res, TWO52, xa, label, xi, half, mask;
39289 const struct real_format *fmt;
39290 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39291
39292 /* Temporary for holding the result, initialized to the input
39293 operand to ease control flow. */
39294 res = gen_reg_rtx (mode);
39295 emit_move_insn (res, operand1);
39296
39297 TWO52 = ix86_gen_TWO52 (mode);
39298 xa = ix86_expand_sse_fabs (res, &mask);
39299 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
39300
39301 /* load nextafter (0.5, 0.0) */
39302 fmt = REAL_MODE_FORMAT (mode);
39303 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39304 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39305
39306 /* xa = xa + 0.5 */
39307 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
39308 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
39309
39310 /* xa = (double)(int64_t)xa */
39311 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
39312 expand_fix (xi, xa, 0);
39313 expand_float (xa, xi, 0);
39314
39315 /* res = copysign (xa, operand1) */
39316 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
39317
39318 emit_label (label);
39319 LABEL_NUSES (label) = 1;
39320
39321 emit_move_insn (operand0, res);
39322 }
39323
39324 /* Expand SSE sequence for computing round
39325 from OP1 storing into OP0 using sse4 round insn. */
39326 void
39327 ix86_expand_round_sse4 (rtx op0, rtx op1)
39328 {
39329 enum machine_mode mode = GET_MODE (op0);
39330 rtx e1, e2, res, half;
39331 const struct real_format *fmt;
39332 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
39333 rtx (*gen_copysign) (rtx, rtx, rtx);
39334 rtx (*gen_round) (rtx, rtx, rtx);
39335
39336 switch (mode)
39337 {
39338 case SFmode:
39339 gen_copysign = gen_copysignsf3;
39340 gen_round = gen_sse4_1_roundsf2;
39341 break;
39342 case DFmode:
39343 gen_copysign = gen_copysigndf3;
39344 gen_round = gen_sse4_1_rounddf2;
39345 break;
39346 default:
39347 gcc_unreachable ();
39348 }
39349
39350 /* round (a) = trunc (a + copysign (0.5, a)) */
39351
39352 /* load nextafter (0.5, 0.0) */
39353 fmt = REAL_MODE_FORMAT (mode);
39354 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
39355 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
39356 half = const_double_from_real_value (pred_half, mode);
39357
39358 /* e1 = copysign (0.5, op1) */
39359 e1 = gen_reg_rtx (mode);
39360 emit_insn (gen_copysign (e1, half, op1));
39361
39362 /* e2 = op1 + e1 */
39363 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
39364
39365 /* res = trunc (e2) */
39366 res = gen_reg_rtx (mode);
39367 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
39368
39369 emit_move_insn (op0, res);
39370 }
39371 \f
39372
39373 /* Table of valid machine attributes. */
39374 static const struct attribute_spec ix86_attribute_table[] =
39375 {
39376 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
39377 affects_type_identity } */
39378 /* Stdcall attribute says callee is responsible for popping arguments
39379 if they are not variable. */
39380 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39381 true },
39382 /* Fastcall attribute says callee is responsible for popping arguments
39383 if they are not variable. */
39384 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39385 true },
39386 /* Thiscall attribute says callee is responsible for popping arguments
39387 if they are not variable. */
39388 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39389 true },
39390 /* Cdecl attribute says the callee is a normal C declaration */
39391 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39392 true },
39393 /* Regparm attribute specifies how many integer arguments are to be
39394 passed in registers. */
39395 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
39396 true },
39397 /* Sseregparm attribute says we are using x86_64 calling conventions
39398 for FP arguments. */
39399 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
39400 true },
39401 /* The transactional memory builtins are implicitly regparm or fastcall
39402 depending on the ABI. Override the generic do-nothing attribute that
39403 these builtins were declared with. */
39404 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
39405 true },
39406 /* force_align_arg_pointer says this function realigns the stack at entry. */
39407 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
39408 false, true, true, ix86_handle_cconv_attribute, false },
39409 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
39410 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
39411 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
39412 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
39413 false },
39414 #endif
39415 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39416 false },
39417 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
39418 false },
39419 #ifdef SUBTARGET_ATTRIBUTE_TABLE
39420 SUBTARGET_ATTRIBUTE_TABLE,
39421 #endif
39422 /* ms_abi and sysv_abi calling convention function attributes. */
39423 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39424 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
39425 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
39426 false },
39427 { "callee_pop_aggregate_return", 1, 1, false, true, true,
39428 ix86_handle_callee_pop_aggregate_return, true },
39429 /* End element. */
39430 { NULL, 0, 0, false, false, false, NULL, false }
39431 };
39432
39433 /* Implement targetm.vectorize.builtin_vectorization_cost. */
39434 static int
39435 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
39436 tree vectype,
39437 int misalign ATTRIBUTE_UNUSED)
39438 {
39439 unsigned elements;
39440
39441 switch (type_of_cost)
39442 {
39443 case scalar_stmt:
39444 return ix86_cost->scalar_stmt_cost;
39445
39446 case scalar_load:
39447 return ix86_cost->scalar_load_cost;
39448
39449 case scalar_store:
39450 return ix86_cost->scalar_store_cost;
39451
39452 case vector_stmt:
39453 return ix86_cost->vec_stmt_cost;
39454
39455 case vector_load:
39456 return ix86_cost->vec_align_load_cost;
39457
39458 case vector_store:
39459 return ix86_cost->vec_store_cost;
39460
39461 case vec_to_scalar:
39462 return ix86_cost->vec_to_scalar_cost;
39463
39464 case scalar_to_vec:
39465 return ix86_cost->scalar_to_vec_cost;
39466
39467 case unaligned_load:
39468 case unaligned_store:
39469 return ix86_cost->vec_unalign_load_cost;
39470
39471 case cond_branch_taken:
39472 return ix86_cost->cond_taken_branch_cost;
39473
39474 case cond_branch_not_taken:
39475 return ix86_cost->cond_not_taken_branch_cost;
39476
39477 case vec_perm:
39478 case vec_promote_demote:
39479 return ix86_cost->vec_stmt_cost;
39480
39481 case vec_construct:
39482 elements = TYPE_VECTOR_SUBPARTS (vectype);
39483 return elements / 2 + 1;
39484
39485 default:
39486 gcc_unreachable ();
39487 }
39488 }
39489
39490 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
39491 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
39492 insn every time. */
39493
39494 static GTY(()) rtx vselect_insn;
39495
39496 /* Initialize vselect_insn. */
39497
39498 static void
39499 init_vselect_insn (void)
39500 {
39501 unsigned i;
39502 rtx x;
39503
39504 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
39505 for (i = 0; i < MAX_VECT_LEN; ++i)
39506 XVECEXP (x, 0, i) = const0_rtx;
39507 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
39508 const0_rtx), x);
39509 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
39510 start_sequence ();
39511 vselect_insn = emit_insn (x);
39512 end_sequence ();
39513 }
39514
39515 /* Construct (set target (vec_select op0 (parallel perm))) and
39516 return true if that's a valid instruction in the active ISA. */
39517
39518 static bool
39519 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
39520 unsigned nelt, bool testing_p)
39521 {
39522 unsigned int i;
39523 rtx x, save_vconcat;
39524 int icode;
39525
39526 if (vselect_insn == NULL_RTX)
39527 init_vselect_insn ();
39528
39529 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
39530 PUT_NUM_ELEM (XVEC (x, 0), nelt);
39531 for (i = 0; i < nelt; ++i)
39532 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
39533 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39534 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
39535 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
39536 SET_DEST (PATTERN (vselect_insn)) = target;
39537 icode = recog_memoized (vselect_insn);
39538
39539 if (icode >= 0 && !testing_p)
39540 emit_insn (copy_rtx (PATTERN (vselect_insn)));
39541
39542 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
39543 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
39544 INSN_CODE (vselect_insn) = -1;
39545
39546 return icode >= 0;
39547 }
39548
39549 /* Similar, but generate a vec_concat from op0 and op1 as well. */
39550
39551 static bool
39552 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
39553 const unsigned char *perm, unsigned nelt,
39554 bool testing_p)
39555 {
39556 enum machine_mode v2mode;
39557 rtx x;
39558 bool ok;
39559
39560 if (vselect_insn == NULL_RTX)
39561 init_vselect_insn ();
39562
39563 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
39564 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
39565 PUT_MODE (x, v2mode);
39566 XEXP (x, 0) = op0;
39567 XEXP (x, 1) = op1;
39568 ok = expand_vselect (target, x, perm, nelt, testing_p);
39569 XEXP (x, 0) = const0_rtx;
39570 XEXP (x, 1) = const0_rtx;
39571 return ok;
39572 }
39573
39574 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39575 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
39576
39577 static bool
39578 expand_vec_perm_blend (struct expand_vec_perm_d *d)
39579 {
39580 enum machine_mode vmode = d->vmode;
39581 unsigned i, mask, nelt = d->nelt;
39582 rtx target, op0, op1, x;
39583 rtx rperm[32], vperm;
39584
39585 if (d->one_operand_p)
39586 return false;
39587 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
39588 ;
39589 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
39590 ;
39591 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
39592 ;
39593 else
39594 return false;
39595
39596 /* This is a blend, not a permute. Elements must stay in their
39597 respective lanes. */
39598 for (i = 0; i < nelt; ++i)
39599 {
39600 unsigned e = d->perm[i];
39601 if (!(e == i || e == i + nelt))
39602 return false;
39603 }
39604
39605 if (d->testing_p)
39606 return true;
39607
39608 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
39609 decision should be extracted elsewhere, so that we only try that
39610 sequence once all budget==3 options have been tried. */
39611 target = d->target;
39612 op0 = d->op0;
39613 op1 = d->op1;
39614 mask = 0;
39615
39616 switch (vmode)
39617 {
39618 case V4DFmode:
39619 case V8SFmode:
39620 case V2DFmode:
39621 case V4SFmode:
39622 case V8HImode:
39623 case V8SImode:
39624 for (i = 0; i < nelt; ++i)
39625 mask |= (d->perm[i] >= nelt) << i;
39626 break;
39627
39628 case V2DImode:
39629 for (i = 0; i < 2; ++i)
39630 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
39631 vmode = V8HImode;
39632 goto do_subreg;
39633
39634 case V4SImode:
39635 for (i = 0; i < 4; ++i)
39636 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39637 vmode = V8HImode;
39638 goto do_subreg;
39639
39640 case V16QImode:
39641 /* See if bytes move in pairs so we can use pblendw with
39642 an immediate argument, rather than pblendvb with a vector
39643 argument. */
39644 for (i = 0; i < 16; i += 2)
39645 if (d->perm[i] + 1 != d->perm[i + 1])
39646 {
39647 use_pblendvb:
39648 for (i = 0; i < nelt; ++i)
39649 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
39650
39651 finish_pblendvb:
39652 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
39653 vperm = force_reg (vmode, vperm);
39654
39655 if (GET_MODE_SIZE (vmode) == 16)
39656 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
39657 else
39658 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
39659 if (target != d->target)
39660 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39661 return true;
39662 }
39663
39664 for (i = 0; i < 8; ++i)
39665 mask |= (d->perm[i * 2] >= 16) << i;
39666 vmode = V8HImode;
39667 /* FALLTHRU */
39668
39669 do_subreg:
39670 target = gen_reg_rtx (vmode);
39671 op0 = gen_lowpart (vmode, op0);
39672 op1 = gen_lowpart (vmode, op1);
39673 break;
39674
39675 case V32QImode:
39676 /* See if bytes move in pairs. If not, vpblendvb must be used. */
39677 for (i = 0; i < 32; i += 2)
39678 if (d->perm[i] + 1 != d->perm[i + 1])
39679 goto use_pblendvb;
39680 /* See if bytes move in quadruplets. If yes, vpblendd
39681 with immediate can be used. */
39682 for (i = 0; i < 32; i += 4)
39683 if (d->perm[i] + 2 != d->perm[i + 2])
39684 break;
39685 if (i < 32)
39686 {
39687 /* See if bytes move the same in both lanes. If yes,
39688 vpblendw with immediate can be used. */
39689 for (i = 0; i < 16; i += 2)
39690 if (d->perm[i] + 16 != d->perm[i + 16])
39691 goto use_pblendvb;
39692
39693 /* Use vpblendw. */
39694 for (i = 0; i < 16; ++i)
39695 mask |= (d->perm[i * 2] >= 32) << i;
39696 vmode = V16HImode;
39697 goto do_subreg;
39698 }
39699
39700 /* Use vpblendd. */
39701 for (i = 0; i < 8; ++i)
39702 mask |= (d->perm[i * 4] >= 32) << i;
39703 vmode = V8SImode;
39704 goto do_subreg;
39705
39706 case V16HImode:
39707 /* See if words move in pairs. If yes, vpblendd can be used. */
39708 for (i = 0; i < 16; i += 2)
39709 if (d->perm[i] + 1 != d->perm[i + 1])
39710 break;
39711 if (i < 16)
39712 {
39713 /* See if words move the same in both lanes. If not,
39714 vpblendvb must be used. */
39715 for (i = 0; i < 8; i++)
39716 if (d->perm[i] + 8 != d->perm[i + 8])
39717 {
39718 /* Use vpblendvb. */
39719 for (i = 0; i < 32; ++i)
39720 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
39721
39722 vmode = V32QImode;
39723 nelt = 32;
39724 target = gen_reg_rtx (vmode);
39725 op0 = gen_lowpart (vmode, op0);
39726 op1 = gen_lowpart (vmode, op1);
39727 goto finish_pblendvb;
39728 }
39729
39730 /* Use vpblendw. */
39731 for (i = 0; i < 16; ++i)
39732 mask |= (d->perm[i] >= 16) << i;
39733 break;
39734 }
39735
39736 /* Use vpblendd. */
39737 for (i = 0; i < 8; ++i)
39738 mask |= (d->perm[i * 2] >= 16) << i;
39739 vmode = V8SImode;
39740 goto do_subreg;
39741
39742 case V4DImode:
39743 /* Use vpblendd. */
39744 for (i = 0; i < 4; ++i)
39745 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
39746 vmode = V8SImode;
39747 goto do_subreg;
39748
39749 default:
39750 gcc_unreachable ();
39751 }
39752
39753 /* This matches five different patterns with the different modes. */
39754 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
39755 x = gen_rtx_SET (VOIDmode, target, x);
39756 emit_insn (x);
39757 if (target != d->target)
39758 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39759
39760 return true;
39761 }
39762
39763 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39764 in terms of the variable form of vpermilps.
39765
39766 Note that we will have already failed the immediate input vpermilps,
39767 which requires that the high and low part shuffle be identical; the
39768 variable form doesn't require that. */
39769
39770 static bool
39771 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
39772 {
39773 rtx rperm[8], vperm;
39774 unsigned i;
39775
39776 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
39777 return false;
39778
39779 /* We can only permute within the 128-bit lane. */
39780 for (i = 0; i < 8; ++i)
39781 {
39782 unsigned e = d->perm[i];
39783 if (i < 4 ? e >= 4 : e < 4)
39784 return false;
39785 }
39786
39787 if (d->testing_p)
39788 return true;
39789
39790 for (i = 0; i < 8; ++i)
39791 {
39792 unsigned e = d->perm[i];
39793
39794 /* Within each 128-bit lane, the elements of op0 are numbered
39795 from 0 and the elements of op1 are numbered from 4. */
39796 if (e >= 8 + 4)
39797 e -= 8;
39798 else if (e >= 4)
39799 e -= 4;
39800
39801 rperm[i] = GEN_INT (e);
39802 }
39803
39804 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
39805 vperm = force_reg (V8SImode, vperm);
39806 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
39807
39808 return true;
39809 }
39810
39811 /* Return true if permutation D can be performed as VMODE permutation
39812 instead. */
39813
39814 static bool
39815 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
39816 {
39817 unsigned int i, j, chunk;
39818
39819 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
39820 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
39821 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
39822 return false;
39823
39824 if (GET_MODE_NUNITS (vmode) >= d->nelt)
39825 return true;
39826
39827 chunk = d->nelt / GET_MODE_NUNITS (vmode);
39828 for (i = 0; i < d->nelt; i += chunk)
39829 if (d->perm[i] & (chunk - 1))
39830 return false;
39831 else
39832 for (j = 1; j < chunk; ++j)
39833 if (d->perm[i] + j != d->perm[i + j])
39834 return false;
39835
39836 return true;
39837 }
39838
39839 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39840 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
39841
39842 static bool
39843 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
39844 {
39845 unsigned i, nelt, eltsz, mask;
39846 unsigned char perm[32];
39847 enum machine_mode vmode = V16QImode;
39848 rtx rperm[32], vperm, target, op0, op1;
39849
39850 nelt = d->nelt;
39851
39852 if (!d->one_operand_p)
39853 {
39854 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
39855 {
39856 if (TARGET_AVX2
39857 && valid_perm_using_mode_p (V2TImode, d))
39858 {
39859 if (d->testing_p)
39860 return true;
39861
39862 /* Use vperm2i128 insn. The pattern uses
39863 V4DImode instead of V2TImode. */
39864 target = d->target;
39865 if (d->vmode != V4DImode)
39866 target = gen_reg_rtx (V4DImode);
39867 op0 = gen_lowpart (V4DImode, d->op0);
39868 op1 = gen_lowpart (V4DImode, d->op1);
39869 rperm[0]
39870 = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
39871 || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
39872 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
39873 if (target != d->target)
39874 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39875 return true;
39876 }
39877 return false;
39878 }
39879 }
39880 else
39881 {
39882 if (GET_MODE_SIZE (d->vmode) == 16)
39883 {
39884 if (!TARGET_SSSE3)
39885 return false;
39886 }
39887 else if (GET_MODE_SIZE (d->vmode) == 32)
39888 {
39889 if (!TARGET_AVX2)
39890 return false;
39891
39892 /* V4DImode should be already handled through
39893 expand_vselect by vpermq instruction. */
39894 gcc_assert (d->vmode != V4DImode);
39895
39896 vmode = V32QImode;
39897 if (d->vmode == V8SImode
39898 || d->vmode == V16HImode
39899 || d->vmode == V32QImode)
39900 {
39901 /* First see if vpermq can be used for
39902 V8SImode/V16HImode/V32QImode. */
39903 if (valid_perm_using_mode_p (V4DImode, d))
39904 {
39905 for (i = 0; i < 4; i++)
39906 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
39907 if (d->testing_p)
39908 return true;
39909 target = gen_reg_rtx (V4DImode);
39910 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
39911 perm, 4, false))
39912 {
39913 emit_move_insn (d->target,
39914 gen_lowpart (d->vmode, target));
39915 return true;
39916 }
39917 return false;
39918 }
39919
39920 /* Next see if vpermd can be used. */
39921 if (valid_perm_using_mode_p (V8SImode, d))
39922 vmode = V8SImode;
39923 }
39924 /* Or if vpermps can be used. */
39925 else if (d->vmode == V8SFmode)
39926 vmode = V8SImode;
39927
39928 if (vmode == V32QImode)
39929 {
39930 /* vpshufb only works intra lanes, it is not
39931 possible to shuffle bytes in between the lanes. */
39932 for (i = 0; i < nelt; ++i)
39933 if ((d->perm[i] ^ i) & (nelt / 2))
39934 return false;
39935 }
39936 }
39937 else
39938 return false;
39939 }
39940
39941 if (d->testing_p)
39942 return true;
39943
39944 if (vmode == V8SImode)
39945 for (i = 0; i < 8; ++i)
39946 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
39947 else
39948 {
39949 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39950 if (!d->one_operand_p)
39951 mask = 2 * nelt - 1;
39952 else if (vmode == V16QImode)
39953 mask = nelt - 1;
39954 else
39955 mask = nelt / 2 - 1;
39956
39957 for (i = 0; i < nelt; ++i)
39958 {
39959 unsigned j, e = d->perm[i] & mask;
39960 for (j = 0; j < eltsz; ++j)
39961 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
39962 }
39963 }
39964
39965 vperm = gen_rtx_CONST_VECTOR (vmode,
39966 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
39967 vperm = force_reg (vmode, vperm);
39968
39969 target = d->target;
39970 if (d->vmode != vmode)
39971 target = gen_reg_rtx (vmode);
39972 op0 = gen_lowpart (vmode, d->op0);
39973 if (d->one_operand_p)
39974 {
39975 if (vmode == V16QImode)
39976 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
39977 else if (vmode == V32QImode)
39978 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
39979 else if (vmode == V8SFmode)
39980 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
39981 else
39982 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
39983 }
39984 else
39985 {
39986 op1 = gen_lowpart (vmode, d->op1);
39987 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
39988 }
39989 if (target != d->target)
39990 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
39991
39992 return true;
39993 }
39994
39995 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
39996 in a single instruction. */
39997
39998 static bool
39999 expand_vec_perm_1 (struct expand_vec_perm_d *d)
40000 {
40001 unsigned i, nelt = d->nelt;
40002 unsigned char perm2[MAX_VECT_LEN];
40003
40004 /* Check plain VEC_SELECT first, because AVX has instructions that could
40005 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
40006 input where SEL+CONCAT may not. */
40007 if (d->one_operand_p)
40008 {
40009 int mask = nelt - 1;
40010 bool identity_perm = true;
40011 bool broadcast_perm = true;
40012
40013 for (i = 0; i < nelt; i++)
40014 {
40015 perm2[i] = d->perm[i] & mask;
40016 if (perm2[i] != i)
40017 identity_perm = false;
40018 if (perm2[i])
40019 broadcast_perm = false;
40020 }
40021
40022 if (identity_perm)
40023 {
40024 if (!d->testing_p)
40025 emit_move_insn (d->target, d->op0);
40026 return true;
40027 }
40028 else if (broadcast_perm && TARGET_AVX2)
40029 {
40030 /* Use vpbroadcast{b,w,d}. */
40031 rtx (*gen) (rtx, rtx) = NULL;
40032 switch (d->vmode)
40033 {
40034 case V32QImode:
40035 gen = gen_avx2_pbroadcastv32qi_1;
40036 break;
40037 case V16HImode:
40038 gen = gen_avx2_pbroadcastv16hi_1;
40039 break;
40040 case V8SImode:
40041 gen = gen_avx2_pbroadcastv8si_1;
40042 break;
40043 case V16QImode:
40044 gen = gen_avx2_pbroadcastv16qi;
40045 break;
40046 case V8HImode:
40047 gen = gen_avx2_pbroadcastv8hi;
40048 break;
40049 case V8SFmode:
40050 gen = gen_avx2_vec_dupv8sf_1;
40051 break;
40052 /* For other modes prefer other shuffles this function creates. */
40053 default: break;
40054 }
40055 if (gen != NULL)
40056 {
40057 if (!d->testing_p)
40058 emit_insn (gen (d->target, d->op0));
40059 return true;
40060 }
40061 }
40062
40063 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
40064 return true;
40065
40066 /* There are plenty of patterns in sse.md that are written for
40067 SEL+CONCAT and are not replicated for a single op. Perhaps
40068 that should be changed, to avoid the nastiness here. */
40069
40070 /* Recognize interleave style patterns, which means incrementing
40071 every other permutation operand. */
40072 for (i = 0; i < nelt; i += 2)
40073 {
40074 perm2[i] = d->perm[i] & mask;
40075 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
40076 }
40077 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40078 d->testing_p))
40079 return true;
40080
40081 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
40082 if (nelt >= 4)
40083 {
40084 for (i = 0; i < nelt; i += 4)
40085 {
40086 perm2[i + 0] = d->perm[i + 0] & mask;
40087 perm2[i + 1] = d->perm[i + 1] & mask;
40088 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
40089 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
40090 }
40091
40092 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
40093 d->testing_p))
40094 return true;
40095 }
40096 }
40097
40098 /* Finally, try the fully general two operand permute. */
40099 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
40100 d->testing_p))
40101 return true;
40102
40103 /* Recognize interleave style patterns with reversed operands. */
40104 if (!d->one_operand_p)
40105 {
40106 for (i = 0; i < nelt; ++i)
40107 {
40108 unsigned e = d->perm[i];
40109 if (e >= nelt)
40110 e -= nelt;
40111 else
40112 e += nelt;
40113 perm2[i] = e;
40114 }
40115
40116 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
40117 d->testing_p))
40118 return true;
40119 }
40120
40121 /* Try the SSE4.1 blend variable merge instructions. */
40122 if (expand_vec_perm_blend (d))
40123 return true;
40124
40125 /* Try one of the AVX vpermil variable permutations. */
40126 if (expand_vec_perm_vpermil (d))
40127 return true;
40128
40129 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
40130 vpshufb, vpermd, vpermps or vpermq variable permutation. */
40131 if (expand_vec_perm_pshufb (d))
40132 return true;
40133
40134 return false;
40135 }
40136
40137 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
40138 in terms of a pair of pshuflw + pshufhw instructions. */
40139
40140 static bool
40141 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
40142 {
40143 unsigned char perm2[MAX_VECT_LEN];
40144 unsigned i;
40145 bool ok;
40146
40147 if (d->vmode != V8HImode || !d->one_operand_p)
40148 return false;
40149
40150 /* The two permutations only operate in 64-bit lanes. */
40151 for (i = 0; i < 4; ++i)
40152 if (d->perm[i] >= 4)
40153 return false;
40154 for (i = 4; i < 8; ++i)
40155 if (d->perm[i] < 4)
40156 return false;
40157
40158 if (d->testing_p)
40159 return true;
40160
40161 /* Emit the pshuflw. */
40162 memcpy (perm2, d->perm, 4);
40163 for (i = 4; i < 8; ++i)
40164 perm2[i] = i;
40165 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
40166 gcc_assert (ok);
40167
40168 /* Emit the pshufhw. */
40169 memcpy (perm2 + 4, d->perm + 4, 4);
40170 for (i = 0; i < 4; ++i)
40171 perm2[i] = i;
40172 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
40173 gcc_assert (ok);
40174
40175 return true;
40176 }
40177
40178 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40179 the permutation using the SSSE3 palignr instruction. This succeeds
40180 when all of the elements in PERM fit within one vector and we merely
40181 need to shift them down so that a single vector permutation has a
40182 chance to succeed. */
40183
40184 static bool
40185 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
40186 {
40187 unsigned i, nelt = d->nelt;
40188 unsigned min, max;
40189 bool in_order, ok;
40190 rtx shift, target;
40191 struct expand_vec_perm_d dcopy;
40192
40193 /* Even with AVX, palignr only operates on 128-bit vectors. */
40194 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40195 return false;
40196
40197 min = nelt, max = 0;
40198 for (i = 0; i < nelt; ++i)
40199 {
40200 unsigned e = d->perm[i];
40201 if (e < min)
40202 min = e;
40203 if (e > max)
40204 max = e;
40205 }
40206 if (min == 0 || max - min >= nelt)
40207 return false;
40208
40209 /* Given that we have SSSE3, we know we'll be able to implement the
40210 single operand permutation after the palignr with pshufb. */
40211 if (d->testing_p)
40212 return true;
40213
40214 dcopy = *d;
40215 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
40216 target = gen_reg_rtx (TImode);
40217 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
40218 gen_lowpart (TImode, d->op0), shift));
40219
40220 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
40221 dcopy.one_operand_p = true;
40222
40223 in_order = true;
40224 for (i = 0; i < nelt; ++i)
40225 {
40226 unsigned e = dcopy.perm[i] - min;
40227 if (e != i)
40228 in_order = false;
40229 dcopy.perm[i] = e;
40230 }
40231
40232 /* Test for the degenerate case where the alignment by itself
40233 produces the desired permutation. */
40234 if (in_order)
40235 {
40236 emit_move_insn (d->target, dcopy.op0);
40237 return true;
40238 }
40239
40240 ok = expand_vec_perm_1 (&dcopy);
40241 gcc_assert (ok);
40242
40243 return ok;
40244 }
40245
40246 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
40247
40248 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40249 a two vector permutation into a single vector permutation by using
40250 an interleave operation to merge the vectors. */
40251
40252 static bool
40253 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
40254 {
40255 struct expand_vec_perm_d dremap, dfinal;
40256 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
40257 unsigned HOST_WIDE_INT contents;
40258 unsigned char remap[2 * MAX_VECT_LEN];
40259 rtx seq;
40260 bool ok, same_halves = false;
40261
40262 if (GET_MODE_SIZE (d->vmode) == 16)
40263 {
40264 if (d->one_operand_p)
40265 return false;
40266 }
40267 else if (GET_MODE_SIZE (d->vmode) == 32)
40268 {
40269 if (!TARGET_AVX)
40270 return false;
40271 /* For 32-byte modes allow even d->one_operand_p.
40272 The lack of cross-lane shuffling in some instructions
40273 might prevent a single insn shuffle. */
40274 dfinal = *d;
40275 dfinal.testing_p = true;
40276 /* If expand_vec_perm_interleave3 can expand this into
40277 a 3 insn sequence, give up and let it be expanded as
40278 3 insn sequence. While that is one insn longer,
40279 it doesn't need a memory operand and in the common
40280 case that both interleave low and high permutations
40281 with the same operands are adjacent needs 4 insns
40282 for both after CSE. */
40283 if (expand_vec_perm_interleave3 (&dfinal))
40284 return false;
40285 }
40286 else
40287 return false;
40288
40289 /* Examine from whence the elements come. */
40290 contents = 0;
40291 for (i = 0; i < nelt; ++i)
40292 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
40293
40294 memset (remap, 0xff, sizeof (remap));
40295 dremap = *d;
40296
40297 if (GET_MODE_SIZE (d->vmode) == 16)
40298 {
40299 unsigned HOST_WIDE_INT h1, h2, h3, h4;
40300
40301 /* Split the two input vectors into 4 halves. */
40302 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
40303 h2 = h1 << nelt2;
40304 h3 = h2 << nelt2;
40305 h4 = h3 << nelt2;
40306
40307 /* If the elements from the low halves use interleave low, and similarly
40308 for interleave high. If the elements are from mis-matched halves, we
40309 can use shufps for V4SF/V4SI or do a DImode shuffle. */
40310 if ((contents & (h1 | h3)) == contents)
40311 {
40312 /* punpckl* */
40313 for (i = 0; i < nelt2; ++i)
40314 {
40315 remap[i] = i * 2;
40316 remap[i + nelt] = i * 2 + 1;
40317 dremap.perm[i * 2] = i;
40318 dremap.perm[i * 2 + 1] = i + nelt;
40319 }
40320 if (!TARGET_SSE2 && d->vmode == V4SImode)
40321 dremap.vmode = V4SFmode;
40322 }
40323 else if ((contents & (h2 | h4)) == contents)
40324 {
40325 /* punpckh* */
40326 for (i = 0; i < nelt2; ++i)
40327 {
40328 remap[i + nelt2] = i * 2;
40329 remap[i + nelt + nelt2] = i * 2 + 1;
40330 dremap.perm[i * 2] = i + nelt2;
40331 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
40332 }
40333 if (!TARGET_SSE2 && d->vmode == V4SImode)
40334 dremap.vmode = V4SFmode;
40335 }
40336 else if ((contents & (h1 | h4)) == contents)
40337 {
40338 /* shufps */
40339 for (i = 0; i < nelt2; ++i)
40340 {
40341 remap[i] = i;
40342 remap[i + nelt + nelt2] = i + nelt2;
40343 dremap.perm[i] = i;
40344 dremap.perm[i + nelt2] = i + nelt + nelt2;
40345 }
40346 if (nelt != 4)
40347 {
40348 /* shufpd */
40349 dremap.vmode = V2DImode;
40350 dremap.nelt = 2;
40351 dremap.perm[0] = 0;
40352 dremap.perm[1] = 3;
40353 }
40354 }
40355 else if ((contents & (h2 | h3)) == contents)
40356 {
40357 /* shufps */
40358 for (i = 0; i < nelt2; ++i)
40359 {
40360 remap[i + nelt2] = i;
40361 remap[i + nelt] = i + nelt2;
40362 dremap.perm[i] = i + nelt2;
40363 dremap.perm[i + nelt2] = i + nelt;
40364 }
40365 if (nelt != 4)
40366 {
40367 /* shufpd */
40368 dremap.vmode = V2DImode;
40369 dremap.nelt = 2;
40370 dremap.perm[0] = 1;
40371 dremap.perm[1] = 2;
40372 }
40373 }
40374 else
40375 return false;
40376 }
40377 else
40378 {
40379 unsigned int nelt4 = nelt / 4, nzcnt = 0;
40380 unsigned HOST_WIDE_INT q[8];
40381 unsigned int nonzero_halves[4];
40382
40383 /* Split the two input vectors into 8 quarters. */
40384 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
40385 for (i = 1; i < 8; ++i)
40386 q[i] = q[0] << (nelt4 * i);
40387 for (i = 0; i < 4; ++i)
40388 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
40389 {
40390 nonzero_halves[nzcnt] = i;
40391 ++nzcnt;
40392 }
40393
40394 if (nzcnt == 1)
40395 {
40396 gcc_assert (d->one_operand_p);
40397 nonzero_halves[1] = nonzero_halves[0];
40398 same_halves = true;
40399 }
40400 else if (d->one_operand_p)
40401 {
40402 gcc_assert (nonzero_halves[0] == 0);
40403 gcc_assert (nonzero_halves[1] == 1);
40404 }
40405
40406 if (nzcnt <= 2)
40407 {
40408 if (d->perm[0] / nelt2 == nonzero_halves[1])
40409 {
40410 /* Attempt to increase the likelihood that dfinal
40411 shuffle will be intra-lane. */
40412 char tmph = nonzero_halves[0];
40413 nonzero_halves[0] = nonzero_halves[1];
40414 nonzero_halves[1] = tmph;
40415 }
40416
40417 /* vperm2f128 or vperm2i128. */
40418 for (i = 0; i < nelt2; ++i)
40419 {
40420 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
40421 remap[i + nonzero_halves[0] * nelt2] = i;
40422 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
40423 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
40424 }
40425
40426 if (d->vmode != V8SFmode
40427 && d->vmode != V4DFmode
40428 && d->vmode != V8SImode)
40429 {
40430 dremap.vmode = V8SImode;
40431 dremap.nelt = 8;
40432 for (i = 0; i < 4; ++i)
40433 {
40434 dremap.perm[i] = i + nonzero_halves[0] * 4;
40435 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
40436 }
40437 }
40438 }
40439 else if (d->one_operand_p)
40440 return false;
40441 else if (TARGET_AVX2
40442 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
40443 {
40444 /* vpunpckl* */
40445 for (i = 0; i < nelt4; ++i)
40446 {
40447 remap[i] = i * 2;
40448 remap[i + nelt] = i * 2 + 1;
40449 remap[i + nelt2] = i * 2 + nelt2;
40450 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
40451 dremap.perm[i * 2] = i;
40452 dremap.perm[i * 2 + 1] = i + nelt;
40453 dremap.perm[i * 2 + nelt2] = i + nelt2;
40454 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
40455 }
40456 }
40457 else if (TARGET_AVX2
40458 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
40459 {
40460 /* vpunpckh* */
40461 for (i = 0; i < nelt4; ++i)
40462 {
40463 remap[i + nelt4] = i * 2;
40464 remap[i + nelt + nelt4] = i * 2 + 1;
40465 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
40466 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
40467 dremap.perm[i * 2] = i + nelt4;
40468 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
40469 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
40470 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
40471 }
40472 }
40473 else
40474 return false;
40475 }
40476
40477 /* Use the remapping array set up above to move the elements from their
40478 swizzled locations into their final destinations. */
40479 dfinal = *d;
40480 for (i = 0; i < nelt; ++i)
40481 {
40482 unsigned e = remap[d->perm[i]];
40483 gcc_assert (e < nelt);
40484 /* If same_halves is true, both halves of the remapped vector are the
40485 same. Avoid cross-lane accesses if possible. */
40486 if (same_halves && i >= nelt2)
40487 {
40488 gcc_assert (e < nelt2);
40489 dfinal.perm[i] = e + nelt2;
40490 }
40491 else
40492 dfinal.perm[i] = e;
40493 }
40494 dremap.target = gen_reg_rtx (dremap.vmode);
40495 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40496 dfinal.op1 = dfinal.op0;
40497 dfinal.one_operand_p = true;
40498
40499 /* Test if the final remap can be done with a single insn. For V4SFmode or
40500 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
40501 start_sequence ();
40502 ok = expand_vec_perm_1 (&dfinal);
40503 seq = get_insns ();
40504 end_sequence ();
40505
40506 if (!ok)
40507 return false;
40508
40509 if (d->testing_p)
40510 return true;
40511
40512 if (dremap.vmode != dfinal.vmode)
40513 {
40514 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
40515 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
40516 }
40517
40518 ok = expand_vec_perm_1 (&dremap);
40519 gcc_assert (ok);
40520
40521 emit_insn (seq);
40522 return true;
40523 }
40524
40525 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40526 a single vector cross-lane permutation into vpermq followed
40527 by any of the single insn permutations. */
40528
40529 static bool
40530 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
40531 {
40532 struct expand_vec_perm_d dremap, dfinal;
40533 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
40534 unsigned contents[2];
40535 bool ok;
40536
40537 if (!(TARGET_AVX2
40538 && (d->vmode == V32QImode || d->vmode == V16HImode)
40539 && d->one_operand_p))
40540 return false;
40541
40542 contents[0] = 0;
40543 contents[1] = 0;
40544 for (i = 0; i < nelt2; ++i)
40545 {
40546 contents[0] |= 1u << (d->perm[i] / nelt4);
40547 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
40548 }
40549
40550 for (i = 0; i < 2; ++i)
40551 {
40552 unsigned int cnt = 0;
40553 for (j = 0; j < 4; ++j)
40554 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
40555 return false;
40556 }
40557
40558 if (d->testing_p)
40559 return true;
40560
40561 dremap = *d;
40562 dremap.vmode = V4DImode;
40563 dremap.nelt = 4;
40564 dremap.target = gen_reg_rtx (V4DImode);
40565 dremap.op0 = gen_lowpart (V4DImode, d->op0);
40566 dremap.op1 = dremap.op0;
40567 dremap.one_operand_p = true;
40568 for (i = 0; i < 2; ++i)
40569 {
40570 unsigned int cnt = 0;
40571 for (j = 0; j < 4; ++j)
40572 if ((contents[i] & (1u << j)) != 0)
40573 dremap.perm[2 * i + cnt++] = j;
40574 for (; cnt < 2; ++cnt)
40575 dremap.perm[2 * i + cnt] = 0;
40576 }
40577
40578 dfinal = *d;
40579 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
40580 dfinal.op1 = dfinal.op0;
40581 dfinal.one_operand_p = true;
40582 for (i = 0, j = 0; i < nelt; ++i)
40583 {
40584 if (i == nelt2)
40585 j = 2;
40586 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
40587 if ((d->perm[i] / nelt4) == dremap.perm[j])
40588 ;
40589 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
40590 dfinal.perm[i] |= nelt4;
40591 else
40592 gcc_unreachable ();
40593 }
40594
40595 ok = expand_vec_perm_1 (&dremap);
40596 gcc_assert (ok);
40597
40598 ok = expand_vec_perm_1 (&dfinal);
40599 gcc_assert (ok);
40600
40601 return true;
40602 }
40603
40604 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
40605 a vector permutation using two instructions, vperm2f128 resp.
40606 vperm2i128 followed by any single in-lane permutation. */
40607
40608 static bool
40609 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
40610 {
40611 struct expand_vec_perm_d dfirst, dsecond;
40612 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
40613 bool ok;
40614
40615 if (!TARGET_AVX
40616 || GET_MODE_SIZE (d->vmode) != 32
40617 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
40618 return false;
40619
40620 dsecond = *d;
40621 dsecond.one_operand_p = false;
40622 dsecond.testing_p = true;
40623
40624 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
40625 immediate. For perm < 16 the second permutation uses
40626 d->op0 as first operand, for perm >= 16 it uses d->op1
40627 as first operand. The second operand is the result of
40628 vperm2[fi]128. */
40629 for (perm = 0; perm < 32; perm++)
40630 {
40631 /* Ignore permutations which do not move anything cross-lane. */
40632 if (perm < 16)
40633 {
40634 /* The second shuffle for e.g. V4DFmode has
40635 0123 and ABCD operands.
40636 Ignore AB23, as 23 is already in the second lane
40637 of the first operand. */
40638 if ((perm & 0xc) == (1 << 2)) continue;
40639 /* And 01CD, as 01 is in the first lane of the first
40640 operand. */
40641 if ((perm & 3) == 0) continue;
40642 /* And 4567, as then the vperm2[fi]128 doesn't change
40643 anything on the original 4567 second operand. */
40644 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
40645 }
40646 else
40647 {
40648 /* The second shuffle for e.g. V4DFmode has
40649 4567 and ABCD operands.
40650 Ignore AB67, as 67 is already in the second lane
40651 of the first operand. */
40652 if ((perm & 0xc) == (3 << 2)) continue;
40653 /* And 45CD, as 45 is in the first lane of the first
40654 operand. */
40655 if ((perm & 3) == 2) continue;
40656 /* And 0123, as then the vperm2[fi]128 doesn't change
40657 anything on the original 0123 first operand. */
40658 if ((perm & 0xf) == (1 << 2)) continue;
40659 }
40660
40661 for (i = 0; i < nelt; i++)
40662 {
40663 j = d->perm[i] / nelt2;
40664 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
40665 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
40666 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
40667 dsecond.perm[i] = d->perm[i] & (nelt - 1);
40668 else
40669 break;
40670 }
40671
40672 if (i == nelt)
40673 {
40674 start_sequence ();
40675 ok = expand_vec_perm_1 (&dsecond);
40676 end_sequence ();
40677 }
40678 else
40679 ok = false;
40680
40681 if (ok)
40682 {
40683 if (d->testing_p)
40684 return true;
40685
40686 /* Found a usable second shuffle. dfirst will be
40687 vperm2f128 on d->op0 and d->op1. */
40688 dsecond.testing_p = false;
40689 dfirst = *d;
40690 dfirst.target = gen_reg_rtx (d->vmode);
40691 for (i = 0; i < nelt; i++)
40692 dfirst.perm[i] = (i & (nelt2 - 1))
40693 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
40694
40695 ok = expand_vec_perm_1 (&dfirst);
40696 gcc_assert (ok);
40697
40698 /* And dsecond is some single insn shuffle, taking
40699 d->op0 and result of vperm2f128 (if perm < 16) or
40700 d->op1 and result of vperm2f128 (otherwise). */
40701 dsecond.op1 = dfirst.target;
40702 if (perm >= 16)
40703 dsecond.op0 = dfirst.op1;
40704
40705 ok = expand_vec_perm_1 (&dsecond);
40706 gcc_assert (ok);
40707
40708 return true;
40709 }
40710
40711 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
40712 if (d->one_operand_p)
40713 return false;
40714 }
40715
40716 return false;
40717 }
40718
40719 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
40720 a two vector permutation using 2 intra-lane interleave insns
40721 and cross-lane shuffle for 32-byte vectors. */
40722
40723 static bool
40724 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
40725 {
40726 unsigned i, nelt;
40727 rtx (*gen) (rtx, rtx, rtx);
40728
40729 if (d->one_operand_p)
40730 return false;
40731 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
40732 ;
40733 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
40734 ;
40735 else
40736 return false;
40737
40738 nelt = d->nelt;
40739 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
40740 return false;
40741 for (i = 0; i < nelt; i += 2)
40742 if (d->perm[i] != d->perm[0] + i / 2
40743 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
40744 return false;
40745
40746 if (d->testing_p)
40747 return true;
40748
40749 switch (d->vmode)
40750 {
40751 case V32QImode:
40752 if (d->perm[0])
40753 gen = gen_vec_interleave_highv32qi;
40754 else
40755 gen = gen_vec_interleave_lowv32qi;
40756 break;
40757 case V16HImode:
40758 if (d->perm[0])
40759 gen = gen_vec_interleave_highv16hi;
40760 else
40761 gen = gen_vec_interleave_lowv16hi;
40762 break;
40763 case V8SImode:
40764 if (d->perm[0])
40765 gen = gen_vec_interleave_highv8si;
40766 else
40767 gen = gen_vec_interleave_lowv8si;
40768 break;
40769 case V4DImode:
40770 if (d->perm[0])
40771 gen = gen_vec_interleave_highv4di;
40772 else
40773 gen = gen_vec_interleave_lowv4di;
40774 break;
40775 case V8SFmode:
40776 if (d->perm[0])
40777 gen = gen_vec_interleave_highv8sf;
40778 else
40779 gen = gen_vec_interleave_lowv8sf;
40780 break;
40781 case V4DFmode:
40782 if (d->perm[0])
40783 gen = gen_vec_interleave_highv4df;
40784 else
40785 gen = gen_vec_interleave_lowv4df;
40786 break;
40787 default:
40788 gcc_unreachable ();
40789 }
40790
40791 emit_insn (gen (d->target, d->op0, d->op1));
40792 return true;
40793 }
40794
40795 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
40796 a single vector permutation using a single intra-lane vector
40797 permutation, vperm2f128 swapping the lanes and vblend* insn blending
40798 the non-swapped and swapped vectors together. */
40799
40800 static bool
40801 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
40802 {
40803 struct expand_vec_perm_d dfirst, dsecond;
40804 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
40805 rtx seq;
40806 bool ok;
40807 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
40808
40809 if (!TARGET_AVX
40810 || TARGET_AVX2
40811 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
40812 || !d->one_operand_p)
40813 return false;
40814
40815 dfirst = *d;
40816 for (i = 0; i < nelt; i++)
40817 dfirst.perm[i] = 0xff;
40818 for (i = 0, msk = 0; i < nelt; i++)
40819 {
40820 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
40821 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
40822 return false;
40823 dfirst.perm[j] = d->perm[i];
40824 if (j != i)
40825 msk |= (1 << i);
40826 }
40827 for (i = 0; i < nelt; i++)
40828 if (dfirst.perm[i] == 0xff)
40829 dfirst.perm[i] = i;
40830
40831 if (!d->testing_p)
40832 dfirst.target = gen_reg_rtx (dfirst.vmode);
40833
40834 start_sequence ();
40835 ok = expand_vec_perm_1 (&dfirst);
40836 seq = get_insns ();
40837 end_sequence ();
40838
40839 if (!ok)
40840 return false;
40841
40842 if (d->testing_p)
40843 return true;
40844
40845 emit_insn (seq);
40846
40847 dsecond = *d;
40848 dsecond.op0 = dfirst.target;
40849 dsecond.op1 = dfirst.target;
40850 dsecond.one_operand_p = true;
40851 dsecond.target = gen_reg_rtx (dsecond.vmode);
40852 for (i = 0; i < nelt; i++)
40853 dsecond.perm[i] = i ^ nelt2;
40854
40855 ok = expand_vec_perm_1 (&dsecond);
40856 gcc_assert (ok);
40857
40858 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
40859 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
40860 return true;
40861 }
40862
40863 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
40864 permutation using two vperm2f128, followed by a vshufpd insn blending
40865 the two vectors together. */
40866
40867 static bool
40868 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
40869 {
40870 struct expand_vec_perm_d dfirst, dsecond, dthird;
40871 bool ok;
40872
40873 if (!TARGET_AVX || (d->vmode != V4DFmode))
40874 return false;
40875
40876 if (d->testing_p)
40877 return true;
40878
40879 dfirst = *d;
40880 dsecond = *d;
40881 dthird = *d;
40882
40883 dfirst.perm[0] = (d->perm[0] & ~1);
40884 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
40885 dfirst.perm[2] = (d->perm[2] & ~1);
40886 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
40887 dsecond.perm[0] = (d->perm[1] & ~1);
40888 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
40889 dsecond.perm[2] = (d->perm[3] & ~1);
40890 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
40891 dthird.perm[0] = (d->perm[0] % 2);
40892 dthird.perm[1] = (d->perm[1] % 2) + 4;
40893 dthird.perm[2] = (d->perm[2] % 2) + 2;
40894 dthird.perm[3] = (d->perm[3] % 2) + 6;
40895
40896 dfirst.target = gen_reg_rtx (dfirst.vmode);
40897 dsecond.target = gen_reg_rtx (dsecond.vmode);
40898 dthird.op0 = dfirst.target;
40899 dthird.op1 = dsecond.target;
40900 dthird.one_operand_p = false;
40901
40902 canonicalize_perm (&dfirst);
40903 canonicalize_perm (&dsecond);
40904
40905 ok = expand_vec_perm_1 (&dfirst)
40906 && expand_vec_perm_1 (&dsecond)
40907 && expand_vec_perm_1 (&dthird);
40908
40909 gcc_assert (ok);
40910
40911 return true;
40912 }
40913
40914 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
40915 permutation with two pshufb insns and an ior. We should have already
40916 failed all two instruction sequences. */
40917
40918 static bool
40919 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
40920 {
40921 rtx rperm[2][16], vperm, l, h, op, m128;
40922 unsigned int i, nelt, eltsz;
40923
40924 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
40925 return false;
40926 gcc_assert (!d->one_operand_p);
40927
40928 nelt = d->nelt;
40929 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40930
40931 /* Generate two permutation masks. If the required element is within
40932 the given vector it is shuffled into the proper lane. If the required
40933 element is in the other vector, force a zero into the lane by setting
40934 bit 7 in the permutation mask. */
40935 m128 = GEN_INT (-128);
40936 for (i = 0; i < nelt; ++i)
40937 {
40938 unsigned j, e = d->perm[i];
40939 unsigned which = (e >= nelt);
40940 if (e >= nelt)
40941 e -= nelt;
40942
40943 for (j = 0; j < eltsz; ++j)
40944 {
40945 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
40946 rperm[1-which][i*eltsz + j] = m128;
40947 }
40948 }
40949
40950 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
40951 vperm = force_reg (V16QImode, vperm);
40952
40953 l = gen_reg_rtx (V16QImode);
40954 op = gen_lowpart (V16QImode, d->op0);
40955 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
40956
40957 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
40958 vperm = force_reg (V16QImode, vperm);
40959
40960 h = gen_reg_rtx (V16QImode);
40961 op = gen_lowpart (V16QImode, d->op1);
40962 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
40963
40964 op = d->target;
40965 if (d->vmode != V16QImode)
40966 op = gen_reg_rtx (V16QImode);
40967 emit_insn (gen_iorv16qi3 (op, l, h));
40968 if (op != d->target)
40969 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
40970
40971 return true;
40972 }
40973
40974 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
40975 with two vpshufb insns, vpermq and vpor. We should have already failed
40976 all two or three instruction sequences. */
40977
40978 static bool
40979 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
40980 {
40981 rtx rperm[2][32], vperm, l, h, hp, op, m128;
40982 unsigned int i, nelt, eltsz;
40983
40984 if (!TARGET_AVX2
40985 || !d->one_operand_p
40986 || (d->vmode != V32QImode && d->vmode != V16HImode))
40987 return false;
40988
40989 if (d->testing_p)
40990 return true;
40991
40992 nelt = d->nelt;
40993 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40994
40995 /* Generate two permutation masks. If the required element is within
40996 the same lane, it is shuffled in. If the required element from the
40997 other lane, force a zero by setting bit 7 in the permutation mask.
40998 In the other mask the mask has non-negative elements if element
40999 is requested from the other lane, but also moved to the other lane,
41000 so that the result of vpshufb can have the two V2TImode halves
41001 swapped. */
41002 m128 = GEN_INT (-128);
41003 for (i = 0; i < nelt; ++i)
41004 {
41005 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41006 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41007
41008 for (j = 0; j < eltsz; ++j)
41009 {
41010 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
41011 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
41012 }
41013 }
41014
41015 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41016 vperm = force_reg (V32QImode, vperm);
41017
41018 h = gen_reg_rtx (V32QImode);
41019 op = gen_lowpart (V32QImode, d->op0);
41020 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41021
41022 /* Swap the 128-byte lanes of h into hp. */
41023 hp = gen_reg_rtx (V4DImode);
41024 op = gen_lowpart (V4DImode, h);
41025 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
41026 const1_rtx));
41027
41028 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41029 vperm = force_reg (V32QImode, vperm);
41030
41031 l = gen_reg_rtx (V32QImode);
41032 op = gen_lowpart (V32QImode, d->op0);
41033 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41034
41035 op = d->target;
41036 if (d->vmode != V32QImode)
41037 op = gen_reg_rtx (V32QImode);
41038 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
41039 if (op != d->target)
41040 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41041
41042 return true;
41043 }
41044
41045 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
41046 and extract-odd permutations of two V32QImode and V16QImode operand
41047 with two vpshufb insns, vpor and vpermq. We should have already
41048 failed all two or three instruction sequences. */
41049
41050 static bool
41051 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
41052 {
41053 rtx rperm[2][32], vperm, l, h, ior, op, m128;
41054 unsigned int i, nelt, eltsz;
41055
41056 if (!TARGET_AVX2
41057 || d->one_operand_p
41058 || (d->vmode != V32QImode && d->vmode != V16HImode))
41059 return false;
41060
41061 for (i = 0; i < d->nelt; ++i)
41062 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
41063 return false;
41064
41065 if (d->testing_p)
41066 return true;
41067
41068 nelt = d->nelt;
41069 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41070
41071 /* Generate two permutation masks. In the first permutation mask
41072 the first quarter will contain indexes for the first half
41073 of the op0, the second quarter will contain bit 7 set, third quarter
41074 will contain indexes for the second half of the op0 and the
41075 last quarter bit 7 set. In the second permutation mask
41076 the first quarter will contain bit 7 set, the second quarter
41077 indexes for the first half of the op1, the third quarter bit 7 set
41078 and last quarter indexes for the second half of the op1.
41079 I.e. the first mask e.g. for V32QImode extract even will be:
41080 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
41081 (all values masked with 0xf except for -128) and second mask
41082 for extract even will be
41083 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
41084 m128 = GEN_INT (-128);
41085 for (i = 0; i < nelt; ++i)
41086 {
41087 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41088 unsigned which = d->perm[i] >= nelt;
41089 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
41090
41091 for (j = 0; j < eltsz; ++j)
41092 {
41093 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
41094 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
41095 }
41096 }
41097
41098 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
41099 vperm = force_reg (V32QImode, vperm);
41100
41101 l = gen_reg_rtx (V32QImode);
41102 op = gen_lowpart (V32QImode, d->op0);
41103 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
41104
41105 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
41106 vperm = force_reg (V32QImode, vperm);
41107
41108 h = gen_reg_rtx (V32QImode);
41109 op = gen_lowpart (V32QImode, d->op1);
41110 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
41111
41112 ior = gen_reg_rtx (V32QImode);
41113 emit_insn (gen_iorv32qi3 (ior, l, h));
41114
41115 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
41116 op = gen_reg_rtx (V4DImode);
41117 ior = gen_lowpart (V4DImode, ior);
41118 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
41119 const1_rtx, GEN_INT (3)));
41120 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41121
41122 return true;
41123 }
41124
41125 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
41126 and extract-odd permutations. */
41127
41128 static bool
41129 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
41130 {
41131 rtx t1, t2, t3, t4, t5;
41132
41133 switch (d->vmode)
41134 {
41135 case V4DFmode:
41136 t1 = gen_reg_rtx (V4DFmode);
41137 t2 = gen_reg_rtx (V4DFmode);
41138
41139 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41140 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
41141 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
41142
41143 /* Now an unpck[lh]pd will produce the result required. */
41144 if (odd)
41145 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
41146 else
41147 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
41148 emit_insn (t3);
41149 break;
41150
41151 case V8SFmode:
41152 {
41153 int mask = odd ? 0xdd : 0x88;
41154
41155 t1 = gen_reg_rtx (V8SFmode);
41156 t2 = gen_reg_rtx (V8SFmode);
41157 t3 = gen_reg_rtx (V8SFmode);
41158
41159 /* Shuffle within the 128-bit lanes to produce:
41160 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
41161 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
41162 GEN_INT (mask)));
41163
41164 /* Shuffle the lanes around to produce:
41165 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
41166 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
41167 GEN_INT (0x3)));
41168
41169 /* Shuffle within the 128-bit lanes to produce:
41170 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
41171 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
41172
41173 /* Shuffle within the 128-bit lanes to produce:
41174 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
41175 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
41176
41177 /* Shuffle the lanes around to produce:
41178 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
41179 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
41180 GEN_INT (0x20)));
41181 }
41182 break;
41183
41184 case V2DFmode:
41185 case V4SFmode:
41186 case V2DImode:
41187 case V4SImode:
41188 /* These are always directly implementable by expand_vec_perm_1. */
41189 gcc_unreachable ();
41190
41191 case V8HImode:
41192 if (TARGET_SSSE3)
41193 return expand_vec_perm_pshufb2 (d);
41194 else
41195 {
41196 /* We need 2*log2(N)-1 operations to achieve odd/even
41197 with interleave. */
41198 t1 = gen_reg_rtx (V8HImode);
41199 t2 = gen_reg_rtx (V8HImode);
41200 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
41201 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
41202 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
41203 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
41204 if (odd)
41205 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
41206 else
41207 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
41208 emit_insn (t3);
41209 }
41210 break;
41211
41212 case V16QImode:
41213 if (TARGET_SSSE3)
41214 return expand_vec_perm_pshufb2 (d);
41215 else
41216 {
41217 t1 = gen_reg_rtx (V16QImode);
41218 t2 = gen_reg_rtx (V16QImode);
41219 t3 = gen_reg_rtx (V16QImode);
41220 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
41221 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
41222 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
41223 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
41224 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
41225 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
41226 if (odd)
41227 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
41228 else
41229 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
41230 emit_insn (t3);
41231 }
41232 break;
41233
41234 case V16HImode:
41235 case V32QImode:
41236 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
41237
41238 case V4DImode:
41239 if (!TARGET_AVX2)
41240 {
41241 struct expand_vec_perm_d d_copy = *d;
41242 d_copy.vmode = V4DFmode;
41243 d_copy.target = gen_reg_rtx (V4DFmode);
41244 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
41245 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
41246 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41247 {
41248 if (!d->testing_p)
41249 emit_move_insn (d->target,
41250 gen_lowpart (V4DImode, d_copy.target));
41251 return true;
41252 }
41253 return false;
41254 }
41255
41256 t1 = gen_reg_rtx (V4DImode);
41257 t2 = gen_reg_rtx (V4DImode);
41258
41259 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
41260 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
41261 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
41262
41263 /* Now an vpunpck[lh]qdq will produce the result required. */
41264 if (odd)
41265 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
41266 else
41267 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
41268 emit_insn (t3);
41269 break;
41270
41271 case V8SImode:
41272 if (!TARGET_AVX2)
41273 {
41274 struct expand_vec_perm_d d_copy = *d;
41275 d_copy.vmode = V8SFmode;
41276 d_copy.target = gen_reg_rtx (V8SFmode);
41277 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
41278 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
41279 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
41280 {
41281 if (!d->testing_p)
41282 emit_move_insn (d->target,
41283 gen_lowpart (V8SImode, d_copy.target));
41284 return true;
41285 }
41286 return false;
41287 }
41288
41289 t1 = gen_reg_rtx (V8SImode);
41290 t2 = gen_reg_rtx (V8SImode);
41291 t3 = gen_reg_rtx (V4DImode);
41292 t4 = gen_reg_rtx (V4DImode);
41293 t5 = gen_reg_rtx (V4DImode);
41294
41295 /* Shuffle the lanes around into
41296 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
41297 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
41298 gen_lowpart (V4DImode, d->op1),
41299 GEN_INT (0x20)));
41300 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
41301 gen_lowpart (V4DImode, d->op1),
41302 GEN_INT (0x31)));
41303
41304 /* Swap the 2nd and 3rd position in each lane into
41305 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
41306 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
41307 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41308 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
41309 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
41310
41311 /* Now an vpunpck[lh]qdq will produce
41312 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
41313 if (odd)
41314 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
41315 gen_lowpart (V4DImode, t2));
41316 else
41317 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
41318 gen_lowpart (V4DImode, t2));
41319 emit_insn (t3);
41320 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
41321 break;
41322
41323 default:
41324 gcc_unreachable ();
41325 }
41326
41327 return true;
41328 }
41329
41330 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41331 extract-even and extract-odd permutations. */
41332
41333 static bool
41334 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
41335 {
41336 unsigned i, odd, nelt = d->nelt;
41337
41338 odd = d->perm[0];
41339 if (odd != 0 && odd != 1)
41340 return false;
41341
41342 for (i = 1; i < nelt; ++i)
41343 if (d->perm[i] != 2 * i + odd)
41344 return false;
41345
41346 return expand_vec_perm_even_odd_1 (d, odd);
41347 }
41348
41349 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
41350 permutations. We assume that expand_vec_perm_1 has already failed. */
41351
41352 static bool
41353 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
41354 {
41355 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
41356 enum machine_mode vmode = d->vmode;
41357 unsigned char perm2[4];
41358 rtx op0 = d->op0, dest;
41359 bool ok;
41360
41361 switch (vmode)
41362 {
41363 case V4DFmode:
41364 case V8SFmode:
41365 /* These are special-cased in sse.md so that we can optionally
41366 use the vbroadcast instruction. They expand to two insns
41367 if the input happens to be in a register. */
41368 gcc_unreachable ();
41369
41370 case V2DFmode:
41371 case V2DImode:
41372 case V4SFmode:
41373 case V4SImode:
41374 /* These are always implementable using standard shuffle patterns. */
41375 gcc_unreachable ();
41376
41377 case V8HImode:
41378 case V16QImode:
41379 /* These can be implemented via interleave. We save one insn by
41380 stopping once we have promoted to V4SImode and then use pshufd. */
41381 do
41382 {
41383 rtx dest;
41384 rtx (*gen) (rtx, rtx, rtx)
41385 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
41386 : gen_vec_interleave_lowv8hi;
41387
41388 if (elt >= nelt2)
41389 {
41390 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
41391 : gen_vec_interleave_highv8hi;
41392 elt -= nelt2;
41393 }
41394 nelt2 /= 2;
41395
41396 dest = gen_reg_rtx (vmode);
41397 emit_insn (gen (dest, op0, op0));
41398 vmode = get_mode_wider_vector (vmode);
41399 op0 = gen_lowpart (vmode, dest);
41400 }
41401 while (vmode != V4SImode);
41402
41403 memset (perm2, elt, 4);
41404 dest = gen_reg_rtx (V4SImode);
41405 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
41406 gcc_assert (ok);
41407 if (!d->testing_p)
41408 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
41409 return true;
41410
41411 case V32QImode:
41412 case V16HImode:
41413 case V8SImode:
41414 case V4DImode:
41415 /* For AVX2 broadcasts of the first element vpbroadcast* or
41416 vpermq should be used by expand_vec_perm_1. */
41417 gcc_assert (!TARGET_AVX2 || d->perm[0]);
41418 return false;
41419
41420 default:
41421 gcc_unreachable ();
41422 }
41423 }
41424
41425 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
41426 broadcast permutations. */
41427
41428 static bool
41429 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
41430 {
41431 unsigned i, elt, nelt = d->nelt;
41432
41433 if (!d->one_operand_p)
41434 return false;
41435
41436 elt = d->perm[0];
41437 for (i = 1; i < nelt; ++i)
41438 if (d->perm[i] != elt)
41439 return false;
41440
41441 return expand_vec_perm_broadcast_1 (d);
41442 }
41443
41444 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
41445 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
41446 all the shorter instruction sequences. */
41447
41448 static bool
41449 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
41450 {
41451 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
41452 unsigned int i, nelt, eltsz;
41453 bool used[4];
41454
41455 if (!TARGET_AVX2
41456 || d->one_operand_p
41457 || (d->vmode != V32QImode && d->vmode != V16HImode))
41458 return false;
41459
41460 if (d->testing_p)
41461 return true;
41462
41463 nelt = d->nelt;
41464 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
41465
41466 /* Generate 4 permutation masks. If the required element is within
41467 the same lane, it is shuffled in. If the required element from the
41468 other lane, force a zero by setting bit 7 in the permutation mask.
41469 In the other mask the mask has non-negative elements if element
41470 is requested from the other lane, but also moved to the other lane,
41471 so that the result of vpshufb can have the two V2TImode halves
41472 swapped. */
41473 m128 = GEN_INT (-128);
41474 for (i = 0; i < 32; ++i)
41475 {
41476 rperm[0][i] = m128;
41477 rperm[1][i] = m128;
41478 rperm[2][i] = m128;
41479 rperm[3][i] = m128;
41480 }
41481 used[0] = false;
41482 used[1] = false;
41483 used[2] = false;
41484 used[3] = false;
41485 for (i = 0; i < nelt; ++i)
41486 {
41487 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
41488 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
41489 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
41490
41491 for (j = 0; j < eltsz; ++j)
41492 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
41493 used[which] = true;
41494 }
41495
41496 for (i = 0; i < 2; ++i)
41497 {
41498 if (!used[2 * i + 1])
41499 {
41500 h[i] = NULL_RTX;
41501 continue;
41502 }
41503 vperm = gen_rtx_CONST_VECTOR (V32QImode,
41504 gen_rtvec_v (32, rperm[2 * i + 1]));
41505 vperm = force_reg (V32QImode, vperm);
41506 h[i] = gen_reg_rtx (V32QImode);
41507 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41508 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
41509 }
41510
41511 /* Swap the 128-byte lanes of h[X]. */
41512 for (i = 0; i < 2; ++i)
41513 {
41514 if (h[i] == NULL_RTX)
41515 continue;
41516 op = gen_reg_rtx (V4DImode);
41517 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
41518 const2_rtx, GEN_INT (3), const0_rtx,
41519 const1_rtx));
41520 h[i] = gen_lowpart (V32QImode, op);
41521 }
41522
41523 for (i = 0; i < 2; ++i)
41524 {
41525 if (!used[2 * i])
41526 {
41527 l[i] = NULL_RTX;
41528 continue;
41529 }
41530 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
41531 vperm = force_reg (V32QImode, vperm);
41532 l[i] = gen_reg_rtx (V32QImode);
41533 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
41534 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
41535 }
41536
41537 for (i = 0; i < 2; ++i)
41538 {
41539 if (h[i] && l[i])
41540 {
41541 op = gen_reg_rtx (V32QImode);
41542 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
41543 l[i] = op;
41544 }
41545 else if (h[i])
41546 l[i] = h[i];
41547 }
41548
41549 gcc_assert (l[0] && l[1]);
41550 op = d->target;
41551 if (d->vmode != V32QImode)
41552 op = gen_reg_rtx (V32QImode);
41553 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
41554 if (op != d->target)
41555 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
41556 return true;
41557 }
41558
41559 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
41560 With all of the interface bits taken care of, perform the expansion
41561 in D and return true on success. */
41562
41563 static bool
41564 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
41565 {
41566 /* Try a single instruction expansion. */
41567 if (expand_vec_perm_1 (d))
41568 return true;
41569
41570 /* Try sequences of two instructions. */
41571
41572 if (expand_vec_perm_pshuflw_pshufhw (d))
41573 return true;
41574
41575 if (expand_vec_perm_palignr (d))
41576 return true;
41577
41578 if (expand_vec_perm_interleave2 (d))
41579 return true;
41580
41581 if (expand_vec_perm_broadcast (d))
41582 return true;
41583
41584 if (expand_vec_perm_vpermq_perm_1 (d))
41585 return true;
41586
41587 if (expand_vec_perm_vperm2f128 (d))
41588 return true;
41589
41590 /* Try sequences of three instructions. */
41591
41592 if (expand_vec_perm_2vperm2f128_vshuf (d))
41593 return true;
41594
41595 if (expand_vec_perm_pshufb2 (d))
41596 return true;
41597
41598 if (expand_vec_perm_interleave3 (d))
41599 return true;
41600
41601 if (expand_vec_perm_vperm2f128_vblend (d))
41602 return true;
41603
41604 /* Try sequences of four instructions. */
41605
41606 if (expand_vec_perm_vpshufb2_vpermq (d))
41607 return true;
41608
41609 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
41610 return true;
41611
41612 /* ??? Look for narrow permutations whose element orderings would
41613 allow the promotion to a wider mode. */
41614
41615 /* ??? Look for sequences of interleave or a wider permute that place
41616 the data into the correct lanes for a half-vector shuffle like
41617 pshuf[lh]w or vpermilps. */
41618
41619 /* ??? Look for sequences of interleave that produce the desired results.
41620 The combinatorics of punpck[lh] get pretty ugly... */
41621
41622 if (expand_vec_perm_even_odd (d))
41623 return true;
41624
41625 /* Even longer sequences. */
41626 if (expand_vec_perm_vpshufb4_vpermq2 (d))
41627 return true;
41628
41629 return false;
41630 }
41631
41632 /* If a permutation only uses one operand, make it clear. Returns true
41633 if the permutation references both operands. */
41634
41635 static bool
41636 canonicalize_perm (struct expand_vec_perm_d *d)
41637 {
41638 int i, which, nelt = d->nelt;
41639
41640 for (i = which = 0; i < nelt; ++i)
41641 which |= (d->perm[i] < nelt ? 1 : 2);
41642
41643 d->one_operand_p = true;
41644 switch (which)
41645 {
41646 default:
41647 gcc_unreachable();
41648
41649 case 3:
41650 if (!rtx_equal_p (d->op0, d->op1))
41651 {
41652 d->one_operand_p = false;
41653 break;
41654 }
41655 /* The elements of PERM do not suggest that only the first operand
41656 is used, but both operands are identical. Allow easier matching
41657 of the permutation by folding the permutation into the single
41658 input vector. */
41659 /* FALLTHRU */
41660
41661 case 2:
41662 for (i = 0; i < nelt; ++i)
41663 d->perm[i] &= nelt - 1;
41664 d->op0 = d->op1;
41665 break;
41666
41667 case 1:
41668 d->op1 = d->op0;
41669 break;
41670 }
41671
41672 return (which == 3);
41673 }
41674
41675 bool
41676 ix86_expand_vec_perm_const (rtx operands[4])
41677 {
41678 struct expand_vec_perm_d d;
41679 unsigned char perm[MAX_VECT_LEN];
41680 int i, nelt;
41681 bool two_args;
41682 rtx sel;
41683
41684 d.target = operands[0];
41685 d.op0 = operands[1];
41686 d.op1 = operands[2];
41687 sel = operands[3];
41688
41689 d.vmode = GET_MODE (d.target);
41690 gcc_assert (VECTOR_MODE_P (d.vmode));
41691 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41692 d.testing_p = false;
41693
41694 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
41695 gcc_assert (XVECLEN (sel, 0) == nelt);
41696 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
41697
41698 for (i = 0; i < nelt; ++i)
41699 {
41700 rtx e = XVECEXP (sel, 0, i);
41701 int ei = INTVAL (e) & (2 * nelt - 1);
41702 d.perm[i] = ei;
41703 perm[i] = ei;
41704 }
41705
41706 two_args = canonicalize_perm (&d);
41707
41708 if (ix86_expand_vec_perm_const_1 (&d))
41709 return true;
41710
41711 /* If the selector says both arguments are needed, but the operands are the
41712 same, the above tried to expand with one_operand_p and flattened selector.
41713 If that didn't work, retry without one_operand_p; we succeeded with that
41714 during testing. */
41715 if (two_args && d.one_operand_p)
41716 {
41717 d.one_operand_p = false;
41718 memcpy (d.perm, perm, sizeof (perm));
41719 return ix86_expand_vec_perm_const_1 (&d);
41720 }
41721
41722 return false;
41723 }
41724
41725 /* Implement targetm.vectorize.vec_perm_const_ok. */
41726
41727 static bool
41728 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
41729 const unsigned char *sel)
41730 {
41731 struct expand_vec_perm_d d;
41732 unsigned int i, nelt, which;
41733 bool ret;
41734
41735 d.vmode = vmode;
41736 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41737 d.testing_p = true;
41738
41739 /* Given sufficient ISA support we can just return true here
41740 for selected vector modes. */
41741 if (GET_MODE_SIZE (d.vmode) == 16)
41742 {
41743 /* All implementable with a single vpperm insn. */
41744 if (TARGET_XOP)
41745 return true;
41746 /* All implementable with 2 pshufb + 1 ior. */
41747 if (TARGET_SSSE3)
41748 return true;
41749 /* All implementable with shufpd or unpck[lh]pd. */
41750 if (d.nelt == 2)
41751 return true;
41752 }
41753
41754 /* Extract the values from the vector CST into the permutation
41755 array in D. */
41756 memcpy (d.perm, sel, nelt);
41757 for (i = which = 0; i < nelt; ++i)
41758 {
41759 unsigned char e = d.perm[i];
41760 gcc_assert (e < 2 * nelt);
41761 which |= (e < nelt ? 1 : 2);
41762 }
41763
41764 /* For all elements from second vector, fold the elements to first. */
41765 if (which == 2)
41766 for (i = 0; i < nelt; ++i)
41767 d.perm[i] -= nelt;
41768
41769 /* Check whether the mask can be applied to the vector type. */
41770 d.one_operand_p = (which != 3);
41771
41772 /* Implementable with shufps or pshufd. */
41773 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
41774 return true;
41775
41776 /* Otherwise we have to go through the motions and see if we can
41777 figure out how to generate the requested permutation. */
41778 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
41779 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
41780 if (!d.one_operand_p)
41781 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
41782
41783 start_sequence ();
41784 ret = ix86_expand_vec_perm_const_1 (&d);
41785 end_sequence ();
41786
41787 return ret;
41788 }
41789
41790 void
41791 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
41792 {
41793 struct expand_vec_perm_d d;
41794 unsigned i, nelt;
41795
41796 d.target = targ;
41797 d.op0 = op0;
41798 d.op1 = op1;
41799 d.vmode = GET_MODE (targ);
41800 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41801 d.one_operand_p = false;
41802 d.testing_p = false;
41803
41804 for (i = 0; i < nelt; ++i)
41805 d.perm[i] = i * 2 + odd;
41806
41807 /* We'll either be able to implement the permutation directly... */
41808 if (expand_vec_perm_1 (&d))
41809 return;
41810
41811 /* ... or we use the special-case patterns. */
41812 expand_vec_perm_even_odd_1 (&d, odd);
41813 }
41814
41815 static void
41816 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
41817 {
41818 struct expand_vec_perm_d d;
41819 unsigned i, nelt, base;
41820 bool ok;
41821
41822 d.target = targ;
41823 d.op0 = op0;
41824 d.op1 = op1;
41825 d.vmode = GET_MODE (targ);
41826 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
41827 d.one_operand_p = false;
41828 d.testing_p = false;
41829
41830 base = high_p ? nelt / 2 : 0;
41831 for (i = 0; i < nelt / 2; ++i)
41832 {
41833 d.perm[i * 2] = i + base;
41834 d.perm[i * 2 + 1] = i + base + nelt;
41835 }
41836
41837 /* Note that for AVX this isn't one instruction. */
41838 ok = ix86_expand_vec_perm_const_1 (&d);
41839 gcc_assert (ok);
41840 }
41841
41842
41843 /* Expand a vector operation CODE for a V*QImode in terms of the
41844 same operation on V*HImode. */
41845
41846 void
41847 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
41848 {
41849 enum machine_mode qimode = GET_MODE (dest);
41850 enum machine_mode himode;
41851 rtx (*gen_il) (rtx, rtx, rtx);
41852 rtx (*gen_ih) (rtx, rtx, rtx);
41853 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
41854 struct expand_vec_perm_d d;
41855 bool ok, full_interleave;
41856 bool uns_p = false;
41857 int i;
41858
41859 switch (qimode)
41860 {
41861 case V16QImode:
41862 himode = V8HImode;
41863 gen_il = gen_vec_interleave_lowv16qi;
41864 gen_ih = gen_vec_interleave_highv16qi;
41865 break;
41866 case V32QImode:
41867 himode = V16HImode;
41868 gen_il = gen_avx2_interleave_lowv32qi;
41869 gen_ih = gen_avx2_interleave_highv32qi;
41870 break;
41871 default:
41872 gcc_unreachable ();
41873 }
41874
41875 op2_l = op2_h = op2;
41876 switch (code)
41877 {
41878 case MULT:
41879 /* Unpack data such that we've got a source byte in each low byte of
41880 each word. We don't care what goes into the high byte of each word.
41881 Rather than trying to get zero in there, most convenient is to let
41882 it be a copy of the low byte. */
41883 op2_l = gen_reg_rtx (qimode);
41884 op2_h = gen_reg_rtx (qimode);
41885 emit_insn (gen_il (op2_l, op2, op2));
41886 emit_insn (gen_ih (op2_h, op2, op2));
41887 /* FALLTHRU */
41888
41889 op1_l = gen_reg_rtx (qimode);
41890 op1_h = gen_reg_rtx (qimode);
41891 emit_insn (gen_il (op1_l, op1, op1));
41892 emit_insn (gen_ih (op1_h, op1, op1));
41893 full_interleave = qimode == V16QImode;
41894 break;
41895
41896 case ASHIFT:
41897 case LSHIFTRT:
41898 uns_p = true;
41899 /* FALLTHRU */
41900 case ASHIFTRT:
41901 op1_l = gen_reg_rtx (himode);
41902 op1_h = gen_reg_rtx (himode);
41903 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
41904 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
41905 full_interleave = true;
41906 break;
41907 default:
41908 gcc_unreachable ();
41909 }
41910
41911 /* Perform the operation. */
41912 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
41913 1, OPTAB_DIRECT);
41914 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
41915 1, OPTAB_DIRECT);
41916 gcc_assert (res_l && res_h);
41917
41918 /* Merge the data back into the right place. */
41919 d.target = dest;
41920 d.op0 = gen_lowpart (qimode, res_l);
41921 d.op1 = gen_lowpart (qimode, res_h);
41922 d.vmode = qimode;
41923 d.nelt = GET_MODE_NUNITS (qimode);
41924 d.one_operand_p = false;
41925 d.testing_p = false;
41926
41927 if (full_interleave)
41928 {
41929 /* For SSE2, we used an full interleave, so the desired
41930 results are in the even elements. */
41931 for (i = 0; i < 32; ++i)
41932 d.perm[i] = i * 2;
41933 }
41934 else
41935 {
41936 /* For AVX, the interleave used above was not cross-lane. So the
41937 extraction is evens but with the second and third quarter swapped.
41938 Happily, that is even one insn shorter than even extraction. */
41939 for (i = 0; i < 32; ++i)
41940 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
41941 }
41942
41943 ok = ix86_expand_vec_perm_const_1 (&d);
41944 gcc_assert (ok);
41945
41946 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41947 gen_rtx_fmt_ee (code, qimode, op1, op2));
41948 }
41949
41950 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
41951 if op is CONST_VECTOR with all odd elements equal to their
41952 preceding element. */
41953
41954 static bool
41955 const_vector_equal_evenodd_p (rtx op)
41956 {
41957 enum machine_mode mode = GET_MODE (op);
41958 int i, nunits = GET_MODE_NUNITS (mode);
41959 if (GET_CODE (op) != CONST_VECTOR
41960 || nunits != CONST_VECTOR_NUNITS (op))
41961 return false;
41962 for (i = 0; i < nunits; i += 2)
41963 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
41964 return false;
41965 return true;
41966 }
41967
41968 void
41969 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
41970 bool uns_p, bool odd_p)
41971 {
41972 enum machine_mode mode = GET_MODE (op1);
41973 enum machine_mode wmode = GET_MODE (dest);
41974 rtx x;
41975 rtx orig_op1 = op1, orig_op2 = op2;
41976
41977 if (!nonimmediate_operand (op1, mode))
41978 op1 = force_reg (mode, op1);
41979 if (!nonimmediate_operand (op2, mode))
41980 op2 = force_reg (mode, op2);
41981
41982 /* We only play even/odd games with vectors of SImode. */
41983 gcc_assert (mode == V4SImode || mode == V8SImode);
41984
41985 /* If we're looking for the odd results, shift those members down to
41986 the even slots. For some cpus this is faster than a PSHUFD. */
41987 if (odd_p)
41988 {
41989 /* For XOP use vpmacsdqh, but only for smult, as it is only
41990 signed. */
41991 if (TARGET_XOP && mode == V4SImode && !uns_p)
41992 {
41993 x = force_reg (wmode, CONST0_RTX (wmode));
41994 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
41995 return;
41996 }
41997
41998 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
41999 if (!const_vector_equal_evenodd_p (orig_op1))
42000 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
42001 x, NULL, 1, OPTAB_DIRECT);
42002 if (!const_vector_equal_evenodd_p (orig_op2))
42003 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
42004 x, NULL, 1, OPTAB_DIRECT);
42005 op1 = gen_lowpart (mode, op1);
42006 op2 = gen_lowpart (mode, op2);
42007 }
42008
42009 if (mode == V8SImode)
42010 {
42011 if (uns_p)
42012 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
42013 else
42014 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
42015 }
42016 else if (uns_p)
42017 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
42018 else if (TARGET_SSE4_1)
42019 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
42020 else
42021 {
42022 rtx s1, s2, t0, t1, t2;
42023
42024 /* The easiest way to implement this without PMULDQ is to go through
42025 the motions as if we are performing a full 64-bit multiply. With
42026 the exception that we need to do less shuffling of the elements. */
42027
42028 /* Compute the sign-extension, aka highparts, of the two operands. */
42029 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42030 op1, pc_rtx, pc_rtx);
42031 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
42032 op2, pc_rtx, pc_rtx);
42033
42034 /* Multiply LO(A) * HI(B), and vice-versa. */
42035 t1 = gen_reg_rtx (wmode);
42036 t2 = gen_reg_rtx (wmode);
42037 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
42038 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
42039
42040 /* Multiply LO(A) * LO(B). */
42041 t0 = gen_reg_rtx (wmode);
42042 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
42043
42044 /* Combine and shift the highparts into place. */
42045 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
42046 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
42047 1, OPTAB_DIRECT);
42048
42049 /* Combine high and low parts. */
42050 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
42051 return;
42052 }
42053 emit_insn (x);
42054 }
42055
42056 void
42057 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
42058 bool uns_p, bool high_p)
42059 {
42060 enum machine_mode wmode = GET_MODE (dest);
42061 enum machine_mode mode = GET_MODE (op1);
42062 rtx t1, t2, t3, t4, mask;
42063
42064 switch (mode)
42065 {
42066 case V4SImode:
42067 t1 = gen_reg_rtx (mode);
42068 t2 = gen_reg_rtx (mode);
42069 if (TARGET_XOP && !uns_p)
42070 {
42071 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
42072 shuffle the elements once so that all elements are in the right
42073 place for immediate use: { A C B D }. */
42074 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
42075 const1_rtx, GEN_INT (3)));
42076 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
42077 const1_rtx, GEN_INT (3)));
42078 }
42079 else
42080 {
42081 /* Put the elements into place for the multiply. */
42082 ix86_expand_vec_interleave (t1, op1, op1, high_p);
42083 ix86_expand_vec_interleave (t2, op2, op2, high_p);
42084 high_p = false;
42085 }
42086 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
42087 break;
42088
42089 case V8SImode:
42090 /* Shuffle the elements between the lanes. After this we
42091 have { A B E F | C D G H } for each operand. */
42092 t1 = gen_reg_rtx (V4DImode);
42093 t2 = gen_reg_rtx (V4DImode);
42094 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
42095 const0_rtx, const2_rtx,
42096 const1_rtx, GEN_INT (3)));
42097 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
42098 const0_rtx, const2_rtx,
42099 const1_rtx, GEN_INT (3)));
42100
42101 /* Shuffle the elements within the lanes. After this we
42102 have { A A B B | C C D D } or { E E F F | G G H H }. */
42103 t3 = gen_reg_rtx (V8SImode);
42104 t4 = gen_reg_rtx (V8SImode);
42105 mask = GEN_INT (high_p
42106 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
42107 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
42108 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
42109 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
42110
42111 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
42112 break;
42113
42114 case V8HImode:
42115 case V16HImode:
42116 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
42117 uns_p, OPTAB_DIRECT);
42118 t2 = expand_binop (mode,
42119 uns_p ? umul_highpart_optab : smul_highpart_optab,
42120 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
42121 gcc_assert (t1 && t2);
42122
42123 t3 = gen_reg_rtx (mode);
42124 ix86_expand_vec_interleave (t3, t1, t2, high_p);
42125 emit_move_insn (dest, gen_lowpart (wmode, t3));
42126 break;
42127
42128 case V16QImode:
42129 case V32QImode:
42130 t1 = gen_reg_rtx (wmode);
42131 t2 = gen_reg_rtx (wmode);
42132 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
42133 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
42134
42135 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
42136 break;
42137
42138 default:
42139 gcc_unreachable ();
42140 }
42141 }
42142
42143 void
42144 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
42145 {
42146 rtx res_1, res_2, res_3, res_4;
42147
42148 res_1 = gen_reg_rtx (V4SImode);
42149 res_2 = gen_reg_rtx (V4SImode);
42150 res_3 = gen_reg_rtx (V2DImode);
42151 res_4 = gen_reg_rtx (V2DImode);
42152 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
42153 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
42154
42155 /* Move the results in element 2 down to element 1; we don't care
42156 what goes in elements 2 and 3. Then we can merge the parts
42157 back together with an interleave.
42158
42159 Note that two other sequences were tried:
42160 (1) Use interleaves at the start instead of psrldq, which allows
42161 us to use a single shufps to merge things back at the end.
42162 (2) Use shufps here to combine the two vectors, then pshufd to
42163 put the elements in the correct order.
42164 In both cases the cost of the reformatting stall was too high
42165 and the overall sequence slower. */
42166
42167 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
42168 const0_rtx, const2_rtx,
42169 const0_rtx, const0_rtx));
42170 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
42171 const0_rtx, const2_rtx,
42172 const0_rtx, const0_rtx));
42173 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
42174
42175 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
42176 }
42177
42178 void
42179 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
42180 {
42181 enum machine_mode mode = GET_MODE (op0);
42182 rtx t1, t2, t3, t4, t5, t6;
42183
42184 if (TARGET_XOP && mode == V2DImode)
42185 {
42186 /* op1: A,B,C,D, op2: E,F,G,H */
42187 op1 = gen_lowpart (V4SImode, op1);
42188 op2 = gen_lowpart (V4SImode, op2);
42189
42190 t1 = gen_reg_rtx (V4SImode);
42191 t2 = gen_reg_rtx (V4SImode);
42192 t3 = gen_reg_rtx (V2DImode);
42193 t4 = gen_reg_rtx (V2DImode);
42194
42195 /* t1: B,A,D,C */
42196 emit_insn (gen_sse2_pshufd_1 (t1, op1,
42197 GEN_INT (1),
42198 GEN_INT (0),
42199 GEN_INT (3),
42200 GEN_INT (2)));
42201
42202 /* t2: (B*E),(A*F),(D*G),(C*H) */
42203 emit_insn (gen_mulv4si3 (t2, t1, op2));
42204
42205 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
42206 emit_insn (gen_xop_phadddq (t3, t2));
42207
42208 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
42209 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
42210
42211 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
42212 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
42213 }
42214 else
42215 {
42216 enum machine_mode nmode;
42217 rtx (*umul) (rtx, rtx, rtx);
42218
42219 if (mode == V2DImode)
42220 {
42221 umul = gen_vec_widen_umult_even_v4si;
42222 nmode = V4SImode;
42223 }
42224 else if (mode == V4DImode)
42225 {
42226 umul = gen_vec_widen_umult_even_v8si;
42227 nmode = V8SImode;
42228 }
42229 else
42230 gcc_unreachable ();
42231
42232
42233 /* Multiply low parts. */
42234 t1 = gen_reg_rtx (mode);
42235 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
42236
42237 /* Shift input vectors right 32 bits so we can multiply high parts. */
42238 t6 = GEN_INT (32);
42239 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
42240 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
42241
42242 /* Multiply high parts by low parts. */
42243 t4 = gen_reg_rtx (mode);
42244 t5 = gen_reg_rtx (mode);
42245 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
42246 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
42247
42248 /* Combine and shift the highparts back. */
42249 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
42250 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
42251
42252 /* Combine high and low parts. */
42253 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
42254 }
42255
42256 set_unique_reg_note (get_last_insn (), REG_EQUAL,
42257 gen_rtx_MULT (mode, op1, op2));
42258 }
42259
42260 /* Return 1 if control tansfer instruction INSN
42261 should be encoded with bnd prefix.
42262 If insn is NULL then return 1 when control
42263 transfer instructions should be prefixed with
42264 bnd by default for current function. */
42265
42266 bool
42267 ix86_bnd_prefixed_insn_p (rtx insn ATTRIBUTE_UNUSED)
42268 {
42269 return false;
42270 }
42271
42272 /* Calculate integer abs() using only SSE2 instructions. */
42273
42274 void
42275 ix86_expand_sse2_abs (rtx target, rtx input)
42276 {
42277 enum machine_mode mode = GET_MODE (target);
42278 rtx tmp0, tmp1, x;
42279
42280 switch (mode)
42281 {
42282 /* For 32-bit signed integer X, the best way to calculate the absolute
42283 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
42284 case V4SImode:
42285 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
42286 GEN_INT (GET_MODE_BITSIZE
42287 (GET_MODE_INNER (mode)) - 1),
42288 NULL, 0, OPTAB_DIRECT);
42289 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
42290 NULL, 0, OPTAB_DIRECT);
42291 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
42292 target, 0, OPTAB_DIRECT);
42293 break;
42294
42295 /* For 16-bit signed integer X, the best way to calculate the absolute
42296 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
42297 case V8HImode:
42298 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42299
42300 x = expand_simple_binop (mode, SMAX, tmp0, input,
42301 target, 0, OPTAB_DIRECT);
42302 break;
42303
42304 /* For 8-bit signed integer X, the best way to calculate the absolute
42305 value of X is min ((unsigned char) X, (unsigned char) (-X)),
42306 as SSE2 provides the PMINUB insn. */
42307 case V16QImode:
42308 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
42309
42310 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
42311 target, 0, OPTAB_DIRECT);
42312 break;
42313
42314 default:
42315 gcc_unreachable ();
42316 }
42317
42318 if (x != target)
42319 emit_move_insn (target, x);
42320 }
42321
42322 /* Expand an insert into a vector register through pinsr insn.
42323 Return true if successful. */
42324
42325 bool
42326 ix86_expand_pinsr (rtx *operands)
42327 {
42328 rtx dst = operands[0];
42329 rtx src = operands[3];
42330
42331 unsigned int size = INTVAL (operands[1]);
42332 unsigned int pos = INTVAL (operands[2]);
42333
42334 if (GET_CODE (dst) == SUBREG)
42335 {
42336 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
42337 dst = SUBREG_REG (dst);
42338 }
42339
42340 if (GET_CODE (src) == SUBREG)
42341 src = SUBREG_REG (src);
42342
42343 switch (GET_MODE (dst))
42344 {
42345 case V16QImode:
42346 case V8HImode:
42347 case V4SImode:
42348 case V2DImode:
42349 {
42350 enum machine_mode srcmode, dstmode;
42351 rtx (*pinsr)(rtx, rtx, rtx, rtx);
42352
42353 srcmode = mode_for_size (size, MODE_INT, 0);
42354
42355 switch (srcmode)
42356 {
42357 case QImode:
42358 if (!TARGET_SSE4_1)
42359 return false;
42360 dstmode = V16QImode;
42361 pinsr = gen_sse4_1_pinsrb;
42362 break;
42363
42364 case HImode:
42365 if (!TARGET_SSE2)
42366 return false;
42367 dstmode = V8HImode;
42368 pinsr = gen_sse2_pinsrw;
42369 break;
42370
42371 case SImode:
42372 if (!TARGET_SSE4_1)
42373 return false;
42374 dstmode = V4SImode;
42375 pinsr = gen_sse4_1_pinsrd;
42376 break;
42377
42378 case DImode:
42379 gcc_assert (TARGET_64BIT);
42380 if (!TARGET_SSE4_1)
42381 return false;
42382 dstmode = V2DImode;
42383 pinsr = gen_sse4_1_pinsrq;
42384 break;
42385
42386 default:
42387 return false;
42388 }
42389
42390 rtx d = dst;
42391 if (GET_MODE (dst) != dstmode)
42392 d = gen_reg_rtx (dstmode);
42393 src = gen_lowpart (srcmode, src);
42394
42395 pos /= size;
42396
42397 emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
42398 GEN_INT (1 << pos)));
42399 if (d != dst)
42400 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
42401 return true;
42402 }
42403
42404 default:
42405 return false;
42406 }
42407 }
42408 \f
42409 /* This function returns the calling abi specific va_list type node.
42410 It returns the FNDECL specific va_list type. */
42411
42412 static tree
42413 ix86_fn_abi_va_list (tree fndecl)
42414 {
42415 if (!TARGET_64BIT)
42416 return va_list_type_node;
42417 gcc_assert (fndecl != NULL_TREE);
42418
42419 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
42420 return ms_va_list_type_node;
42421 else
42422 return sysv_va_list_type_node;
42423 }
42424
42425 /* Returns the canonical va_list type specified by TYPE. If there
42426 is no valid TYPE provided, it return NULL_TREE. */
42427
42428 static tree
42429 ix86_canonical_va_list_type (tree type)
42430 {
42431 tree wtype, htype;
42432
42433 /* Resolve references and pointers to va_list type. */
42434 if (TREE_CODE (type) == MEM_REF)
42435 type = TREE_TYPE (type);
42436 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
42437 type = TREE_TYPE (type);
42438 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
42439 type = TREE_TYPE (type);
42440
42441 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
42442 {
42443 wtype = va_list_type_node;
42444 gcc_assert (wtype != NULL_TREE);
42445 htype = type;
42446 if (TREE_CODE (wtype) == ARRAY_TYPE)
42447 {
42448 /* If va_list is an array type, the argument may have decayed
42449 to a pointer type, e.g. by being passed to another function.
42450 In that case, unwrap both types so that we can compare the
42451 underlying records. */
42452 if (TREE_CODE (htype) == ARRAY_TYPE
42453 || POINTER_TYPE_P (htype))
42454 {
42455 wtype = TREE_TYPE (wtype);
42456 htype = TREE_TYPE (htype);
42457 }
42458 }
42459 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42460 return va_list_type_node;
42461 wtype = sysv_va_list_type_node;
42462 gcc_assert (wtype != NULL_TREE);
42463 htype = type;
42464 if (TREE_CODE (wtype) == ARRAY_TYPE)
42465 {
42466 /* If va_list is an array type, the argument may have decayed
42467 to a pointer type, e.g. by being passed to another function.
42468 In that case, unwrap both types so that we can compare the
42469 underlying records. */
42470 if (TREE_CODE (htype) == ARRAY_TYPE
42471 || POINTER_TYPE_P (htype))
42472 {
42473 wtype = TREE_TYPE (wtype);
42474 htype = TREE_TYPE (htype);
42475 }
42476 }
42477 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42478 return sysv_va_list_type_node;
42479 wtype = ms_va_list_type_node;
42480 gcc_assert (wtype != NULL_TREE);
42481 htype = type;
42482 if (TREE_CODE (wtype) == ARRAY_TYPE)
42483 {
42484 /* If va_list is an array type, the argument may have decayed
42485 to a pointer type, e.g. by being passed to another function.
42486 In that case, unwrap both types so that we can compare the
42487 underlying records. */
42488 if (TREE_CODE (htype) == ARRAY_TYPE
42489 || POINTER_TYPE_P (htype))
42490 {
42491 wtype = TREE_TYPE (wtype);
42492 htype = TREE_TYPE (htype);
42493 }
42494 }
42495 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
42496 return ms_va_list_type_node;
42497 return NULL_TREE;
42498 }
42499 return std_canonical_va_list_type (type);
42500 }
42501
42502 /* Iterate through the target-specific builtin types for va_list.
42503 IDX denotes the iterator, *PTREE is set to the result type of
42504 the va_list builtin, and *PNAME to its internal type.
42505 Returns zero if there is no element for this index, otherwise
42506 IDX should be increased upon the next call.
42507 Note, do not iterate a base builtin's name like __builtin_va_list.
42508 Used from c_common_nodes_and_builtins. */
42509
42510 static int
42511 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
42512 {
42513 if (TARGET_64BIT)
42514 {
42515 switch (idx)
42516 {
42517 default:
42518 break;
42519
42520 case 0:
42521 *ptree = ms_va_list_type_node;
42522 *pname = "__builtin_ms_va_list";
42523 return 1;
42524
42525 case 1:
42526 *ptree = sysv_va_list_type_node;
42527 *pname = "__builtin_sysv_va_list";
42528 return 1;
42529 }
42530 }
42531
42532 return 0;
42533 }
42534
42535 #undef TARGET_SCHED_DISPATCH
42536 #define TARGET_SCHED_DISPATCH has_dispatch
42537 #undef TARGET_SCHED_DISPATCH_DO
42538 #define TARGET_SCHED_DISPATCH_DO do_dispatch
42539 #undef TARGET_SCHED_REASSOCIATION_WIDTH
42540 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
42541 #undef TARGET_SCHED_REORDER
42542 #define TARGET_SCHED_REORDER ix86_sched_reorder
42543 #undef TARGET_SCHED_ADJUST_PRIORITY
42544 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
42545 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
42546 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
42547 ix86_dependencies_evaluation_hook
42548
42549 /* The size of the dispatch window is the total number of bytes of
42550 object code allowed in a window. */
42551 #define DISPATCH_WINDOW_SIZE 16
42552
42553 /* Number of dispatch windows considered for scheduling. */
42554 #define MAX_DISPATCH_WINDOWS 3
42555
42556 /* Maximum number of instructions in a window. */
42557 #define MAX_INSN 4
42558
42559 /* Maximum number of immediate operands in a window. */
42560 #define MAX_IMM 4
42561
42562 /* Maximum number of immediate bits allowed in a window. */
42563 #define MAX_IMM_SIZE 128
42564
42565 /* Maximum number of 32 bit immediates allowed in a window. */
42566 #define MAX_IMM_32 4
42567
42568 /* Maximum number of 64 bit immediates allowed in a window. */
42569 #define MAX_IMM_64 2
42570
42571 /* Maximum total of loads or prefetches allowed in a window. */
42572 #define MAX_LOAD 2
42573
42574 /* Maximum total of stores allowed in a window. */
42575 #define MAX_STORE 1
42576
42577 #undef BIG
42578 #define BIG 100
42579
42580
42581 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
42582 enum dispatch_group {
42583 disp_no_group = 0,
42584 disp_load,
42585 disp_store,
42586 disp_load_store,
42587 disp_prefetch,
42588 disp_imm,
42589 disp_imm_32,
42590 disp_imm_64,
42591 disp_branch,
42592 disp_cmp,
42593 disp_jcc,
42594 disp_last
42595 };
42596
42597 /* Number of allowable groups in a dispatch window. It is an array
42598 indexed by dispatch_group enum. 100 is used as a big number,
42599 because the number of these kind of operations does not have any
42600 effect in dispatch window, but we need them for other reasons in
42601 the table. */
42602 static unsigned int num_allowable_groups[disp_last] = {
42603 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
42604 };
42605
42606 char group_name[disp_last + 1][16] = {
42607 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
42608 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
42609 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
42610 };
42611
42612 /* Instruction path. */
42613 enum insn_path {
42614 no_path = 0,
42615 path_single, /* Single micro op. */
42616 path_double, /* Double micro op. */
42617 path_multi, /* Instructions with more than 2 micro op.. */
42618 last_path
42619 };
42620
42621 /* sched_insn_info defines a window to the instructions scheduled in
42622 the basic block. It contains a pointer to the insn_info table and
42623 the instruction scheduled.
42624
42625 Windows are allocated for each basic block and are linked
42626 together. */
42627 typedef struct sched_insn_info_s {
42628 rtx insn;
42629 enum dispatch_group group;
42630 enum insn_path path;
42631 int byte_len;
42632 int imm_bytes;
42633 } sched_insn_info;
42634
42635 /* Linked list of dispatch windows. This is a two way list of
42636 dispatch windows of a basic block. It contains information about
42637 the number of uops in the window and the total number of
42638 instructions and of bytes in the object code for this dispatch
42639 window. */
42640 typedef struct dispatch_windows_s {
42641 int num_insn; /* Number of insn in the window. */
42642 int num_uops; /* Number of uops in the window. */
42643 int window_size; /* Number of bytes in the window. */
42644 int window_num; /* Window number between 0 or 1. */
42645 int num_imm; /* Number of immediates in an insn. */
42646 int num_imm_32; /* Number of 32 bit immediates in an insn. */
42647 int num_imm_64; /* Number of 64 bit immediates in an insn. */
42648 int imm_size; /* Total immediates in the window. */
42649 int num_loads; /* Total memory loads in the window. */
42650 int num_stores; /* Total memory stores in the window. */
42651 int violation; /* Violation exists in window. */
42652 sched_insn_info *window; /* Pointer to the window. */
42653 struct dispatch_windows_s *next;
42654 struct dispatch_windows_s *prev;
42655 } dispatch_windows;
42656
42657 /* Immediate valuse used in an insn. */
42658 typedef struct imm_info_s
42659 {
42660 int imm;
42661 int imm32;
42662 int imm64;
42663 } imm_info;
42664
42665 static dispatch_windows *dispatch_window_list;
42666 static dispatch_windows *dispatch_window_list1;
42667
42668 /* Get dispatch group of insn. */
42669
42670 static enum dispatch_group
42671 get_mem_group (rtx insn)
42672 {
42673 enum attr_memory memory;
42674
42675 if (INSN_CODE (insn) < 0)
42676 return disp_no_group;
42677 memory = get_attr_memory (insn);
42678 if (memory == MEMORY_STORE)
42679 return disp_store;
42680
42681 if (memory == MEMORY_LOAD)
42682 return disp_load;
42683
42684 if (memory == MEMORY_BOTH)
42685 return disp_load_store;
42686
42687 return disp_no_group;
42688 }
42689
42690 /* Return true if insn is a compare instruction. */
42691
42692 static bool
42693 is_cmp (rtx insn)
42694 {
42695 enum attr_type type;
42696
42697 type = get_attr_type (insn);
42698 return (type == TYPE_TEST
42699 || type == TYPE_ICMP
42700 || type == TYPE_FCMP
42701 || GET_CODE (PATTERN (insn)) == COMPARE);
42702 }
42703
42704 /* Return true if a dispatch violation encountered. */
42705
42706 static bool
42707 dispatch_violation (void)
42708 {
42709 if (dispatch_window_list->next)
42710 return dispatch_window_list->next->violation;
42711 return dispatch_window_list->violation;
42712 }
42713
42714 /* Return true if insn is a branch instruction. */
42715
42716 static bool
42717 is_branch (rtx insn)
42718 {
42719 return (CALL_P (insn) || JUMP_P (insn));
42720 }
42721
42722 /* Return true if insn is a prefetch instruction. */
42723
42724 static bool
42725 is_prefetch (rtx insn)
42726 {
42727 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
42728 }
42729
42730 /* This function initializes a dispatch window and the list container holding a
42731 pointer to the window. */
42732
42733 static void
42734 init_window (int window_num)
42735 {
42736 int i;
42737 dispatch_windows *new_list;
42738
42739 if (window_num == 0)
42740 new_list = dispatch_window_list;
42741 else
42742 new_list = dispatch_window_list1;
42743
42744 new_list->num_insn = 0;
42745 new_list->num_uops = 0;
42746 new_list->window_size = 0;
42747 new_list->next = NULL;
42748 new_list->prev = NULL;
42749 new_list->window_num = window_num;
42750 new_list->num_imm = 0;
42751 new_list->num_imm_32 = 0;
42752 new_list->num_imm_64 = 0;
42753 new_list->imm_size = 0;
42754 new_list->num_loads = 0;
42755 new_list->num_stores = 0;
42756 new_list->violation = false;
42757
42758 for (i = 0; i < MAX_INSN; i++)
42759 {
42760 new_list->window[i].insn = NULL;
42761 new_list->window[i].group = disp_no_group;
42762 new_list->window[i].path = no_path;
42763 new_list->window[i].byte_len = 0;
42764 new_list->window[i].imm_bytes = 0;
42765 }
42766 return;
42767 }
42768
42769 /* This function allocates and initializes a dispatch window and the
42770 list container holding a pointer to the window. */
42771
42772 static dispatch_windows *
42773 allocate_window (void)
42774 {
42775 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
42776 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
42777
42778 return new_list;
42779 }
42780
42781 /* This routine initializes the dispatch scheduling information. It
42782 initiates building dispatch scheduler tables and constructs the
42783 first dispatch window. */
42784
42785 static void
42786 init_dispatch_sched (void)
42787 {
42788 /* Allocate a dispatch list and a window. */
42789 dispatch_window_list = allocate_window ();
42790 dispatch_window_list1 = allocate_window ();
42791 init_window (0);
42792 init_window (1);
42793 }
42794
42795 /* This function returns true if a branch is detected. End of a basic block
42796 does not have to be a branch, but here we assume only branches end a
42797 window. */
42798
42799 static bool
42800 is_end_basic_block (enum dispatch_group group)
42801 {
42802 return group == disp_branch;
42803 }
42804
42805 /* This function is called when the end of a window processing is reached. */
42806
42807 static void
42808 process_end_window (void)
42809 {
42810 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
42811 if (dispatch_window_list->next)
42812 {
42813 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
42814 gcc_assert (dispatch_window_list->window_size
42815 + dispatch_window_list1->window_size <= 48);
42816 init_window (1);
42817 }
42818 init_window (0);
42819 }
42820
42821 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
42822 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
42823 for 48 bytes of instructions. Note that these windows are not dispatch
42824 windows that their sizes are DISPATCH_WINDOW_SIZE. */
42825
42826 static dispatch_windows *
42827 allocate_next_window (int window_num)
42828 {
42829 if (window_num == 0)
42830 {
42831 if (dispatch_window_list->next)
42832 init_window (1);
42833 init_window (0);
42834 return dispatch_window_list;
42835 }
42836
42837 dispatch_window_list->next = dispatch_window_list1;
42838 dispatch_window_list1->prev = dispatch_window_list;
42839
42840 return dispatch_window_list1;
42841 }
42842
42843 /* Increment the number of immediate operands of an instruction. */
42844
42845 static int
42846 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
42847 {
42848 if (*in_rtx == 0)
42849 return 0;
42850
42851 switch ( GET_CODE (*in_rtx))
42852 {
42853 case CONST:
42854 case SYMBOL_REF:
42855 case CONST_INT:
42856 (imm_values->imm)++;
42857 if (x86_64_immediate_operand (*in_rtx, SImode))
42858 (imm_values->imm32)++;
42859 else
42860 (imm_values->imm64)++;
42861 break;
42862
42863 case CONST_DOUBLE:
42864 (imm_values->imm)++;
42865 (imm_values->imm64)++;
42866 break;
42867
42868 case CODE_LABEL:
42869 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
42870 {
42871 (imm_values->imm)++;
42872 (imm_values->imm32)++;
42873 }
42874 break;
42875
42876 default:
42877 break;
42878 }
42879
42880 return 0;
42881 }
42882
42883 /* Compute number of immediate operands of an instruction. */
42884
42885 static void
42886 find_constant (rtx in_rtx, imm_info *imm_values)
42887 {
42888 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
42889 (rtx_function) find_constant_1, (void *) imm_values);
42890 }
42891
42892 /* Return total size of immediate operands of an instruction along with number
42893 of corresponding immediate-operands. It initializes its parameters to zero
42894 befor calling FIND_CONSTANT.
42895 INSN is the input instruction. IMM is the total of immediates.
42896 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
42897 bit immediates. */
42898
42899 static int
42900 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
42901 {
42902 imm_info imm_values = {0, 0, 0};
42903
42904 find_constant (insn, &imm_values);
42905 *imm = imm_values.imm;
42906 *imm32 = imm_values.imm32;
42907 *imm64 = imm_values.imm64;
42908 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
42909 }
42910
42911 /* This function indicates if an operand of an instruction is an
42912 immediate. */
42913
42914 static bool
42915 has_immediate (rtx insn)
42916 {
42917 int num_imm_operand;
42918 int num_imm32_operand;
42919 int num_imm64_operand;
42920
42921 if (insn)
42922 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42923 &num_imm64_operand);
42924 return false;
42925 }
42926
42927 /* Return single or double path for instructions. */
42928
42929 static enum insn_path
42930 get_insn_path (rtx insn)
42931 {
42932 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
42933
42934 if ((int)path == 0)
42935 return path_single;
42936
42937 if ((int)path == 1)
42938 return path_double;
42939
42940 return path_multi;
42941 }
42942
42943 /* Return insn dispatch group. */
42944
42945 static enum dispatch_group
42946 get_insn_group (rtx insn)
42947 {
42948 enum dispatch_group group = get_mem_group (insn);
42949 if (group)
42950 return group;
42951
42952 if (is_branch (insn))
42953 return disp_branch;
42954
42955 if (is_cmp (insn))
42956 return disp_cmp;
42957
42958 if (has_immediate (insn))
42959 return disp_imm;
42960
42961 if (is_prefetch (insn))
42962 return disp_prefetch;
42963
42964 return disp_no_group;
42965 }
42966
42967 /* Count number of GROUP restricted instructions in a dispatch
42968 window WINDOW_LIST. */
42969
42970 static int
42971 count_num_restricted (rtx insn, dispatch_windows *window_list)
42972 {
42973 enum dispatch_group group = get_insn_group (insn);
42974 int imm_size;
42975 int num_imm_operand;
42976 int num_imm32_operand;
42977 int num_imm64_operand;
42978
42979 if (group == disp_no_group)
42980 return 0;
42981
42982 if (group == disp_imm)
42983 {
42984 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42985 &num_imm64_operand);
42986 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
42987 || num_imm_operand + window_list->num_imm > MAX_IMM
42988 || (num_imm32_operand > 0
42989 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
42990 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
42991 || (num_imm64_operand > 0
42992 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
42993 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
42994 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
42995 && num_imm64_operand > 0
42996 && ((window_list->num_imm_64 > 0
42997 && window_list->num_insn >= 2)
42998 || window_list->num_insn >= 3)))
42999 return BIG;
43000
43001 return 1;
43002 }
43003
43004 if ((group == disp_load_store
43005 && (window_list->num_loads >= MAX_LOAD
43006 || window_list->num_stores >= MAX_STORE))
43007 || ((group == disp_load
43008 || group == disp_prefetch)
43009 && window_list->num_loads >= MAX_LOAD)
43010 || (group == disp_store
43011 && window_list->num_stores >= MAX_STORE))
43012 return BIG;
43013
43014 return 1;
43015 }
43016
43017 /* This function returns true if insn satisfies dispatch rules on the
43018 last window scheduled. */
43019
43020 static bool
43021 fits_dispatch_window (rtx insn)
43022 {
43023 dispatch_windows *window_list = dispatch_window_list;
43024 dispatch_windows *window_list_next = dispatch_window_list->next;
43025 unsigned int num_restrict;
43026 enum dispatch_group group = get_insn_group (insn);
43027 enum insn_path path = get_insn_path (insn);
43028 int sum;
43029
43030 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
43031 instructions should be given the lowest priority in the
43032 scheduling process in Haifa scheduler to make sure they will be
43033 scheduled in the same dispatch window as the reference to them. */
43034 if (group == disp_jcc || group == disp_cmp)
43035 return false;
43036
43037 /* Check nonrestricted. */
43038 if (group == disp_no_group || group == disp_branch)
43039 return true;
43040
43041 /* Get last dispatch window. */
43042 if (window_list_next)
43043 window_list = window_list_next;
43044
43045 if (window_list->window_num == 1)
43046 {
43047 sum = window_list->prev->window_size + window_list->window_size;
43048
43049 if (sum == 32
43050 || (min_insn_size (insn) + sum) >= 48)
43051 /* Window 1 is full. Go for next window. */
43052 return true;
43053 }
43054
43055 num_restrict = count_num_restricted (insn, window_list);
43056
43057 if (num_restrict > num_allowable_groups[group])
43058 return false;
43059
43060 /* See if it fits in the first window. */
43061 if (window_list->window_num == 0)
43062 {
43063 /* The first widow should have only single and double path
43064 uops. */
43065 if (path == path_double
43066 && (window_list->num_uops + 2) > MAX_INSN)
43067 return false;
43068 else if (path != path_single)
43069 return false;
43070 }
43071 return true;
43072 }
43073
43074 /* Add an instruction INSN with NUM_UOPS micro-operations to the
43075 dispatch window WINDOW_LIST. */
43076
43077 static void
43078 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
43079 {
43080 int byte_len = min_insn_size (insn);
43081 int num_insn = window_list->num_insn;
43082 int imm_size;
43083 sched_insn_info *window = window_list->window;
43084 enum dispatch_group group = get_insn_group (insn);
43085 enum insn_path path = get_insn_path (insn);
43086 int num_imm_operand;
43087 int num_imm32_operand;
43088 int num_imm64_operand;
43089
43090 if (!window_list->violation && group != disp_cmp
43091 && !fits_dispatch_window (insn))
43092 window_list->violation = true;
43093
43094 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43095 &num_imm64_operand);
43096
43097 /* Initialize window with new instruction. */
43098 window[num_insn].insn = insn;
43099 window[num_insn].byte_len = byte_len;
43100 window[num_insn].group = group;
43101 window[num_insn].path = path;
43102 window[num_insn].imm_bytes = imm_size;
43103
43104 window_list->window_size += byte_len;
43105 window_list->num_insn = num_insn + 1;
43106 window_list->num_uops = window_list->num_uops + num_uops;
43107 window_list->imm_size += imm_size;
43108 window_list->num_imm += num_imm_operand;
43109 window_list->num_imm_32 += num_imm32_operand;
43110 window_list->num_imm_64 += num_imm64_operand;
43111
43112 if (group == disp_store)
43113 window_list->num_stores += 1;
43114 else if (group == disp_load
43115 || group == disp_prefetch)
43116 window_list->num_loads += 1;
43117 else if (group == disp_load_store)
43118 {
43119 window_list->num_stores += 1;
43120 window_list->num_loads += 1;
43121 }
43122 }
43123
43124 /* Adds a scheduled instruction, INSN, to the current dispatch window.
43125 If the total bytes of instructions or the number of instructions in
43126 the window exceed allowable, it allocates a new window. */
43127
43128 static void
43129 add_to_dispatch_window (rtx insn)
43130 {
43131 int byte_len;
43132 dispatch_windows *window_list;
43133 dispatch_windows *next_list;
43134 dispatch_windows *window0_list;
43135 enum insn_path path;
43136 enum dispatch_group insn_group;
43137 bool insn_fits;
43138 int num_insn;
43139 int num_uops;
43140 int window_num;
43141 int insn_num_uops;
43142 int sum;
43143
43144 if (INSN_CODE (insn) < 0)
43145 return;
43146
43147 byte_len = min_insn_size (insn);
43148 window_list = dispatch_window_list;
43149 next_list = window_list->next;
43150 path = get_insn_path (insn);
43151 insn_group = get_insn_group (insn);
43152
43153 /* Get the last dispatch window. */
43154 if (next_list)
43155 window_list = dispatch_window_list->next;
43156
43157 if (path == path_single)
43158 insn_num_uops = 1;
43159 else if (path == path_double)
43160 insn_num_uops = 2;
43161 else
43162 insn_num_uops = (int) path;
43163
43164 /* If current window is full, get a new window.
43165 Window number zero is full, if MAX_INSN uops are scheduled in it.
43166 Window number one is full, if window zero's bytes plus window
43167 one's bytes is 32, or if the bytes of the new instruction added
43168 to the total makes it greater than 48, or it has already MAX_INSN
43169 instructions in it. */
43170 num_insn = window_list->num_insn;
43171 num_uops = window_list->num_uops;
43172 window_num = window_list->window_num;
43173 insn_fits = fits_dispatch_window (insn);
43174
43175 if (num_insn >= MAX_INSN
43176 || num_uops + insn_num_uops > MAX_INSN
43177 || !(insn_fits))
43178 {
43179 window_num = ~window_num & 1;
43180 window_list = allocate_next_window (window_num);
43181 }
43182
43183 if (window_num == 0)
43184 {
43185 add_insn_window (insn, window_list, insn_num_uops);
43186 if (window_list->num_insn >= MAX_INSN
43187 && insn_group == disp_branch)
43188 {
43189 process_end_window ();
43190 return;
43191 }
43192 }
43193 else if (window_num == 1)
43194 {
43195 window0_list = window_list->prev;
43196 sum = window0_list->window_size + window_list->window_size;
43197 if (sum == 32
43198 || (byte_len + sum) >= 48)
43199 {
43200 process_end_window ();
43201 window_list = dispatch_window_list;
43202 }
43203
43204 add_insn_window (insn, window_list, insn_num_uops);
43205 }
43206 else
43207 gcc_unreachable ();
43208
43209 if (is_end_basic_block (insn_group))
43210 {
43211 /* End of basic block is reached do end-basic-block process. */
43212 process_end_window ();
43213 return;
43214 }
43215 }
43216
43217 /* Print the dispatch window, WINDOW_NUM, to FILE. */
43218
43219 DEBUG_FUNCTION static void
43220 debug_dispatch_window_file (FILE *file, int window_num)
43221 {
43222 dispatch_windows *list;
43223 int i;
43224
43225 if (window_num == 0)
43226 list = dispatch_window_list;
43227 else
43228 list = dispatch_window_list1;
43229
43230 fprintf (file, "Window #%d:\n", list->window_num);
43231 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
43232 list->num_insn, list->num_uops, list->window_size);
43233 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43234 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
43235
43236 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
43237 list->num_stores);
43238 fprintf (file, " insn info:\n");
43239
43240 for (i = 0; i < MAX_INSN; i++)
43241 {
43242 if (!list->window[i].insn)
43243 break;
43244 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
43245 i, group_name[list->window[i].group],
43246 i, (void *)list->window[i].insn,
43247 i, list->window[i].path,
43248 i, list->window[i].byte_len,
43249 i, list->window[i].imm_bytes);
43250 }
43251 }
43252
43253 /* Print to stdout a dispatch window. */
43254
43255 DEBUG_FUNCTION void
43256 debug_dispatch_window (int window_num)
43257 {
43258 debug_dispatch_window_file (stdout, window_num);
43259 }
43260
43261 /* Print INSN dispatch information to FILE. */
43262
43263 DEBUG_FUNCTION static void
43264 debug_insn_dispatch_info_file (FILE *file, rtx insn)
43265 {
43266 int byte_len;
43267 enum insn_path path;
43268 enum dispatch_group group;
43269 int imm_size;
43270 int num_imm_operand;
43271 int num_imm32_operand;
43272 int num_imm64_operand;
43273
43274 if (INSN_CODE (insn) < 0)
43275 return;
43276
43277 byte_len = min_insn_size (insn);
43278 path = get_insn_path (insn);
43279 group = get_insn_group (insn);
43280 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
43281 &num_imm64_operand);
43282
43283 fprintf (file, " insn info:\n");
43284 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
43285 group_name[group], path, byte_len);
43286 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
43287 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
43288 }
43289
43290 /* Print to STDERR the status of the ready list with respect to
43291 dispatch windows. */
43292
43293 DEBUG_FUNCTION void
43294 debug_ready_dispatch (void)
43295 {
43296 int i;
43297 int no_ready = number_in_ready ();
43298
43299 fprintf (stdout, "Number of ready: %d\n", no_ready);
43300
43301 for (i = 0; i < no_ready; i++)
43302 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
43303 }
43304
43305 /* This routine is the driver of the dispatch scheduler. */
43306
43307 static void
43308 do_dispatch (rtx insn, int mode)
43309 {
43310 if (mode == DISPATCH_INIT)
43311 init_dispatch_sched ();
43312 else if (mode == ADD_TO_DISPATCH_WINDOW)
43313 add_to_dispatch_window (insn);
43314 }
43315
43316 /* Return TRUE if Dispatch Scheduling is supported. */
43317
43318 static bool
43319 has_dispatch (rtx insn, int action)
43320 {
43321 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
43322 && flag_dispatch_scheduler)
43323 switch (action)
43324 {
43325 default:
43326 return false;
43327
43328 case IS_DISPATCH_ON:
43329 return true;
43330 break;
43331
43332 case IS_CMP:
43333 return is_cmp (insn);
43334
43335 case DISPATCH_VIOLATION:
43336 return dispatch_violation ();
43337
43338 case FITS_DISPATCH_WINDOW:
43339 return fits_dispatch_window (insn);
43340 }
43341
43342 return false;
43343 }
43344
43345 /* Implementation of reassociation_width target hook used by
43346 reassoc phase to identify parallelism level in reassociated
43347 tree. Statements tree_code is passed in OPC. Arguments type
43348 is passed in MODE.
43349
43350 Currently parallel reassociation is enabled for Atom
43351 processors only and we set reassociation width to be 2
43352 because Atom may issue up to 2 instructions per cycle.
43353
43354 Return value should be fixed if parallel reassociation is
43355 enabled for other processors. */
43356
43357 static int
43358 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
43359 enum machine_mode mode)
43360 {
43361 int res = 1;
43362
43363 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
43364 res = 2;
43365 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
43366 res = 2;
43367
43368 return res;
43369 }
43370
43371 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
43372 place emms and femms instructions. */
43373
43374 static enum machine_mode
43375 ix86_preferred_simd_mode (enum machine_mode mode)
43376 {
43377 if (!TARGET_SSE)
43378 return word_mode;
43379
43380 switch (mode)
43381 {
43382 case QImode:
43383 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
43384 case HImode:
43385 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
43386 case SImode:
43387 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
43388 case DImode:
43389 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
43390
43391 case SFmode:
43392 if (TARGET_AVX && !TARGET_PREFER_AVX128)
43393 return V8SFmode;
43394 else
43395 return V4SFmode;
43396
43397 case DFmode:
43398 if (!TARGET_VECTORIZE_DOUBLE)
43399 return word_mode;
43400 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
43401 return V4DFmode;
43402 else if (TARGET_SSE2)
43403 return V2DFmode;
43404 /* FALLTHRU */
43405
43406 default:
43407 return word_mode;
43408 }
43409 }
43410
43411 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
43412 vectors. */
43413
43414 static unsigned int
43415 ix86_autovectorize_vector_sizes (void)
43416 {
43417 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
43418 }
43419
43420 \f
43421
43422 /* Return class of registers which could be used for pseudo of MODE
43423 and of class RCLASS for spilling instead of memory. Return NO_REGS
43424 if it is not possible or non-profitable. */
43425 static reg_class_t
43426 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
43427 {
43428 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
43429 && (mode == SImode || (TARGET_64BIT && mode == DImode))
43430 && INTEGER_CLASS_P (rclass))
43431 return ALL_SSE_REGS;
43432 return NO_REGS;
43433 }
43434
43435 /* Implement targetm.vectorize.init_cost. */
43436
43437 static void *
43438 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
43439 {
43440 unsigned *cost = XNEWVEC (unsigned, 3);
43441 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
43442 return cost;
43443 }
43444
43445 /* Implement targetm.vectorize.add_stmt_cost. */
43446
43447 static unsigned
43448 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
43449 struct _stmt_vec_info *stmt_info, int misalign,
43450 enum vect_cost_model_location where)
43451 {
43452 unsigned *cost = (unsigned *) data;
43453 unsigned retval = 0;
43454
43455 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
43456 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
43457
43458 /* Statements in an inner loop relative to the loop being
43459 vectorized are weighted more heavily. The value here is
43460 arbitrary and could potentially be improved with analysis. */
43461 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
43462 count *= 50; /* FIXME. */
43463
43464 retval = (unsigned) (count * stmt_cost);
43465 cost[where] += retval;
43466
43467 return retval;
43468 }
43469
43470 /* Implement targetm.vectorize.finish_cost. */
43471
43472 static void
43473 ix86_finish_cost (void *data, unsigned *prologue_cost,
43474 unsigned *body_cost, unsigned *epilogue_cost)
43475 {
43476 unsigned *cost = (unsigned *) data;
43477 *prologue_cost = cost[vect_prologue];
43478 *body_cost = cost[vect_body];
43479 *epilogue_cost = cost[vect_epilogue];
43480 }
43481
43482 /* Implement targetm.vectorize.destroy_cost_data. */
43483
43484 static void
43485 ix86_destroy_cost_data (void *data)
43486 {
43487 free (data);
43488 }
43489
43490 /* Validate target specific memory model bits in VAL. */
43491
43492 static unsigned HOST_WIDE_INT
43493 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
43494 {
43495 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
43496 bool strong;
43497
43498 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
43499 |MEMMODEL_MASK)
43500 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
43501 {
43502 warning (OPT_Winvalid_memory_model,
43503 "Unknown architecture specific memory model");
43504 return MEMMODEL_SEQ_CST;
43505 }
43506 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
43507 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
43508 {
43509 warning (OPT_Winvalid_memory_model,
43510 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
43511 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
43512 }
43513 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
43514 {
43515 warning (OPT_Winvalid_memory_model,
43516 "HLE_RELEASE not used with RELEASE or stronger memory model");
43517 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
43518 }
43519 return val;
43520 }
43521
43522 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
43523
43524 static bool
43525 ix86_float_exceptions_rounding_supported_p (void)
43526 {
43527 /* For x87 floating point with standard excess precision handling,
43528 there is no adddf3 pattern (since x87 floating point only has
43529 XFmode operations) so the default hook implementation gets this
43530 wrong. */
43531 return TARGET_80387 || TARGET_SSE_MATH;
43532 }
43533
43534 /* Initialize the GCC target structure. */
43535 #undef TARGET_RETURN_IN_MEMORY
43536 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
43537
43538 #undef TARGET_LEGITIMIZE_ADDRESS
43539 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
43540
43541 #undef TARGET_ATTRIBUTE_TABLE
43542 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
43543 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
43544 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
43545 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43546 # undef TARGET_MERGE_DECL_ATTRIBUTES
43547 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
43548 #endif
43549
43550 #undef TARGET_COMP_TYPE_ATTRIBUTES
43551 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
43552
43553 #undef TARGET_INIT_BUILTINS
43554 #define TARGET_INIT_BUILTINS ix86_init_builtins
43555 #undef TARGET_BUILTIN_DECL
43556 #define TARGET_BUILTIN_DECL ix86_builtin_decl
43557 #undef TARGET_EXPAND_BUILTIN
43558 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
43559
43560 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
43561 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
43562 ix86_builtin_vectorized_function
43563
43564 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
43565 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
43566
43567 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
43568 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
43569
43570 #undef TARGET_VECTORIZE_BUILTIN_GATHER
43571 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
43572
43573 #undef TARGET_BUILTIN_RECIPROCAL
43574 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
43575
43576 #undef TARGET_ASM_FUNCTION_EPILOGUE
43577 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
43578
43579 #undef TARGET_ENCODE_SECTION_INFO
43580 #ifndef SUBTARGET_ENCODE_SECTION_INFO
43581 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
43582 #else
43583 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
43584 #endif
43585
43586 #undef TARGET_ASM_OPEN_PAREN
43587 #define TARGET_ASM_OPEN_PAREN ""
43588 #undef TARGET_ASM_CLOSE_PAREN
43589 #define TARGET_ASM_CLOSE_PAREN ""
43590
43591 #undef TARGET_ASM_BYTE_OP
43592 #define TARGET_ASM_BYTE_OP ASM_BYTE
43593
43594 #undef TARGET_ASM_ALIGNED_HI_OP
43595 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
43596 #undef TARGET_ASM_ALIGNED_SI_OP
43597 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
43598 #ifdef ASM_QUAD
43599 #undef TARGET_ASM_ALIGNED_DI_OP
43600 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
43601 #endif
43602
43603 #undef TARGET_PROFILE_BEFORE_PROLOGUE
43604 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
43605
43606 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
43607 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
43608
43609 #undef TARGET_ASM_UNALIGNED_HI_OP
43610 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
43611 #undef TARGET_ASM_UNALIGNED_SI_OP
43612 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
43613 #undef TARGET_ASM_UNALIGNED_DI_OP
43614 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
43615
43616 #undef TARGET_PRINT_OPERAND
43617 #define TARGET_PRINT_OPERAND ix86_print_operand
43618 #undef TARGET_PRINT_OPERAND_ADDRESS
43619 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
43620 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
43621 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
43622 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
43623 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
43624
43625 #undef TARGET_SCHED_INIT_GLOBAL
43626 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
43627 #undef TARGET_SCHED_ADJUST_COST
43628 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
43629 #undef TARGET_SCHED_ISSUE_RATE
43630 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
43631 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
43632 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
43633 ia32_multipass_dfa_lookahead
43634 #undef TARGET_SCHED_MACRO_FUSION_P
43635 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
43636 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
43637 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
43638
43639 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
43640 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
43641
43642 #undef TARGET_MEMMODEL_CHECK
43643 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
43644
43645 #ifdef HAVE_AS_TLS
43646 #undef TARGET_HAVE_TLS
43647 #define TARGET_HAVE_TLS true
43648 #endif
43649 #undef TARGET_CANNOT_FORCE_CONST_MEM
43650 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
43651 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
43652 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
43653
43654 #undef TARGET_DELEGITIMIZE_ADDRESS
43655 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
43656
43657 #undef TARGET_MS_BITFIELD_LAYOUT_P
43658 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
43659
43660 #if TARGET_MACHO
43661 #undef TARGET_BINDS_LOCAL_P
43662 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
43663 #endif
43664 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
43665 #undef TARGET_BINDS_LOCAL_P
43666 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
43667 #endif
43668
43669 #undef TARGET_ASM_OUTPUT_MI_THUNK
43670 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
43671 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
43672 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
43673
43674 #undef TARGET_ASM_FILE_START
43675 #define TARGET_ASM_FILE_START x86_file_start
43676
43677 #undef TARGET_OPTION_OVERRIDE
43678 #define TARGET_OPTION_OVERRIDE ix86_option_override
43679
43680 #undef TARGET_REGISTER_MOVE_COST
43681 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
43682 #undef TARGET_MEMORY_MOVE_COST
43683 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
43684 #undef TARGET_RTX_COSTS
43685 #define TARGET_RTX_COSTS ix86_rtx_costs
43686 #undef TARGET_ADDRESS_COST
43687 #define TARGET_ADDRESS_COST ix86_address_cost
43688
43689 #undef TARGET_FIXED_CONDITION_CODE_REGS
43690 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
43691 #undef TARGET_CC_MODES_COMPATIBLE
43692 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
43693
43694 #undef TARGET_MACHINE_DEPENDENT_REORG
43695 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
43696
43697 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
43698 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
43699
43700 #undef TARGET_BUILD_BUILTIN_VA_LIST
43701 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
43702
43703 #undef TARGET_FOLD_BUILTIN
43704 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
43705
43706 #undef TARGET_COMPARE_VERSION_PRIORITY
43707 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
43708
43709 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
43710 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
43711 ix86_generate_version_dispatcher_body
43712
43713 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
43714 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
43715 ix86_get_function_versions_dispatcher
43716
43717 #undef TARGET_ENUM_VA_LIST_P
43718 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
43719
43720 #undef TARGET_FN_ABI_VA_LIST
43721 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
43722
43723 #undef TARGET_CANONICAL_VA_LIST_TYPE
43724 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
43725
43726 #undef TARGET_EXPAND_BUILTIN_VA_START
43727 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
43728
43729 #undef TARGET_MD_ASM_CLOBBERS
43730 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
43731
43732 #undef TARGET_PROMOTE_PROTOTYPES
43733 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
43734 #undef TARGET_STRUCT_VALUE_RTX
43735 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
43736 #undef TARGET_SETUP_INCOMING_VARARGS
43737 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
43738 #undef TARGET_MUST_PASS_IN_STACK
43739 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
43740 #undef TARGET_FUNCTION_ARG_ADVANCE
43741 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
43742 #undef TARGET_FUNCTION_ARG
43743 #define TARGET_FUNCTION_ARG ix86_function_arg
43744 #undef TARGET_FUNCTION_ARG_BOUNDARY
43745 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
43746 #undef TARGET_PASS_BY_REFERENCE
43747 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
43748 #undef TARGET_INTERNAL_ARG_POINTER
43749 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
43750 #undef TARGET_UPDATE_STACK_BOUNDARY
43751 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
43752 #undef TARGET_GET_DRAP_RTX
43753 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
43754 #undef TARGET_STRICT_ARGUMENT_NAMING
43755 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
43756 #undef TARGET_STATIC_CHAIN
43757 #define TARGET_STATIC_CHAIN ix86_static_chain
43758 #undef TARGET_TRAMPOLINE_INIT
43759 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
43760 #undef TARGET_RETURN_POPS_ARGS
43761 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
43762
43763 #undef TARGET_LEGITIMATE_COMBINED_INSN
43764 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
43765
43766 #undef TARGET_ASAN_SHADOW_OFFSET
43767 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
43768
43769 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
43770 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
43771
43772 #undef TARGET_SCALAR_MODE_SUPPORTED_P
43773 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
43774
43775 #undef TARGET_VECTOR_MODE_SUPPORTED_P
43776 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
43777
43778 #undef TARGET_C_MODE_FOR_SUFFIX
43779 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
43780
43781 #ifdef HAVE_AS_TLS
43782 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
43783 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
43784 #endif
43785
43786 #ifdef SUBTARGET_INSERT_ATTRIBUTES
43787 #undef TARGET_INSERT_ATTRIBUTES
43788 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
43789 #endif
43790
43791 #undef TARGET_MANGLE_TYPE
43792 #define TARGET_MANGLE_TYPE ix86_mangle_type
43793
43794 #if !TARGET_MACHO
43795 #undef TARGET_STACK_PROTECT_FAIL
43796 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
43797 #endif
43798
43799 #undef TARGET_FUNCTION_VALUE
43800 #define TARGET_FUNCTION_VALUE ix86_function_value
43801
43802 #undef TARGET_FUNCTION_VALUE_REGNO_P
43803 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
43804
43805 #undef TARGET_PROMOTE_FUNCTION_MODE
43806 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
43807
43808 #undef TARGET_MEMBER_TYPE_FORCES_BLK
43809 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
43810
43811 #undef TARGET_INSTANTIATE_DECLS
43812 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
43813
43814 #undef TARGET_SECONDARY_RELOAD
43815 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
43816
43817 #undef TARGET_CLASS_MAX_NREGS
43818 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
43819
43820 #undef TARGET_PREFERRED_RELOAD_CLASS
43821 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
43822 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
43823 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
43824 #undef TARGET_CLASS_LIKELY_SPILLED_P
43825 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
43826
43827 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
43828 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
43829 ix86_builtin_vectorization_cost
43830 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
43831 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
43832 ix86_vectorize_vec_perm_const_ok
43833 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
43834 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
43835 ix86_preferred_simd_mode
43836 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
43837 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
43838 ix86_autovectorize_vector_sizes
43839 #undef TARGET_VECTORIZE_INIT_COST
43840 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
43841 #undef TARGET_VECTORIZE_ADD_STMT_COST
43842 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
43843 #undef TARGET_VECTORIZE_FINISH_COST
43844 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
43845 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
43846 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
43847
43848 #undef TARGET_SET_CURRENT_FUNCTION
43849 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
43850
43851 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
43852 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
43853
43854 #undef TARGET_OPTION_SAVE
43855 #define TARGET_OPTION_SAVE ix86_function_specific_save
43856
43857 #undef TARGET_OPTION_RESTORE
43858 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
43859
43860 #undef TARGET_OPTION_PRINT
43861 #define TARGET_OPTION_PRINT ix86_function_specific_print
43862
43863 #undef TARGET_OPTION_FUNCTION_VERSIONS
43864 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
43865
43866 #undef TARGET_CAN_INLINE_P
43867 #define TARGET_CAN_INLINE_P ix86_can_inline_p
43868
43869 #undef TARGET_EXPAND_TO_RTL_HOOK
43870 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
43871
43872 #undef TARGET_LEGITIMATE_ADDRESS_P
43873 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
43874
43875 #undef TARGET_LRA_P
43876 #define TARGET_LRA_P hook_bool_void_true
43877
43878 #undef TARGET_REGISTER_PRIORITY
43879 #define TARGET_REGISTER_PRIORITY ix86_register_priority
43880
43881 #undef TARGET_REGISTER_USAGE_LEVELING_P
43882 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
43883
43884 #undef TARGET_LEGITIMATE_CONSTANT_P
43885 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
43886
43887 #undef TARGET_FRAME_POINTER_REQUIRED
43888 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
43889
43890 #undef TARGET_CAN_ELIMINATE
43891 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
43892
43893 #undef TARGET_EXTRA_LIVE_ON_ENTRY
43894 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
43895
43896 #undef TARGET_ASM_CODE_END
43897 #define TARGET_ASM_CODE_END ix86_code_end
43898
43899 #undef TARGET_CONDITIONAL_REGISTER_USAGE
43900 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
43901
43902 #if TARGET_MACHO
43903 #undef TARGET_INIT_LIBFUNCS
43904 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
43905 #endif
43906
43907 #undef TARGET_SPILL_CLASS
43908 #define TARGET_SPILL_CLASS ix86_spill_class
43909
43910 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
43911 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
43912 ix86_float_exceptions_rounding_supported_p
43913
43914 struct gcc_target targetm = TARGET_INITIALIZER;
43915 \f
43916 #include "gt-i386.h"