Enable VBMI2 support [6/7]
[gcc.git] / gcc / config / i386 / i386.c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "backend.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "memmodel.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "cfgloop.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "stringpool.h"
33 #include "expmed.h"
34 #include "optabs.h"
35 #include "regs.h"
36 #include "emit-rtl.h"
37 #include "recog.h"
38 #include "cgraph.h"
39 #include "diagnostic.h"
40 #include "cfgbuild.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "attribs.h"
44 #include "calls.h"
45 #include "stor-layout.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "insn-attr.h"
49 #include "flags.h"
50 #include "except.h"
51 #include "explow.h"
52 #include "expr.h"
53 #include "cfgrtl.h"
54 #include "common/common-target.h"
55 #include "langhooks.h"
56 #include "reload.h"
57 #include "gimplify.h"
58 #include "dwarf2.h"
59 #include "tm-constrs.h"
60 #include "params.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "tree-chkp.h"
75 #include "rtl-chkp.h"
76 #include "dbgcnt.h"
77 #include "case-cfn-macros.h"
78 #include "regrename.h"
79 #include "dojump.h"
80 #include "fold-const-call.h"
81 #include "tree-vrp.h"
82 #include "tree-ssanames.h"
83 #include "selftest.h"
84 #include "selftest-rtl.h"
85 #include "print-rtl.h"
86 #include "intl.h"
87 #include "ifcvt.h"
88 #include "symbol-summary.h"
89 #include "ipa-prop.h"
90 #include "ipa-fnsummary.h"
91
92 /* This file should be included last. */
93 #include "target-def.h"
94
95 #include "x86-tune-costs.h"
96
97 static rtx legitimize_dllimport_symbol (rtx, bool);
98 static rtx legitimize_pe_coff_extern_decl (rtx, bool);
99 static rtx legitimize_pe_coff_symbol (rtx, bool);
100 static void ix86_print_operand_address_as (FILE *, rtx, addr_space_t, bool);
101 static bool ix86_save_reg (unsigned int, bool, bool);
102 static bool ix86_function_naked (const_tree);
103 static bool ix86_notrack_prefixed_insn_p (rtx);
104 static void ix86_emit_restore_reg_using_pop (rtx);
105
106
107 #ifndef CHECK_STACK_LIMIT
108 #define CHECK_STACK_LIMIT (-1)
109 #endif
110
111 /* Return index of given mode in mult and division cost tables. */
112 #define MODE_INDEX(mode) \
113 ((mode) == QImode ? 0 \
114 : (mode) == HImode ? 1 \
115 : (mode) == SImode ? 2 \
116 : (mode) == DImode ? 3 \
117 : 4)
118
119
120 /* Set by -mtune. */
121 const struct processor_costs *ix86_tune_cost = NULL;
122
123 /* Set by -mtune or -Os. */
124 const struct processor_costs *ix86_cost = NULL;
125
126 /* Processor feature/optimization bitmasks. */
127 #define m_386 (1U<<PROCESSOR_I386)
128 #define m_486 (1U<<PROCESSOR_I486)
129 #define m_PENT (1U<<PROCESSOR_PENTIUM)
130 #define m_LAKEMONT (1U<<PROCESSOR_LAKEMONT)
131 #define m_PPRO (1U<<PROCESSOR_PENTIUMPRO)
132 #define m_PENT4 (1U<<PROCESSOR_PENTIUM4)
133 #define m_NOCONA (1U<<PROCESSOR_NOCONA)
134 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
135 #define m_CORE2 (1U<<PROCESSOR_CORE2)
136 #define m_NEHALEM (1U<<PROCESSOR_NEHALEM)
137 #define m_SANDYBRIDGE (1U<<PROCESSOR_SANDYBRIDGE)
138 #define m_HASWELL (1U<<PROCESSOR_HASWELL)
139 #define m_CORE_ALL (m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_HASWELL)
140 #define m_BONNELL (1U<<PROCESSOR_BONNELL)
141 #define m_SILVERMONT (1U<<PROCESSOR_SILVERMONT)
142 #define m_KNL (1U<<PROCESSOR_KNL)
143 #define m_KNM (1U<<PROCESSOR_KNM)
144 #define m_SKYLAKE_AVX512 (1U<<PROCESSOR_SKYLAKE_AVX512)
145 #define m_CANNONLAKE (1U<<PROCESSOR_CANNONLAKE)
146 #define m_INTEL (1U<<PROCESSOR_INTEL)
147
148 #define m_GEODE (1U<<PROCESSOR_GEODE)
149 #define m_K6 (1U<<PROCESSOR_K6)
150 #define m_K6_GEODE (m_K6 | m_GEODE)
151 #define m_K8 (1U<<PROCESSOR_K8)
152 #define m_ATHLON (1U<<PROCESSOR_ATHLON)
153 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
154 #define m_AMDFAM10 (1U<<PROCESSOR_AMDFAM10)
155 #define m_BDVER1 (1U<<PROCESSOR_BDVER1)
156 #define m_BDVER2 (1U<<PROCESSOR_BDVER2)
157 #define m_BDVER3 (1U<<PROCESSOR_BDVER3)
158 #define m_BDVER4 (1U<<PROCESSOR_BDVER4)
159 #define m_ZNVER1 (1U<<PROCESSOR_ZNVER1)
160 #define m_BTVER1 (1U<<PROCESSOR_BTVER1)
161 #define m_BTVER2 (1U<<PROCESSOR_BTVER2)
162 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3 | m_BDVER4)
163 #define m_BTVER (m_BTVER1 | m_BTVER2)
164 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER \
165 | m_ZNVER1)
166
167 #define m_GENERIC (1U<<PROCESSOR_GENERIC)
168
169 const char* ix86_tune_feature_names[X86_TUNE_LAST] = {
170 #undef DEF_TUNE
171 #define DEF_TUNE(tune, name, selector) name,
172 #include "x86-tune.def"
173 #undef DEF_TUNE
174 };
175
176 /* Feature tests against the various tunings. */
177 unsigned char ix86_tune_features[X86_TUNE_LAST];
178
179 /* Feature tests against the various tunings used to create ix86_tune_features
180 based on the processor mask. */
181 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
182 #undef DEF_TUNE
183 #define DEF_TUNE(tune, name, selector) selector,
184 #include "x86-tune.def"
185 #undef DEF_TUNE
186 };
187
188 /* Feature tests against the various architecture variations. */
189 unsigned char ix86_arch_features[X86_ARCH_LAST];
190
191 /* Feature tests against the various architecture variations, used to create
192 ix86_arch_features based on the processor mask. */
193 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
194 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
195 ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),
196
197 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
198 ~m_386,
199
200 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
201 ~(m_386 | m_486),
202
203 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
204 ~m_386,
205
206 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
207 ~m_386,
208 };
209
210 /* In case the average insn count for single function invocation is
211 lower than this constant, emit fast (but longer) prologue and
212 epilogue code. */
213 #define FAST_PROLOGUE_INSN_COUNT 20
214
215 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
216 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
217 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
218 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
219
220 /* Array of the smallest class containing reg number REGNO, indexed by
221 REGNO. Used by REGNO_REG_CLASS in i386.h. */
222
223 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
224 {
225 /* ax, dx, cx, bx */
226 AREG, DREG, CREG, BREG,
227 /* si, di, bp, sp */
228 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
229 /* FP registers */
230 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
231 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
232 /* arg pointer */
233 NON_Q_REGS,
234 /* flags, fpsr, fpcr, frame */
235 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
236 /* SSE registers */
237 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
238 SSE_REGS, SSE_REGS,
239 /* MMX registers */
240 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
241 MMX_REGS, MMX_REGS,
242 /* REX registers */
243 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
244 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
245 /* SSE REX registers */
246 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
247 SSE_REGS, SSE_REGS,
248 /* AVX-512 SSE registers */
249 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
250 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
251 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
252 EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS, EVEX_SSE_REGS,
253 /* Mask registers. */
254 MASK_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
255 MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS, MASK_EVEX_REGS,
256 /* MPX bound registers */
257 BND_REGS, BND_REGS, BND_REGS, BND_REGS,
258 };
259
260 /* The "default" register map used in 32bit mode. */
261
262 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
263 {
264 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
265 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
266 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
267 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
268 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
269 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
270 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
271 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
272 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
273 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
274 101, 102, 103, 104, /* bound registers */
275 };
276
277 /* The "default" register map used in 64bit mode. */
278
279 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
280 {
281 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
282 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
283 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
284 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
285 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
286 8,9,10,11,12,13,14,15, /* extended integer registers */
287 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
288 67, 68, 69, 70, 71, 72, 73, 74, /* AVX-512 registers 16-23 */
289 75, 76, 77, 78, 79, 80, 81, 82, /* AVX-512 registers 24-31 */
290 118, 119, 120, 121, 122, 123, 124, 125, /* Mask registers */
291 126, 127, 128, 129, /* bound registers */
292 };
293
294 /* Define the register numbers to be used in Dwarf debugging information.
295 The SVR4 reference port C compiler uses the following register numbers
296 in its Dwarf output code:
297 0 for %eax (gcc regno = 0)
298 1 for %ecx (gcc regno = 2)
299 2 for %edx (gcc regno = 1)
300 3 for %ebx (gcc regno = 3)
301 4 for %esp (gcc regno = 7)
302 5 for %ebp (gcc regno = 6)
303 6 for %esi (gcc regno = 4)
304 7 for %edi (gcc regno = 5)
305 The following three DWARF register numbers are never generated by
306 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
307 believed these numbers have these meanings.
308 8 for %eip (no gcc equivalent)
309 9 for %eflags (gcc regno = 17)
310 10 for %trapno (no gcc equivalent)
311 It is not at all clear how we should number the FP stack registers
312 for the x86 architecture. If the version of SDB on x86/svr4 were
313 a bit less brain dead with respect to floating-point then we would
314 have a precedent to follow with respect to DWARF register numbers
315 for x86 FP registers, but the SDB on x86/svr4 was so completely
316 broken with respect to FP registers that it is hardly worth thinking
317 of it as something to strive for compatibility with.
318 The version of x86/svr4 SDB I had does (partially)
319 seem to believe that DWARF register number 11 is associated with
320 the x86 register %st(0), but that's about all. Higher DWARF
321 register numbers don't seem to be associated with anything in
322 particular, and even for DWARF regno 11, SDB only seemed to under-
323 stand that it should say that a variable lives in %st(0) (when
324 asked via an `=' command) if we said it was in DWARF regno 11,
325 but SDB still printed garbage when asked for the value of the
326 variable in question (via a `/' command).
327 (Also note that the labels SDB printed for various FP stack regs
328 when doing an `x' command were all wrong.)
329 Note that these problems generally don't affect the native SVR4
330 C compiler because it doesn't allow the use of -O with -g and
331 because when it is *not* optimizing, it allocates a memory
332 location for each floating-point variable, and the memory
333 location is what gets described in the DWARF AT_location
334 attribute for the variable in question.
335 Regardless of the severe mental illness of the x86/svr4 SDB, we
336 do something sensible here and we use the following DWARF
337 register numbers. Note that these are all stack-top-relative
338 numbers.
339 11 for %st(0) (gcc regno = 8)
340 12 for %st(1) (gcc regno = 9)
341 13 for %st(2) (gcc regno = 10)
342 14 for %st(3) (gcc regno = 11)
343 15 for %st(4) (gcc regno = 12)
344 16 for %st(5) (gcc regno = 13)
345 17 for %st(6) (gcc regno = 14)
346 18 for %st(7) (gcc regno = 15)
347 */
348 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
349 {
350 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
351 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
352 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
353 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
354 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
355 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
356 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
357 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 16-23*/
358 -1, -1, -1, -1, -1, -1, -1, -1, /* AVX-512 registers 24-31*/
359 93, 94, 95, 96, 97, 98, 99, 100, /* Mask registers */
360 101, 102, 103, 104, /* bound registers */
361 };
362
363 /* Define parameter passing and return registers. */
364
365 static int const x86_64_int_parameter_registers[6] =
366 {
367 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
368 };
369
370 static int const x86_64_ms_abi_int_parameter_registers[4] =
371 {
372 CX_REG, DX_REG, R8_REG, R9_REG
373 };
374
375 static int const x86_64_int_return_registers[4] =
376 {
377 AX_REG, DX_REG, DI_REG, SI_REG
378 };
379
380 /* Additional registers that are clobbered by SYSV calls. */
381
382 #define NUM_X86_64_MS_CLOBBERED_REGS 12
383 static int const x86_64_ms_sysv_extra_clobbered_registers
384 [NUM_X86_64_MS_CLOBBERED_REGS] =
385 {
386 SI_REG, DI_REG,
387 XMM6_REG, XMM7_REG,
388 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
389 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
390 };
391
392 enum xlogue_stub {
393 XLOGUE_STUB_SAVE,
394 XLOGUE_STUB_RESTORE,
395 XLOGUE_STUB_RESTORE_TAIL,
396 XLOGUE_STUB_SAVE_HFP,
397 XLOGUE_STUB_RESTORE_HFP,
398 XLOGUE_STUB_RESTORE_HFP_TAIL,
399
400 XLOGUE_STUB_COUNT
401 };
402
403 enum xlogue_stub_sets {
404 XLOGUE_SET_ALIGNED,
405 XLOGUE_SET_ALIGNED_PLUS_8,
406 XLOGUE_SET_HFP_ALIGNED_OR_REALIGN,
407 XLOGUE_SET_HFP_ALIGNED_PLUS_8,
408
409 XLOGUE_SET_COUNT
410 };
411
412 /* Register save/restore layout used by out-of-line stubs. */
413 class xlogue_layout {
414 public:
415 struct reginfo
416 {
417 unsigned regno;
418 HOST_WIDE_INT offset; /* Offset used by stub base pointer (rax or
419 rsi) to where each register is stored. */
420 };
421
422 unsigned get_nregs () const {return m_nregs;}
423 HOST_WIDE_INT get_stack_align_off_in () const {return m_stack_align_off_in;}
424
425 const reginfo &get_reginfo (unsigned reg) const
426 {
427 gcc_assert (reg < m_nregs);
428 return m_regs[reg];
429 }
430
431 static const char *get_stub_name (enum xlogue_stub stub,
432 unsigned n_extra_args);
433
434 /* Returns an rtx for the stub's symbol based upon
435 1.) the specified stub (save, restore or restore_ret) and
436 2.) the value of cfun->machine->call_ms2sysv_extra_regs and
437 3.) rather or not stack alignment is being performed. */
438 static rtx get_stub_rtx (enum xlogue_stub stub);
439
440 /* Returns the amount of stack space (including padding) that the stub
441 needs to store registers based upon data in the machine_function. */
442 HOST_WIDE_INT get_stack_space_used () const
443 {
444 const struct machine_function *m = cfun->machine;
445 unsigned last_reg = m->call_ms2sysv_extra_regs + MIN_REGS - 1;
446
447 gcc_assert (m->call_ms2sysv_extra_regs <= MAX_EXTRA_REGS);
448 return m_regs[last_reg].offset + STUB_INDEX_OFFSET;
449 }
450
451 /* Returns the offset for the base pointer used by the stub. */
452 HOST_WIDE_INT get_stub_ptr_offset () const
453 {
454 return STUB_INDEX_OFFSET + m_stack_align_off_in;
455 }
456
457 static const struct xlogue_layout &get_instance ();
458 static unsigned count_stub_managed_regs ();
459 static bool is_stub_managed_reg (unsigned regno, unsigned count);
460
461 static const HOST_WIDE_INT STUB_INDEX_OFFSET = 0x70;
462 static const unsigned MIN_REGS = NUM_X86_64_MS_CLOBBERED_REGS;
463 static const unsigned MAX_REGS = 18;
464 static const unsigned MAX_EXTRA_REGS = MAX_REGS - MIN_REGS;
465 static const unsigned VARIANT_COUNT = MAX_EXTRA_REGS + 1;
466 static const unsigned STUB_NAME_MAX_LEN = 20;
467 static const char * const STUB_BASE_NAMES[XLOGUE_STUB_COUNT];
468 static const unsigned REG_ORDER[MAX_REGS];
469 static const unsigned REG_ORDER_REALIGN[MAX_REGS];
470
471 private:
472 xlogue_layout ();
473 xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp);
474 xlogue_layout (const xlogue_layout &);
475
476 /* True if hard frame pointer is used. */
477 bool m_hfp;
478
479 /* Max number of register this layout manages. */
480 unsigned m_nregs;
481
482 /* Incoming offset from 16-byte alignment. */
483 HOST_WIDE_INT m_stack_align_off_in;
484
485 /* Register order and offsets. */
486 struct reginfo m_regs[MAX_REGS];
487
488 /* Lazy-inited cache of symbol names for stubs. */
489 static char s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
490 [STUB_NAME_MAX_LEN];
491
492 static const xlogue_layout s_instances[XLOGUE_SET_COUNT];
493 };
494
495 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
496 "savms64",
497 "resms64",
498 "resms64x",
499 "savms64f",
500 "resms64f",
501 "resms64fx"
502 };
503
504 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
505 /* The below offset values are where each register is stored for the layout
506 relative to incoming stack pointer. The value of each m_regs[].offset will
507 be relative to the incoming base pointer (rax or rsi) used by the stub.
508
509 s_instances: 0 1 2 3
510 Offset: realigned or aligned + 8
511 Register aligned aligned + 8 aligned w/HFP w/HFP */
512 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
513 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
514 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
515 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
516 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
517 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
518 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
519 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
520 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
521 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
522 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
523 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
524 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
525 BP_REG, /* 0xc0 0xc8 N/A N/A */
526 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
527 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
528 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
529 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
530 };
531
532 /* Instantiate static const values. */
533 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
534 const unsigned xlogue_layout::MIN_REGS;
535 const unsigned xlogue_layout::MAX_REGS;
536 const unsigned xlogue_layout::MAX_EXTRA_REGS;
537 const unsigned xlogue_layout::VARIANT_COUNT;
538 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
539
540 /* Initialize xlogue_layout::s_stub_names to zero. */
541 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
542 [STUB_NAME_MAX_LEN];
543
544 /* Instantiates all xlogue_layout instances. */
545 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
546 xlogue_layout (0, false),
547 xlogue_layout (8, false),
548 xlogue_layout (0, true),
549 xlogue_layout (8, true)
550 };
551
552 /* Return an appropriate const instance of xlogue_layout based upon values
553 in cfun->machine and crtl. */
554 const struct xlogue_layout &
555 xlogue_layout::get_instance ()
556 {
557 enum xlogue_stub_sets stub_set;
558 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
559
560 if (stack_realign_fp)
561 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
562 else if (frame_pointer_needed)
563 stub_set = aligned_plus_8
564 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
565 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
566 else
567 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
568
569 return s_instances[stub_set];
570 }
571
572 /* Determine how many clobbered registers can be saved by the stub.
573 Returns the count of registers the stub will save and restore. */
574 unsigned
575 xlogue_layout::count_stub_managed_regs ()
576 {
577 bool hfp = frame_pointer_needed || stack_realign_fp;
578 unsigned i, count;
579 unsigned regno;
580
581 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
582 {
583 regno = REG_ORDER[i];
584 if (regno == BP_REG && hfp)
585 continue;
586 if (!ix86_save_reg (regno, false, false))
587 break;
588 ++count;
589 }
590 return count;
591 }
592
593 /* Determine if register REGNO is a stub managed register given the
594 total COUNT of stub managed registers. */
595 bool
596 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
597 {
598 bool hfp = frame_pointer_needed || stack_realign_fp;
599 unsigned i;
600
601 for (i = 0; i < count; ++i)
602 {
603 gcc_assert (i < MAX_REGS);
604 if (REG_ORDER[i] == BP_REG && hfp)
605 ++count;
606 else if (REG_ORDER[i] == regno)
607 return true;
608 }
609 return false;
610 }
611
612 /* Constructor for xlogue_layout. */
613 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
614 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
615 m_stack_align_off_in (stack_align_off_in)
616 {
617 HOST_WIDE_INT offset = stack_align_off_in;
618 unsigned i, j;
619
620 for (i = j = 0; i < MAX_REGS; ++i)
621 {
622 unsigned regno = REG_ORDER[i];
623
624 if (regno == BP_REG && hfp)
625 continue;
626 if (SSE_REGNO_P (regno))
627 {
628 offset += 16;
629 /* Verify that SSE regs are always aligned. */
630 gcc_assert (!((stack_align_off_in + offset) & 15));
631 }
632 else
633 offset += 8;
634
635 m_regs[j].regno = regno;
636 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
637 }
638 gcc_assert (j == m_nregs);
639 }
640
641 const char *
642 xlogue_layout::get_stub_name (enum xlogue_stub stub,
643 unsigned n_extra_regs)
644 {
645 const int have_avx = TARGET_AVX;
646 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
647
648 /* Lazy init */
649 if (!*name)
650 {
651 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
652 (have_avx ? "avx" : "sse"),
653 STUB_BASE_NAMES[stub],
654 MIN_REGS + n_extra_regs);
655 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
656 }
657
658 return name;
659 }
660
661 /* Return rtx of a symbol ref for the entry point (based upon
662 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
663 rtx
664 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
665 {
666 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
667 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
668 gcc_assert (stub < XLOGUE_STUB_COUNT);
669 gcc_assert (crtl->stack_realign_finalized);
670
671 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
672 }
673
674 /* Define the structure for the machine field in struct function. */
675
676 struct GTY(()) stack_local_entry {
677 unsigned short mode;
678 unsigned short n;
679 rtx rtl;
680 struct stack_local_entry *next;
681 };
682
683 /* Which cpu are we scheduling for. */
684 enum attr_cpu ix86_schedule;
685
686 /* Which cpu are we optimizing for. */
687 enum processor_type ix86_tune;
688
689 /* Which instruction set architecture to use. */
690 enum processor_type ix86_arch;
691
692 /* True if processor has SSE prefetch instruction. */
693 unsigned char x86_prefetch_sse;
694
695 /* -mstackrealign option */
696 static const char ix86_force_align_arg_pointer_string[]
697 = "force_align_arg_pointer";
698
699 static rtx (*ix86_gen_leave) (void);
700 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
701 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
702 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
703 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
704 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
705 static rtx (*ix86_gen_monitorx) (rtx, rtx, rtx);
706 static rtx (*ix86_gen_clzero) (rtx);
707 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
708 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
709 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
710 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
711 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
712 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
713
714 /* Preferred alignment for stack boundary in bits. */
715 unsigned int ix86_preferred_stack_boundary;
716
717 /* Alignment for incoming stack boundary in bits specified at
718 command line. */
719 static unsigned int ix86_user_incoming_stack_boundary;
720
721 /* Default alignment for incoming stack boundary in bits. */
722 static unsigned int ix86_default_incoming_stack_boundary;
723
724 /* Alignment for incoming stack boundary in bits. */
725 unsigned int ix86_incoming_stack_boundary;
726
727 /* Calling abi specific va_list type nodes. */
728 static GTY(()) tree sysv_va_list_type_node;
729 static GTY(()) tree ms_va_list_type_node;
730
731 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
732 char internal_label_prefix[16];
733 int internal_label_prefix_len;
734
735 /* Fence to use after loop using movnt. */
736 tree x86_mfence;
737
738 /* Register class used for passing given 64bit part of the argument.
739 These represent classes as documented by the PS ABI, with the exception
740 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
741 use SF or DFmode move instead of DImode to avoid reformatting penalties.
742
743 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
744 whenever possible (upper half does contain padding). */
745 enum x86_64_reg_class
746 {
747 X86_64_NO_CLASS,
748 X86_64_INTEGER_CLASS,
749 X86_64_INTEGERSI_CLASS,
750 X86_64_SSE_CLASS,
751 X86_64_SSESF_CLASS,
752 X86_64_SSEDF_CLASS,
753 X86_64_SSEUP_CLASS,
754 X86_64_X87_CLASS,
755 X86_64_X87UP_CLASS,
756 X86_64_COMPLEX_X87_CLASS,
757 X86_64_MEMORY_CLASS
758 };
759
760 #define MAX_CLASSES 8
761
762 /* Table of constants used by fldpi, fldln2, etc.... */
763 static REAL_VALUE_TYPE ext_80387_constants_table [5];
764 static bool ext_80387_constants_init;
765
766 \f
767 static struct machine_function * ix86_init_machine_status (void);
768 static rtx ix86_function_value (const_tree, const_tree, bool);
769 static bool ix86_function_value_regno_p (const unsigned int);
770 static unsigned int ix86_function_arg_boundary (machine_mode,
771 const_tree);
772 static rtx ix86_static_chain (const_tree, bool);
773 static int ix86_function_regparm (const_tree, const_tree);
774 static void ix86_compute_frame_layout (void);
775 static bool ix86_expand_vector_init_one_nonzero (bool, machine_mode,
776 rtx, rtx, int);
777 static void ix86_add_new_builtins (HOST_WIDE_INT, HOST_WIDE_INT);
778 static tree ix86_canonical_va_list_type (tree);
779 static void predict_jump (int);
780 static unsigned int split_stack_prologue_scratch_regno (void);
781 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
782
783 enum ix86_function_specific_strings
784 {
785 IX86_FUNCTION_SPECIFIC_ARCH,
786 IX86_FUNCTION_SPECIFIC_TUNE,
787 IX86_FUNCTION_SPECIFIC_MAX
788 };
789
790 static char *ix86_target_string (HOST_WIDE_INT, HOST_WIDE_INT, int, int,
791 const char *, const char *, enum fpmath_unit,
792 bool);
793 static void ix86_function_specific_save (struct cl_target_option *,
794 struct gcc_options *opts);
795 static void ix86_function_specific_restore (struct gcc_options *opts,
796 struct cl_target_option *);
797 static void ix86_function_specific_post_stream_in (struct cl_target_option *);
798 static void ix86_function_specific_print (FILE *, int,
799 struct cl_target_option *);
800 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
801 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
802 struct gcc_options *,
803 struct gcc_options *,
804 struct gcc_options *);
805 static bool ix86_can_inline_p (tree, tree);
806 static void ix86_set_current_function (tree);
807 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
808
809 static enum calling_abi ix86_function_abi (const_tree);
810
811 \f
812 #ifndef SUBTARGET32_DEFAULT_CPU
813 #define SUBTARGET32_DEFAULT_CPU "i386"
814 #endif
815
816 /* Whether -mtune= or -march= were specified */
817 static int ix86_tune_defaulted;
818 static int ix86_arch_specified;
819
820 /* Vectorization library interface and handlers. */
821 static tree (*ix86_veclib_handler) (combined_fn, tree, tree);
822
823 static tree ix86_veclibabi_svml (combined_fn, tree, tree);
824 static tree ix86_veclibabi_acml (combined_fn, tree, tree);
825
826 /* Processor target table, indexed by processor number */
827 struct ptt
828 {
829 const char *const name; /* processor name */
830 const struct processor_costs *cost; /* Processor costs */
831 const int align_loop; /* Default alignments. */
832 const int align_loop_max_skip;
833 const int align_jump;
834 const int align_jump_max_skip;
835 const int align_func;
836 };
837
838 /* This table must be in sync with enum processor_type in i386.h. */
839 static const struct ptt processor_target_table[PROCESSOR_max] =
840 {
841 {"generic", &generic_cost, 16, 10, 16, 10, 16},
842 {"i386", &i386_cost, 4, 3, 4, 3, 4},
843 {"i486", &i486_cost, 16, 15, 16, 15, 16},
844 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
845 {"lakemont", &lakemont_cost, 16, 7, 16, 7, 16},
846 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
847 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
848 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
849 {"core2", &core_cost, 16, 10, 16, 10, 16},
850 {"nehalem", &core_cost, 16, 10, 16, 10, 16},
851 {"sandybridge", &core_cost, 16, 10, 16, 10, 16},
852 {"haswell", &core_cost, 16, 10, 16, 10, 16},
853 {"bonnell", &atom_cost, 16, 15, 16, 7, 16},
854 {"silvermont", &slm_cost, 16, 15, 16, 7, 16},
855 {"knl", &slm_cost, 16, 15, 16, 7, 16},
856 {"knm", &slm_cost, 16, 15, 16, 7, 16},
857 {"skylake-avx512", &skylake_cost, 16, 10, 16, 10, 16},
858 {"cannonlake", &core_cost, 16, 10, 16, 10, 16},
859 {"intel", &intel_cost, 16, 15, 16, 7, 16},
860 {"geode", &geode_cost, 0, 0, 0, 0, 0},
861 {"k6", &k6_cost, 32, 7, 32, 7, 32},
862 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
863 {"k8", &k8_cost, 16, 7, 16, 7, 16},
864 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
865 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
866 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
867 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
868 {"bdver4", &bdver4_cost, 16, 10, 16, 7, 11},
869 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
870 {"btver2", &btver2_cost, 16, 10, 16, 7, 11},
871 {"znver1", &znver1_cost, 16, 15, 16, 15, 16}
872 };
873 \f
874 static unsigned int
875 rest_of_handle_insert_vzeroupper (void)
876 {
877 int i;
878
879 /* vzeroupper instructions are inserted immediately after reload to
880 account for possible spills from 256bit or 512bit registers. The pass
881 reuses mode switching infrastructure by re-running mode insertion
882 pass, so disable entities that have already been processed. */
883 for (i = 0; i < MAX_386_ENTITIES; i++)
884 ix86_optimize_mode_switching[i] = 0;
885
886 ix86_optimize_mode_switching[AVX_U128] = 1;
887
888 /* Call optimize_mode_switching. */
889 g->get_passes ()->execute_pass_mode_switching ();
890 return 0;
891 }
892
893 /* Return 1 if INSN uses or defines a hard register.
894 Hard register uses in a memory address are ignored.
895 Clobbers and flags definitions are ignored. */
896
897 static bool
898 has_non_address_hard_reg (rtx_insn *insn)
899 {
900 df_ref ref;
901 FOR_EACH_INSN_DEF (ref, insn)
902 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
903 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
904 && DF_REF_REGNO (ref) != FLAGS_REG)
905 return true;
906
907 FOR_EACH_INSN_USE (ref, insn)
908 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
909 return true;
910
911 return false;
912 }
913
914 /* Check if comparison INSN may be transformed
915 into vector comparison. Currently we transform
916 zero checks only which look like:
917
918 (set (reg:CCZ 17 flags)
919 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
920 (subreg:SI (reg:DI x) 0))
921 (const_int 0 [0]))) */
922
923 static bool
924 convertible_comparison_p (rtx_insn *insn)
925 {
926 if (!TARGET_SSE4_1)
927 return false;
928
929 rtx def_set = single_set (insn);
930
931 gcc_assert (def_set);
932
933 rtx src = SET_SRC (def_set);
934 rtx dst = SET_DEST (def_set);
935
936 gcc_assert (GET_CODE (src) == COMPARE);
937
938 if (GET_CODE (dst) != REG
939 || REGNO (dst) != FLAGS_REG
940 || GET_MODE (dst) != CCZmode)
941 return false;
942
943 rtx op1 = XEXP (src, 0);
944 rtx op2 = XEXP (src, 1);
945
946 if (op2 != CONST0_RTX (GET_MODE (op2)))
947 return false;
948
949 if (GET_CODE (op1) != IOR)
950 return false;
951
952 op2 = XEXP (op1, 1);
953 op1 = XEXP (op1, 0);
954
955 if (!SUBREG_P (op1)
956 || !SUBREG_P (op2)
957 || GET_MODE (op1) != SImode
958 || GET_MODE (op2) != SImode
959 || ((SUBREG_BYTE (op1) != 0
960 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
961 && (SUBREG_BYTE (op2) != 0
962 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
963 return false;
964
965 op1 = SUBREG_REG (op1);
966 op2 = SUBREG_REG (op2);
967
968 if (op1 != op2
969 || !REG_P (op1)
970 || GET_MODE (op1) != DImode)
971 return false;
972
973 return true;
974 }
975
976 /* The DImode version of scalar_to_vector_candidate_p. */
977
978 static bool
979 dimode_scalar_to_vector_candidate_p (rtx_insn *insn)
980 {
981 rtx def_set = single_set (insn);
982
983 if (!def_set)
984 return false;
985
986 if (has_non_address_hard_reg (insn))
987 return false;
988
989 rtx src = SET_SRC (def_set);
990 rtx dst = SET_DEST (def_set);
991
992 if (GET_CODE (src) == COMPARE)
993 return convertible_comparison_p (insn);
994
995 /* We are interested in DImode promotion only. */
996 if ((GET_MODE (src) != DImode
997 && !CONST_INT_P (src))
998 || GET_MODE (dst) != DImode)
999 return false;
1000
1001 if (!REG_P (dst) && !MEM_P (dst))
1002 return false;
1003
1004 switch (GET_CODE (src))
1005 {
1006 case ASHIFTRT:
1007 if (!TARGET_AVX512VL)
1008 return false;
1009 /* FALLTHRU */
1010
1011 case ASHIFT:
1012 case LSHIFTRT:
1013 if (!REG_P (XEXP (src, 1))
1014 && (!SUBREG_P (XEXP (src, 1))
1015 || SUBREG_BYTE (XEXP (src, 1)) != 0
1016 || !REG_P (SUBREG_REG (XEXP (src, 1))))
1017 && (!CONST_INT_P (XEXP (src, 1))
1018 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, 63)))
1019 return false;
1020
1021 if (GET_MODE (XEXP (src, 1)) != QImode
1022 && !CONST_INT_P (XEXP (src, 1)))
1023 return false;
1024 break;
1025
1026 case PLUS:
1027 case MINUS:
1028 case IOR:
1029 case XOR:
1030 case AND:
1031 if (!REG_P (XEXP (src, 1))
1032 && !MEM_P (XEXP (src, 1))
1033 && !CONST_INT_P (XEXP (src, 1)))
1034 return false;
1035
1036 if (GET_MODE (XEXP (src, 1)) != DImode
1037 && !CONST_INT_P (XEXP (src, 1)))
1038 return false;
1039 break;
1040
1041 case NEG:
1042 case NOT:
1043 break;
1044
1045 case REG:
1046 return true;
1047
1048 case MEM:
1049 case CONST_INT:
1050 return REG_P (dst);
1051
1052 default:
1053 return false;
1054 }
1055
1056 if (!REG_P (XEXP (src, 0))
1057 && !MEM_P (XEXP (src, 0))
1058 && !CONST_INT_P (XEXP (src, 0))
1059 /* Check for andnot case. */
1060 && (GET_CODE (src) != AND
1061 || GET_CODE (XEXP (src, 0)) != NOT
1062 || !REG_P (XEXP (XEXP (src, 0), 0))))
1063 return false;
1064
1065 if (GET_MODE (XEXP (src, 0)) != DImode
1066 && !CONST_INT_P (XEXP (src, 0)))
1067 return false;
1068
1069 return true;
1070 }
1071
1072 /* The TImode version of scalar_to_vector_candidate_p. */
1073
1074 static bool
1075 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1076 {
1077 rtx def_set = single_set (insn);
1078
1079 if (!def_set)
1080 return false;
1081
1082 if (has_non_address_hard_reg (insn))
1083 return false;
1084
1085 rtx src = SET_SRC (def_set);
1086 rtx dst = SET_DEST (def_set);
1087
1088 /* Only TImode load and store are allowed. */
1089 if (GET_MODE (dst) != TImode)
1090 return false;
1091
1092 if (MEM_P (dst))
1093 {
1094 /* Check for store. Memory must be aligned or unaligned store
1095 is optimal. Only support store from register, standard SSE
1096 constant or CONST_WIDE_INT generated from piecewise store.
1097
1098 ??? Verify performance impact before enabling CONST_INT for
1099 __int128 store. */
1100 if (misaligned_operand (dst, TImode)
1101 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1102 return false;
1103
1104 switch (GET_CODE (src))
1105 {
1106 default:
1107 return false;
1108
1109 case REG:
1110 case CONST_WIDE_INT:
1111 return true;
1112
1113 case CONST_INT:
1114 return standard_sse_constant_p (src, TImode);
1115 }
1116 }
1117 else if (MEM_P (src))
1118 {
1119 /* Check for load. Memory must be aligned or unaligned load is
1120 optimal. */
1121 return (REG_P (dst)
1122 && (!misaligned_operand (src, TImode)
1123 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1124 }
1125
1126 return false;
1127 }
1128
1129 /* Return 1 if INSN may be converted into vector
1130 instruction. */
1131
1132 static bool
1133 scalar_to_vector_candidate_p (rtx_insn *insn)
1134 {
1135 if (TARGET_64BIT)
1136 return timode_scalar_to_vector_candidate_p (insn);
1137 else
1138 return dimode_scalar_to_vector_candidate_p (insn);
1139 }
1140
1141 /* The DImode version of remove_non_convertible_regs. */
1142
1143 static void
1144 dimode_remove_non_convertible_regs (bitmap candidates)
1145 {
1146 bitmap_iterator bi;
1147 unsigned id;
1148 bitmap regs = BITMAP_ALLOC (NULL);
1149
1150 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1151 {
1152 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1153 rtx reg = SET_DEST (def_set);
1154
1155 if (!REG_P (reg)
1156 || bitmap_bit_p (regs, REGNO (reg))
1157 || HARD_REGISTER_P (reg))
1158 continue;
1159
1160 for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
1161 def;
1162 def = DF_REF_NEXT_REG (def))
1163 {
1164 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1165 {
1166 if (dump_file)
1167 fprintf (dump_file,
1168 "r%d has non convertible definition in insn %d\n",
1169 REGNO (reg), DF_REF_INSN_UID (def));
1170
1171 bitmap_set_bit (regs, REGNO (reg));
1172 break;
1173 }
1174 }
1175 }
1176
1177 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1178 {
1179 for (df_ref def = DF_REG_DEF_CHAIN (id);
1180 def;
1181 def = DF_REF_NEXT_REG (def))
1182 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1183 {
1184 if (dump_file)
1185 fprintf (dump_file, "Removing insn %d from candidates list\n",
1186 DF_REF_INSN_UID (def));
1187
1188 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1189 }
1190 }
1191
1192 BITMAP_FREE (regs);
1193 }
1194
1195 /* For a register REGNO, scan instructions for its defs and uses.
1196 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1197
1198 static void
1199 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1200 unsigned int regno)
1201 {
1202 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1203 def;
1204 def = DF_REF_NEXT_REG (def))
1205 {
1206 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1207 {
1208 if (dump_file)
1209 fprintf (dump_file,
1210 "r%d has non convertible def in insn %d\n",
1211 regno, DF_REF_INSN_UID (def));
1212
1213 bitmap_set_bit (regs, regno);
1214 break;
1215 }
1216 }
1217
1218 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1219 ref;
1220 ref = DF_REF_NEXT_REG (ref))
1221 {
1222 /* Debug instructions are skipped. */
1223 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1224 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1225 {
1226 if (dump_file)
1227 fprintf (dump_file,
1228 "r%d has non convertible use in insn %d\n",
1229 regno, DF_REF_INSN_UID (ref));
1230
1231 bitmap_set_bit (regs, regno);
1232 break;
1233 }
1234 }
1235 }
1236
1237 /* The TImode version of remove_non_convertible_regs. */
1238
1239 static void
1240 timode_remove_non_convertible_regs (bitmap candidates)
1241 {
1242 bitmap_iterator bi;
1243 unsigned id;
1244 bitmap regs = BITMAP_ALLOC (NULL);
1245
1246 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1247 {
1248 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1249 rtx dest = SET_DEST (def_set);
1250 rtx src = SET_SRC (def_set);
1251
1252 if ((!REG_P (dest)
1253 || bitmap_bit_p (regs, REGNO (dest))
1254 || HARD_REGISTER_P (dest))
1255 && (!REG_P (src)
1256 || bitmap_bit_p (regs, REGNO (src))
1257 || HARD_REGISTER_P (src)))
1258 continue;
1259
1260 if (REG_P (dest))
1261 timode_check_non_convertible_regs (candidates, regs,
1262 REGNO (dest));
1263
1264 if (REG_P (src))
1265 timode_check_non_convertible_regs (candidates, regs,
1266 REGNO (src));
1267 }
1268
1269 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1270 {
1271 for (df_ref def = DF_REG_DEF_CHAIN (id);
1272 def;
1273 def = DF_REF_NEXT_REG (def))
1274 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1275 {
1276 if (dump_file)
1277 fprintf (dump_file, "Removing insn %d from candidates list\n",
1278 DF_REF_INSN_UID (def));
1279
1280 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1281 }
1282
1283 for (df_ref ref = DF_REG_USE_CHAIN (id);
1284 ref;
1285 ref = DF_REF_NEXT_REG (ref))
1286 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1287 {
1288 if (dump_file)
1289 fprintf (dump_file, "Removing insn %d from candidates list\n",
1290 DF_REF_INSN_UID (ref));
1291
1292 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1293 }
1294 }
1295
1296 BITMAP_FREE (regs);
1297 }
1298
1299 /* For a given bitmap of insn UIDs scans all instruction and
1300 remove insn from CANDIDATES in case it has both convertible
1301 and not convertible definitions.
1302
1303 All insns in a bitmap are conversion candidates according to
1304 scalar_to_vector_candidate_p. Currently it implies all insns
1305 are single_set. */
1306
1307 static void
1308 remove_non_convertible_regs (bitmap candidates)
1309 {
1310 if (TARGET_64BIT)
1311 timode_remove_non_convertible_regs (candidates);
1312 else
1313 dimode_remove_non_convertible_regs (candidates);
1314 }
1315
1316 class scalar_chain
1317 {
1318 public:
1319 scalar_chain ();
1320 virtual ~scalar_chain ();
1321
1322 static unsigned max_id;
1323
1324 /* ID of a chain. */
1325 unsigned int chain_id;
1326 /* A queue of instructions to be included into a chain. */
1327 bitmap queue;
1328 /* Instructions included into a chain. */
1329 bitmap insns;
1330 /* All registers defined by a chain. */
1331 bitmap defs;
1332 /* Registers used in both vector and sclar modes. */
1333 bitmap defs_conv;
1334
1335 void build (bitmap candidates, unsigned insn_uid);
1336 virtual int compute_convert_gain () = 0;
1337 int convert ();
1338
1339 protected:
1340 void add_to_queue (unsigned insn_uid);
1341 void emit_conversion_insns (rtx insns, rtx_insn *pos);
1342
1343 private:
1344 void add_insn (bitmap candidates, unsigned insn_uid);
1345 void analyze_register_chain (bitmap candidates, df_ref ref);
1346 virtual void mark_dual_mode_def (df_ref def) = 0;
1347 virtual void convert_insn (rtx_insn *insn) = 0;
1348 virtual void convert_registers () = 0;
1349 };
1350
1351 class dimode_scalar_chain : public scalar_chain
1352 {
1353 public:
1354 int compute_convert_gain ();
1355 private:
1356 void mark_dual_mode_def (df_ref def);
1357 rtx replace_with_subreg (rtx x, rtx reg, rtx subreg);
1358 void replace_with_subreg_in_insn (rtx_insn *insn, rtx reg, rtx subreg);
1359 void convert_insn (rtx_insn *insn);
1360 void convert_op (rtx *op, rtx_insn *insn);
1361 void convert_reg (unsigned regno);
1362 void make_vector_copies (unsigned regno);
1363 void convert_registers ();
1364 int vector_const_cost (rtx exp);
1365 };
1366
1367 class timode_scalar_chain : public scalar_chain
1368 {
1369 public:
1370 /* Convert from TImode to V1TImode is always faster. */
1371 int compute_convert_gain () { return 1; }
1372
1373 private:
1374 void mark_dual_mode_def (df_ref def);
1375 void fix_debug_reg_uses (rtx reg);
1376 void convert_insn (rtx_insn *insn);
1377 /* We don't convert registers to difference size. */
1378 void convert_registers () {}
1379 };
1380
1381 unsigned scalar_chain::max_id = 0;
1382
1383 /* Initialize new chain. */
1384
1385 scalar_chain::scalar_chain ()
1386 {
1387 chain_id = ++max_id;
1388
1389 if (dump_file)
1390 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
1391
1392 bitmap_obstack_initialize (NULL);
1393 insns = BITMAP_ALLOC (NULL);
1394 defs = BITMAP_ALLOC (NULL);
1395 defs_conv = BITMAP_ALLOC (NULL);
1396 queue = NULL;
1397 }
1398
1399 /* Free chain's data. */
1400
1401 scalar_chain::~scalar_chain ()
1402 {
1403 BITMAP_FREE (insns);
1404 BITMAP_FREE (defs);
1405 BITMAP_FREE (defs_conv);
1406 bitmap_obstack_release (NULL);
1407 }
1408
1409 /* Add instruction into chains' queue. */
1410
1411 void
1412 scalar_chain::add_to_queue (unsigned insn_uid)
1413 {
1414 if (bitmap_bit_p (insns, insn_uid)
1415 || bitmap_bit_p (queue, insn_uid))
1416 return;
1417
1418 if (dump_file)
1419 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
1420 insn_uid, chain_id);
1421 bitmap_set_bit (queue, insn_uid);
1422 }
1423
1424 /* For DImode conversion, mark register defined by DEF as requiring
1425 conversion. */
1426
1427 void
1428 dimode_scalar_chain::mark_dual_mode_def (df_ref def)
1429 {
1430 gcc_assert (DF_REF_REG_DEF_P (def));
1431
1432 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (def)))
1433 return;
1434
1435 if (dump_file)
1436 fprintf (dump_file,
1437 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
1438 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
1439
1440 bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
1441 }
1442
1443 /* For TImode conversion, it is unused. */
1444
1445 void
1446 timode_scalar_chain::mark_dual_mode_def (df_ref)
1447 {
1448 gcc_unreachable ();
1449 }
1450
1451 /* Check REF's chain to add new insns into a queue
1452 and find registers requiring conversion. */
1453
1454 void
1455 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
1456 {
1457 df_link *chain;
1458
1459 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
1460 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
1461 add_to_queue (DF_REF_INSN_UID (ref));
1462
1463 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
1464 {
1465 unsigned uid = DF_REF_INSN_UID (chain->ref);
1466
1467 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
1468 continue;
1469
1470 if (!DF_REF_REG_MEM_P (chain->ref))
1471 {
1472 if (bitmap_bit_p (insns, uid))
1473 continue;
1474
1475 if (bitmap_bit_p (candidates, uid))
1476 {
1477 add_to_queue (uid);
1478 continue;
1479 }
1480 }
1481
1482 if (DF_REF_REG_DEF_P (chain->ref))
1483 {
1484 if (dump_file)
1485 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
1486 DF_REF_REGNO (chain->ref), uid);
1487 mark_dual_mode_def (chain->ref);
1488 }
1489 else
1490 {
1491 if (dump_file)
1492 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
1493 DF_REF_REGNO (chain->ref), uid);
1494 mark_dual_mode_def (ref);
1495 }
1496 }
1497 }
1498
1499 /* Add instruction into a chain. */
1500
1501 void
1502 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
1503 {
1504 if (bitmap_bit_p (insns, insn_uid))
1505 return;
1506
1507 if (dump_file)
1508 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
1509
1510 bitmap_set_bit (insns, insn_uid);
1511
1512 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1513 rtx def_set = single_set (insn);
1514 if (def_set && REG_P (SET_DEST (def_set))
1515 && !HARD_REGISTER_P (SET_DEST (def_set)))
1516 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
1517
1518 df_ref ref;
1519 df_ref def;
1520 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1521 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
1522 for (def = DF_REG_DEF_CHAIN (DF_REF_REGNO (ref));
1523 def;
1524 def = DF_REF_NEXT_REG (def))
1525 analyze_register_chain (candidates, def);
1526 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
1527 if (!DF_REF_REG_MEM_P (ref))
1528 analyze_register_chain (candidates, ref);
1529 }
1530
1531 /* Build new chain starting from insn INSN_UID recursively
1532 adding all dependent uses and definitions. */
1533
1534 void
1535 scalar_chain::build (bitmap candidates, unsigned insn_uid)
1536 {
1537 queue = BITMAP_ALLOC (NULL);
1538 bitmap_set_bit (queue, insn_uid);
1539
1540 if (dump_file)
1541 fprintf (dump_file, "Building chain #%d...\n", chain_id);
1542
1543 while (!bitmap_empty_p (queue))
1544 {
1545 insn_uid = bitmap_first_set_bit (queue);
1546 bitmap_clear_bit (queue, insn_uid);
1547 bitmap_clear_bit (candidates, insn_uid);
1548 add_insn (candidates, insn_uid);
1549 }
1550
1551 if (dump_file)
1552 {
1553 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
1554 fprintf (dump_file, " insns: ");
1555 dump_bitmap (dump_file, insns);
1556 if (!bitmap_empty_p (defs_conv))
1557 {
1558 bitmap_iterator bi;
1559 unsigned id;
1560 const char *comma = "";
1561 fprintf (dump_file, " defs to convert: ");
1562 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1563 {
1564 fprintf (dump_file, "%sr%d", comma, id);
1565 comma = ", ";
1566 }
1567 fprintf (dump_file, "\n");
1568 }
1569 }
1570
1571 BITMAP_FREE (queue);
1572 }
1573
1574 /* Return a cost of building a vector costant
1575 instead of using a scalar one. */
1576
1577 int
1578 dimode_scalar_chain::vector_const_cost (rtx exp)
1579 {
1580 gcc_assert (CONST_INT_P (exp));
1581
1582 if (standard_sse_constant_p (exp, V2DImode))
1583 return COSTS_N_INSNS (1);
1584 return ix86_cost->sse_load[1];
1585 }
1586
1587 /* Compute a gain for chain conversion. */
1588
1589 int
1590 dimode_scalar_chain::compute_convert_gain ()
1591 {
1592 bitmap_iterator bi;
1593 unsigned insn_uid;
1594 int gain = 0;
1595 int cost = 0;
1596
1597 if (dump_file)
1598 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1599
1600 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1601 {
1602 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1603 rtx def_set = single_set (insn);
1604 rtx src = SET_SRC (def_set);
1605 rtx dst = SET_DEST (def_set);
1606
1607 if (REG_P (src) && REG_P (dst))
1608 gain += COSTS_N_INSNS (2) - ix86_cost->xmm_move;
1609 else if (REG_P (src) && MEM_P (dst))
1610 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1611 else if (MEM_P (src) && REG_P (dst))
1612 gain += 2 * ix86_cost->int_load[2] - ix86_cost->sse_load[1];
1613 else if (GET_CODE (src) == ASHIFT
1614 || GET_CODE (src) == ASHIFTRT
1615 || GET_CODE (src) == LSHIFTRT)
1616 {
1617 if (CONST_INT_P (XEXP (src, 0)))
1618 gain -= vector_const_cost (XEXP (src, 0));
1619 if (CONST_INT_P (XEXP (src, 1)))
1620 {
1621 gain += ix86_cost->shift_const;
1622 if (INTVAL (XEXP (src, 1)) >= 32)
1623 gain -= COSTS_N_INSNS (1);
1624 }
1625 else
1626 /* Additional gain for omitting two CMOVs. */
1627 gain += ix86_cost->shift_var + COSTS_N_INSNS (2);
1628 }
1629 else if (GET_CODE (src) == PLUS
1630 || GET_CODE (src) == MINUS
1631 || GET_CODE (src) == IOR
1632 || GET_CODE (src) == XOR
1633 || GET_CODE (src) == AND)
1634 {
1635 gain += ix86_cost->add;
1636 /* Additional gain for andnot for targets without BMI. */
1637 if (GET_CODE (XEXP (src, 0)) == NOT
1638 && !TARGET_BMI)
1639 gain += 2 * ix86_cost->add;
1640
1641 if (CONST_INT_P (XEXP (src, 0)))
1642 gain -= vector_const_cost (XEXP (src, 0));
1643 if (CONST_INT_P (XEXP (src, 1)))
1644 gain -= vector_const_cost (XEXP (src, 1));
1645 }
1646 else if (GET_CODE (src) == NEG
1647 || GET_CODE (src) == NOT)
1648 gain += ix86_cost->add - COSTS_N_INSNS (1);
1649 else if (GET_CODE (src) == COMPARE)
1650 {
1651 /* Assume comparison cost is the same. */
1652 }
1653 else if (CONST_INT_P (src))
1654 {
1655 if (REG_P (dst))
1656 gain += COSTS_N_INSNS (2);
1657 else if (MEM_P (dst))
1658 gain += 2 * ix86_cost->int_store[2] - ix86_cost->sse_store[1];
1659 gain -= vector_const_cost (src);
1660 }
1661 else
1662 gcc_unreachable ();
1663 }
1664
1665 if (dump_file)
1666 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
1667
1668 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, insn_uid, bi)
1669 cost += DF_REG_DEF_COUNT (insn_uid) * ix86_cost->mmxsse_to_integer;
1670
1671 if (dump_file)
1672 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
1673
1674 gain -= cost;
1675
1676 if (dump_file)
1677 fprintf (dump_file, " Total gain: %d\n", gain);
1678
1679 return gain;
1680 }
1681
1682 /* Replace REG in X with a V2DI subreg of NEW_REG. */
1683
1684 rtx
1685 dimode_scalar_chain::replace_with_subreg (rtx x, rtx reg, rtx new_reg)
1686 {
1687 if (x == reg)
1688 return gen_rtx_SUBREG (V2DImode, new_reg, 0);
1689
1690 const char *fmt = GET_RTX_FORMAT (GET_CODE (x));
1691 int i, j;
1692 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
1693 {
1694 if (fmt[i] == 'e')
1695 XEXP (x, i) = replace_with_subreg (XEXP (x, i), reg, new_reg);
1696 else if (fmt[i] == 'E')
1697 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
1698 XVECEXP (x, i, j) = replace_with_subreg (XVECEXP (x, i, j),
1699 reg, new_reg);
1700 }
1701
1702 return x;
1703 }
1704
1705 /* Replace REG in INSN with a V2DI subreg of NEW_REG. */
1706
1707 void
1708 dimode_scalar_chain::replace_with_subreg_in_insn (rtx_insn *insn,
1709 rtx reg, rtx new_reg)
1710 {
1711 replace_with_subreg (single_set (insn), reg, new_reg);
1712 }
1713
1714 /* Insert generated conversion instruction sequence INSNS
1715 after instruction AFTER. New BB may be required in case
1716 instruction has EH region attached. */
1717
1718 void
1719 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
1720 {
1721 if (!control_flow_insn_p (after))
1722 {
1723 emit_insn_after (insns, after);
1724 return;
1725 }
1726
1727 basic_block bb = BLOCK_FOR_INSN (after);
1728 edge e = find_fallthru_edge (bb->succs);
1729 gcc_assert (e);
1730
1731 basic_block new_bb = split_edge (e);
1732 emit_insn_after (insns, BB_HEAD (new_bb));
1733 }
1734
1735 /* Make vector copies for all register REGNO definitions
1736 and replace its uses in a chain. */
1737
1738 void
1739 dimode_scalar_chain::make_vector_copies (unsigned regno)
1740 {
1741 rtx reg = regno_reg_rtx[regno];
1742 rtx vreg = gen_reg_rtx (DImode);
1743 bool count_reg = false;
1744 df_ref ref;
1745
1746 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1747 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1748 {
1749 df_ref use;
1750
1751 /* Detect the count register of a shift instruction. */
1752 for (use = DF_REG_USE_CHAIN (regno); use; use = DF_REF_NEXT_REG (use))
1753 if (bitmap_bit_p (insns, DF_REF_INSN_UID (use)))
1754 {
1755 rtx_insn *insn = DF_REF_INSN (use);
1756 rtx def_set = single_set (insn);
1757
1758 gcc_assert (def_set);
1759
1760 rtx src = SET_SRC (def_set);
1761
1762 if ((GET_CODE (src) == ASHIFT
1763 || GET_CODE (src) == ASHIFTRT
1764 || GET_CODE (src) == LSHIFTRT)
1765 && !CONST_INT_P (XEXP (src, 1))
1766 && reg_or_subregno (XEXP (src, 1)) == regno)
1767 count_reg = true;
1768 }
1769
1770 start_sequence ();
1771 if (count_reg)
1772 {
1773 rtx qreg = gen_lowpart (QImode, reg);
1774 rtx tmp = gen_reg_rtx (SImode);
1775
1776 if (TARGET_ZERO_EXTEND_WITH_AND
1777 && optimize_function_for_speed_p (cfun))
1778 {
1779 emit_move_insn (tmp, const0_rtx);
1780 emit_insn (gen_movstrictqi
1781 (gen_lowpart (QImode, tmp), qreg));
1782 }
1783 else
1784 emit_insn (gen_rtx_SET
1785 (tmp, gen_rtx_ZERO_EXTEND (SImode, qreg)));
1786
1787 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1788 {
1789 rtx slot = assign_386_stack_local (SImode, SLOT_STV_TEMP);
1790 emit_move_insn (slot, tmp);
1791 tmp = copy_rtx (slot);
1792 }
1793
1794 emit_insn (gen_zero_extendsidi2 (vreg, tmp));
1795 }
1796 else if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
1797 {
1798 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1799 emit_move_insn (adjust_address (tmp, SImode, 0),
1800 gen_rtx_SUBREG (SImode, reg, 0));
1801 emit_move_insn (adjust_address (tmp, SImode, 4),
1802 gen_rtx_SUBREG (SImode, reg, 4));
1803 emit_move_insn (vreg, tmp);
1804 }
1805 else if (TARGET_SSE4_1)
1806 {
1807 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1808 CONST0_RTX (V4SImode),
1809 gen_rtx_SUBREG (SImode, reg, 0)));
1810 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
1811 gen_rtx_SUBREG (V4SImode, vreg, 0),
1812 gen_rtx_SUBREG (SImode, reg, 4),
1813 GEN_INT (2)));
1814 }
1815 else
1816 {
1817 rtx tmp = gen_reg_rtx (DImode);
1818 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
1819 CONST0_RTX (V4SImode),
1820 gen_rtx_SUBREG (SImode, reg, 0)));
1821 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
1822 CONST0_RTX (V4SImode),
1823 gen_rtx_SUBREG (SImode, reg, 4)));
1824 emit_insn (gen_vec_interleave_lowv4si
1825 (gen_rtx_SUBREG (V4SImode, vreg, 0),
1826 gen_rtx_SUBREG (V4SImode, vreg, 0),
1827 gen_rtx_SUBREG (V4SImode, tmp, 0)));
1828 }
1829 rtx_insn *seq = get_insns ();
1830 end_sequence ();
1831 rtx_insn *insn = DF_REF_INSN (ref);
1832 emit_conversion_insns (seq, insn);
1833
1834 if (dump_file)
1835 fprintf (dump_file,
1836 " Copied r%d to a vector register r%d for insn %d\n",
1837 regno, REGNO (vreg), INSN_UID (insn));
1838 }
1839
1840 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1841 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1842 {
1843 rtx_insn *insn = DF_REF_INSN (ref);
1844 if (count_reg)
1845 {
1846 rtx def_set = single_set (insn);
1847 gcc_assert (def_set);
1848
1849 rtx src = SET_SRC (def_set);
1850
1851 if ((GET_CODE (src) == ASHIFT
1852 || GET_CODE (src) == ASHIFTRT
1853 || GET_CODE (src) == LSHIFTRT)
1854 && !CONST_INT_P (XEXP (src, 1))
1855 && reg_or_subregno (XEXP (src, 1)) == regno)
1856 XEXP (src, 1) = vreg;
1857 }
1858 else
1859 replace_with_subreg_in_insn (insn, reg, vreg);
1860
1861 if (dump_file)
1862 fprintf (dump_file, " Replaced r%d with r%d in insn %d\n",
1863 regno, REGNO (vreg), INSN_UID (insn));
1864 }
1865 }
1866
1867 /* Convert all definitions of register REGNO
1868 and fix its uses. Scalar copies may be created
1869 in case register is used in not convertible insn. */
1870
1871 void
1872 dimode_scalar_chain::convert_reg (unsigned regno)
1873 {
1874 bool scalar_copy = bitmap_bit_p (defs_conv, regno);
1875 rtx reg = regno_reg_rtx[regno];
1876 rtx scopy = NULL_RTX;
1877 df_ref ref;
1878 bitmap conv;
1879
1880 conv = BITMAP_ALLOC (NULL);
1881 bitmap_copy (conv, insns);
1882
1883 if (scalar_copy)
1884 scopy = gen_reg_rtx (DImode);
1885
1886 for (ref = DF_REG_DEF_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1887 {
1888 rtx_insn *insn = DF_REF_INSN (ref);
1889 rtx def_set = single_set (insn);
1890 rtx src = SET_SRC (def_set);
1891 rtx reg = DF_REF_REG (ref);
1892
1893 if (!MEM_P (src))
1894 {
1895 replace_with_subreg_in_insn (insn, reg, reg);
1896 bitmap_clear_bit (conv, INSN_UID (insn));
1897 }
1898
1899 if (scalar_copy)
1900 {
1901 start_sequence ();
1902 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1903 {
1904 rtx tmp = assign_386_stack_local (DImode, SLOT_STV_TEMP);
1905 emit_move_insn (tmp, reg);
1906 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1907 adjust_address (tmp, SImode, 0));
1908 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1909 adjust_address (tmp, SImode, 4));
1910 }
1911 else if (TARGET_SSE4_1)
1912 {
1913 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
1914 emit_insn
1915 (gen_rtx_SET
1916 (gen_rtx_SUBREG (SImode, scopy, 0),
1917 gen_rtx_VEC_SELECT (SImode,
1918 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1919
1920 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1921 emit_insn
1922 (gen_rtx_SET
1923 (gen_rtx_SUBREG (SImode, scopy, 4),
1924 gen_rtx_VEC_SELECT (SImode,
1925 gen_rtx_SUBREG (V4SImode, reg, 0), tmp)));
1926 }
1927 else
1928 {
1929 rtx vcopy = gen_reg_rtx (V2DImode);
1930 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, reg, 0));
1931 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 0),
1932 gen_rtx_SUBREG (SImode, vcopy, 0));
1933 emit_move_insn (vcopy,
1934 gen_rtx_LSHIFTRT (V2DImode, vcopy, GEN_INT (32)));
1935 emit_move_insn (gen_rtx_SUBREG (SImode, scopy, 4),
1936 gen_rtx_SUBREG (SImode, vcopy, 0));
1937 }
1938 rtx_insn *seq = get_insns ();
1939 end_sequence ();
1940 emit_conversion_insns (seq, insn);
1941
1942 if (dump_file)
1943 fprintf (dump_file,
1944 " Copied r%d to a scalar register r%d for insn %d\n",
1945 regno, REGNO (scopy), INSN_UID (insn));
1946 }
1947 }
1948
1949 for (ref = DF_REG_USE_CHAIN (regno); ref; ref = DF_REF_NEXT_REG (ref))
1950 if (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)))
1951 {
1952 if (bitmap_bit_p (conv, DF_REF_INSN_UID (ref)))
1953 {
1954 rtx_insn *insn = DF_REF_INSN (ref);
1955
1956 rtx def_set = single_set (insn);
1957 gcc_assert (def_set);
1958
1959 rtx src = SET_SRC (def_set);
1960 rtx dst = SET_DEST (def_set);
1961
1962 if ((GET_CODE (src) == ASHIFT
1963 || GET_CODE (src) == ASHIFTRT
1964 || GET_CODE (src) == LSHIFTRT)
1965 && !CONST_INT_P (XEXP (src, 1))
1966 && reg_or_subregno (XEXP (src, 1)) == regno)
1967 {
1968 rtx tmp2 = gen_reg_rtx (V2DImode);
1969
1970 start_sequence ();
1971
1972 if (TARGET_SSE4_1)
1973 emit_insn (gen_sse4_1_zero_extendv2qiv2di2
1974 (tmp2, gen_rtx_SUBREG (V16QImode, reg, 0)));
1975 else
1976 {
1977 rtx vec_cst
1978 = gen_rtx_CONST_VECTOR (V2DImode,
1979 gen_rtvec (2, GEN_INT (0xff),
1980 const0_rtx));
1981 vec_cst
1982 = validize_mem (force_const_mem (V2DImode, vec_cst));
1983
1984 emit_insn (gen_rtx_SET
1985 (tmp2,
1986 gen_rtx_AND (V2DImode,
1987 gen_rtx_SUBREG (V2DImode, reg, 0),
1988 vec_cst)));
1989 }
1990 rtx_insn *seq = get_insns ();
1991 end_sequence ();
1992
1993 emit_insn_before (seq, insn);
1994
1995 XEXP (src, 1) = gen_rtx_SUBREG (DImode, tmp2, 0);
1996 }
1997 else if (!MEM_P (dst) || !REG_P (src))
1998 replace_with_subreg_in_insn (insn, reg, reg);
1999
2000 bitmap_clear_bit (conv, INSN_UID (insn));
2001 }
2002 }
2003 /* Skip debug insns and uninitialized uses. */
2004 else if (DF_REF_CHAIN (ref)
2005 && NONDEBUG_INSN_P (DF_REF_INSN (ref)))
2006 {
2007 gcc_assert (scopy);
2008 replace_rtx (DF_REF_INSN (ref), reg, scopy);
2009 df_insn_rescan (DF_REF_INSN (ref));
2010 }
2011
2012 BITMAP_FREE (conv);
2013 }
2014
2015 /* Convert operand OP in INSN. We should handle
2016 memory operands and uninitialized registers.
2017 All other register uses are converted during
2018 registers conversion. */
2019
2020 void
2021 dimode_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
2022 {
2023 *op = copy_rtx_if_shared (*op);
2024
2025 if (GET_CODE (*op) == NOT)
2026 {
2027 convert_op (&XEXP (*op, 0), insn);
2028 PUT_MODE (*op, V2DImode);
2029 }
2030 else if (MEM_P (*op))
2031 {
2032 rtx tmp = gen_reg_rtx (DImode);
2033
2034 emit_insn_before (gen_move_insn (tmp, *op), insn);
2035 *op = gen_rtx_SUBREG (V2DImode, tmp, 0);
2036
2037 if (dump_file)
2038 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
2039 INSN_UID (insn), REGNO (tmp));
2040 }
2041 else if (REG_P (*op))
2042 {
2043 /* We may have not converted register usage in case
2044 this register has no definition. Otherwise it
2045 should be converted in convert_reg. */
2046 df_ref ref;
2047 FOR_EACH_INSN_USE (ref, insn)
2048 if (DF_REF_REGNO (ref) == REGNO (*op))
2049 {
2050 gcc_assert (!DF_REF_CHAIN (ref));
2051 break;
2052 }
2053 *op = gen_rtx_SUBREG (V2DImode, *op, 0);
2054 }
2055 else if (CONST_INT_P (*op))
2056 {
2057 rtx vec_cst;
2058 rtx tmp = gen_rtx_SUBREG (V2DImode, gen_reg_rtx (DImode), 0);
2059
2060 /* Prefer all ones vector in case of -1. */
2061 if (constm1_operand (*op, GET_MODE (*op)))
2062 vec_cst = CONSTM1_RTX (V2DImode);
2063 else
2064 vec_cst = gen_rtx_CONST_VECTOR (V2DImode,
2065 gen_rtvec (2, *op, const0_rtx));
2066
2067 if (!standard_sse_constant_p (vec_cst, V2DImode))
2068 {
2069 start_sequence ();
2070 vec_cst = validize_mem (force_const_mem (V2DImode, vec_cst));
2071 rtx_insn *seq = get_insns ();
2072 end_sequence ();
2073 emit_insn_before (seq, insn);
2074 }
2075
2076 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
2077 *op = tmp;
2078 }
2079 else
2080 {
2081 gcc_assert (SUBREG_P (*op));
2082 gcc_assert (GET_MODE (*op) == V2DImode);
2083 }
2084 }
2085
2086 /* Convert INSN to vector mode. */
2087
2088 void
2089 dimode_scalar_chain::convert_insn (rtx_insn *insn)
2090 {
2091 rtx def_set = single_set (insn);
2092 rtx src = SET_SRC (def_set);
2093 rtx dst = SET_DEST (def_set);
2094 rtx subreg;
2095
2096 if (MEM_P (dst) && !REG_P (src))
2097 {
2098 /* There are no scalar integer instructions and therefore
2099 temporary register usage is required. */
2100 rtx tmp = gen_reg_rtx (DImode);
2101 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
2102 dst = gen_rtx_SUBREG (V2DImode, tmp, 0);
2103 }
2104
2105 switch (GET_CODE (src))
2106 {
2107 case ASHIFT:
2108 case ASHIFTRT:
2109 case LSHIFTRT:
2110 convert_op (&XEXP (src, 0), insn);
2111 PUT_MODE (src, V2DImode);
2112 break;
2113
2114 case PLUS:
2115 case MINUS:
2116 case IOR:
2117 case XOR:
2118 case AND:
2119 convert_op (&XEXP (src, 0), insn);
2120 convert_op (&XEXP (src, 1), insn);
2121 PUT_MODE (src, V2DImode);
2122 break;
2123
2124 case NEG:
2125 src = XEXP (src, 0);
2126 convert_op (&src, insn);
2127 subreg = gen_reg_rtx (V2DImode);
2128 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (V2DImode)), insn);
2129 src = gen_rtx_MINUS (V2DImode, subreg, src);
2130 break;
2131
2132 case NOT:
2133 src = XEXP (src, 0);
2134 convert_op (&src, insn);
2135 subreg = gen_reg_rtx (V2DImode);
2136 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (V2DImode)), insn);
2137 src = gen_rtx_XOR (V2DImode, src, subreg);
2138 break;
2139
2140 case MEM:
2141 if (!REG_P (dst))
2142 convert_op (&src, insn);
2143 break;
2144
2145 case REG:
2146 if (!MEM_P (dst))
2147 convert_op (&src, insn);
2148 break;
2149
2150 case SUBREG:
2151 gcc_assert (GET_MODE (src) == V2DImode);
2152 break;
2153
2154 case COMPARE:
2155 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
2156
2157 gcc_assert ((REG_P (src) && GET_MODE (src) == DImode)
2158 || (SUBREG_P (src) && GET_MODE (src) == V2DImode));
2159
2160 if (REG_P (src))
2161 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
2162 else
2163 subreg = copy_rtx_if_shared (src);
2164 emit_insn_before (gen_vec_interleave_lowv2di (copy_rtx_if_shared (subreg),
2165 copy_rtx_if_shared (subreg),
2166 copy_rtx_if_shared (subreg)),
2167 insn);
2168 dst = gen_rtx_REG (CCmode, FLAGS_REG);
2169 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (src),
2170 copy_rtx_if_shared (src)),
2171 UNSPEC_PTEST);
2172 break;
2173
2174 case CONST_INT:
2175 convert_op (&src, insn);
2176 break;
2177
2178 default:
2179 gcc_unreachable ();
2180 }
2181
2182 SET_SRC (def_set) = src;
2183 SET_DEST (def_set) = dst;
2184
2185 /* Drop possible dead definitions. */
2186 PATTERN (insn) = def_set;
2187
2188 INSN_CODE (insn) = -1;
2189 recog_memoized (insn);
2190 df_insn_rescan (insn);
2191 }
2192
2193 /* Fix uses of converted REG in debug insns. */
2194
2195 void
2196 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
2197 {
2198 if (!flag_var_tracking)
2199 return;
2200
2201 df_ref ref, next;
2202 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
2203 {
2204 rtx_insn *insn = DF_REF_INSN (ref);
2205 /* Make sure the next ref is for a different instruction,
2206 so that we're not affected by the rescan. */
2207 next = DF_REF_NEXT_REG (ref);
2208 while (next && DF_REF_INSN (next) == insn)
2209 next = DF_REF_NEXT_REG (next);
2210
2211 if (DEBUG_INSN_P (insn))
2212 {
2213 /* It may be a debug insn with a TImode variable in
2214 register. */
2215 bool changed = false;
2216 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
2217 {
2218 rtx *loc = DF_REF_LOC (ref);
2219 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
2220 {
2221 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
2222 changed = true;
2223 }
2224 }
2225 if (changed)
2226 df_insn_rescan (insn);
2227 }
2228 }
2229 }
2230
2231 /* Convert INSN from TImode to V1T1mode. */
2232
2233 void
2234 timode_scalar_chain::convert_insn (rtx_insn *insn)
2235 {
2236 rtx def_set = single_set (insn);
2237 rtx src = SET_SRC (def_set);
2238 rtx dst = SET_DEST (def_set);
2239
2240 switch (GET_CODE (dst))
2241 {
2242 case REG:
2243 {
2244 rtx tmp = find_reg_equal_equiv_note (insn);
2245 if (tmp)
2246 PUT_MODE (XEXP (tmp, 0), V1TImode);
2247 PUT_MODE (dst, V1TImode);
2248 fix_debug_reg_uses (dst);
2249 }
2250 break;
2251 case MEM:
2252 PUT_MODE (dst, V1TImode);
2253 break;
2254
2255 default:
2256 gcc_unreachable ();
2257 }
2258
2259 switch (GET_CODE (src))
2260 {
2261 case REG:
2262 PUT_MODE (src, V1TImode);
2263 /* Call fix_debug_reg_uses only if SRC is never defined. */
2264 if (!DF_REG_DEF_CHAIN (REGNO (src)))
2265 fix_debug_reg_uses (src);
2266 break;
2267
2268 case MEM:
2269 PUT_MODE (src, V1TImode);
2270 break;
2271
2272 case CONST_WIDE_INT:
2273 if (NONDEBUG_INSN_P (insn))
2274 {
2275 /* Since there are no instructions to store 128-bit constant,
2276 temporary register usage is required. */
2277 rtx tmp = gen_reg_rtx (V1TImode);
2278 start_sequence ();
2279 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
2280 src = validize_mem (force_const_mem (V1TImode, src));
2281 rtx_insn *seq = get_insns ();
2282 end_sequence ();
2283 if (seq)
2284 emit_insn_before (seq, insn);
2285 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2286 dst = tmp;
2287 }
2288 break;
2289
2290 case CONST_INT:
2291 switch (standard_sse_constant_p (src, TImode))
2292 {
2293 case 1:
2294 src = CONST0_RTX (GET_MODE (dst));
2295 break;
2296 case 2:
2297 src = CONSTM1_RTX (GET_MODE (dst));
2298 break;
2299 default:
2300 gcc_unreachable ();
2301 }
2302 if (NONDEBUG_INSN_P (insn))
2303 {
2304 rtx tmp = gen_reg_rtx (V1TImode);
2305 /* Since there are no instructions to store standard SSE
2306 constant, temporary register usage is required. */
2307 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
2308 dst = tmp;
2309 }
2310 break;
2311
2312 default:
2313 gcc_unreachable ();
2314 }
2315
2316 SET_SRC (def_set) = src;
2317 SET_DEST (def_set) = dst;
2318
2319 /* Drop possible dead definitions. */
2320 PATTERN (insn) = def_set;
2321
2322 INSN_CODE (insn) = -1;
2323 recog_memoized (insn);
2324 df_insn_rescan (insn);
2325 }
2326
2327 void
2328 dimode_scalar_chain::convert_registers ()
2329 {
2330 bitmap_iterator bi;
2331 unsigned id;
2332
2333 EXECUTE_IF_SET_IN_BITMAP (defs, 0, id, bi)
2334 convert_reg (id);
2335
2336 EXECUTE_IF_AND_COMPL_IN_BITMAP (defs_conv, defs, 0, id, bi)
2337 make_vector_copies (id);
2338 }
2339
2340 /* Convert whole chain creating required register
2341 conversions and copies. */
2342
2343 int
2344 scalar_chain::convert ()
2345 {
2346 bitmap_iterator bi;
2347 unsigned id;
2348 int converted_insns = 0;
2349
2350 if (!dbg_cnt (stv_conversion))
2351 return 0;
2352
2353 if (dump_file)
2354 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2355
2356 convert_registers ();
2357
2358 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2359 {
2360 convert_insn (DF_INSN_UID_GET (id)->insn);
2361 converted_insns++;
2362 }
2363
2364 return converted_insns;
2365 }
2366
2367 /* Main STV pass function. Find and convert scalar
2368 instructions into vector mode when profitable. */
2369
2370 static unsigned int
2371 convert_scalars_to_vector ()
2372 {
2373 basic_block bb;
2374 bitmap candidates;
2375 int converted_insns = 0;
2376
2377 bitmap_obstack_initialize (NULL);
2378 candidates = BITMAP_ALLOC (NULL);
2379
2380 calculate_dominance_info (CDI_DOMINATORS);
2381 df_set_flags (DF_DEFER_INSN_RESCAN);
2382 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2383 df_md_add_problem ();
2384 df_analyze ();
2385
2386 /* Find all instructions we want to convert into vector mode. */
2387 if (dump_file)
2388 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2389
2390 FOR_EACH_BB_FN (bb, cfun)
2391 {
2392 rtx_insn *insn;
2393 FOR_BB_INSNS (bb, insn)
2394 if (scalar_to_vector_candidate_p (insn))
2395 {
2396 if (dump_file)
2397 fprintf (dump_file, " insn %d is marked as a candidate\n",
2398 INSN_UID (insn));
2399
2400 bitmap_set_bit (candidates, INSN_UID (insn));
2401 }
2402 }
2403
2404 remove_non_convertible_regs (candidates);
2405
2406 if (bitmap_empty_p (candidates))
2407 if (dump_file)
2408 fprintf (dump_file, "There are no candidates for optimization.\n");
2409
2410 while (!bitmap_empty_p (candidates))
2411 {
2412 unsigned uid = bitmap_first_set_bit (candidates);
2413 scalar_chain *chain;
2414
2415 if (TARGET_64BIT)
2416 chain = new timode_scalar_chain;
2417 else
2418 chain = new dimode_scalar_chain;
2419
2420 /* Find instructions chain we want to convert to vector mode.
2421 Check all uses and definitions to estimate all required
2422 conversions. */
2423 chain->build (candidates, uid);
2424
2425 if (chain->compute_convert_gain () > 0)
2426 converted_insns += chain->convert ();
2427 else
2428 if (dump_file)
2429 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2430 chain->chain_id);
2431
2432 delete chain;
2433 }
2434
2435 if (dump_file)
2436 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2437
2438 BITMAP_FREE (candidates);
2439 bitmap_obstack_release (NULL);
2440 df_process_deferred_rescans ();
2441
2442 /* Conversion means we may have 128bit register spills/fills
2443 which require aligned stack. */
2444 if (converted_insns)
2445 {
2446 if (crtl->stack_alignment_needed < 128)
2447 crtl->stack_alignment_needed = 128;
2448 if (crtl->stack_alignment_estimated < 128)
2449 crtl->stack_alignment_estimated = 128;
2450 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2451 if (TARGET_64BIT)
2452 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2453 parm; parm = DECL_CHAIN (parm))
2454 {
2455 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2456 continue;
2457 if (DECL_RTL_SET_P (parm)
2458 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2459 {
2460 rtx r = DECL_RTL (parm);
2461 if (REG_P (r))
2462 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2463 }
2464 if (DECL_INCOMING_RTL (parm)
2465 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2466 {
2467 rtx r = DECL_INCOMING_RTL (parm);
2468 if (REG_P (r))
2469 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2470 }
2471 }
2472 }
2473
2474 return 0;
2475 }
2476
2477 namespace {
2478
2479 const pass_data pass_data_insert_vzeroupper =
2480 {
2481 RTL_PASS, /* type */
2482 "vzeroupper", /* name */
2483 OPTGROUP_NONE, /* optinfo_flags */
2484 TV_MACH_DEP, /* tv_id */
2485 0, /* properties_required */
2486 0, /* properties_provided */
2487 0, /* properties_destroyed */
2488 0, /* todo_flags_start */
2489 TODO_df_finish, /* todo_flags_finish */
2490 };
2491
2492 class pass_insert_vzeroupper : public rtl_opt_pass
2493 {
2494 public:
2495 pass_insert_vzeroupper(gcc::context *ctxt)
2496 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2497 {}
2498
2499 /* opt_pass methods: */
2500 virtual bool gate (function *)
2501 {
2502 return TARGET_AVX
2503 && TARGET_VZEROUPPER && flag_expensive_optimizations
2504 && !optimize_size;
2505 }
2506
2507 virtual unsigned int execute (function *)
2508 {
2509 return rest_of_handle_insert_vzeroupper ();
2510 }
2511
2512 }; // class pass_insert_vzeroupper
2513
2514 const pass_data pass_data_stv =
2515 {
2516 RTL_PASS, /* type */
2517 "stv", /* name */
2518 OPTGROUP_NONE, /* optinfo_flags */
2519 TV_MACH_DEP, /* tv_id */
2520 0, /* properties_required */
2521 0, /* properties_provided */
2522 0, /* properties_destroyed */
2523 0, /* todo_flags_start */
2524 TODO_df_finish, /* todo_flags_finish */
2525 };
2526
2527 class pass_stv : public rtl_opt_pass
2528 {
2529 public:
2530 pass_stv (gcc::context *ctxt)
2531 : rtl_opt_pass (pass_data_stv, ctxt),
2532 timode_p (false)
2533 {}
2534
2535 /* opt_pass methods: */
2536 virtual bool gate (function *)
2537 {
2538 return (timode_p == !!TARGET_64BIT
2539 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2540 }
2541
2542 virtual unsigned int execute (function *)
2543 {
2544 return convert_scalars_to_vector ();
2545 }
2546
2547 opt_pass *clone ()
2548 {
2549 return new pass_stv (m_ctxt);
2550 }
2551
2552 void set_pass_param (unsigned int n, bool param)
2553 {
2554 gcc_assert (n == 0);
2555 timode_p = param;
2556 }
2557
2558 private:
2559 bool timode_p;
2560 }; // class pass_stv
2561
2562 } // anon namespace
2563
2564 rtl_opt_pass *
2565 make_pass_insert_vzeroupper (gcc::context *ctxt)
2566 {
2567 return new pass_insert_vzeroupper (ctxt);
2568 }
2569
2570 rtl_opt_pass *
2571 make_pass_stv (gcc::context *ctxt)
2572 {
2573 return new pass_stv (ctxt);
2574 }
2575
2576 /* Inserting ENDBRANCH instructions. */
2577
2578 static unsigned int
2579 rest_of_insert_endbranch (void)
2580 {
2581 timevar_push (TV_MACH_DEP);
2582
2583 rtx cet_eb;
2584 rtx_insn *insn;
2585 basic_block bb;
2586
2587 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check' is
2588 absent among function attributes. Later an optimization will be
2589 introduced to make analysis if an address of a static function is
2590 taken. A static function whose address is not taken will get a
2591 nocf_check attribute. This will allow to reduce the number of EB. */
2592
2593 if (!lookup_attribute ("nocf_check",
2594 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2595 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
2596 {
2597 cet_eb = gen_nop_endbr ();
2598
2599 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2600 insn = BB_HEAD (bb);
2601 emit_insn_before (cet_eb, insn);
2602 }
2603
2604 bb = 0;
2605 FOR_EACH_BB_FN (bb, cfun)
2606 {
2607 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2608 insn = NEXT_INSN (insn))
2609 {
2610 if (INSN_P (insn) && GET_CODE (insn) == CALL_INSN)
2611 {
2612 rtx_insn *next_insn = insn;
2613
2614 while ((next_insn != BB_END (bb))
2615 && (DEBUG_INSN_P (NEXT_INSN (next_insn))
2616 || NOTE_P (NEXT_INSN (next_insn))
2617 || BARRIER_P (NEXT_INSN (next_insn))))
2618 next_insn = NEXT_INSN (next_insn);
2619
2620 /* Generate ENDBRANCH after CALL, which can return more than
2621 twice, setjmp-like functions. */
2622 if (find_reg_note (insn, REG_SETJMP, NULL) != NULL)
2623 {
2624 cet_eb = gen_nop_endbr ();
2625 emit_insn_after (cet_eb, next_insn);
2626 }
2627 continue;
2628 }
2629
2630 if (INSN_P (insn) && JUMP_P (insn) && flag_cet_switch)
2631 {
2632 rtx target = JUMP_LABEL (insn);
2633 if (target == NULL_RTX || ANY_RETURN_P (target))
2634 continue;
2635
2636 /* Check the jump is a switch table. */
2637 rtx_insn *label = as_a<rtx_insn *> (target);
2638 rtx_insn *table = next_insn (label);
2639 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2640 continue;
2641
2642 /* For the indirect jump find out all places it jumps and insert
2643 ENDBRANCH there. It should be done under a special flag to
2644 control ENDBRANCH generation for switch stmts. */
2645 edge_iterator ei;
2646 edge e;
2647 basic_block dest_blk;
2648
2649 FOR_EACH_EDGE (e, ei, bb->succs)
2650 {
2651 rtx_insn *insn;
2652
2653 dest_blk = e->dest;
2654 insn = BB_HEAD (dest_blk);
2655 gcc_assert (LABEL_P (insn));
2656 cet_eb = gen_nop_endbr ();
2657 emit_insn_after (cet_eb, insn);
2658 }
2659 continue;
2660 }
2661
2662 if ((LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2663 || (NOTE_P (insn)
2664 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
2665 /* TODO. Check /s bit also. */
2666 {
2667 cet_eb = gen_nop_endbr ();
2668 emit_insn_after (cet_eb, insn);
2669 continue;
2670 }
2671 }
2672 }
2673
2674 timevar_pop (TV_MACH_DEP);
2675 return 0;
2676 }
2677
2678 namespace {
2679
2680 const pass_data pass_data_insert_endbranch =
2681 {
2682 RTL_PASS, /* type. */
2683 "cet", /* name. */
2684 OPTGROUP_NONE, /* optinfo_flags. */
2685 TV_MACH_DEP, /* tv_id. */
2686 0, /* properties_required. */
2687 0, /* properties_provided. */
2688 0, /* properties_destroyed. */
2689 0, /* todo_flags_start. */
2690 0, /* todo_flags_finish. */
2691 };
2692
2693 class pass_insert_endbranch : public rtl_opt_pass
2694 {
2695 public:
2696 pass_insert_endbranch (gcc::context *ctxt)
2697 : rtl_opt_pass (pass_data_insert_endbranch, ctxt)
2698 {}
2699
2700 /* opt_pass methods: */
2701 virtual bool gate (function *)
2702 {
2703 return ((flag_cf_protection & CF_BRANCH) && TARGET_IBT);
2704 }
2705
2706 virtual unsigned int execute (function *)
2707 {
2708 return rest_of_insert_endbranch ();
2709 }
2710
2711 }; // class pass_insert_endbranch
2712
2713 } // anon namespace
2714
2715 rtl_opt_pass *
2716 make_pass_insert_endbranch (gcc::context *ctxt)
2717 {
2718 return new pass_insert_endbranch (ctxt);
2719 }
2720
2721 /* Return true if a red-zone is in use. */
2722
2723 bool
2724 ix86_using_red_zone (void)
2725 {
2726 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2727 }
2728 \f
2729 /* Return a string that documents the current -m options. The caller is
2730 responsible for freeing the string. */
2731
2732 static char *
2733 ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
2734 int flags, int flags2,
2735 const char *arch, const char *tune,
2736 enum fpmath_unit fpmath, bool add_nl_p)
2737 {
2738 struct ix86_target_opts
2739 {
2740 const char *option; /* option string */
2741 HOST_WIDE_INT mask; /* isa mask options */
2742 };
2743
2744 /* This table is ordered so that options like -msse4.2 that imply other
2745 ISAs come first. Target string will be displayed in the same order. */
2746 static struct ix86_target_opts isa2_opts[] =
2747 {
2748 { "-mmpx", OPTION_MASK_ISA_MPX },
2749 { "-mavx512vbmi2", OPTION_MASK_ISA_AVX512VBMI2 },
2750 { "-mrdpid", OPTION_MASK_ISA_RDPID },
2751 { "-msgx", OPTION_MASK_ISA_SGX },
2752 { "-mavx5124vnniw", OPTION_MASK_ISA_AVX5124VNNIW },
2753 { "-mavx5124fmaps", OPTION_MASK_ISA_AVX5124FMAPS },
2754 { "-mavx512vpopcntdq", OPTION_MASK_ISA_AVX512VPOPCNTDQ },
2755 { "-mibt", OPTION_MASK_ISA_IBT },
2756 { "-mshstk", OPTION_MASK_ISA_SHSTK }
2757 };
2758 static struct ix86_target_opts isa_opts[] =
2759 {
2760 { "-mgfni", OPTION_MASK_ISA_GFNI },
2761 { "-mavx512vbmi", OPTION_MASK_ISA_AVX512VBMI },
2762 { "-mavx512ifma", OPTION_MASK_ISA_AVX512IFMA },
2763 { "-mavx512vl", OPTION_MASK_ISA_AVX512VL },
2764 { "-mavx512bw", OPTION_MASK_ISA_AVX512BW },
2765 { "-mavx512dq", OPTION_MASK_ISA_AVX512DQ },
2766 { "-mavx512er", OPTION_MASK_ISA_AVX512ER },
2767 { "-mavx512pf", OPTION_MASK_ISA_AVX512PF },
2768 { "-mavx512cd", OPTION_MASK_ISA_AVX512CD },
2769 { "-mavx512f", OPTION_MASK_ISA_AVX512F },
2770 { "-mavx2", OPTION_MASK_ISA_AVX2 },
2771 { "-mfma", OPTION_MASK_ISA_FMA },
2772 { "-mxop", OPTION_MASK_ISA_XOP },
2773 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2774 { "-mf16c", OPTION_MASK_ISA_F16C },
2775 { "-mavx", OPTION_MASK_ISA_AVX },
2776 /* { "-msse4" OPTION_MASK_ISA_SSE4 }, */
2777 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2778 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2779 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2780 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2781 { "-msse3", OPTION_MASK_ISA_SSE3 },
2782 { "-maes", OPTION_MASK_ISA_AES },
2783 { "-msha", OPTION_MASK_ISA_SHA },
2784 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2785 { "-msse2", OPTION_MASK_ISA_SSE2 },
2786 { "-msse", OPTION_MASK_ISA_SSE },
2787 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2788 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2789 { "-mmmx", OPTION_MASK_ISA_MMX },
2790 { "-mrtm", OPTION_MASK_ISA_RTM },
2791 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2792 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2793 { "-madx", OPTION_MASK_ISA_ADX },
2794 { "-mprefetchwt1", OPTION_MASK_ISA_PREFETCHWT1 },
2795 { "-mclflushopt", OPTION_MASK_ISA_CLFLUSHOPT },
2796 { "-mxsaves", OPTION_MASK_ISA_XSAVES },
2797 { "-mxsavec", OPTION_MASK_ISA_XSAVEC },
2798 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2799 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2800 { "-mabm", OPTION_MASK_ISA_ABM },
2801 { "-mbmi", OPTION_MASK_ISA_BMI },
2802 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2803 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2804 { "-mtbm", OPTION_MASK_ISA_TBM },
2805 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2806 { "-mcx16", OPTION_MASK_ISA_CX16 },
2807 { "-msahf", OPTION_MASK_ISA_SAHF },
2808 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2809 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2810 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2811 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2812 { "-mmwaitx", OPTION_MASK_ISA_MWAITX },
2813 { "-mclzero", OPTION_MASK_ISA_CLZERO },
2814 { "-mpku", OPTION_MASK_ISA_PKU },
2815 { "-mlwp", OPTION_MASK_ISA_LWP },
2816 { "-mhle", OPTION_MASK_ISA_HLE },
2817 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2818 { "-mclwb", OPTION_MASK_ISA_CLWB }
2819 };
2820
2821 /* Flag options. */
2822 static struct ix86_target_opts flag_opts[] =
2823 {
2824 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2825 { "-mlong-double-128", MASK_LONG_DOUBLE_128 },
2826 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2827 { "-m80387", MASK_80387 },
2828 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2829 { "-malign-double", MASK_ALIGN_DOUBLE },
2830 { "-mcld", MASK_CLD },
2831 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2832 { "-mieee-fp", MASK_IEEE_FP },
2833 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2834 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2835 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2836 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2837 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2838 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2839 { "-mno-red-zone", MASK_NO_RED_ZONE },
2840 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2841 { "-mrecip", MASK_RECIP },
2842 { "-mrtd", MASK_RTD },
2843 { "-msseregparm", MASK_SSEREGPARM },
2844 { "-mstack-arg-probe", MASK_STACK_PROBE },
2845 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2846 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2847 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2848 { "-mvzeroupper", MASK_VZEROUPPER },
2849 { "-mstv", MASK_STV },
2850 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
2851 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE },
2852 { "-mcall-ms2sysv-xlogues", MASK_CALL_MS2SYSV_XLOGUES }
2853 };
2854
2855 /* Additional flag options. */
2856 static struct ix86_target_opts flag2_opts[] =
2857 {
2858 { "-mgeneral-regs-only", OPTION_MASK_GENERAL_REGS_ONLY }
2859 };
2860
2861 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (isa2_opts)
2862 + ARRAY_SIZE (flag_opts) + ARRAY_SIZE (flag2_opts) + 6][2];
2863
2864 char isa_other[40];
2865 char isa2_other[40];
2866 char flags_other[40];
2867 char flags2_other[40];
2868 unsigned num = 0;
2869 unsigned i, j;
2870 char *ret;
2871 char *ptr;
2872 size_t len;
2873 size_t line_len;
2874 size_t sep_len;
2875 const char *abi;
2876
2877 memset (opts, '\0', sizeof (opts));
2878
2879 /* Add -march= option. */
2880 if (arch)
2881 {
2882 opts[num][0] = "-march=";
2883 opts[num++][1] = arch;
2884 }
2885
2886 /* Add -mtune= option. */
2887 if (tune)
2888 {
2889 opts[num][0] = "-mtune=";
2890 opts[num++][1] = tune;
2891 }
2892
2893 /* Add -m32/-m64/-mx32. */
2894 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2895 {
2896 if ((isa & OPTION_MASK_ABI_64) != 0)
2897 abi = "-m64";
2898 else
2899 abi = "-mx32";
2900 isa &= ~ (OPTION_MASK_ISA_64BIT
2901 | OPTION_MASK_ABI_64
2902 | OPTION_MASK_ABI_X32);
2903 }
2904 else
2905 abi = "-m32";
2906 opts[num++][0] = abi;
2907
2908 /* Pick out the options in isa2 options. */
2909 for (i = 0; i < ARRAY_SIZE (isa2_opts); i++)
2910 {
2911 if ((isa2 & isa2_opts[i].mask) != 0)
2912 {
2913 opts[num++][0] = isa2_opts[i].option;
2914 isa2 &= ~ isa2_opts[i].mask;
2915 }
2916 }
2917
2918 if (isa2 && add_nl_p)
2919 {
2920 opts[num++][0] = isa2_other;
2921 sprintf (isa2_other, "(other isa2: %#" HOST_WIDE_INT_PRINT "x)", isa2);
2922 }
2923
2924 /* Pick out the options in isa options. */
2925 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2926 {
2927 if ((isa & isa_opts[i].mask) != 0)
2928 {
2929 opts[num++][0] = isa_opts[i].option;
2930 isa &= ~ isa_opts[i].mask;
2931 }
2932 }
2933
2934 if (isa && add_nl_p)
2935 {
2936 opts[num++][0] = isa_other;
2937 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)", isa);
2938 }
2939
2940 /* Add flag options. */
2941 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2942 {
2943 if ((flags & flag_opts[i].mask) != 0)
2944 {
2945 opts[num++][0] = flag_opts[i].option;
2946 flags &= ~ flag_opts[i].mask;
2947 }
2948 }
2949
2950 if (flags && add_nl_p)
2951 {
2952 opts[num++][0] = flags_other;
2953 sprintf (flags_other, "(other flags: %#x)", flags);
2954 }
2955
2956 /* Add additional flag options. */
2957 for (i = 0; i < ARRAY_SIZE (flag2_opts); i++)
2958 {
2959 if ((flags2 & flag2_opts[i].mask) != 0)
2960 {
2961 opts[num++][0] = flag2_opts[i].option;
2962 flags2 &= ~ flag2_opts[i].mask;
2963 }
2964 }
2965
2966 if (flags2 && add_nl_p)
2967 {
2968 opts[num++][0] = flags2_other;
2969 sprintf (flags2_other, "(other flags2: %#x)", flags2);
2970 }
2971
2972 /* Add -fpmath= option. */
2973 if (fpmath)
2974 {
2975 opts[num][0] = "-mfpmath=";
2976 switch ((int) fpmath)
2977 {
2978 case FPMATH_387:
2979 opts[num++][1] = "387";
2980 break;
2981
2982 case FPMATH_SSE:
2983 opts[num++][1] = "sse";
2984 break;
2985
2986 case FPMATH_387 | FPMATH_SSE:
2987 opts[num++][1] = "sse+387";
2988 break;
2989
2990 default:
2991 gcc_unreachable ();
2992 }
2993 }
2994
2995 /* Any options? */
2996 if (num == 0)
2997 return NULL;
2998
2999 gcc_assert (num < ARRAY_SIZE (opts));
3000
3001 /* Size the string. */
3002 len = 0;
3003 sep_len = (add_nl_p) ? 3 : 1;
3004 for (i = 0; i < num; i++)
3005 {
3006 len += sep_len;
3007 for (j = 0; j < 2; j++)
3008 if (opts[i][j])
3009 len += strlen (opts[i][j]);
3010 }
3011
3012 /* Build the string. */
3013 ret = ptr = (char *) xmalloc (len);
3014 line_len = 0;
3015
3016 for (i = 0; i < num; i++)
3017 {
3018 size_t len2[2];
3019
3020 for (j = 0; j < 2; j++)
3021 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
3022
3023 if (i != 0)
3024 {
3025 *ptr++ = ' ';
3026 line_len++;
3027
3028 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
3029 {
3030 *ptr++ = '\\';
3031 *ptr++ = '\n';
3032 line_len = 0;
3033 }
3034 }
3035
3036 for (j = 0; j < 2; j++)
3037 if (opts[i][j])
3038 {
3039 memcpy (ptr, opts[i][j], len2[j]);
3040 ptr += len2[j];
3041 line_len += len2[j];
3042 }
3043 }
3044
3045 *ptr = '\0';
3046 gcc_assert (ret + len >= ptr);
3047
3048 return ret;
3049 }
3050
3051 /* Return true, if profiling code should be emitted before
3052 prologue. Otherwise it returns false.
3053 Note: For x86 with "hotfix" it is sorried. */
3054 static bool
3055 ix86_profile_before_prologue (void)
3056 {
3057 return flag_fentry != 0;
3058 }
3059
3060 /* Function that is callable from the debugger to print the current
3061 options. */
3062 void ATTRIBUTE_UNUSED
3063 ix86_debug_options (void)
3064 {
3065 char *opts = ix86_target_string (ix86_isa_flags, ix86_isa_flags2,
3066 target_flags, ix86_target_flags,
3067 ix86_arch_string,ix86_tune_string,
3068 ix86_fpmath, true);
3069
3070 if (opts)
3071 {
3072 fprintf (stderr, "%s\n\n", opts);
3073 free (opts);
3074 }
3075 else
3076 fputs ("<no options>\n\n", stderr);
3077
3078 return;
3079 }
3080
3081 /* Return true if T is one of the bytes we should avoid with
3082 -fmitigate-rop. */
3083
3084 static bool
3085 ix86_rop_should_change_byte_p (int t)
3086 {
3087 return t == 0xc2 || t == 0xc3 || t == 0xca || t == 0xcb;
3088 }
3089
3090 static const char *stringop_alg_names[] = {
3091 #define DEF_ENUM
3092 #define DEF_ALG(alg, name) #name,
3093 #include "stringop.def"
3094 #undef DEF_ENUM
3095 #undef DEF_ALG
3096 };
3097
3098 /* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
3099 The string is of the following form (or comma separated list of it):
3100
3101 strategy_alg:max_size:[align|noalign]
3102
3103 where the full size range for the strategy is either [0, max_size] or
3104 [min_size, max_size], in which min_size is the max_size + 1 of the
3105 preceding range. The last size range must have max_size == -1.
3106
3107 Examples:
3108
3109 1.
3110 -mmemcpy-strategy=libcall:-1:noalign
3111
3112 this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
3113
3114
3115 2.
3116 -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
3117
3118 This is to tell the compiler to use the following strategy for memset
3119 1) when the expected size is between [1, 16], use rep_8byte strategy;
3120 2) when the size is between [17, 2048], use vector_loop;
3121 3) when the size is > 2048, use libcall. */
3122
3123 struct stringop_size_range
3124 {
3125 int max;
3126 stringop_alg alg;
3127 bool noalign;
3128 };
3129
3130 static void
3131 ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
3132 {
3133 const struct stringop_algs *default_algs;
3134 stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
3135 char *curr_range_str, *next_range_str;
3136 const char *opt = is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=";
3137 int i = 0, n = 0;
3138
3139 if (is_memset)
3140 default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
3141 else
3142 default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
3143
3144 curr_range_str = strategy_str;
3145
3146 do
3147 {
3148 int maxs;
3149 char alg_name[128];
3150 char align[16];
3151 next_range_str = strchr (curr_range_str, ',');
3152 if (next_range_str)
3153 *next_range_str++ = '\0';
3154
3155 if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
3156 alg_name, &maxs, align))
3157 {
3158 error ("wrong argument %qs to option %qs", curr_range_str, opt);
3159 return;
3160 }
3161
3162 if (n > 0 && (maxs < (input_ranges[n - 1].max + 1) && maxs != -1))
3163 {
3164 error ("size ranges of option %qs should be increasing", opt);
3165 return;
3166 }
3167
3168 for (i = 0; i < last_alg; i++)
3169 if (!strcmp (alg_name, stringop_alg_names[i]))
3170 break;
3171
3172 if (i == last_alg)
3173 {
3174 error ("wrong strategy name %qs specified for option %qs",
3175 alg_name, opt);
3176
3177 auto_vec <const char *> candidates;
3178 for (i = 0; i < last_alg; i++)
3179 if ((stringop_alg) i != rep_prefix_8_byte || TARGET_64BIT)
3180 candidates.safe_push (stringop_alg_names[i]);
3181
3182 char *s;
3183 const char *hint
3184 = candidates_list_and_hint (alg_name, s, candidates);
3185 if (hint)
3186 inform (input_location,
3187 "valid arguments to %qs are: %s; did you mean %qs?",
3188 opt, s, hint);
3189 else
3190 inform (input_location, "valid arguments to %qs are: %s",
3191 opt, s);
3192 XDELETEVEC (s);
3193 return;
3194 }
3195
3196 if ((stringop_alg) i == rep_prefix_8_byte
3197 && !TARGET_64BIT)
3198 {
3199 /* rep; movq isn't available in 32-bit code. */
3200 error ("strategy name %qs specified for option %qs "
3201 "not supported for 32-bit code", alg_name, opt);
3202 return;
3203 }
3204
3205 input_ranges[n].max = maxs;
3206 input_ranges[n].alg = (stringop_alg) i;
3207 if (!strcmp (align, "align"))
3208 input_ranges[n].noalign = false;
3209 else if (!strcmp (align, "noalign"))
3210 input_ranges[n].noalign = true;
3211 else
3212 {
3213 error ("unknown alignment %qs specified for option %qs", align, opt);
3214 return;
3215 }
3216 n++;
3217 curr_range_str = next_range_str;
3218 }
3219 while (curr_range_str);
3220
3221 if (input_ranges[n - 1].max != -1)
3222 {
3223 error ("the max value for the last size range should be -1"
3224 " for option %qs", opt);
3225 return;
3226 }
3227
3228 if (n > MAX_STRINGOP_ALGS)
3229 {
3230 error ("too many size ranges specified in option %qs", opt);
3231 return;
3232 }
3233
3234 /* Now override the default algs array. */
3235 for (i = 0; i < n; i++)
3236 {
3237 *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
3238 *const_cast<stringop_alg *>(&default_algs->size[i].alg)
3239 = input_ranges[i].alg;
3240 *const_cast<int *>(&default_algs->size[i].noalign)
3241 = input_ranges[i].noalign;
3242 }
3243 }
3244
3245 \f
3246 /* parse -mtune-ctrl= option. When DUMP is true,
3247 print the features that are explicitly set. */
3248
3249 static void
3250 parse_mtune_ctrl_str (bool dump)
3251 {
3252 if (!ix86_tune_ctrl_string)
3253 return;
3254
3255 char *next_feature_string = NULL;
3256 char *curr_feature_string = xstrdup (ix86_tune_ctrl_string);
3257 char *orig = curr_feature_string;
3258 int i;
3259 do
3260 {
3261 bool clear = false;
3262
3263 next_feature_string = strchr (curr_feature_string, ',');
3264 if (next_feature_string)
3265 *next_feature_string++ = '\0';
3266 if (*curr_feature_string == '^')
3267 {
3268 curr_feature_string++;
3269 clear = true;
3270 }
3271 for (i = 0; i < X86_TUNE_LAST; i++)
3272 {
3273 if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
3274 {
3275 ix86_tune_features[i] = !clear;
3276 if (dump)
3277 fprintf (stderr, "Explicitly %s feature %s\n",
3278 clear ? "clear" : "set", ix86_tune_feature_names[i]);
3279 break;
3280 }
3281 }
3282 if (i == X86_TUNE_LAST)
3283 error ("unknown parameter to option -mtune-ctrl: %s",
3284 clear ? curr_feature_string - 1 : curr_feature_string);
3285 curr_feature_string = next_feature_string;
3286 }
3287 while (curr_feature_string);
3288 free (orig);
3289 }
3290
3291 /* Helper function to set ix86_tune_features. IX86_TUNE is the
3292 processor type. */
3293
3294 static void
3295 set_ix86_tune_features (enum processor_type ix86_tune, bool dump)
3296 {
3297 unsigned int ix86_tune_mask = 1u << ix86_tune;
3298 int i;
3299
3300 for (i = 0; i < X86_TUNE_LAST; ++i)
3301 {
3302 if (ix86_tune_no_default)
3303 ix86_tune_features[i] = 0;
3304 else
3305 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3306 }
3307
3308 if (dump)
3309 {
3310 fprintf (stderr, "List of x86 specific tuning parameter names:\n");
3311 for (i = 0; i < X86_TUNE_LAST; i++)
3312 fprintf (stderr, "%s : %s\n", ix86_tune_feature_names[i],
3313 ix86_tune_features[i] ? "on" : "off");
3314 }
3315
3316 parse_mtune_ctrl_str (dump);
3317 }
3318
3319
3320 /* Default align_* from the processor table. */
3321
3322 static void
3323 ix86_default_align (struct gcc_options *opts)
3324 {
3325 if (opts->x_align_loops == 0)
3326 {
3327 opts->x_align_loops = processor_target_table[ix86_tune].align_loop;
3328 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3329 }
3330 if (opts->x_align_jumps == 0)
3331 {
3332 opts->x_align_jumps = processor_target_table[ix86_tune].align_jump;
3333 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3334 }
3335 if (opts->x_align_functions == 0)
3336 {
3337 opts->x_align_functions = processor_target_table[ix86_tune].align_func;
3338 }
3339 }
3340
3341 /* Implement TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE hook. */
3342
3343 static void
3344 ix86_override_options_after_change (void)
3345 {
3346 ix86_default_align (&global_options);
3347 }
3348
3349 /* Override various settings based on options. If MAIN_ARGS_P, the
3350 options are from the command line, otherwise they are from
3351 attributes. Return true if there's an error related to march
3352 option. */
3353
3354 static bool
3355 ix86_option_override_internal (bool main_args_p,
3356 struct gcc_options *opts,
3357 struct gcc_options *opts_set)
3358 {
3359 int i;
3360 unsigned int ix86_arch_mask;
3361 const bool ix86_tune_specified = (opts->x_ix86_tune_string != NULL);
3362
3363 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
3364 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
3365 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
3366 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
3367 #define PTA_AES (HOST_WIDE_INT_1 << 4)
3368 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
3369 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
3370 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
3371 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
3372 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
3373 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
3374 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
3375 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
3376 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
3377 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
3378 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
3379 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
3380 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
3381 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
3382 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
3383 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
3384 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
3385 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
3386 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
3387 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
3388 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
3389 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
3390 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
3391 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
3392 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
3393 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
3394 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
3395 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
3396 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
3397 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
3398 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
3399 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
3400 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
3401 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
3402 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
3403 #define PTA_AVX512F (HOST_WIDE_INT_1 << 40)
3404 #define PTA_AVX512ER (HOST_WIDE_INT_1 << 41)
3405 #define PTA_AVX512PF (HOST_WIDE_INT_1 << 42)
3406 #define PTA_AVX512CD (HOST_WIDE_INT_1 << 43)
3407 #define PTA_MPX (HOST_WIDE_INT_1 << 44)
3408 #define PTA_SHA (HOST_WIDE_INT_1 << 45)
3409 #define PTA_PREFETCHWT1 (HOST_WIDE_INT_1 << 46)
3410 #define PTA_CLFLUSHOPT (HOST_WIDE_INT_1 << 47)
3411 #define PTA_XSAVEC (HOST_WIDE_INT_1 << 48)
3412 #define PTA_XSAVES (HOST_WIDE_INT_1 << 49)
3413 #define PTA_AVX512DQ (HOST_WIDE_INT_1 << 50)
3414 #define PTA_AVX512BW (HOST_WIDE_INT_1 << 51)
3415 #define PTA_AVX512VL (HOST_WIDE_INT_1 << 52)
3416 #define PTA_AVX512IFMA (HOST_WIDE_INT_1 << 53)
3417 #define PTA_AVX512VBMI (HOST_WIDE_INT_1 << 54)
3418 #define PTA_CLWB (HOST_WIDE_INT_1 << 55)
3419 #define PTA_MWAITX (HOST_WIDE_INT_1 << 56)
3420 #define PTA_CLZERO (HOST_WIDE_INT_1 << 57)
3421 #define PTA_NO_80387 (HOST_WIDE_INT_1 << 58)
3422 #define PTA_PKU (HOST_WIDE_INT_1 << 59)
3423 #define PTA_AVX5124VNNIW (HOST_WIDE_INT_1 << 60)
3424 #define PTA_AVX5124FMAPS (HOST_WIDE_INT_1 << 61)
3425 #define PTA_AVX512VPOPCNTDQ (HOST_WIDE_INT_1 << 62)
3426 #define PTA_SGX (HOST_WIDE_INT_1 << 63)
3427
3428 #define PTA_CORE2 \
3429 (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 \
3430 | PTA_CX16 | PTA_FXSR)
3431 #define PTA_NEHALEM \
3432 (PTA_CORE2 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_POPCNT)
3433 #define PTA_WESTMERE \
3434 (PTA_NEHALEM | PTA_AES | PTA_PCLMUL)
3435 #define PTA_SANDYBRIDGE \
3436 (PTA_WESTMERE | PTA_AVX | PTA_XSAVE | PTA_XSAVEOPT)
3437 #define PTA_IVYBRIDGE \
3438 (PTA_SANDYBRIDGE | PTA_FSGSBASE | PTA_RDRND | PTA_F16C)
3439 #define PTA_HASWELL \
3440 (PTA_IVYBRIDGE | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_LZCNT \
3441 | PTA_FMA | PTA_MOVBE | PTA_HLE)
3442 #define PTA_BROADWELL \
3443 (PTA_HASWELL | PTA_ADX | PTA_PRFCHW | PTA_RDSEED)
3444 #define PTA_SKYLAKE \
3445 (PTA_BROADWELL | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES)
3446 #define PTA_SKYLAKE_AVX512 \
3447 (PTA_SKYLAKE | PTA_AVX512F | PTA_AVX512CD | PTA_AVX512VL \
3448 | PTA_AVX512BW | PTA_AVX512DQ | PTA_PKU)
3449 #define PTA_CANNONLAKE \
3450 (PTA_SKYLAKE_AVX512 | PTA_AVX512VBMI | PTA_AVX512IFMA | PTA_SHA | PTA_CLWB)
3451 #define PTA_KNL \
3452 (PTA_BROADWELL | PTA_AVX512PF | PTA_AVX512ER | PTA_AVX512F | PTA_AVX512CD)
3453 #define PTA_BONNELL \
3454 (PTA_CORE2 | PTA_MOVBE)
3455 #define PTA_SILVERMONT \
3456 (PTA_WESTMERE | PTA_MOVBE)
3457 #define PTA_KNM \
3458 (PTA_KNL | PTA_AVX5124VNNIW | PTA_AVX5124FMAPS | PTA_AVX512VPOPCNTDQ)
3459
3460 /* if this reaches 64, need to widen struct pta flags below */
3461
3462 static struct pta
3463 {
3464 const char *const name; /* processor name or nickname. */
3465 const enum processor_type processor;
3466 const enum attr_cpu schedule;
3467 const unsigned HOST_WIDE_INT flags;
3468 }
3469 const processor_alias_table[] =
3470 {
3471 {"i386", PROCESSOR_I386, CPU_NONE, 0},
3472 {"i486", PROCESSOR_I486, CPU_NONE, 0},
3473 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3474 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
3475 {"lakemont", PROCESSOR_LAKEMONT, CPU_PENTIUM, PTA_NO_80387},
3476 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
3477 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
3478 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3479 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3480 {"samuel-2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
3481 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3482 PTA_MMX | PTA_SSE | PTA_FXSR},
3483 {"nehemiah", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3484 PTA_MMX | PTA_SSE | PTA_FXSR},
3485 {"c7", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3486 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3487 {"esther", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3488 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3489 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3490 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
3491 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
3492 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3493 PTA_MMX | PTA_SSE | PTA_FXSR},
3494 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3495 PTA_MMX | PTA_SSE | PTA_FXSR},
3496 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
3497 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3498 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
3499 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
3500 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
3501 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
3502 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
3503 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3504 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
3505 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3506 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
3507 {"core2", PROCESSOR_CORE2, CPU_CORE2, PTA_CORE2},
3508 {"nehalem", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3509 {"corei7", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_NEHALEM},
3510 {"westmere", PROCESSOR_NEHALEM, CPU_NEHALEM, PTA_WESTMERE},
3511 {"sandybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3512 PTA_SANDYBRIDGE},
3513 {"corei7-avx", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3514 PTA_SANDYBRIDGE},
3515 {"ivybridge", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3516 PTA_IVYBRIDGE},
3517 {"core-avx-i", PROCESSOR_SANDYBRIDGE, CPU_NEHALEM,
3518 PTA_IVYBRIDGE},
3519 {"haswell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3520 {"core-avx2", PROCESSOR_HASWELL, CPU_HASWELL, PTA_HASWELL},
3521 {"broadwell", PROCESSOR_HASWELL, CPU_HASWELL, PTA_BROADWELL},
3522 {"skylake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_SKYLAKE},
3523 {"skylake-avx512", PROCESSOR_SKYLAKE_AVX512, CPU_HASWELL,
3524 PTA_SKYLAKE_AVX512},
3525 {"cannonlake", PROCESSOR_HASWELL, CPU_HASWELL, PTA_CANNONLAKE},
3526 {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3527 {"atom", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL},
3528 {"silvermont", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3529 {"slm", PROCESSOR_SILVERMONT, CPU_SLM, PTA_SILVERMONT},
3530 {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL},
3531 {"knm", PROCESSOR_KNM, CPU_SLM, PTA_KNM},
3532 {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM},
3533 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3534 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3535 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3536 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3537 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3538 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3539 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3540 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3541 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3542 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3543 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3544 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3545 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3546 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3547 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_FXSR},
3548 {"x86-64", PROCESSOR_K8, CPU_K8,
3549 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3550 {"eden-x2", PROCESSOR_K8, CPU_K8,
3551 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
3552 {"nano", PROCESSOR_K8, CPU_K8,
3553 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3554 | PTA_SSSE3 | PTA_FXSR},
3555 {"nano-1000", PROCESSOR_K8, CPU_K8,
3556 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3557 | PTA_SSSE3 | PTA_FXSR},
3558 {"nano-2000", PROCESSOR_K8, CPU_K8,
3559 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3560 | PTA_SSSE3 | PTA_FXSR},
3561 {"nano-3000", PROCESSOR_K8, CPU_K8,
3562 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3563 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3564 {"nano-x2", PROCESSOR_K8, CPU_K8,
3565 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3566 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3567 {"eden-x4", PROCESSOR_K8, CPU_K8,
3568 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3569 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3570 {"nano-x4", PROCESSOR_K8, CPU_K8,
3571 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3572 | PTA_SSSE3 | PTA_SSE4_1 | PTA_FXSR},
3573 {"k8", PROCESSOR_K8, CPU_K8,
3574 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3575 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3576 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3577 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3578 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3579 {"opteron", PROCESSOR_K8, CPU_K8,
3580 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3581 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3582 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3583 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3584 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3585 {"athlon64", PROCESSOR_K8, CPU_K8,
3586 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3587 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3588 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3589 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3590 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_FXSR},
3591 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3592 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3593 | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
3594 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3595 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3596 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3597 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3598 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
3599 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
3600 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3601 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3602 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3603 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3604 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3605 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3606 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3607 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3608 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3609 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3610 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
3611 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
3612 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3613 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3614 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3615 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3616 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
3617 | PTA_XSAVEOPT | PTA_FSGSBASE},
3618 {"bdver4", PROCESSOR_BDVER4, CPU_BDVER4,
3619 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3620 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3621 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3622 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_BMI2
3623 | PTA_TBM | PTA_F16C | PTA_FMA | PTA_PRFCHW | PTA_FXSR
3624 | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE | PTA_RDRND
3625 | PTA_MOVBE | PTA_MWAITX},
3626 {"znver1", PROCESSOR_ZNVER1, CPU_ZNVER1,
3627 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3628 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3629 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_AVX2
3630 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_PRFCHW
3631 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT | PTA_FSGSBASE
3632 | PTA_RDRND | PTA_MOVBE | PTA_MWAITX | PTA_ADX | PTA_RDSEED
3633 | PTA_CLZERO | PTA_CLFLUSHOPT | PTA_XSAVEC | PTA_XSAVES
3634 | PTA_SHA | PTA_LZCNT | PTA_POPCNT},
3635 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC,
3636 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3637 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
3638 | PTA_FXSR | PTA_XSAVE},
3639 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
3640 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3641 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
3642 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3643 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
3644 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
3645
3646 {"generic", PROCESSOR_GENERIC, CPU_GENERIC,
3647 PTA_64BIT
3648 | PTA_HLE /* flags are only used for -march switch. */ },
3649 };
3650
3651 /* -mrecip options. */
3652 static struct
3653 {
3654 const char *string; /* option name */
3655 unsigned int mask; /* mask bits to set */
3656 }
3657 const recip_options[] =
3658 {
3659 { "all", RECIP_MASK_ALL },
3660 { "none", RECIP_MASK_NONE },
3661 { "div", RECIP_MASK_DIV },
3662 { "sqrt", RECIP_MASK_SQRT },
3663 { "vec-div", RECIP_MASK_VEC_DIV },
3664 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3665 };
3666
3667 int const pta_size = ARRAY_SIZE (processor_alias_table);
3668
3669 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3670 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3671 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3672 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3673 #ifdef TARGET_BI_ARCH
3674 else
3675 {
3676 #if TARGET_BI_ARCH == 1
3677 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3678 is on and OPTION_MASK_ABI_X32 is off. We turn off
3679 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3680 -mx32. */
3681 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3682 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3683 #else
3684 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3685 on and OPTION_MASK_ABI_64 is off. We turn off
3686 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3687 -m64 or OPTION_MASK_CODE16 is turned on by -m16. */
3688 if (TARGET_LP64_P (opts->x_ix86_isa_flags)
3689 || TARGET_16BIT_P (opts->x_ix86_isa_flags))
3690 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3691 #endif
3692 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3693 && TARGET_IAMCU_P (opts->x_target_flags))
3694 sorry ("Intel MCU psABI isn%'t supported in %s mode",
3695 TARGET_X32_P (opts->x_ix86_isa_flags) ? "x32" : "64-bit");
3696 }
3697 #endif
3698
3699 if (TARGET_X32_P (opts->x_ix86_isa_flags))
3700 {
3701 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3702 OPTION_MASK_ABI_64 for TARGET_X32. */
3703 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3704 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3705 }
3706 else if (TARGET_16BIT_P (opts->x_ix86_isa_flags))
3707 opts->x_ix86_isa_flags &= ~(OPTION_MASK_ISA_64BIT
3708 | OPTION_MASK_ABI_X32
3709 | OPTION_MASK_ABI_64);
3710 else if (TARGET_LP64_P (opts->x_ix86_isa_flags))
3711 {
3712 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3713 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3714 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3715 opts->x_ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3716 }
3717
3718 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3719 SUBTARGET_OVERRIDE_OPTIONS;
3720 #endif
3721
3722 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3723 SUBSUBTARGET_OVERRIDE_OPTIONS;
3724 #endif
3725
3726 /* -fPIC is the default for x86_64. */
3727 if (TARGET_MACHO && TARGET_64BIT_P (opts->x_ix86_isa_flags))
3728 opts->x_flag_pic = 2;
3729
3730 /* Need to check -mtune=generic first. */
3731 if (opts->x_ix86_tune_string)
3732 {
3733 /* As special support for cross compilers we read -mtune=native
3734 as -mtune=generic. With native compilers we won't see the
3735 -mtune=native, as it was changed by the driver. */
3736 if (!strcmp (opts->x_ix86_tune_string, "native"))
3737 {
3738 opts->x_ix86_tune_string = "generic";
3739 }
3740 else if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3741 warning (OPT_Wdeprecated,
3742 main_args_p
3743 ? G_("%<-mtune=x86-64%> is deprecated; use %<-mtune=k8%> "
3744 "or %<-mtune=generic%> instead as appropriate")
3745 : G_("%<target(\"tune=x86-64\")%> is deprecated; use "
3746 "%<target(\"tune=k8\")%> or %<target(\"tune=generic\")%>"
3747 " instead as appropriate"));
3748 }
3749 else
3750 {
3751 if (opts->x_ix86_arch_string)
3752 opts->x_ix86_tune_string = opts->x_ix86_arch_string;
3753 if (!opts->x_ix86_tune_string)
3754 {
3755 opts->x_ix86_tune_string
3756 = processor_target_table[TARGET_CPU_DEFAULT].name;
3757 ix86_tune_defaulted = 1;
3758 }
3759
3760 /* opts->x_ix86_tune_string is set to opts->x_ix86_arch_string
3761 or defaulted. We need to use a sensible tune option. */
3762 if (!strcmp (opts->x_ix86_tune_string, "x86-64"))
3763 {
3764 opts->x_ix86_tune_string = "generic";
3765 }
3766 }
3767
3768 if (opts->x_ix86_stringop_alg == rep_prefix_8_byte
3769 && !TARGET_64BIT_P (opts->x_ix86_isa_flags))
3770 {
3771 /* rep; movq isn't available in 32-bit code. */
3772 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3773 opts->x_ix86_stringop_alg = no_stringop;
3774 }
3775
3776 if (!opts->x_ix86_arch_string)
3777 opts->x_ix86_arch_string
3778 = TARGET_64BIT_P (opts->x_ix86_isa_flags)
3779 ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3780 else
3781 ix86_arch_specified = 1;
3782
3783 if (opts_set->x_ix86_pmode)
3784 {
3785 if ((TARGET_LP64_P (opts->x_ix86_isa_flags)
3786 && opts->x_ix86_pmode == PMODE_SI)
3787 || (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
3788 && opts->x_ix86_pmode == PMODE_DI))
3789 error ("address mode %qs not supported in the %s bit mode",
3790 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "short" : "long",
3791 TARGET_64BIT_P (opts->x_ix86_isa_flags) ? "64" : "32");
3792 }
3793 else
3794 opts->x_ix86_pmode = TARGET_LP64_P (opts->x_ix86_isa_flags)
3795 ? PMODE_DI : PMODE_SI;
3796
3797 if (!opts_set->x_ix86_abi)
3798 opts->x_ix86_abi = DEFAULT_ABI;
3799
3800 if (opts->x_ix86_abi == MS_ABI && TARGET_X32_P (opts->x_ix86_isa_flags))
3801 error ("-mabi=ms not supported with X32 ABI");
3802 gcc_assert (opts->x_ix86_abi == SYSV_ABI || opts->x_ix86_abi == MS_ABI);
3803
3804 /* For targets using ms ABI enable ms-extensions, if not
3805 explicit turned off. For non-ms ABI we turn off this
3806 option. */
3807 if (!opts_set->x_flag_ms_extensions)
3808 opts->x_flag_ms_extensions = (MS_ABI == DEFAULT_ABI);
3809
3810 if (opts_set->x_ix86_cmodel)
3811 {
3812 switch (opts->x_ix86_cmodel)
3813 {
3814 case CM_SMALL:
3815 case CM_SMALL_PIC:
3816 if (opts->x_flag_pic)
3817 opts->x_ix86_cmodel = CM_SMALL_PIC;
3818 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3819 error ("code model %qs not supported in the %s bit mode",
3820 "small", "32");
3821 break;
3822
3823 case CM_MEDIUM:
3824 case CM_MEDIUM_PIC:
3825 if (opts->x_flag_pic)
3826 opts->x_ix86_cmodel = CM_MEDIUM_PIC;
3827 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3828 error ("code model %qs not supported in the %s bit mode",
3829 "medium", "32");
3830 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3831 error ("code model %qs not supported in x32 mode",
3832 "medium");
3833 break;
3834
3835 case CM_LARGE:
3836 case CM_LARGE_PIC:
3837 if (opts->x_flag_pic)
3838 opts->x_ix86_cmodel = CM_LARGE_PIC;
3839 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3840 error ("code model %qs not supported in the %s bit mode",
3841 "large", "32");
3842 else if (TARGET_X32_P (opts->x_ix86_isa_flags))
3843 error ("code model %qs not supported in x32 mode",
3844 "large");
3845 break;
3846
3847 case CM_32:
3848 if (opts->x_flag_pic)
3849 error ("code model %s does not support PIC mode", "32");
3850 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3851 error ("code model %qs not supported in the %s bit mode",
3852 "32", "64");
3853 break;
3854
3855 case CM_KERNEL:
3856 if (opts->x_flag_pic)
3857 {
3858 error ("code model %s does not support PIC mode", "kernel");
3859 opts->x_ix86_cmodel = CM_32;
3860 }
3861 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
3862 error ("code model %qs not supported in the %s bit mode",
3863 "kernel", "32");
3864 break;
3865
3866 default:
3867 gcc_unreachable ();
3868 }
3869 }
3870 else
3871 {
3872 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3873 use of rip-relative addressing. This eliminates fixups that
3874 would otherwise be needed if this object is to be placed in a
3875 DLL, and is essentially just as efficient as direct addressing. */
3876 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3877 && (TARGET_RDOS || TARGET_PECOFF))
3878 opts->x_ix86_cmodel = CM_MEDIUM_PIC, opts->x_flag_pic = 1;
3879 else if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
3880 opts->x_ix86_cmodel = opts->x_flag_pic ? CM_SMALL_PIC : CM_SMALL;
3881 else
3882 opts->x_ix86_cmodel = CM_32;
3883 }
3884 if (TARGET_MACHO && opts->x_ix86_asm_dialect == ASM_INTEL)
3885 {
3886 error ("-masm=intel not supported in this configuration");
3887 opts->x_ix86_asm_dialect = ASM_ATT;
3888 }
3889 if ((TARGET_64BIT_P (opts->x_ix86_isa_flags) != 0)
3890 != ((opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3891 sorry ("%i-bit mode not compiled in",
3892 (opts->x_ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3893
3894 for (i = 0; i < pta_size; i++)
3895 if (! strcmp (opts->x_ix86_arch_string, processor_alias_table[i].name))
3896 {
3897 if (!strcmp (opts->x_ix86_arch_string, "generic"))
3898 {
3899 error (main_args_p
3900 ? G_("%<generic%> CPU can be used only for %<-mtune=%> "
3901 "switch")
3902 : G_("%<generic%> CPU can be used only for "
3903 "%<target(\"tune=\")%> attribute"));
3904 return false;
3905 }
3906 else if (!strcmp (opts->x_ix86_arch_string, "intel"))
3907 {
3908 error (main_args_p
3909 ? G_("%<intel%> CPU can be used only for %<-mtune=%> "
3910 "switch")
3911 : G_("%<intel%> CPU can be used only for "
3912 "%<target(\"tune=\")%> attribute"));
3913 return false;
3914 }
3915
3916 if (TARGET_64BIT_P (opts->x_ix86_isa_flags)
3917 && !(processor_alias_table[i].flags & PTA_64BIT))
3918 {
3919 error ("CPU you selected does not support x86-64 "
3920 "instruction set");
3921 return false;
3922 }
3923
3924 ix86_schedule = processor_alias_table[i].schedule;
3925 ix86_arch = processor_alias_table[i].processor;
3926 /* Default cpu tuning to the architecture. */
3927 ix86_tune = ix86_arch;
3928
3929 if (processor_alias_table[i].flags & PTA_MMX
3930 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3931 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3932 if (processor_alias_table[i].flags & PTA_3DNOW
3933 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3934 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3935 if (processor_alias_table[i].flags & PTA_3DNOW_A
3936 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3937 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3938 if (processor_alias_table[i].flags & PTA_SSE
3939 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3940 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3941 if (processor_alias_table[i].flags & PTA_SSE2
3942 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3943 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3944 if (processor_alias_table[i].flags & PTA_SSE3
3945 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3946 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3947 if (processor_alias_table[i].flags & PTA_SSSE3
3948 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3949 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3950 if (processor_alias_table[i].flags & PTA_SSE4_1
3951 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3952 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3953 if (processor_alias_table[i].flags & PTA_SSE4_2
3954 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3955 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3956 if (processor_alias_table[i].flags & PTA_AVX
3957 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3958 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3959 if (processor_alias_table[i].flags & PTA_AVX2
3960 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3961 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3962 if (processor_alias_table[i].flags & PTA_FMA
3963 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3964 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3965 if (processor_alias_table[i].flags & PTA_SSE4A
3966 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3967 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3968 if (processor_alias_table[i].flags & PTA_FMA4
3969 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3970 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3971 if (processor_alias_table[i].flags & PTA_XOP
3972 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3973 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3974 if (processor_alias_table[i].flags & PTA_LWP
3975 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3976 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3977 if (processor_alias_table[i].flags & PTA_ABM
3978 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3979 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3980 if (processor_alias_table[i].flags & PTA_BMI
3981 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3982 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3983 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3984 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3985 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3986 if (processor_alias_table[i].flags & PTA_TBM
3987 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3988 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3989 if (processor_alias_table[i].flags & PTA_BMI2
3990 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3991 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3992 if (processor_alias_table[i].flags & PTA_CX16
3993 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3994 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3995 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3996 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3997 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3998 if (!(TARGET_64BIT_P (opts->x_ix86_isa_flags)
3999 && (processor_alias_table[i].flags & PTA_NO_SAHF))
4000 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
4001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
4002 if (processor_alias_table[i].flags & PTA_MOVBE
4003 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
4004 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
4005 if (processor_alias_table[i].flags & PTA_AES
4006 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
4007 ix86_isa_flags |= OPTION_MASK_ISA_AES;
4008 if (processor_alias_table[i].flags & PTA_SHA
4009 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SHA))
4010 ix86_isa_flags |= OPTION_MASK_ISA_SHA;
4011 if (processor_alias_table[i].flags & PTA_PCLMUL
4012 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
4013 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
4014 if (processor_alias_table[i].flags & PTA_FSGSBASE
4015 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
4016 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
4017 if (processor_alias_table[i].flags & PTA_RDRND
4018 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
4019 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
4020 if (processor_alias_table[i].flags & PTA_F16C
4021 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
4022 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C;
4023 if (processor_alias_table[i].flags & PTA_RTM
4024 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
4025 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RTM;
4026 if (processor_alias_table[i].flags & PTA_HLE
4027 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
4028 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_HLE;
4029 if (processor_alias_table[i].flags & PTA_PRFCHW
4030 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
4031 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
4032 if (processor_alias_table[i].flags & PTA_RDSEED
4033 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
4034 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
4035 if (processor_alias_table[i].flags & PTA_ADX
4036 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
4037 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ADX;
4038 if (processor_alias_table[i].flags & PTA_FXSR
4039 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
4040 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
4041 if (processor_alias_table[i].flags & PTA_XSAVE
4042 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
4043 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
4044 if (processor_alias_table[i].flags & PTA_XSAVEOPT
4045 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
4046 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
4047 if (processor_alias_table[i].flags & PTA_AVX512F
4048 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512F))
4049 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512F;
4050 if (processor_alias_table[i].flags & PTA_AVX512ER
4051 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512ER))
4052 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512ER;
4053 if (processor_alias_table[i].flags & PTA_AVX512PF
4054 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512PF))
4055 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512PF;
4056 if (processor_alias_table[i].flags & PTA_AVX512CD
4057 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512CD))
4058 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512CD;
4059 if (processor_alias_table[i].flags & PTA_PREFETCHWT1
4060 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PREFETCHWT1))
4061 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PREFETCHWT1;
4062 if (processor_alias_table[i].flags & PTA_CLWB
4063 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLWB))
4064 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLWB;
4065 if (processor_alias_table[i].flags & PTA_CLFLUSHOPT
4066 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLFLUSHOPT))
4067 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLFLUSHOPT;
4068 if (processor_alias_table[i].flags & PTA_CLZERO
4069 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_CLZERO))
4070 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CLZERO;
4071 if (processor_alias_table[i].flags & PTA_XSAVEC
4072 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEC))
4073 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVEC;
4074 if (processor_alias_table[i].flags & PTA_XSAVES
4075 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVES))
4076 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XSAVES;
4077 if (processor_alias_table[i].flags & PTA_AVX512DQ
4078 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512DQ))
4079 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512DQ;
4080 if (processor_alias_table[i].flags & PTA_AVX512BW
4081 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512BW))
4082 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512BW;
4083 if (processor_alias_table[i].flags & PTA_AVX512VL
4084 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VL))
4085 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VL;
4086 if (processor_alias_table[i].flags & PTA_MPX
4087 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_MPX))
4088 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_MPX;
4089 if (processor_alias_table[i].flags & PTA_AVX512VBMI
4090 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512VBMI))
4091 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512VBMI;
4092 if (processor_alias_table[i].flags & PTA_AVX512IFMA
4093 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX512IFMA))
4094 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX512IFMA;
4095
4096 if (processor_alias_table[i].flags & PTA_AVX5124VNNIW
4097 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124VNNIW))
4098 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124VNNIW;
4099 if (processor_alias_table[i].flags & PTA_AVX5124FMAPS
4100 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX5124FMAPS))
4101 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX5124FMAPS;
4102 if (processor_alias_table[i].flags & PTA_AVX512VPOPCNTDQ
4103 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_AVX512VPOPCNTDQ))
4104 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_AVX512VPOPCNTDQ;
4105 if (processor_alias_table[i].flags & PTA_SGX
4106 && !(opts->x_ix86_isa_flags2_explicit & OPTION_MASK_ISA_SGX))
4107 opts->x_ix86_isa_flags2 |= OPTION_MASK_ISA_SGX;
4108
4109 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
4110 x86_prefetch_sse = true;
4111 if (processor_alias_table[i].flags & PTA_MWAITX
4112 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_MWAITX))
4113 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MWAITX;
4114 if (processor_alias_table[i].flags & PTA_PKU
4115 && !(opts->x_ix86_isa_flags_explicit & OPTION_MASK_ISA_PKU))
4116 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PKU;
4117
4118 /* Don't enable x87 instructions if only
4119 general registers are allowed. */
4120 if (!(opts_set->x_ix86_target_flags & OPTION_MASK_GENERAL_REGS_ONLY)
4121 && !(opts_set->x_target_flags & MASK_80387))
4122 {
4123 if (processor_alias_table[i].flags & PTA_NO_80387)
4124 opts->x_target_flags &= ~MASK_80387;
4125 else
4126 opts->x_target_flags |= MASK_80387;
4127 }
4128 break;
4129 }
4130
4131 if (TARGET_X32 && (opts->x_ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4132 error ("Intel MPX does not support x32");
4133
4134 if (TARGET_X32 && (ix86_isa_flags2 & OPTION_MASK_ISA_MPX))
4135 error ("Intel MPX does not support x32");
4136
4137 if (i == pta_size)
4138 {
4139 error (main_args_p
4140 ? G_("bad value (%qs) for %<-march=%> switch")
4141 : G_("bad value (%qs) for %<target(\"arch=\")%> attribute"),
4142 opts->x_ix86_arch_string);
4143
4144 auto_vec <const char *> candidates;
4145 for (i = 0; i < pta_size; i++)
4146 if (strcmp (processor_alias_table[i].name, "generic")
4147 && strcmp (processor_alias_table[i].name, "intel")
4148 && (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4149 || (processor_alias_table[i].flags & PTA_64BIT)))
4150 candidates.safe_push (processor_alias_table[i].name);
4151
4152 char *s;
4153 const char *hint
4154 = candidates_list_and_hint (opts->x_ix86_arch_string, s, candidates);
4155 if (hint)
4156 inform (input_location,
4157 main_args_p
4158 ? G_("valid arguments to %<-march=%> switch are: "
4159 "%s; did you mean %qs?")
4160 : G_("valid arguments to %<target(\"arch=\")%> attribute are: "
4161 "%s; did you mean %qs?"), s, hint);
4162 else
4163 inform (input_location,
4164 main_args_p
4165 ? G_("valid arguments to %<-march=%> switch are: %s")
4166 : G_("valid arguments to %<target(\"arch=\")%> attribute "
4167 "are: %s"), s);
4168 XDELETEVEC (s);
4169 }
4170
4171 ix86_arch_mask = 1u << ix86_arch;
4172 for (i = 0; i < X86_ARCH_LAST; ++i)
4173 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4174
4175 for (i = 0; i < pta_size; i++)
4176 if (! strcmp (opts->x_ix86_tune_string, processor_alias_table[i].name))
4177 {
4178 ix86_schedule = processor_alias_table[i].schedule;
4179 ix86_tune = processor_alias_table[i].processor;
4180 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4181 {
4182 if (!(processor_alias_table[i].flags & PTA_64BIT))
4183 {
4184 if (ix86_tune_defaulted)
4185 {
4186 opts->x_ix86_tune_string = "x86-64";
4187 for (i = 0; i < pta_size; i++)
4188 if (! strcmp (opts->x_ix86_tune_string,
4189 processor_alias_table[i].name))
4190 break;
4191 ix86_schedule = processor_alias_table[i].schedule;
4192 ix86_tune = processor_alias_table[i].processor;
4193 }
4194 else
4195 error ("CPU you selected does not support x86-64 "
4196 "instruction set");
4197 }
4198 }
4199 /* Intel CPUs have always interpreted SSE prefetch instructions as
4200 NOPs; so, we can enable SSE prefetch instructions even when
4201 -mtune (rather than -march) points us to a processor that has them.
4202 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
4203 higher processors. */
4204 if (TARGET_CMOV
4205 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
4206 x86_prefetch_sse = true;
4207 break;
4208 }
4209
4210 if (ix86_tune_specified && i == pta_size)
4211 {
4212 error (main_args_p
4213 ? G_("bad value (%qs) for %<-mtune=%> switch")
4214 : G_("bad value (%qs) for %<target(\"tune=\")%> attribute"),
4215 opts->x_ix86_tune_string);
4216
4217 auto_vec <const char *> candidates;
4218 for (i = 0; i < pta_size; i++)
4219 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags)
4220 || (processor_alias_table[i].flags & PTA_64BIT))
4221 candidates.safe_push (processor_alias_table[i].name);
4222
4223 char *s;
4224 const char *hint
4225 = candidates_list_and_hint (opts->x_ix86_tune_string, s, candidates);
4226 if (hint)
4227 inform (input_location,
4228 main_args_p
4229 ? G_("valid arguments to %<-mtune=%> switch are: "
4230 "%s; did you mean %qs?")
4231 : G_("valid arguments to %<target(\"tune=\")%> attribute are: "
4232 "%s; did you mean %qs?"), s, hint);
4233 else
4234 inform (input_location,
4235 main_args_p
4236 ? G_("valid arguments to %<-mtune=%> switch are: %s")
4237 : G_("valid arguments to %<target(\"tune=\")%> attribute "
4238 "are: %s"), s);
4239 XDELETEVEC (s);
4240 }
4241
4242 set_ix86_tune_features (ix86_tune, opts->x_ix86_dump_tunes);
4243
4244 #ifndef USE_IX86_FRAME_POINTER
4245 #define USE_IX86_FRAME_POINTER 0
4246 #endif
4247
4248 #ifndef USE_X86_64_FRAME_POINTER
4249 #define USE_X86_64_FRAME_POINTER 0
4250 #endif
4251
4252 /* Set the default values for switches whose default depends on TARGET_64BIT
4253 in case they weren't overwritten by command line options. */
4254 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4255 {
4256 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4257 opts->x_flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
4258 if (opts->x_flag_asynchronous_unwind_tables
4259 && !opts_set->x_flag_unwind_tables
4260 && TARGET_64BIT_MS_ABI)
4261 opts->x_flag_unwind_tables = 1;
4262 if (opts->x_flag_asynchronous_unwind_tables == 2)
4263 opts->x_flag_unwind_tables
4264 = opts->x_flag_asynchronous_unwind_tables = 1;
4265 if (opts->x_flag_pcc_struct_return == 2)
4266 opts->x_flag_pcc_struct_return = 0;
4267 }
4268 else
4269 {
4270 if (opts->x_optimize >= 1 && !opts_set->x_flag_omit_frame_pointer)
4271 opts->x_flag_omit_frame_pointer
4272 = !(USE_IX86_FRAME_POINTER || opts->x_optimize_size);
4273 if (opts->x_flag_asynchronous_unwind_tables == 2)
4274 opts->x_flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
4275 if (opts->x_flag_pcc_struct_return == 2)
4276 {
4277 /* Intel MCU psABI specifies that -freg-struct-return should
4278 be on. Instead of setting DEFAULT_PCC_STRUCT_RETURN to 1,
4279 we check -miamcu so that -freg-struct-return is always
4280 turned on if -miamcu is used. */
4281 if (TARGET_IAMCU_P (opts->x_target_flags))
4282 opts->x_flag_pcc_struct_return = 0;
4283 else
4284 opts->x_flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
4285 }
4286 }
4287
4288 ix86_tune_cost = processor_target_table[ix86_tune].cost;
4289 /* TODO: ix86_cost should be chosen at instruction or function granuality
4290 so for cold code we use size_cost even in !optimize_size compilation. */
4291 if (opts->x_optimize_size)
4292 ix86_cost = &ix86_size_cost;
4293 else
4294 ix86_cost = ix86_tune_cost;
4295
4296 /* Arrange to set up i386_stack_locals for all functions. */
4297 init_machine_status = ix86_init_machine_status;
4298
4299 /* Validate -mregparm= value. */
4300 if (opts_set->x_ix86_regparm)
4301 {
4302 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4303 warning (0, "-mregparm is ignored in 64-bit mode");
4304 else if (TARGET_IAMCU_P (opts->x_target_flags))
4305 warning (0, "-mregparm is ignored for Intel MCU psABI");
4306 if (opts->x_ix86_regparm > REGPARM_MAX)
4307 {
4308 error ("-mregparm=%d is not between 0 and %d",
4309 opts->x_ix86_regparm, REGPARM_MAX);
4310 opts->x_ix86_regparm = 0;
4311 }
4312 }
4313 if (TARGET_IAMCU_P (opts->x_target_flags)
4314 || TARGET_64BIT_P (opts->x_ix86_isa_flags))
4315 opts->x_ix86_regparm = REGPARM_MAX;
4316
4317 /* Default align_* from the processor table. */
4318 ix86_default_align (opts);
4319
4320 /* Provide default for -mbranch-cost= value. */
4321 if (!opts_set->x_ix86_branch_cost)
4322 opts->x_ix86_branch_cost = ix86_tune_cost->branch_cost;
4323
4324 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4325 {
4326 opts->x_target_flags
4327 |= TARGET_SUBTARGET64_DEFAULT & ~opts_set->x_target_flags;
4328
4329 /* Enable by default the SSE and MMX builtins. Do allow the user to
4330 explicitly disable any of these. In particular, disabling SSE and
4331 MMX for kernel code is extremely useful. */
4332 if (!ix86_arch_specified)
4333 opts->x_ix86_isa_flags
4334 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
4335 | TARGET_SUBTARGET64_ISA_DEFAULT)
4336 & ~opts->x_ix86_isa_flags_explicit);
4337
4338 if (TARGET_RTD_P (opts->x_target_flags))
4339 warning (0,
4340 main_args_p
4341 ? G_("%<-mrtd%> is ignored in 64bit mode")
4342 : G_("%<target(\"rtd\")%> is ignored in 64bit mode"));
4343 }
4344 else
4345 {
4346 opts->x_target_flags
4347 |= TARGET_SUBTARGET32_DEFAULT & ~opts_set->x_target_flags;
4348
4349 if (!ix86_arch_specified)
4350 opts->x_ix86_isa_flags
4351 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~opts->x_ix86_isa_flags_explicit;
4352
4353 /* i386 ABI does not specify red zone. It still makes sense to use it
4354 when programmer takes care to stack from being destroyed. */
4355 if (!(opts_set->x_target_flags & MASK_NO_RED_ZONE))
4356 opts->x_target_flags |= MASK_NO_RED_ZONE;
4357 }
4358
4359 /* Keep nonleaf frame pointers. */
4360 if (opts->x_flag_omit_frame_pointer)
4361 opts->x_target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
4362 else if (TARGET_OMIT_LEAF_FRAME_POINTER_P (opts->x_target_flags))
4363 opts->x_flag_omit_frame_pointer = 1;
4364
4365 /* If we're doing fast math, we don't care about comparison order
4366 wrt NaNs. This lets us use a shorter comparison sequence. */
4367 if (opts->x_flag_finite_math_only)
4368 opts->x_target_flags &= ~MASK_IEEE_FP;
4369
4370 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
4371 since the insns won't need emulation. */
4372 if (ix86_tune_features [X86_TUNE_ALWAYS_FANCY_MATH_387])
4373 opts->x_target_flags &= ~MASK_NO_FANCY_MATH_387;
4374
4375 /* Likewise, if the target doesn't have a 387, or we've specified
4376 software floating point, don't use 387 inline intrinsics. */
4377 if (!TARGET_80387_P (opts->x_target_flags))
4378 opts->x_target_flags |= MASK_NO_FANCY_MATH_387;
4379
4380 /* Turn on MMX builtins for -msse. */
4381 if (TARGET_SSE_P (opts->x_ix86_isa_flags))
4382 opts->x_ix86_isa_flags
4383 |= OPTION_MASK_ISA_MMX & ~opts->x_ix86_isa_flags_explicit;
4384
4385 /* Enable SSE prefetch. */
4386 if (TARGET_SSE_P (opts->x_ix86_isa_flags)
4387 || (TARGET_PRFCHW_P (opts->x_ix86_isa_flags)
4388 && !TARGET_3DNOW_P (opts->x_ix86_isa_flags))
4389 || TARGET_PREFETCHWT1_P (opts->x_ix86_isa_flags))
4390 x86_prefetch_sse = true;
4391
4392 /* Enable popcnt instruction for -msse4.2 or -mabm. */
4393 if (TARGET_SSE4_2_P (opts->x_ix86_isa_flags)
4394 || TARGET_ABM_P (opts->x_ix86_isa_flags))
4395 opts->x_ix86_isa_flags
4396 |= OPTION_MASK_ISA_POPCNT & ~opts->x_ix86_isa_flags_explicit;
4397
4398 /* Enable lzcnt instruction for -mabm. */
4399 if (TARGET_ABM_P(opts->x_ix86_isa_flags))
4400 opts->x_ix86_isa_flags
4401 |= OPTION_MASK_ISA_LZCNT & ~opts->x_ix86_isa_flags_explicit;
4402
4403 /* Disable BMI, BMI2 and TBM instructions for -m16. */
4404 if (TARGET_16BIT_P(opts->x_ix86_isa_flags))
4405 opts->x_ix86_isa_flags
4406 &= ~((OPTION_MASK_ISA_BMI | OPTION_MASK_ISA_BMI2 | OPTION_MASK_ISA_TBM)
4407 & ~opts->x_ix86_isa_flags_explicit);
4408
4409 /* Validate -mpreferred-stack-boundary= value or default it to
4410 PREFERRED_STACK_BOUNDARY_DEFAULT. */
4411 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
4412 if (opts_set->x_ix86_preferred_stack_boundary_arg)
4413 {
4414 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags)? 3 : 2;
4415 int max = TARGET_SEH ? 4 : 12;
4416
4417 if (opts->x_ix86_preferred_stack_boundary_arg < min
4418 || opts->x_ix86_preferred_stack_boundary_arg > max)
4419 {
4420 if (min == max)
4421 error ("-mpreferred-stack-boundary is not supported "
4422 "for this target");
4423 else
4424 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
4425 opts->x_ix86_preferred_stack_boundary_arg, min, max);
4426 }
4427 else
4428 ix86_preferred_stack_boundary
4429 = (1 << opts->x_ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
4430 }
4431
4432 /* Set the default value for -mstackrealign. */
4433 if (!opts_set->x_ix86_force_align_arg_pointer)
4434 opts->x_ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
4435
4436 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
4437
4438 /* Validate -mincoming-stack-boundary= value or default it to
4439 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
4440 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
4441 if (opts_set->x_ix86_incoming_stack_boundary_arg)
4442 {
4443 int min = TARGET_64BIT_P (opts->x_ix86_isa_flags) ? 3 : 2;
4444
4445 if (opts->x_ix86_incoming_stack_boundary_arg < min
4446 || opts->x_ix86_incoming_stack_boundary_arg > 12)
4447 error ("-mincoming-stack-boundary=%d is not between %d and 12",
4448 opts->x_ix86_incoming_stack_boundary_arg, min);
4449 else
4450 {
4451 ix86_user_incoming_stack_boundary
4452 = (1 << opts->x_ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
4453 ix86_incoming_stack_boundary
4454 = ix86_user_incoming_stack_boundary;
4455 }
4456 }
4457
4458 #ifndef NO_PROFILE_COUNTERS
4459 if (flag_nop_mcount)
4460 error ("-mnop-mcount is not compatible with this target");
4461 #endif
4462 if (flag_nop_mcount && flag_pic)
4463 error ("-mnop-mcount is not implemented for -fPIC");
4464
4465 /* Accept -msseregparm only if at least SSE support is enabled. */
4466 if (TARGET_SSEREGPARM_P (opts->x_target_flags)
4467 && ! TARGET_SSE_P (opts->x_ix86_isa_flags))
4468 error (main_args_p
4469 ? G_("%<-msseregparm%> used without SSE enabled")
4470 : G_("%<target(\"sseregparm\")%> used without SSE enabled"));
4471
4472 if (opts_set->x_ix86_fpmath)
4473 {
4474 if (opts->x_ix86_fpmath & FPMATH_SSE)
4475 {
4476 if (!TARGET_SSE_P (opts->x_ix86_isa_flags))
4477 {
4478 if (TARGET_80387_P (opts->x_target_flags))
4479 {
4480 warning (0, "SSE instruction set disabled, using 387 arithmetics");
4481 opts->x_ix86_fpmath = FPMATH_387;
4482 }
4483 }
4484 else if ((opts->x_ix86_fpmath & FPMATH_387)
4485 && !TARGET_80387_P (opts->x_target_flags))
4486 {
4487 warning (0, "387 instruction set disabled, using SSE arithmetics");
4488 opts->x_ix86_fpmath = FPMATH_SSE;
4489 }
4490 }
4491 }
4492 /* For all chips supporting SSE2, -mfpmath=sse performs better than
4493 fpmath=387. The second is however default at many targets since the
4494 extra 80bit precision of temporaries is considered to be part of ABI.
4495 Overwrite the default at least for -ffast-math.
4496 TODO: -mfpmath=both seems to produce same performing code with bit
4497 smaller binaries. It is however not clear if register allocation is
4498 ready for this setting.
4499 Also -mfpmath=387 is overall a lot more compact (bout 4-5%) than SSE
4500 codegen. We may switch to 387 with -ffast-math for size optimized
4501 functions. */
4502 else if (fast_math_flags_set_p (&global_options)
4503 && TARGET_SSE2_P (opts->x_ix86_isa_flags))
4504 opts->x_ix86_fpmath = FPMATH_SSE;
4505 else
4506 opts->x_ix86_fpmath = TARGET_FPMATH_DEFAULT_P (opts->x_ix86_isa_flags);
4507
4508 /* Use external vectorized library in vectorizing intrinsics. */
4509 if (opts_set->x_ix86_veclibabi_type)
4510 switch (opts->x_ix86_veclibabi_type)
4511 {
4512 case ix86_veclibabi_type_svml:
4513 ix86_veclib_handler = ix86_veclibabi_svml;
4514 break;
4515
4516 case ix86_veclibabi_type_acml:
4517 ix86_veclib_handler = ix86_veclibabi_acml;
4518 break;
4519
4520 default:
4521 gcc_unreachable ();
4522 }
4523
4524 if (ix86_tune_features [X86_TUNE_ACCUMULATE_OUTGOING_ARGS]
4525 && !(opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4526 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4527
4528 /* If stack probes are required, the space used for large function
4529 arguments on the stack must also be probed, so enable
4530 -maccumulate-outgoing-args so this happens in the prologue. */
4531 if (TARGET_STACK_PROBE_P (opts->x_target_flags)
4532 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4533 {
4534 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4535 warning (0,
4536 main_args_p
4537 ? G_("stack probing requires %<-maccumulate-outgoing-args%> "
4538 "for correctness")
4539 : G_("stack probing requires "
4540 "%<target(\"accumulate-outgoing-args\")%> for "
4541 "correctness"));
4542 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4543 }
4544
4545 /* Stack realignment without -maccumulate-outgoing-args requires %ebp,
4546 so enable -maccumulate-outgoing-args when %ebp is fixed. */
4547 if (fixed_regs[BP_REG]
4548 && !(opts->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
4549 {
4550 if (opts_set->x_target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)
4551 warning (0,
4552 main_args_p
4553 ? G_("fixed ebp register requires "
4554 "%<-maccumulate-outgoing-args%>")
4555 : G_("fixed ebp register requires "
4556 "%<target(\"accumulate-outgoing-args\")%>"));
4557 opts->x_target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
4558 }
4559
4560 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
4561 {
4562 char *p;
4563 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
4564 p = strchr (internal_label_prefix, 'X');
4565 internal_label_prefix_len = p - internal_label_prefix;
4566 *p = '\0';
4567 }
4568
4569 /* When scheduling description is not available, disable scheduler pass
4570 so it won't slow down the compilation and make x87 code slower. */
4571 if (!TARGET_SCHEDULE)
4572 opts->x_flag_schedule_insns_after_reload = opts->x_flag_schedule_insns = 0;
4573
4574 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
4575 ix86_tune_cost->simultaneous_prefetches,
4576 opts->x_param_values,
4577 opts_set->x_param_values);
4578 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
4579 ix86_tune_cost->prefetch_block,
4580 opts->x_param_values,
4581 opts_set->x_param_values);
4582 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
4583 ix86_tune_cost->l1_cache_size,
4584 opts->x_param_values,
4585 opts_set->x_param_values);
4586 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
4587 ix86_tune_cost->l2_cache_size,
4588 opts->x_param_values,
4589 opts_set->x_param_values);
4590
4591 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
4592 if (opts->x_flag_prefetch_loop_arrays < 0
4593 && HAVE_prefetch
4594 && (opts->x_optimize >= 3 || opts->x_flag_profile_use)
4595 && !opts->x_optimize_size
4596 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
4597 opts->x_flag_prefetch_loop_arrays = 1;
4598
4599 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
4600 can be opts->x_optimized to ap = __builtin_next_arg (0). */
4601 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && !opts->x_flag_split_stack)
4602 targetm.expand_builtin_va_start = NULL;
4603
4604 if (TARGET_64BIT_P (opts->x_ix86_isa_flags))
4605 {
4606 ix86_gen_leave = gen_leave_rex64;
4607 if (Pmode == DImode)
4608 {
4609 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
4610 ix86_gen_tls_local_dynamic_base_64
4611 = gen_tls_local_dynamic_base_64_di;
4612 }
4613 else
4614 {
4615 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
4616 ix86_gen_tls_local_dynamic_base_64
4617 = gen_tls_local_dynamic_base_64_si;
4618 }
4619 }
4620 else
4621 ix86_gen_leave = gen_leave;
4622
4623 if (Pmode == DImode)
4624 {
4625 ix86_gen_add3 = gen_adddi3;
4626 ix86_gen_sub3 = gen_subdi3;
4627 ix86_gen_sub3_carry = gen_subdi3_carry;
4628 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
4629 ix86_gen_andsp = gen_anddi3;
4630 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
4631 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
4632 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
4633 ix86_gen_monitor = gen_sse3_monitor_di;
4634 ix86_gen_monitorx = gen_monitorx_di;
4635 ix86_gen_clzero = gen_clzero_di;
4636 }
4637 else
4638 {
4639 ix86_gen_add3 = gen_addsi3;
4640 ix86_gen_sub3 = gen_subsi3;
4641 ix86_gen_sub3_carry = gen_subsi3_carry;
4642 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
4643 ix86_gen_andsp = gen_andsi3;
4644 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
4645 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
4646 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
4647 ix86_gen_monitor = gen_sse3_monitor_si;
4648 ix86_gen_monitorx = gen_monitorx_si;
4649 ix86_gen_clzero = gen_clzero_si;
4650 }
4651
4652 #ifdef USE_IX86_CLD
4653 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
4654 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags))
4655 opts->x_target_flags |= MASK_CLD & ~opts_set->x_target_flags;
4656 #endif
4657
4658 /* Set the default value for -mfentry. */
4659 if (!opts_set->x_flag_fentry)
4660 opts->x_flag_fentry = TARGET_SEH;
4661 else
4662 {
4663 if (!TARGET_64BIT_P (opts->x_ix86_isa_flags) && opts->x_flag_pic
4664 && opts->x_flag_fentry)
4665 sorry ("-mfentry isn%'t supported for 32-bit in combination "
4666 "with -fpic");
4667 else if (TARGET_SEH && !opts->x_flag_fentry)
4668 sorry ("-mno-fentry isn%'t compatible with SEH");
4669 }
4670
4671 if (TARGET_SEH && TARGET_CALL_MS2SYSV_XLOGUES)
4672 sorry ("-mcall-ms2sysv-xlogues isn%'t currently supported with SEH");
4673
4674 if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
4675 && TARGET_EMIT_VZEROUPPER)
4676 opts->x_target_flags |= MASK_VZEROUPPER;
4677 if (!(opts_set->x_target_flags & MASK_STV))
4678 opts->x_target_flags |= MASK_STV;
4679 /* Disable STV if -mpreferred-stack-boundary={2,3} or
4680 -mincoming-stack-boundary={2,3} or -mstackrealign - the needed
4681 stack realignment will be extra cost the pass doesn't take into
4682 account and the pass can't realign the stack. */
4683 if (ix86_preferred_stack_boundary < 128
4684 || ix86_incoming_stack_boundary < 128
4685 || opts->x_ix86_force_align_arg_pointer)
4686 opts->x_target_flags &= ~MASK_STV;
4687 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
4688 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
4689 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
4690 if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
4691 && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
4692 opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
4693
4694 /* Enable 128-bit AVX instruction generation
4695 for the auto-vectorizer. */
4696 if (TARGET_AVX128_OPTIMAL
4697 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4698 opts->x_prefer_vector_width_type = PVW_AVX128;
4699
4700 /* Use 256-bit AVX instruction generation
4701 in the auto-vectorizer. */
4702 if (ix86_tune_features[X86_TUNE_AVX256_OPTIMAL]
4703 && (opts_set->x_prefer_vector_width_type == PVW_NONE))
4704 opts->x_prefer_vector_width_type = PVW_AVX256;
4705
4706 if (opts->x_ix86_recip_name)
4707 {
4708 char *p = ASTRDUP (opts->x_ix86_recip_name);
4709 char *q;
4710 unsigned int mask, i;
4711 bool invert;
4712
4713 while ((q = strtok (p, ",")) != NULL)
4714 {
4715 p = NULL;
4716 if (*q == '!')
4717 {
4718 invert = true;
4719 q++;
4720 }
4721 else
4722 invert = false;
4723
4724 if (!strcmp (q, "default"))
4725 mask = RECIP_MASK_ALL;
4726 else
4727 {
4728 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
4729 if (!strcmp (q, recip_options[i].string))
4730 {
4731 mask = recip_options[i].mask;
4732 break;
4733 }
4734
4735 if (i == ARRAY_SIZE (recip_options))
4736 {
4737 error ("unknown option for -mrecip=%s", q);
4738 invert = false;
4739 mask = RECIP_MASK_NONE;
4740 }
4741 }
4742
4743 opts->x_recip_mask_explicit |= mask;
4744 if (invert)
4745 opts->x_recip_mask &= ~mask;
4746 else
4747 opts->x_recip_mask |= mask;
4748 }
4749 }
4750
4751 if (TARGET_RECIP_P (opts->x_target_flags))
4752 opts->x_recip_mask |= RECIP_MASK_ALL & ~opts->x_recip_mask_explicit;
4753 else if (opts_set->x_target_flags & MASK_RECIP)
4754 opts->x_recip_mask &= ~(RECIP_MASK_ALL & ~opts->x_recip_mask_explicit);
4755
4756 /* Default long double to 64-bit for 32-bit Bionic and to __float128
4757 for 64-bit Bionic. Also default long double to 64-bit for Intel
4758 MCU psABI. */
4759 if ((TARGET_HAS_BIONIC || TARGET_IAMCU)
4760 && !(opts_set->x_target_flags
4761 & (MASK_LONG_DOUBLE_64 | MASK_LONG_DOUBLE_128)))
4762 opts->x_target_flags |= (TARGET_64BIT
4763 ? MASK_LONG_DOUBLE_128
4764 : MASK_LONG_DOUBLE_64);
4765
4766 /* Only one of them can be active. */
4767 gcc_assert ((opts->x_target_flags & MASK_LONG_DOUBLE_64) == 0
4768 || (opts->x_target_flags & MASK_LONG_DOUBLE_128) == 0);
4769
4770 /* Handle stack protector */
4771 if (!opts_set->x_ix86_stack_protector_guard)
4772 opts->x_ix86_stack_protector_guard
4773 = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
4774
4775 #ifdef TARGET_THREAD_SSP_OFFSET
4776 ix86_stack_protector_guard_offset = TARGET_THREAD_SSP_OFFSET;
4777 #endif
4778
4779 if (global_options_set.x_ix86_stack_protector_guard_offset_str)
4780 {
4781 char *endp;
4782 const char *str = ix86_stack_protector_guard_offset_str;
4783
4784 errno = 0;
4785 int64_t offset;
4786
4787 #if defined(INT64_T_IS_LONG)
4788 offset = strtol (str, &endp, 0);
4789 #else
4790 offset = strtoll (str, &endp, 0);
4791 #endif
4792
4793 if (!*str || *endp || errno)
4794 error ("%qs is not a valid number "
4795 "in -mstack-protector-guard-offset=", str);
4796
4797 if (!IN_RANGE (offset, HOST_WIDE_INT_C (-0x80000000),
4798 HOST_WIDE_INT_C (0x7fffffff)))
4799 error ("%qs is not a valid offset "
4800 "in -mstack-protector-guard-offset=", str);
4801
4802 ix86_stack_protector_guard_offset = offset;
4803 }
4804
4805 ix86_stack_protector_guard_reg = DEFAULT_TLS_SEG_REG;
4806
4807 /* The kernel uses a different segment register for performance
4808 reasons; a system call would not have to trash the userspace
4809 segment register, which would be expensive. */
4810 if (ix86_cmodel == CM_KERNEL)
4811 ix86_stack_protector_guard_reg = ADDR_SPACE_SEG_GS;
4812
4813 if (global_options_set.x_ix86_stack_protector_guard_reg_str)
4814 {
4815 const char *str = ix86_stack_protector_guard_reg_str;
4816 addr_space_t seg = ADDR_SPACE_GENERIC;
4817
4818 /* Discard optional register prefix. */
4819 if (str[0] == '%')
4820 str++;
4821
4822 if (strlen (str) == 2 && str[1] == 's')
4823 {
4824 if (str[0] == 'f')
4825 seg = ADDR_SPACE_SEG_FS;
4826 else if (str[0] == 'g')
4827 seg = ADDR_SPACE_SEG_GS;
4828 }
4829
4830 if (seg == ADDR_SPACE_GENERIC)
4831 error ("%qs is not a valid base register "
4832 "in -mstack-protector-guard-reg=",
4833 ix86_stack_protector_guard_reg_str);
4834
4835 ix86_stack_protector_guard_reg = seg;
4836 }
4837
4838 /* Handle -mmemcpy-strategy= and -mmemset-strategy= */
4839 if (opts->x_ix86_tune_memcpy_strategy)
4840 {
4841 char *str = xstrdup (opts->x_ix86_tune_memcpy_strategy);
4842 ix86_parse_stringop_strategy_string (str, false);
4843 free (str);
4844 }
4845
4846 if (opts->x_ix86_tune_memset_strategy)
4847 {
4848 char *str = xstrdup (opts->x_ix86_tune_memset_strategy);
4849 ix86_parse_stringop_strategy_string (str, true);
4850 free (str);
4851 }
4852
4853 /* Save the initial options in case the user does function specific
4854 options. */
4855 if (main_args_p)
4856 target_option_default_node = target_option_current_node
4857 = build_target_option_node (opts);
4858
4859 /* Do not support control flow instrumentation if CET is not enabled. */
4860 if (opts->x_flag_cf_protection != CF_NONE)
4861 {
4862 if (!(TARGET_IBT_P (opts->x_ix86_isa_flags2)
4863 || TARGET_SHSTK_P (opts->x_ix86_isa_flags2)))
4864 {
4865 if (flag_cf_protection == CF_FULL)
4866 {
4867 error ("%<-fcf-protection=full%> requires CET support "
4868 "on this target. Use -mcet or one of -mibt, "
4869 "-mshstk options to enable CET");
4870 }
4871 else if (flag_cf_protection == CF_BRANCH)
4872 {
4873 error ("%<-fcf-protection=branch%> requires CET support "
4874 "on this target. Use -mcet or one of -mibt, "
4875 "-mshstk options to enable CET");
4876 }
4877 else if (flag_cf_protection == CF_RETURN)
4878 {
4879 error ("%<-fcf-protection=return%> requires CET support "
4880 "on this target. Use -mcet or one of -mibt, "
4881 "-mshstk options to enable CET");
4882 }
4883 flag_cf_protection = CF_NONE;
4884 return false;
4885 }
4886 opts->x_flag_cf_protection =
4887 (cf_protection_level) (opts->x_flag_cf_protection | CF_SET);
4888 }
4889
4890 return true;
4891 }
4892
4893 /* Implement the TARGET_OPTION_OVERRIDE hook. */
4894
4895 static void
4896 ix86_option_override (void)
4897 {
4898 ix86_option_override_internal (true, &global_options, &global_options_set);
4899 }
4900
4901 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
4902 static char *
4903 ix86_offload_options (void)
4904 {
4905 if (TARGET_LP64)
4906 return xstrdup ("-foffload-abi=lp64");
4907 return xstrdup ("-foffload-abi=ilp32");
4908 }
4909
4910 /* Update register usage after having seen the compiler flags. */
4911
4912 static void
4913 ix86_conditional_register_usage (void)
4914 {
4915 int i, c_mask;
4916
4917 /* If there are no caller-saved registers, preserve all registers.
4918 except fixed_regs and registers used for function return value
4919 since aggregate_value_p checks call_used_regs[regno] on return
4920 value. */
4921 if (cfun && cfun->machine->no_caller_saved_registers)
4922 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4923 if (!fixed_regs[i] && !ix86_function_value_regno_p (i))
4924 call_used_regs[i] = 0;
4925
4926 /* For 32-bit targets, squash the REX registers. */
4927 if (! TARGET_64BIT)
4928 {
4929 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
4930 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4931 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
4932 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4933 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4934 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4935 }
4936
4937 /* See the definition of CALL_USED_REGISTERS in i386.h. */
4938 c_mask = CALL_USED_REGISTERS_MASK (TARGET_64BIT_MS_ABI);
4939
4940 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
4941
4942 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4943 {
4944 /* Set/reset conditionally defined registers from
4945 CALL_USED_REGISTERS initializer. */
4946 if (call_used_regs[i] > 1)
4947 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
4948
4949 /* Calculate registers of CLOBBERED_REGS register set
4950 as call used registers from GENERAL_REGS register set. */
4951 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
4952 && call_used_regs[i])
4953 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
4954 }
4955
4956 /* If MMX is disabled, squash the registers. */
4957 if (! TARGET_MMX)
4958 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4959 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
4960 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4961
4962 /* If SSE is disabled, squash the registers. */
4963 if (! TARGET_SSE)
4964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
4966 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4967
4968 /* If the FPU is disabled, squash the registers. */
4969 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
4970 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4971 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
4972 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4973
4974 /* If AVX512F is disabled, squash the registers. */
4975 if (! TARGET_AVX512F)
4976 {
4977 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
4978 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4979
4980 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
4981 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4982 }
4983
4984 /* If MPX is disabled, squash the registers. */
4985 if (! TARGET_MPX)
4986 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
4987 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
4988 }
4989
4990 /* Canonicalize a comparison from one we don't have to one we do have. */
4991
4992 static void
4993 ix86_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
4994 bool op0_preserve_value)
4995 {
4996 /* The order of operands in x87 ficom compare is forced by combine in
4997 simplify_comparison () function. Float operator is treated as RTX_OBJ
4998 with a precedence over other operators and is always put in the first
4999 place. Swap condition and operands to match ficom instruction. */
5000 if (!op0_preserve_value
5001 && GET_CODE (*op0) == FLOAT && MEM_P (XEXP (*op0, 0)) && REG_P (*op1))
5002 {
5003 enum rtx_code scode = swap_condition ((enum rtx_code) *code);
5004
5005 /* We are called only for compares that are split to SAHF instruction.
5006 Ensure that we have setcc/jcc insn for the swapped condition. */
5007 if (ix86_fp_compare_code_to_integer (scode) != UNKNOWN)
5008 {
5009 std::swap (*op0, *op1);
5010 *code = (int) scode;
5011 }
5012 }
5013 }
5014 \f
5015 /* Save the current options */
5016
5017 static void
5018 ix86_function_specific_save (struct cl_target_option *ptr,
5019 struct gcc_options *opts)
5020 {
5021 ptr->arch = ix86_arch;
5022 ptr->schedule = ix86_schedule;
5023 ptr->prefetch_sse = x86_prefetch_sse;
5024 ptr->tune = ix86_tune;
5025 ptr->branch_cost = ix86_branch_cost;
5026 ptr->tune_defaulted = ix86_tune_defaulted;
5027 ptr->arch_specified = ix86_arch_specified;
5028 ptr->x_ix86_isa_flags_explicit = opts->x_ix86_isa_flags_explicit;
5029 ptr->x_ix86_isa_flags2_explicit = opts->x_ix86_isa_flags2_explicit;
5030 ptr->x_recip_mask_explicit = opts->x_recip_mask_explicit;
5031 ptr->x_ix86_arch_string = opts->x_ix86_arch_string;
5032 ptr->x_ix86_tune_string = opts->x_ix86_tune_string;
5033 ptr->x_ix86_cmodel = opts->x_ix86_cmodel;
5034 ptr->x_ix86_abi = opts->x_ix86_abi;
5035 ptr->x_ix86_asm_dialect = opts->x_ix86_asm_dialect;
5036 ptr->x_ix86_branch_cost = opts->x_ix86_branch_cost;
5037 ptr->x_ix86_dump_tunes = opts->x_ix86_dump_tunes;
5038 ptr->x_ix86_force_align_arg_pointer = opts->x_ix86_force_align_arg_pointer;
5039 ptr->x_ix86_force_drap = opts->x_ix86_force_drap;
5040 ptr->x_ix86_incoming_stack_boundary_arg = opts->x_ix86_incoming_stack_boundary_arg;
5041 ptr->x_ix86_pmode = opts->x_ix86_pmode;
5042 ptr->x_ix86_preferred_stack_boundary_arg = opts->x_ix86_preferred_stack_boundary_arg;
5043 ptr->x_ix86_recip_name = opts->x_ix86_recip_name;
5044 ptr->x_ix86_regparm = opts->x_ix86_regparm;
5045 ptr->x_ix86_section_threshold = opts->x_ix86_section_threshold;
5046 ptr->x_ix86_sse2avx = opts->x_ix86_sse2avx;
5047 ptr->x_ix86_stack_protector_guard = opts->x_ix86_stack_protector_guard;
5048 ptr->x_ix86_stringop_alg = opts->x_ix86_stringop_alg;
5049 ptr->x_ix86_tls_dialect = opts->x_ix86_tls_dialect;
5050 ptr->x_ix86_tune_ctrl_string = opts->x_ix86_tune_ctrl_string;
5051 ptr->x_ix86_tune_memcpy_strategy = opts->x_ix86_tune_memcpy_strategy;
5052 ptr->x_ix86_tune_memset_strategy = opts->x_ix86_tune_memset_strategy;
5053 ptr->x_ix86_tune_no_default = opts->x_ix86_tune_no_default;
5054 ptr->x_ix86_veclibabi_type = opts->x_ix86_veclibabi_type;
5055
5056 /* The fields are char but the variables are not; make sure the
5057 values fit in the fields. */
5058 gcc_assert (ptr->arch == ix86_arch);
5059 gcc_assert (ptr->schedule == ix86_schedule);
5060 gcc_assert (ptr->tune == ix86_tune);
5061 gcc_assert (ptr->branch_cost == ix86_branch_cost);
5062 }
5063
5064 /* Restore the current options */
5065
5066 static void
5067 ix86_function_specific_restore (struct gcc_options *opts,
5068 struct cl_target_option *ptr)
5069 {
5070 enum processor_type old_tune = ix86_tune;
5071 enum processor_type old_arch = ix86_arch;
5072 unsigned int ix86_arch_mask;
5073 int i;
5074
5075 /* We don't change -fPIC. */
5076 opts->x_flag_pic = flag_pic;
5077
5078 ix86_arch = (enum processor_type) ptr->arch;
5079 ix86_schedule = (enum attr_cpu) ptr->schedule;
5080 ix86_tune = (enum processor_type) ptr->tune;
5081 x86_prefetch_sse = ptr->prefetch_sse;
5082 opts->x_ix86_branch_cost = ptr->branch_cost;
5083 ix86_tune_defaulted = ptr->tune_defaulted;
5084 ix86_arch_specified = ptr->arch_specified;
5085 opts->x_ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
5086 opts->x_ix86_isa_flags2_explicit = ptr->x_ix86_isa_flags2_explicit;
5087 opts->x_recip_mask_explicit = ptr->x_recip_mask_explicit;
5088 opts->x_ix86_arch_string = ptr->x_ix86_arch_string;
5089 opts->x_ix86_tune_string = ptr->x_ix86_tune_string;
5090 opts->x_ix86_cmodel = ptr->x_ix86_cmodel;
5091 opts->x_ix86_abi = ptr->x_ix86_abi;
5092 opts->x_ix86_asm_dialect = ptr->x_ix86_asm_dialect;
5093 opts->x_ix86_branch_cost = ptr->x_ix86_branch_cost;
5094 opts->x_ix86_dump_tunes = ptr->x_ix86_dump_tunes;
5095 opts->x_ix86_force_align_arg_pointer = ptr->x_ix86_force_align_arg_pointer;
5096 opts->x_ix86_force_drap = ptr->x_ix86_force_drap;
5097 opts->x_ix86_incoming_stack_boundary_arg = ptr->x_ix86_incoming_stack_boundary_arg;
5098 opts->x_ix86_pmode = ptr->x_ix86_pmode;
5099 opts->x_ix86_preferred_stack_boundary_arg = ptr->x_ix86_preferred_stack_boundary_arg;
5100 opts->x_ix86_recip_name = ptr->x_ix86_recip_name;
5101 opts->x_ix86_regparm = ptr->x_ix86_regparm;
5102 opts->x_ix86_section_threshold = ptr->x_ix86_section_threshold;
5103 opts->x_ix86_sse2avx = ptr->x_ix86_sse2avx;
5104 opts->x_ix86_stack_protector_guard = ptr->x_ix86_stack_protector_guard;
5105 opts->x_ix86_stringop_alg = ptr->x_ix86_stringop_alg;
5106 opts->x_ix86_tls_dialect = ptr->x_ix86_tls_dialect;
5107 opts->x_ix86_tune_ctrl_string = ptr->x_ix86_tune_ctrl_string;
5108 opts->x_ix86_tune_memcpy_strategy = ptr->x_ix86_tune_memcpy_strategy;
5109 opts->x_ix86_tune_memset_strategy = ptr->x_ix86_tune_memset_strategy;
5110 opts->x_ix86_tune_no_default = ptr->x_ix86_tune_no_default;
5111 opts->x_ix86_veclibabi_type = ptr->x_ix86_veclibabi_type;
5112 ix86_tune_cost = processor_target_table[ix86_tune].cost;
5113 /* TODO: ix86_cost should be chosen at instruction or function granuality
5114 so for cold code we use size_cost even in !optimize_size compilation. */
5115 if (opts->x_optimize_size)
5116 ix86_cost = &ix86_size_cost;
5117 else
5118 ix86_cost = ix86_tune_cost;
5119
5120 /* Recreate the arch feature tests if the arch changed */
5121 if (old_arch != ix86_arch)
5122 {
5123 ix86_arch_mask = 1u << ix86_arch;
5124 for (i = 0; i < X86_ARCH_LAST; ++i)
5125 ix86_arch_features[i]
5126 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
5127 }
5128
5129 /* Recreate the tune optimization tests */
5130 if (old_tune != ix86_tune)
5131 set_ix86_tune_features (ix86_tune, false);
5132 }
5133
5134 /* Adjust target options after streaming them in. This is mainly about
5135 reconciling them with global options. */
5136
5137 static void
5138 ix86_function_specific_post_stream_in (struct cl_target_option *ptr)
5139 {
5140 /* flag_pic is a global option, but ix86_cmodel is target saved option
5141 partly computed from flag_pic. If flag_pic is on, adjust x_ix86_cmodel
5142 for PIC, or error out. */
5143 if (flag_pic)
5144 switch (ptr->x_ix86_cmodel)
5145 {
5146 case CM_SMALL:
5147 ptr->x_ix86_cmodel = CM_SMALL_PIC;
5148 break;
5149
5150 case CM_MEDIUM:
5151 ptr->x_ix86_cmodel = CM_MEDIUM_PIC;
5152 break;
5153
5154 case CM_LARGE:
5155 ptr->x_ix86_cmodel = CM_LARGE_PIC;
5156 break;
5157
5158 case CM_KERNEL:
5159 error ("code model %s does not support PIC mode", "kernel");
5160 break;
5161
5162 default:
5163 break;
5164 }
5165 else
5166 switch (ptr->x_ix86_cmodel)
5167 {
5168 case CM_SMALL_PIC:
5169 ptr->x_ix86_cmodel = CM_SMALL;
5170 break;
5171
5172 case CM_MEDIUM_PIC:
5173 ptr->x_ix86_cmodel = CM_MEDIUM;
5174 break;
5175
5176 case CM_LARGE_PIC:
5177 ptr->x_ix86_cmodel = CM_LARGE;
5178 break;
5179
5180 default:
5181 break;
5182 }
5183 }
5184
5185 /* Print the current options */
5186
5187 static void
5188 ix86_function_specific_print (FILE *file, int indent,
5189 struct cl_target_option *ptr)
5190 {
5191 char *target_string
5192 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_ix86_isa_flags2,
5193 ptr->x_target_flags, ptr->x_ix86_target_flags,
5194 NULL, NULL, ptr->x_ix86_fpmath, false);
5195
5196 gcc_assert (ptr->arch < PROCESSOR_max);
5197 fprintf (file, "%*sarch = %d (%s)\n",
5198 indent, "",
5199 ptr->arch, processor_target_table[ptr->arch].name);
5200
5201 gcc_assert (ptr->tune < PROCESSOR_max);
5202 fprintf (file, "%*stune = %d (%s)\n",
5203 indent, "",
5204 ptr->tune, processor_target_table[ptr->tune].name);
5205
5206 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
5207
5208 if (target_string)
5209 {
5210 fprintf (file, "%*s%s\n", indent, "", target_string);
5211 free (target_string);
5212 }
5213 }
5214
5215 \f
5216 /* Inner function to process the attribute((target(...))), take an argument and
5217 set the current options from the argument. If we have a list, recursively go
5218 over the list. */
5219
5220 static bool
5221 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
5222 struct gcc_options *opts,
5223 struct gcc_options *opts_set,
5224 struct gcc_options *enum_opts_set)
5225 {
5226 char *next_optstr;
5227 bool ret = true;
5228
5229 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
5230 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
5231 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
5232 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
5233 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
5234
5235 enum ix86_opt_type
5236 {
5237 ix86_opt_unknown,
5238 ix86_opt_yes,
5239 ix86_opt_no,
5240 ix86_opt_str,
5241 ix86_opt_enum,
5242 ix86_opt_isa
5243 };
5244
5245 static const struct
5246 {
5247 const char *string;
5248 size_t len;
5249 enum ix86_opt_type type;
5250 int opt;
5251 int mask;
5252 } attrs[] = {
5253 /* isa options */
5254 IX86_ATTR_ISA ("sgx", OPT_msgx),
5255 IX86_ATTR_ISA ("avx5124fmaps", OPT_mavx5124fmaps),
5256 IX86_ATTR_ISA ("avx5124vnniw", OPT_mavx5124vnniw),
5257 IX86_ATTR_ISA ("avx512vpopcntdq", OPT_mavx512vpopcntdq),
5258 IX86_ATTR_ISA ("avx512vbmi2", OPT_mavx512vbmi2),
5259
5260 IX86_ATTR_ISA ("avx512vbmi", OPT_mavx512vbmi),
5261 IX86_ATTR_ISA ("avx512ifma", OPT_mavx512ifma),
5262 IX86_ATTR_ISA ("avx512vl", OPT_mavx512vl),
5263 IX86_ATTR_ISA ("avx512bw", OPT_mavx512bw),
5264 IX86_ATTR_ISA ("avx512dq", OPT_mavx512dq),
5265 IX86_ATTR_ISA ("avx512er", OPT_mavx512er),
5266 IX86_ATTR_ISA ("avx512pf", OPT_mavx512pf),
5267 IX86_ATTR_ISA ("avx512cd", OPT_mavx512cd),
5268 IX86_ATTR_ISA ("avx512f", OPT_mavx512f),
5269 IX86_ATTR_ISA ("avx2", OPT_mavx2),
5270 IX86_ATTR_ISA ("fma", OPT_mfma),
5271 IX86_ATTR_ISA ("xop", OPT_mxop),
5272 IX86_ATTR_ISA ("fma4", OPT_mfma4),
5273 IX86_ATTR_ISA ("f16c", OPT_mf16c),
5274 IX86_ATTR_ISA ("avx", OPT_mavx),
5275 IX86_ATTR_ISA ("sse4", OPT_msse4),
5276 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
5277 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
5278 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
5279 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
5280 IX86_ATTR_ISA ("sse3", OPT_msse3),
5281 IX86_ATTR_ISA ("aes", OPT_maes),
5282 IX86_ATTR_ISA ("sha", OPT_msha),
5283 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
5284 IX86_ATTR_ISA ("sse2", OPT_msse2),
5285 IX86_ATTR_ISA ("sse", OPT_msse),
5286 IX86_ATTR_ISA ("3dnowa", OPT_m3dnowa),
5287 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
5288 IX86_ATTR_ISA ("mmx", OPT_mmmx),
5289 IX86_ATTR_ISA ("rtm", OPT_mrtm),
5290 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
5291 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
5292 IX86_ATTR_ISA ("adx", OPT_madx),
5293 IX86_ATTR_ISA ("prefetchwt1", OPT_mprefetchwt1),
5294 IX86_ATTR_ISA ("clflushopt", OPT_mclflushopt),
5295 IX86_ATTR_ISA ("xsaves", OPT_mxsaves),
5296 IX86_ATTR_ISA ("xsavec", OPT_mxsavec),
5297 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
5298 IX86_ATTR_ISA ("xsave", OPT_mxsave),
5299 IX86_ATTR_ISA ("abm", OPT_mabm),
5300 IX86_ATTR_ISA ("bmi", OPT_mbmi),
5301 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
5302 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
5303 IX86_ATTR_ISA ("tbm", OPT_mtbm),
5304 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
5305 IX86_ATTR_ISA ("cx16", OPT_mcx16),
5306 IX86_ATTR_ISA ("sahf", OPT_msahf),
5307 IX86_ATTR_ISA ("movbe", OPT_mmovbe),
5308 IX86_ATTR_ISA ("crc32", OPT_mcrc32),
5309 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
5310 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
5311 IX86_ATTR_ISA ("mwaitx", OPT_mmwaitx),
5312 IX86_ATTR_ISA ("clzero", OPT_mclzero),
5313 IX86_ATTR_ISA ("pku", OPT_mpku),
5314 IX86_ATTR_ISA ("lwp", OPT_mlwp),
5315 IX86_ATTR_ISA ("hle", OPT_mhle),
5316 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
5317 IX86_ATTR_ISA ("mpx", OPT_mmpx),
5318 IX86_ATTR_ISA ("clwb", OPT_mclwb),
5319 IX86_ATTR_ISA ("rdpid", OPT_mrdpid),
5320 IX86_ATTR_ISA ("gfni", OPT_mgfni),
5321 IX86_ATTR_ISA ("ibt", OPT_mibt),
5322 IX86_ATTR_ISA ("shstk", OPT_mshstk),
5323
5324 /* enum options */
5325 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
5326
5327 /* string options */
5328 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
5329 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
5330
5331 /* flag options */
5332 IX86_ATTR_YES ("cld",
5333 OPT_mcld,
5334 MASK_CLD),
5335
5336 IX86_ATTR_NO ("fancy-math-387",
5337 OPT_mfancy_math_387,
5338 MASK_NO_FANCY_MATH_387),
5339
5340 IX86_ATTR_YES ("ieee-fp",
5341 OPT_mieee_fp,
5342 MASK_IEEE_FP),
5343
5344 IX86_ATTR_YES ("inline-all-stringops",
5345 OPT_minline_all_stringops,
5346 MASK_INLINE_ALL_STRINGOPS),
5347
5348 IX86_ATTR_YES ("inline-stringops-dynamically",
5349 OPT_minline_stringops_dynamically,
5350 MASK_INLINE_STRINGOPS_DYNAMICALLY),
5351
5352 IX86_ATTR_NO ("align-stringops",
5353 OPT_mno_align_stringops,
5354 MASK_NO_ALIGN_STRINGOPS),
5355
5356 IX86_ATTR_YES ("recip",
5357 OPT_mrecip,
5358 MASK_RECIP),
5359
5360 };
5361
5362 /* If this is a list, recurse to get the options. */
5363 if (TREE_CODE (args) == TREE_LIST)
5364 {
5365 bool ret = true;
5366
5367 for (; args; args = TREE_CHAIN (args))
5368 if (TREE_VALUE (args)
5369 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
5370 p_strings, opts, opts_set,
5371 enum_opts_set))
5372 ret = false;
5373
5374 return ret;
5375 }
5376
5377 else if (TREE_CODE (args) != STRING_CST)
5378 {
5379 error ("attribute %<target%> argument not a string");
5380 return false;
5381 }
5382
5383 /* Handle multiple arguments separated by commas. */
5384 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
5385
5386 while (next_optstr && *next_optstr != '\0')
5387 {
5388 char *p = next_optstr;
5389 char *orig_p = p;
5390 char *comma = strchr (next_optstr, ',');
5391 const char *opt_string;
5392 size_t len, opt_len;
5393 int opt;
5394 bool opt_set_p;
5395 char ch;
5396 unsigned i;
5397 enum ix86_opt_type type = ix86_opt_unknown;
5398 int mask = 0;
5399
5400 if (comma)
5401 {
5402 *comma = '\0';
5403 len = comma - next_optstr;
5404 next_optstr = comma + 1;
5405 }
5406 else
5407 {
5408 len = strlen (p);
5409 next_optstr = NULL;
5410 }
5411
5412 /* Recognize no-xxx. */
5413 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
5414 {
5415 opt_set_p = false;
5416 p += 3;
5417 len -= 3;
5418 }
5419 else
5420 opt_set_p = true;
5421
5422 /* Find the option. */
5423 ch = *p;
5424 opt = N_OPTS;
5425 for (i = 0; i < ARRAY_SIZE (attrs); i++)
5426 {
5427 type = attrs[i].type;
5428 opt_len = attrs[i].len;
5429 if (ch == attrs[i].string[0]
5430 && ((type != ix86_opt_str && type != ix86_opt_enum)
5431 ? len == opt_len
5432 : len > opt_len)
5433 && memcmp (p, attrs[i].string, opt_len) == 0)
5434 {
5435 opt = attrs[i].opt;
5436 mask = attrs[i].mask;
5437 opt_string = attrs[i].string;
5438 break;
5439 }
5440 }
5441
5442 /* Process the option. */
5443 if (opt == N_OPTS)
5444 {
5445 error ("attribute(target(\"%s\")) is unknown", orig_p);
5446 ret = false;
5447 }
5448
5449 else if (type == ix86_opt_isa)
5450 {
5451 struct cl_decoded_option decoded;
5452
5453 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
5454 ix86_handle_option (opts, opts_set,
5455 &decoded, input_location);
5456 }
5457
5458 else if (type == ix86_opt_yes || type == ix86_opt_no)
5459 {
5460 if (type == ix86_opt_no)
5461 opt_set_p = !opt_set_p;
5462
5463 if (opt_set_p)
5464 opts->x_target_flags |= mask;
5465 else
5466 opts->x_target_flags &= ~mask;
5467 }
5468
5469 else if (type == ix86_opt_str)
5470 {
5471 if (p_strings[opt])
5472 {
5473 error ("option(\"%s\") was already specified", opt_string);
5474 ret = false;
5475 }
5476 else
5477 p_strings[opt] = xstrdup (p + opt_len);
5478 }
5479
5480 else if (type == ix86_opt_enum)
5481 {
5482 bool arg_ok;
5483 int value;
5484
5485 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
5486 if (arg_ok)
5487 set_option (opts, enum_opts_set, opt, value,
5488 p + opt_len, DK_UNSPECIFIED, input_location,
5489 global_dc);
5490 else
5491 {
5492 error ("attribute(target(\"%s\")) is unknown", orig_p);
5493 ret = false;
5494 }
5495 }
5496
5497 else
5498 gcc_unreachable ();
5499 }
5500
5501 return ret;
5502 }
5503
5504 /* Release allocated strings. */
5505 static void
5506 release_options_strings (char **option_strings)
5507 {
5508 /* Free up memory allocated to hold the strings */
5509 for (unsigned i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
5510 free (option_strings[i]);
5511 }
5512
5513 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
5514
5515 tree
5516 ix86_valid_target_attribute_tree (tree args,
5517 struct gcc_options *opts,
5518 struct gcc_options *opts_set)
5519 {
5520 const char *orig_arch_string = opts->x_ix86_arch_string;
5521 const char *orig_tune_string = opts->x_ix86_tune_string;
5522 enum fpmath_unit orig_fpmath_set = opts_set->x_ix86_fpmath;
5523 int orig_tune_defaulted = ix86_tune_defaulted;
5524 int orig_arch_specified = ix86_arch_specified;
5525 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
5526 tree t = NULL_TREE;
5527 struct cl_target_option *def
5528 = TREE_TARGET_OPTION (target_option_default_node);
5529 struct gcc_options enum_opts_set;
5530
5531 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
5532
5533 /* Process each of the options on the chain. */
5534 if (! ix86_valid_target_attribute_inner_p (args, option_strings, opts,
5535 opts_set, &enum_opts_set))
5536 return error_mark_node;
5537
5538 /* If the changed options are different from the default, rerun
5539 ix86_option_override_internal, and then save the options away.
5540 The string options are attribute options, and will be undone
5541 when we copy the save structure. */
5542 if (opts->x_ix86_isa_flags != def->x_ix86_isa_flags
5543 || opts->x_ix86_isa_flags2 != def->x_ix86_isa_flags2
5544 || opts->x_target_flags != def->x_target_flags
5545 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
5546 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
5547 || enum_opts_set.x_ix86_fpmath)
5548 {
5549 /* If we are using the default tune= or arch=, undo the string assigned,
5550 and use the default. */
5551 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
5552 {
5553 opts->x_ix86_arch_string
5554 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_ARCH]);
5555
5556 /* If arch= is set, clear all bits in x_ix86_isa_flags,
5557 except for ISA_64BIT, ABI_64, ABI_X32, and CODE16. */
5558 opts->x_ix86_isa_flags &= (OPTION_MASK_ISA_64BIT
5559 | OPTION_MASK_ABI_64
5560 | OPTION_MASK_ABI_X32
5561 | OPTION_MASK_CODE16);
5562 opts->x_ix86_isa_flags2 = 0;
5563 }
5564 else if (!orig_arch_specified)
5565 opts->x_ix86_arch_string = NULL;
5566
5567 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
5568 opts->x_ix86_tune_string
5569 = ggc_strdup (option_strings[IX86_FUNCTION_SPECIFIC_TUNE]);
5570 else if (orig_tune_defaulted)
5571 opts->x_ix86_tune_string = NULL;
5572
5573 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
5574 if (enum_opts_set.x_ix86_fpmath)
5575 opts_set->x_ix86_fpmath = (enum fpmath_unit) 1;
5576
5577 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
5578 bool r = ix86_option_override_internal (false, opts, opts_set);
5579 if (!r)
5580 {
5581 release_options_strings (option_strings);
5582 return error_mark_node;
5583 }
5584
5585 /* Add any builtin functions with the new isa if any. */
5586 ix86_add_new_builtins (opts->x_ix86_isa_flags, opts->x_ix86_isa_flags2);
5587
5588 /* Save the current options unless we are validating options for
5589 #pragma. */
5590 t = build_target_option_node (opts);
5591
5592 opts->x_ix86_arch_string = orig_arch_string;
5593 opts->x_ix86_tune_string = orig_tune_string;
5594 opts_set->x_ix86_fpmath = orig_fpmath_set;
5595
5596 release_options_strings (option_strings);
5597 }
5598
5599 return t;
5600 }
5601
5602 /* Hook to validate attribute((target("string"))). */
5603
5604 static bool
5605 ix86_valid_target_attribute_p (tree fndecl,
5606 tree ARG_UNUSED (name),
5607 tree args,
5608 int ARG_UNUSED (flags))
5609 {
5610 struct gcc_options func_options;
5611 tree new_target, new_optimize;
5612 bool ret = true;
5613
5614 /* attribute((target("default"))) does nothing, beyond
5615 affecting multi-versioning. */
5616 if (TREE_VALUE (args)
5617 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
5618 && TREE_CHAIN (args) == NULL_TREE
5619 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
5620 return true;
5621
5622 tree old_optimize = build_optimization_node (&global_options);
5623
5624 /* Get the optimization options of the current function. */
5625 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
5626
5627 if (!func_optimize)
5628 func_optimize = old_optimize;
5629
5630 /* Init func_options. */
5631 memset (&func_options, 0, sizeof (func_options));
5632 init_options_struct (&func_options, NULL);
5633 lang_hooks.init_options_struct (&func_options);
5634
5635 cl_optimization_restore (&func_options,
5636 TREE_OPTIMIZATION (func_optimize));
5637
5638 /* Initialize func_options to the default before its target options can
5639 be set. */
5640 cl_target_option_restore (&func_options,
5641 TREE_TARGET_OPTION (target_option_default_node));
5642
5643 new_target = ix86_valid_target_attribute_tree (args, &func_options,
5644 &global_options_set);
5645
5646 new_optimize = build_optimization_node (&func_options);
5647
5648 if (new_target == error_mark_node)
5649 ret = false;
5650
5651 else if (fndecl && new_target)
5652 {
5653 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
5654
5655 if (old_optimize != new_optimize)
5656 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
5657 }
5658
5659 finalize_options_struct (&func_options);
5660
5661 return ret;
5662 }
5663
5664 \f
5665 /* Hook to determine if one function can safely inline another. */
5666
5667 static bool
5668 ix86_can_inline_p (tree caller, tree callee)
5669 {
5670 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
5671 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
5672 if (!callee_tree)
5673 callee_tree = target_option_default_node;
5674 if (!caller_tree)
5675 caller_tree = target_option_default_node;
5676 if (callee_tree == caller_tree)
5677 return true;
5678
5679 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
5680 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
5681 bool ret = false;
5682
5683 /* Callee's isa options should be a subset of the caller's, i.e. a SSE4
5684 function can inline a SSE2 function but a SSE2 function can't inline
5685 a SSE4 function. */
5686 if (((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
5687 != callee_opts->x_ix86_isa_flags)
5688 || ((caller_opts->x_ix86_isa_flags2 & callee_opts->x_ix86_isa_flags2)
5689 != callee_opts->x_ix86_isa_flags2))
5690 ret = false;
5691
5692 /* See if we have the same non-isa options. */
5693 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
5694 ret = false;
5695
5696 /* See if arch, tune, etc. are the same. */
5697 else if (caller_opts->arch != callee_opts->arch)
5698 ret = false;
5699
5700 else if (caller_opts->tune != callee_opts->tune)
5701 ret = false;
5702
5703 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
5704 /* If the calle doesn't use FP expressions differences in
5705 ix86_fpmath can be ignored. We are called from FEs
5706 for multi-versioning call optimization, so beware of
5707 ipa_fn_summaries not available. */
5708 && (! ipa_fn_summaries
5709 || ipa_fn_summaries->get
5710 (cgraph_node::get (callee))->fp_expressions))
5711 ret = false;
5712
5713 else if (caller_opts->branch_cost != callee_opts->branch_cost)
5714 ret = false;
5715
5716 else
5717 ret = true;
5718
5719 return ret;
5720 }
5721
5722 \f
5723 /* Remember the last target of ix86_set_current_function. */
5724 static GTY(()) tree ix86_previous_fndecl;
5725
5726 /* Set targets globals to the default (or current #pragma GCC target
5727 if active). Invalidate ix86_previous_fndecl cache. */
5728
5729 void
5730 ix86_reset_previous_fndecl (void)
5731 {
5732 tree new_tree = target_option_current_node;
5733 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5734 if (TREE_TARGET_GLOBALS (new_tree))
5735 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5736 else if (new_tree == target_option_default_node)
5737 restore_target_globals (&default_target_globals);
5738 else
5739 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5740 ix86_previous_fndecl = NULL_TREE;
5741 }
5742
5743 /* Set the func_type field from the function FNDECL. */
5744
5745 static void
5746 ix86_set_func_type (tree fndecl)
5747 {
5748 if (cfun->machine->func_type == TYPE_UNKNOWN)
5749 {
5750 if (lookup_attribute ("interrupt",
5751 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5752 {
5753 if (ix86_function_naked (fndecl))
5754 error_at (DECL_SOURCE_LOCATION (fndecl),
5755 "interrupt and naked attributes are not compatible");
5756
5757 int nargs = 0;
5758 for (tree arg = DECL_ARGUMENTS (fndecl);
5759 arg;
5760 arg = TREE_CHAIN (arg))
5761 nargs++;
5762 cfun->machine->no_caller_saved_registers = true;
5763 cfun->machine->func_type
5764 = nargs == 2 ? TYPE_EXCEPTION : TYPE_INTERRUPT;
5765
5766 ix86_optimize_mode_switching[X86_DIRFLAG] = 1;
5767
5768 /* Only dwarf2out.c can handle -WORD(AP) as a pointer argument. */
5769 if (write_symbols != NO_DEBUG && write_symbols != DWARF2_DEBUG)
5770 sorry ("Only DWARF debug format is supported for interrupt "
5771 "service routine.");
5772 }
5773 else
5774 {
5775 cfun->machine->func_type = TYPE_NORMAL;
5776 if (lookup_attribute ("no_caller_saved_registers",
5777 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
5778 cfun->machine->no_caller_saved_registers = true;
5779 }
5780 }
5781 }
5782
5783 /* Establish appropriate back-end context for processing the function
5784 FNDECL. The argument might be NULL to indicate processing at top
5785 level, outside of any function scope. */
5786 static void
5787 ix86_set_current_function (tree fndecl)
5788 {
5789 /* Only change the context if the function changes. This hook is called
5790 several times in the course of compiling a function, and we don't want to
5791 slow things down too much or call target_reinit when it isn't safe. */
5792 if (fndecl == ix86_previous_fndecl)
5793 {
5794 /* There may be 2 function bodies for the same function FNDECL,
5795 one is extern inline and one isn't. Call ix86_set_func_type
5796 to set the func_type field. */
5797 if (fndecl != NULL_TREE)
5798 ix86_set_func_type (fndecl);
5799 return;
5800 }
5801
5802 tree old_tree;
5803 if (ix86_previous_fndecl == NULL_TREE)
5804 old_tree = target_option_current_node;
5805 else if (DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl))
5806 old_tree = DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl);
5807 else
5808 old_tree = target_option_default_node;
5809
5810 if (fndecl == NULL_TREE)
5811 {
5812 if (old_tree != target_option_current_node)
5813 ix86_reset_previous_fndecl ();
5814 return;
5815 }
5816
5817 ix86_set_func_type (fndecl);
5818
5819 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
5820 if (new_tree == NULL_TREE)
5821 new_tree = target_option_default_node;
5822
5823 if (old_tree != new_tree)
5824 {
5825 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
5826 if (TREE_TARGET_GLOBALS (new_tree))
5827 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
5828 else if (new_tree == target_option_default_node)
5829 restore_target_globals (&default_target_globals);
5830 else
5831 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
5832 }
5833 ix86_previous_fndecl = fndecl;
5834
5835 static bool prev_no_caller_saved_registers;
5836
5837 /* 64-bit MS and SYSV ABI have different set of call used registers.
5838 Avoid expensive re-initialization of init_regs each time we switch
5839 function context. */
5840 if (TARGET_64BIT
5841 && (call_used_regs[SI_REG]
5842 == (cfun->machine->call_abi == MS_ABI)))
5843 reinit_regs ();
5844 /* Need to re-initialize init_regs if caller-saved registers are
5845 changed. */
5846 else if (prev_no_caller_saved_registers
5847 != cfun->machine->no_caller_saved_registers)
5848 reinit_regs ();
5849
5850 if (cfun->machine->func_type != TYPE_NORMAL
5851 || cfun->machine->no_caller_saved_registers)
5852 {
5853 /* Don't allow MPX, SSE, MMX nor x87 instructions since they
5854 may change processor state. */
5855 const char *isa;
5856 if (TARGET_MPX)
5857 isa = "MPX";
5858 else if (TARGET_SSE)
5859 isa = "SSE";
5860 else if (TARGET_MMX)
5861 isa = "MMX/3Dnow";
5862 else if (TARGET_80387)
5863 isa = "80387";
5864 else
5865 isa = NULL;
5866 if (isa != NULL)
5867 {
5868 if (cfun->machine->func_type != TYPE_NORMAL)
5869 sorry ("%s instructions aren't allowed in %s service routine",
5870 isa, (cfun->machine->func_type == TYPE_EXCEPTION
5871 ? "exception" : "interrupt"));
5872 else
5873 sorry ("%s instructions aren't allowed in function with "
5874 "no_caller_saved_registers attribute", isa);
5875 /* Don't issue the same error twice. */
5876 cfun->machine->func_type = TYPE_NORMAL;
5877 cfun->machine->no_caller_saved_registers = false;
5878 }
5879 }
5880
5881 prev_no_caller_saved_registers
5882 = cfun->machine->no_caller_saved_registers;
5883 }
5884
5885 \f
5886 /* Return true if this goes in large data/bss. */
5887
5888 static bool
5889 ix86_in_large_data_p (tree exp)
5890 {
5891 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
5892 return false;
5893
5894 if (exp == NULL_TREE)
5895 return false;
5896
5897 /* Functions are never large data. */
5898 if (TREE_CODE (exp) == FUNCTION_DECL)
5899 return false;
5900
5901 /* Automatic variables are never large data. */
5902 if (VAR_P (exp) && !is_global_var (exp))
5903 return false;
5904
5905 if (VAR_P (exp) && DECL_SECTION_NAME (exp))
5906 {
5907 const char *section = DECL_SECTION_NAME (exp);
5908 if (strcmp (section, ".ldata") == 0
5909 || strcmp (section, ".lbss") == 0)
5910 return true;
5911 return false;
5912 }
5913 else
5914 {
5915 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
5916
5917 /* If this is an incomplete type with size 0, then we can't put it
5918 in data because it might be too big when completed. Also,
5919 int_size_in_bytes returns -1 if size can vary or is larger than
5920 an integer in which case also it is safer to assume that it goes in
5921 large data. */
5922 if (size <= 0 || size > ix86_section_threshold)
5923 return true;
5924 }
5925
5926 return false;
5927 }
5928
5929 /* i386-specific section flag to mark large sections. */
5930 #define SECTION_LARGE SECTION_MACH_DEP
5931
5932 /* Switch to the appropriate section for output of DECL.
5933 DECL is either a `VAR_DECL' node or a constant of some sort.
5934 RELOC indicates whether forming the initial value of DECL requires
5935 link-time relocations. */
5936
5937 ATTRIBUTE_UNUSED static section *
5938 x86_64_elf_select_section (tree decl, int reloc,
5939 unsigned HOST_WIDE_INT align)
5940 {
5941 if (ix86_in_large_data_p (decl))
5942 {
5943 const char *sname = NULL;
5944 unsigned int flags = SECTION_WRITE | SECTION_LARGE;
5945 switch (categorize_decl_for_section (decl, reloc))
5946 {
5947 case SECCAT_DATA:
5948 sname = ".ldata";
5949 break;
5950 case SECCAT_DATA_REL:
5951 sname = ".ldata.rel";
5952 break;
5953 case SECCAT_DATA_REL_LOCAL:
5954 sname = ".ldata.rel.local";
5955 break;
5956 case SECCAT_DATA_REL_RO:
5957 sname = ".ldata.rel.ro";
5958 break;
5959 case SECCAT_DATA_REL_RO_LOCAL:
5960 sname = ".ldata.rel.ro.local";
5961 break;
5962 case SECCAT_BSS:
5963 sname = ".lbss";
5964 flags |= SECTION_BSS;
5965 break;
5966 case SECCAT_RODATA:
5967 case SECCAT_RODATA_MERGE_STR:
5968 case SECCAT_RODATA_MERGE_STR_INIT:
5969 case SECCAT_RODATA_MERGE_CONST:
5970 sname = ".lrodata";
5971 flags &= ~SECTION_WRITE;
5972 break;
5973 case SECCAT_SRODATA:
5974 case SECCAT_SDATA:
5975 case SECCAT_SBSS:
5976 gcc_unreachable ();
5977 case SECCAT_TEXT:
5978 case SECCAT_TDATA:
5979 case SECCAT_TBSS:
5980 /* We don't split these for medium model. Place them into
5981 default sections and hope for best. */
5982 break;
5983 }
5984 if (sname)
5985 {
5986 /* We might get called with string constants, but get_named_section
5987 doesn't like them as they are not DECLs. Also, we need to set
5988 flags in that case. */
5989 if (!DECL_P (decl))
5990 return get_section (sname, flags, NULL);
5991 return get_named_section (decl, sname, reloc);
5992 }
5993 }
5994 return default_elf_select_section (decl, reloc, align);
5995 }
5996
5997 /* Select a set of attributes for section NAME based on the properties
5998 of DECL and whether or not RELOC indicates that DECL's initializer
5999 might contain runtime relocations. */
6000
6001 static unsigned int ATTRIBUTE_UNUSED
6002 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
6003 {
6004 unsigned int flags = default_section_type_flags (decl, name, reloc);
6005
6006 if (ix86_in_large_data_p (decl))
6007 flags |= SECTION_LARGE;
6008
6009 if (decl == NULL_TREE
6010 && (strcmp (name, ".ldata.rel.ro") == 0
6011 || strcmp (name, ".ldata.rel.ro.local") == 0))
6012 flags |= SECTION_RELRO;
6013
6014 if (strcmp (name, ".lbss") == 0
6015 || strncmp (name, ".lbss.", 5) == 0
6016 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
6017 flags |= SECTION_BSS;
6018
6019 return flags;
6020 }
6021
6022 /* Build up a unique section name, expressed as a
6023 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
6024 RELOC indicates whether the initial value of EXP requires
6025 link-time relocations. */
6026
6027 static void ATTRIBUTE_UNUSED
6028 x86_64_elf_unique_section (tree decl, int reloc)
6029 {
6030 if (ix86_in_large_data_p (decl))
6031 {
6032 const char *prefix = NULL;
6033 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
6034 bool one_only = DECL_COMDAT_GROUP (decl) && !HAVE_COMDAT_GROUP;
6035
6036 switch (categorize_decl_for_section (decl, reloc))
6037 {
6038 case SECCAT_DATA:
6039 case SECCAT_DATA_REL:
6040 case SECCAT_DATA_REL_LOCAL:
6041 case SECCAT_DATA_REL_RO:
6042 case SECCAT_DATA_REL_RO_LOCAL:
6043 prefix = one_only ? ".ld" : ".ldata";
6044 break;
6045 case SECCAT_BSS:
6046 prefix = one_only ? ".lb" : ".lbss";
6047 break;
6048 case SECCAT_RODATA:
6049 case SECCAT_RODATA_MERGE_STR:
6050 case SECCAT_RODATA_MERGE_STR_INIT:
6051 case SECCAT_RODATA_MERGE_CONST:
6052 prefix = one_only ? ".lr" : ".lrodata";
6053 break;
6054 case SECCAT_SRODATA:
6055 case SECCAT_SDATA:
6056 case SECCAT_SBSS:
6057 gcc_unreachable ();
6058 case SECCAT_TEXT:
6059 case SECCAT_TDATA:
6060 case SECCAT_TBSS:
6061 /* We don't split these for medium model. Place them into
6062 default sections and hope for best. */
6063 break;
6064 }
6065 if (prefix)
6066 {
6067 const char *name, *linkonce;
6068 char *string;
6069
6070 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
6071 name = targetm.strip_name_encoding (name);
6072
6073 /* If we're using one_only, then there needs to be a .gnu.linkonce
6074 prefix to the section name. */
6075 linkonce = one_only ? ".gnu.linkonce" : "";
6076
6077 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
6078
6079 set_decl_section_name (decl, string);
6080 return;
6081 }
6082 }
6083 default_unique_section (decl, reloc);
6084 }
6085
6086 #ifdef COMMON_ASM_OP
6087
6088 #ifndef LARGECOMM_SECTION_ASM_OP
6089 #define LARGECOMM_SECTION_ASM_OP "\t.largecomm\t"
6090 #endif
6091
6092 /* This says how to output assembler code to declare an
6093 uninitialized external linkage data object.
6094
6095 For medium model x86-64 we need to use LARGECOMM_SECTION_ASM_OP opcode for
6096 large objects. */
6097 void
6098 x86_elf_aligned_decl_common (FILE *file, tree decl,
6099 const char *name, unsigned HOST_WIDE_INT size,
6100 int align)
6101 {
6102 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6103 && size > (unsigned int)ix86_section_threshold)
6104 {
6105 switch_to_section (get_named_section (decl, ".lbss", 0));
6106 fputs (LARGECOMM_SECTION_ASM_OP, file);
6107 }
6108 else
6109 fputs (COMMON_ASM_OP, file);
6110 assemble_name (file, name);
6111 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
6112 size, align / BITS_PER_UNIT);
6113 }
6114 #endif
6115
6116 /* Utility function for targets to use in implementing
6117 ASM_OUTPUT_ALIGNED_BSS. */
6118
6119 void
6120 x86_output_aligned_bss (FILE *file, tree decl, const char *name,
6121 unsigned HOST_WIDE_INT size, int align)
6122 {
6123 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
6124 && size > (unsigned int)ix86_section_threshold)
6125 switch_to_section (get_named_section (decl, ".lbss", 0));
6126 else
6127 switch_to_section (bss_section);
6128 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
6129 #ifdef ASM_DECLARE_OBJECT_NAME
6130 last_assemble_variable_decl = decl;
6131 ASM_DECLARE_OBJECT_NAME (file, name, decl);
6132 #else
6133 /* Standard thing is just output label for the object. */
6134 ASM_OUTPUT_LABEL (file, name);
6135 #endif /* ASM_DECLARE_OBJECT_NAME */
6136 ASM_OUTPUT_SKIP (file, size ? size : 1);
6137 }
6138 \f
6139 /* Decide whether we must probe the stack before any space allocation
6140 on this target. It's essentially TARGET_STACK_PROBE except when
6141 -fstack-check causes the stack to be already probed differently. */
6142
6143 bool
6144 ix86_target_stack_probe (void)
6145 {
6146 /* Do not probe the stack twice if static stack checking is enabled. */
6147 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6148 return false;
6149
6150 return TARGET_STACK_PROBE;
6151 }
6152 \f
6153 /* Decide whether we can make a sibling call to a function. DECL is the
6154 declaration of the function being targeted by the call and EXP is the
6155 CALL_EXPR representing the call. */
6156
6157 static bool
6158 ix86_function_ok_for_sibcall (tree decl, tree exp)
6159 {
6160 tree type, decl_or_type;
6161 rtx a, b;
6162 bool bind_global = decl && !targetm.binds_local_p (decl);
6163
6164 if (ix86_function_naked (current_function_decl))
6165 return false;
6166
6167 /* Sibling call isn't OK if there are no caller-saved registers
6168 since all registers must be preserved before return. */
6169 if (cfun->machine->no_caller_saved_registers)
6170 return false;
6171
6172 /* If we are generating position-independent code, we cannot sibcall
6173 optimize direct calls to global functions, as the PLT requires
6174 %ebx be live. (Darwin does not have a PLT.) */
6175 if (!TARGET_MACHO
6176 && !TARGET_64BIT
6177 && flag_pic
6178 && flag_plt
6179 && bind_global)
6180 return false;
6181
6182 /* If we need to align the outgoing stack, then sibcalling would
6183 unalign the stack, which may break the called function. */
6184 if (ix86_minimum_incoming_stack_boundary (true)
6185 < PREFERRED_STACK_BOUNDARY)
6186 return false;
6187
6188 if (decl)
6189 {
6190 decl_or_type = decl;
6191 type = TREE_TYPE (decl);
6192 }
6193 else
6194 {
6195 /* We're looking at the CALL_EXPR, we need the type of the function. */
6196 type = CALL_EXPR_FN (exp); /* pointer expression */
6197 type = TREE_TYPE (type); /* pointer type */
6198 type = TREE_TYPE (type); /* function type */
6199 decl_or_type = type;
6200 }
6201
6202 /* Check that the return value locations are the same. Like
6203 if we are returning floats on the 80387 register stack, we cannot
6204 make a sibcall from a function that doesn't return a float to a
6205 function that does or, conversely, from a function that does return
6206 a float to a function that doesn't; the necessary stack adjustment
6207 would not be executed. This is also the place we notice
6208 differences in the return value ABI. Note that it is ok for one
6209 of the functions to have void return type as long as the return
6210 value of the other is passed in a register. */
6211 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
6212 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
6213 cfun->decl, false);
6214 if (STACK_REG_P (a) || STACK_REG_P (b))
6215 {
6216 if (!rtx_equal_p (a, b))
6217 return false;
6218 }
6219 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
6220 ;
6221 else if (!rtx_equal_p (a, b))
6222 return false;
6223
6224 if (TARGET_64BIT)
6225 {
6226 /* The SYSV ABI has more call-clobbered registers;
6227 disallow sibcalls from MS to SYSV. */
6228 if (cfun->machine->call_abi == MS_ABI
6229 && ix86_function_type_abi (type) == SYSV_ABI)
6230 return false;
6231 }
6232 else
6233 {
6234 /* If this call is indirect, we'll need to be able to use a
6235 call-clobbered register for the address of the target function.
6236 Make sure that all such registers are not used for passing
6237 parameters. Note that DLLIMPORT functions and call to global
6238 function via GOT slot are indirect. */
6239 if (!decl
6240 || (bind_global && flag_pic && !flag_plt)
6241 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
6242 {
6243 /* Check if regparm >= 3 since arg_reg_available is set to
6244 false if regparm == 0. If regparm is 1 or 2, there is
6245 always a call-clobbered register available.
6246
6247 ??? The symbol indirect call doesn't need a call-clobbered
6248 register. But we don't know if this is a symbol indirect
6249 call or not here. */
6250 if (ix86_function_regparm (type, NULL) >= 3
6251 && !cfun->machine->arg_reg_available)
6252 return false;
6253 }
6254 }
6255
6256 /* Otherwise okay. That also includes certain types of indirect calls. */
6257 return true;
6258 }
6259
6260 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
6261 and "sseregparm" calling convention attributes;
6262 arguments as in struct attribute_spec.handler. */
6263
6264 static tree
6265 ix86_handle_cconv_attribute (tree *node, tree name,
6266 tree args,
6267 int,
6268 bool *no_add_attrs)
6269 {
6270 if (TREE_CODE (*node) != FUNCTION_TYPE
6271 && TREE_CODE (*node) != METHOD_TYPE
6272 && TREE_CODE (*node) != FIELD_DECL
6273 && TREE_CODE (*node) != TYPE_DECL)
6274 {
6275 warning (OPT_Wattributes, "%qE attribute only applies to functions",
6276 name);
6277 *no_add_attrs = true;
6278 return NULL_TREE;
6279 }
6280
6281 /* Can combine regparm with all attributes but fastcall, and thiscall. */
6282 if (is_attribute_p ("regparm", name))
6283 {
6284 tree cst;
6285
6286 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6287 {
6288 error ("fastcall and regparm attributes are not compatible");
6289 }
6290
6291 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6292 {
6293 error ("regparam and thiscall attributes are not compatible");
6294 }
6295
6296 cst = TREE_VALUE (args);
6297 if (TREE_CODE (cst) != INTEGER_CST)
6298 {
6299 warning (OPT_Wattributes,
6300 "%qE attribute requires an integer constant argument",
6301 name);
6302 *no_add_attrs = true;
6303 }
6304 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
6305 {
6306 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
6307 name, REGPARM_MAX);
6308 *no_add_attrs = true;
6309 }
6310
6311 return NULL_TREE;
6312 }
6313
6314 if (TARGET_64BIT)
6315 {
6316 /* Do not warn when emulating the MS ABI. */
6317 if ((TREE_CODE (*node) != FUNCTION_TYPE
6318 && TREE_CODE (*node) != METHOD_TYPE)
6319 || ix86_function_type_abi (*node) != MS_ABI)
6320 warning (OPT_Wattributes, "%qE attribute ignored",
6321 name);
6322 *no_add_attrs = true;
6323 return NULL_TREE;
6324 }
6325
6326 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
6327 if (is_attribute_p ("fastcall", name))
6328 {
6329 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6330 {
6331 error ("fastcall and cdecl attributes are not compatible");
6332 }
6333 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6334 {
6335 error ("fastcall and stdcall attributes are not compatible");
6336 }
6337 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
6338 {
6339 error ("fastcall and regparm attributes are not compatible");
6340 }
6341 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6342 {
6343 error ("fastcall and thiscall attributes are not compatible");
6344 }
6345 }
6346
6347 /* Can combine stdcall with fastcall (redundant), regparm and
6348 sseregparm. */
6349 else if (is_attribute_p ("stdcall", name))
6350 {
6351 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6352 {
6353 error ("stdcall and cdecl attributes are not compatible");
6354 }
6355 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6356 {
6357 error ("stdcall and fastcall attributes are not compatible");
6358 }
6359 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6360 {
6361 error ("stdcall and thiscall attributes are not compatible");
6362 }
6363 }
6364
6365 /* Can combine cdecl with regparm and sseregparm. */
6366 else if (is_attribute_p ("cdecl", name))
6367 {
6368 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6369 {
6370 error ("stdcall and cdecl attributes are not compatible");
6371 }
6372 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6373 {
6374 error ("fastcall and cdecl attributes are not compatible");
6375 }
6376 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
6377 {
6378 error ("cdecl and thiscall attributes are not compatible");
6379 }
6380 }
6381 else if (is_attribute_p ("thiscall", name))
6382 {
6383 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
6384 warning (OPT_Wattributes, "%qE attribute is used for non-class method",
6385 name);
6386 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
6387 {
6388 error ("stdcall and thiscall attributes are not compatible");
6389 }
6390 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
6391 {
6392 error ("fastcall and thiscall attributes are not compatible");
6393 }
6394 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
6395 {
6396 error ("cdecl and thiscall attributes are not compatible");
6397 }
6398 }
6399
6400 /* Can combine sseregparm with all attributes. */
6401
6402 return NULL_TREE;
6403 }
6404
6405 /* The transactional memory builtins are implicitly regparm or fastcall
6406 depending on the ABI. Override the generic do-nothing attribute that
6407 these builtins were declared with, and replace it with one of the two
6408 attributes that we expect elsewhere. */
6409
6410 static tree
6411 ix86_handle_tm_regparm_attribute (tree *node, tree, tree,
6412 int flags, bool *no_add_attrs)
6413 {
6414 tree alt;
6415
6416 /* In no case do we want to add the placeholder attribute. */
6417 *no_add_attrs = true;
6418
6419 /* The 64-bit ABI is unchanged for transactional memory. */
6420 if (TARGET_64BIT)
6421 return NULL_TREE;
6422
6423 /* ??? Is there a better way to validate 32-bit windows? We have
6424 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
6425 if (CHECK_STACK_LIMIT > 0)
6426 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
6427 else
6428 {
6429 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
6430 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
6431 }
6432 decl_attributes (node, alt, flags);
6433
6434 return NULL_TREE;
6435 }
6436
6437 /* This function determines from TYPE the calling-convention. */
6438
6439 unsigned int
6440 ix86_get_callcvt (const_tree type)
6441 {
6442 unsigned int ret = 0;
6443 bool is_stdarg;
6444 tree attrs;
6445
6446 if (TARGET_64BIT)
6447 return IX86_CALLCVT_CDECL;
6448
6449 attrs = TYPE_ATTRIBUTES (type);
6450 if (attrs != NULL_TREE)
6451 {
6452 if (lookup_attribute ("cdecl", attrs))
6453 ret |= IX86_CALLCVT_CDECL;
6454 else if (lookup_attribute ("stdcall", attrs))
6455 ret |= IX86_CALLCVT_STDCALL;
6456 else if (lookup_attribute ("fastcall", attrs))
6457 ret |= IX86_CALLCVT_FASTCALL;
6458 else if (lookup_attribute ("thiscall", attrs))
6459 ret |= IX86_CALLCVT_THISCALL;
6460
6461 /* Regparam isn't allowed for thiscall and fastcall. */
6462 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
6463 {
6464 if (lookup_attribute ("regparm", attrs))
6465 ret |= IX86_CALLCVT_REGPARM;
6466 if (lookup_attribute ("sseregparm", attrs))
6467 ret |= IX86_CALLCVT_SSEREGPARM;
6468 }
6469
6470 if (IX86_BASE_CALLCVT(ret) != 0)
6471 return ret;
6472 }
6473
6474 is_stdarg = stdarg_p (type);
6475 if (TARGET_RTD && !is_stdarg)
6476 return IX86_CALLCVT_STDCALL | ret;
6477
6478 if (ret != 0
6479 || is_stdarg
6480 || TREE_CODE (type) != METHOD_TYPE
6481 || ix86_function_type_abi (type) != MS_ABI)
6482 return IX86_CALLCVT_CDECL | ret;
6483
6484 return IX86_CALLCVT_THISCALL;
6485 }
6486
6487 /* Return 0 if the attributes for two types are incompatible, 1 if they
6488 are compatible, and 2 if they are nearly compatible (which causes a
6489 warning to be generated). */
6490
6491 static int
6492 ix86_comp_type_attributes (const_tree type1, const_tree type2)
6493 {
6494 unsigned int ccvt1, ccvt2;
6495
6496 if (TREE_CODE (type1) != FUNCTION_TYPE
6497 && TREE_CODE (type1) != METHOD_TYPE)
6498 return 1;
6499
6500 ccvt1 = ix86_get_callcvt (type1);
6501 ccvt2 = ix86_get_callcvt (type2);
6502 if (ccvt1 != ccvt2)
6503 return 0;
6504 if (ix86_function_regparm (type1, NULL)
6505 != ix86_function_regparm (type2, NULL))
6506 return 0;
6507
6508 return 1;
6509 }
6510 \f
6511 /* Return the regparm value for a function with the indicated TYPE and DECL.
6512 DECL may be NULL when calling function indirectly
6513 or considering a libcall. */
6514
6515 static int
6516 ix86_function_regparm (const_tree type, const_tree decl)
6517 {
6518 tree attr;
6519 int regparm;
6520 unsigned int ccvt;
6521
6522 if (TARGET_64BIT)
6523 return (ix86_function_type_abi (type) == SYSV_ABI
6524 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
6525 ccvt = ix86_get_callcvt (type);
6526 regparm = ix86_regparm;
6527
6528 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
6529 {
6530 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
6531 if (attr)
6532 {
6533 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
6534 return regparm;
6535 }
6536 }
6537 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
6538 return 2;
6539 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
6540 return 1;
6541
6542 /* Use register calling convention for local functions when possible. */
6543 if (decl
6544 && TREE_CODE (decl) == FUNCTION_DECL)
6545 {
6546 cgraph_node *target = cgraph_node::get (decl);
6547 if (target)
6548 target = target->function_symbol ();
6549
6550 /* Caller and callee must agree on the calling convention, so
6551 checking here just optimize means that with
6552 __attribute__((optimize (...))) caller could use regparm convention
6553 and callee not, or vice versa. Instead look at whether the callee
6554 is optimized or not. */
6555 if (target && opt_for_fn (target->decl, optimize)
6556 && !(profile_flag && !flag_fentry))
6557 {
6558 cgraph_local_info *i = &target->local;
6559 if (i && i->local && i->can_change_signature)
6560 {
6561 int local_regparm, globals = 0, regno;
6562
6563 /* Make sure no regparm register is taken by a
6564 fixed register variable. */
6565 for (local_regparm = 0; local_regparm < REGPARM_MAX;
6566 local_regparm++)
6567 if (fixed_regs[local_regparm])
6568 break;
6569
6570 /* We don't want to use regparm(3) for nested functions as
6571 these use a static chain pointer in the third argument. */
6572 if (local_regparm == 3 && DECL_STATIC_CHAIN (target->decl))
6573 local_regparm = 2;
6574
6575 /* Save a register for the split stack. */
6576 if (flag_split_stack)
6577 {
6578 if (local_regparm == 3)
6579 local_regparm = 2;
6580 else if (local_regparm == 2
6581 && DECL_STATIC_CHAIN (target->decl))
6582 local_regparm = 1;
6583 }
6584
6585 /* Each fixed register usage increases register pressure,
6586 so less registers should be used for argument passing.
6587 This functionality can be overriden by an explicit
6588 regparm value. */
6589 for (regno = AX_REG; regno <= DI_REG; regno++)
6590 if (fixed_regs[regno])
6591 globals++;
6592
6593 local_regparm
6594 = globals < local_regparm ? local_regparm - globals : 0;
6595
6596 if (local_regparm > regparm)
6597 regparm = local_regparm;
6598 }
6599 }
6600 }
6601
6602 return regparm;
6603 }
6604
6605 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
6606 DFmode (2) arguments in SSE registers for a function with the
6607 indicated TYPE and DECL. DECL may be NULL when calling function
6608 indirectly or considering a libcall. Return -1 if any FP parameter
6609 should be rejected by error. This is used in siutation we imply SSE
6610 calling convetion but the function is called from another function with
6611 SSE disabled. Otherwise return 0. */
6612
6613 static int
6614 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
6615 {
6616 gcc_assert (!TARGET_64BIT);
6617
6618 /* Use SSE registers to pass SFmode and DFmode arguments if requested
6619 by the sseregparm attribute. */
6620 if (TARGET_SSEREGPARM
6621 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
6622 {
6623 if (!TARGET_SSE)
6624 {
6625 if (warn)
6626 {
6627 if (decl)
6628 error ("calling %qD with attribute sseregparm without "
6629 "SSE/SSE2 enabled", decl);
6630 else
6631 error ("calling %qT with attribute sseregparm without "
6632 "SSE/SSE2 enabled", type);
6633 }
6634 return 0;
6635 }
6636
6637 return 2;
6638 }
6639
6640 if (!decl)
6641 return 0;
6642
6643 cgraph_node *target = cgraph_node::get (decl);
6644 if (target)
6645 target = target->function_symbol ();
6646
6647 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
6648 (and DFmode for SSE2) arguments in SSE registers. */
6649 if (target
6650 /* TARGET_SSE_MATH */
6651 && (target_opts_for_fn (target->decl)->x_ix86_fpmath & FPMATH_SSE)
6652 && opt_for_fn (target->decl, optimize)
6653 && !(profile_flag && !flag_fentry))
6654 {
6655 cgraph_local_info *i = &target->local;
6656 if (i && i->local && i->can_change_signature)
6657 {
6658 /* Refuse to produce wrong code when local function with SSE enabled
6659 is called from SSE disabled function.
6660 FIXME: We need a way to detect these cases cross-ltrans partition
6661 and avoid using SSE calling conventions on local functions called
6662 from function with SSE disabled. For now at least delay the
6663 warning until we know we are going to produce wrong code.
6664 See PR66047 */
6665 if (!TARGET_SSE && warn)
6666 return -1;
6667 return TARGET_SSE2_P (target_opts_for_fn (target->decl)
6668 ->x_ix86_isa_flags) ? 2 : 1;
6669 }
6670 }
6671
6672 return 0;
6673 }
6674
6675 /* Return true if EAX is live at the start of the function. Used by
6676 ix86_expand_prologue to determine if we need special help before
6677 calling allocate_stack_worker. */
6678
6679 static bool
6680 ix86_eax_live_at_start_p (void)
6681 {
6682 /* Cheat. Don't bother working forward from ix86_function_regparm
6683 to the function type to whether an actual argument is located in
6684 eax. Instead just look at cfg info, which is still close enough
6685 to correct at this point. This gives false positives for broken
6686 functions that might use uninitialized data that happens to be
6687 allocated in eax, but who cares? */
6688 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun)), 0);
6689 }
6690
6691 static bool
6692 ix86_keep_aggregate_return_pointer (tree fntype)
6693 {
6694 tree attr;
6695
6696 if (!TARGET_64BIT)
6697 {
6698 attr = lookup_attribute ("callee_pop_aggregate_return",
6699 TYPE_ATTRIBUTES (fntype));
6700 if (attr)
6701 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
6702
6703 /* For 32-bit MS-ABI the default is to keep aggregate
6704 return pointer. */
6705 if (ix86_function_type_abi (fntype) == MS_ABI)
6706 return true;
6707 }
6708 return KEEP_AGGREGATE_RETURN_POINTER != 0;
6709 }
6710
6711 /* Value is the number of bytes of arguments automatically
6712 popped when returning from a subroutine call.
6713 FUNDECL is the declaration node of the function (as a tree),
6714 FUNTYPE is the data type of the function (as a tree),
6715 or for a library call it is an identifier node for the subroutine name.
6716 SIZE is the number of bytes of arguments passed on the stack.
6717
6718 On the 80386, the RTD insn may be used to pop them if the number
6719 of args is fixed, but if the number is variable then the caller
6720 must pop them all. RTD can't be used for library calls now
6721 because the library is compiled with the Unix compiler.
6722 Use of RTD is a selectable option, since it is incompatible with
6723 standard Unix calling sequences. If the option is not selected,
6724 the caller must always pop the args.
6725
6726 The attribute stdcall is equivalent to RTD on a per module basis. */
6727
6728 static int
6729 ix86_return_pops_args (tree fundecl, tree funtype, int size)
6730 {
6731 unsigned int ccvt;
6732
6733 /* None of the 64-bit ABIs pop arguments. */
6734 if (TARGET_64BIT)
6735 return 0;
6736
6737 ccvt = ix86_get_callcvt (funtype);
6738
6739 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
6740 | IX86_CALLCVT_THISCALL)) != 0
6741 && ! stdarg_p (funtype))
6742 return size;
6743
6744 /* Lose any fake structure return argument if it is passed on the stack. */
6745 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
6746 && !ix86_keep_aggregate_return_pointer (funtype))
6747 {
6748 int nregs = ix86_function_regparm (funtype, fundecl);
6749 if (nregs == 0)
6750 return GET_MODE_SIZE (Pmode);
6751 }
6752
6753 return 0;
6754 }
6755
6756 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
6757
6758 static bool
6759 ix86_legitimate_combined_insn (rtx_insn *insn)
6760 {
6761 int i;
6762
6763 /* Check operand constraints in case hard registers were propagated
6764 into insn pattern. This check prevents combine pass from
6765 generating insn patterns with invalid hard register operands.
6766 These invalid insns can eventually confuse reload to error out
6767 with a spill failure. See also PRs 46829 and 46843. */
6768
6769 gcc_assert (INSN_CODE (insn) >= 0);
6770
6771 extract_insn (insn);
6772 preprocess_constraints (insn);
6773
6774 int n_operands = recog_data.n_operands;
6775 int n_alternatives = recog_data.n_alternatives;
6776 for (i = 0; i < n_operands; i++)
6777 {
6778 rtx op = recog_data.operand[i];
6779 machine_mode mode = GET_MODE (op);
6780 const operand_alternative *op_alt;
6781 int offset = 0;
6782 bool win;
6783 int j;
6784
6785 /* A unary operator may be accepted by the predicate, but it
6786 is irrelevant for matching constraints. */
6787 if (UNARY_P (op))
6788 op = XEXP (op, 0);
6789
6790 if (SUBREG_P (op))
6791 {
6792 if (REG_P (SUBREG_REG (op))
6793 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
6794 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
6795 GET_MODE (SUBREG_REG (op)),
6796 SUBREG_BYTE (op),
6797 GET_MODE (op));
6798 op = SUBREG_REG (op);
6799 }
6800
6801 if (!(REG_P (op) && HARD_REGISTER_P (op)))
6802 continue;
6803
6804 op_alt = recog_op_alt;
6805
6806 /* Operand has no constraints, anything is OK. */
6807 win = !n_alternatives;
6808
6809 alternative_mask preferred = get_preferred_alternatives (insn);
6810 for (j = 0; j < n_alternatives; j++, op_alt += n_operands)
6811 {
6812 if (!TEST_BIT (preferred, j))
6813 continue;
6814 if (op_alt[i].anything_ok
6815 || (op_alt[i].matches != -1
6816 && operands_match_p
6817 (recog_data.operand[i],
6818 recog_data.operand[op_alt[i].matches]))
6819 || reg_fits_class_p (op, op_alt[i].cl, offset, mode))
6820 {
6821 win = true;
6822 break;
6823 }
6824 }
6825
6826 if (!win)
6827 return false;
6828 }
6829
6830 return true;
6831 }
6832 \f
6833 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
6834
6835 static unsigned HOST_WIDE_INT
6836 ix86_asan_shadow_offset (void)
6837 {
6838 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
6839 : HOST_WIDE_INT_C (0x7fff8000))
6840 : (HOST_WIDE_INT_1 << 29);
6841 }
6842 \f
6843 /* Argument support functions. */
6844
6845 /* Return true when register may be used to pass function parameters. */
6846 bool
6847 ix86_function_arg_regno_p (int regno)
6848 {
6849 int i;
6850 enum calling_abi call_abi;
6851 const int *parm_regs;
6852
6853 if (TARGET_MPX && BND_REGNO_P (regno))
6854 return true;
6855
6856 if (!TARGET_64BIT)
6857 {
6858 if (TARGET_MACHO)
6859 return (regno < REGPARM_MAX
6860 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
6861 else
6862 return (regno < REGPARM_MAX
6863 || (TARGET_MMX && MMX_REGNO_P (regno)
6864 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
6865 || (TARGET_SSE && SSE_REGNO_P (regno)
6866 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
6867 }
6868
6869 if (TARGET_SSE && SSE_REGNO_P (regno)
6870 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
6871 return true;
6872
6873 /* TODO: The function should depend on current function ABI but
6874 builtins.c would need updating then. Therefore we use the
6875 default ABI. */
6876 call_abi = ix86_cfun_abi ();
6877
6878 /* RAX is used as hidden argument to va_arg functions. */
6879 if (call_abi == SYSV_ABI && regno == AX_REG)
6880 return true;
6881
6882 if (call_abi == MS_ABI)
6883 parm_regs = x86_64_ms_abi_int_parameter_registers;
6884 else
6885 parm_regs = x86_64_int_parameter_registers;
6886
6887 for (i = 0; i < (call_abi == MS_ABI
6888 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
6889 if (regno == parm_regs[i])
6890 return true;
6891 return false;
6892 }
6893
6894 /* Return if we do not know how to pass TYPE solely in registers. */
6895
6896 static bool
6897 ix86_must_pass_in_stack (machine_mode mode, const_tree type)
6898 {
6899 if (must_pass_in_stack_var_size_or_pad (mode, type))
6900 return true;
6901
6902 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
6903 The layout_type routine is crafty and tries to trick us into passing
6904 currently unsupported vector types on the stack by using TImode. */
6905 return (!TARGET_64BIT && mode == TImode
6906 && type && TREE_CODE (type) != VECTOR_TYPE);
6907 }
6908
6909 /* It returns the size, in bytes, of the area reserved for arguments passed
6910 in registers for the function represented by fndecl dependent to the used
6911 abi format. */
6912 int
6913 ix86_reg_parm_stack_space (const_tree fndecl)
6914 {
6915 enum calling_abi call_abi = SYSV_ABI;
6916 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
6917 call_abi = ix86_function_abi (fndecl);
6918 else
6919 call_abi = ix86_function_type_abi (fndecl);
6920 if (TARGET_64BIT && call_abi == MS_ABI)
6921 return 32;
6922 return 0;
6923 }
6924
6925 /* We add this as a workaround in order to use libc_has_function
6926 hook in i386.md. */
6927 bool
6928 ix86_libc_has_function (enum function_class fn_class)
6929 {
6930 return targetm.libc_has_function (fn_class);
6931 }
6932
6933 /* Returns value SYSV_ABI, MS_ABI dependent on fntype,
6934 specifying the call abi used. */
6935 enum calling_abi
6936 ix86_function_type_abi (const_tree fntype)
6937 {
6938 enum calling_abi abi = ix86_abi;
6939
6940 if (fntype == NULL_TREE || TYPE_ATTRIBUTES (fntype) == NULL_TREE)
6941 return abi;
6942
6943 if (abi == SYSV_ABI
6944 && lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
6945 {
6946 static int warned;
6947 if (TARGET_X32 && !warned)
6948 {
6949 error ("X32 does not support ms_abi attribute");
6950 warned = 1;
6951 }
6952
6953 abi = MS_ABI;
6954 }
6955 else if (abi == MS_ABI
6956 && lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
6957 abi = SYSV_ABI;
6958
6959 return abi;
6960 }
6961
6962 static enum calling_abi
6963 ix86_function_abi (const_tree fndecl)
6964 {
6965 return fndecl ? ix86_function_type_abi (TREE_TYPE (fndecl)) : ix86_abi;
6966 }
6967
6968 /* Returns value SYSV_ABI, MS_ABI dependent on cfun,
6969 specifying the call abi used. */
6970 enum calling_abi
6971 ix86_cfun_abi (void)
6972 {
6973 return cfun ? cfun->machine->call_abi : ix86_abi;
6974 }
6975
6976 static bool
6977 ix86_function_ms_hook_prologue (const_tree fn)
6978 {
6979 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
6980 {
6981 if (decl_function_context (fn) != NULL_TREE)
6982 error_at (DECL_SOURCE_LOCATION (fn),
6983 "ms_hook_prologue is not compatible with nested function");
6984 else
6985 return true;
6986 }
6987 return false;
6988 }
6989
6990 static bool
6991 ix86_function_naked (const_tree fn)
6992 {
6993 if (fn && lookup_attribute ("naked", DECL_ATTRIBUTES (fn)))
6994 return true;
6995
6996 return false;
6997 }
6998
6999 /* Write the extra assembler code needed to declare a function properly. */
7000
7001 void
7002 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
7003 tree decl)
7004 {
7005 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
7006
7007 if (is_ms_hook)
7008 {
7009 int i, filler_count = (TARGET_64BIT ? 32 : 16);
7010 unsigned int filler_cc = 0xcccccccc;
7011
7012 for (i = 0; i < filler_count; i += 4)
7013 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
7014 }
7015
7016 #ifdef SUBTARGET_ASM_UNWIND_INIT
7017 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
7018 #endif
7019
7020 ASM_OUTPUT_LABEL (asm_out_file, fname);
7021
7022 /* Output magic byte marker, if hot-patch attribute is set. */
7023 if (is_ms_hook)
7024 {
7025 if (TARGET_64BIT)
7026 {
7027 /* leaq [%rsp + 0], %rsp */
7028 fputs (ASM_BYTE "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n",
7029 asm_out_file);
7030 }
7031 else
7032 {
7033 /* movl.s %edi, %edi
7034 push %ebp
7035 movl.s %esp, %ebp */
7036 fputs (ASM_BYTE "0x8b, 0xff, 0x55, 0x8b, 0xec\n", asm_out_file);
7037 }
7038 }
7039 }
7040
7041 /* Implementation of call abi switching target hook. Specific to FNDECL
7042 the specific call register sets are set. See also
7043 ix86_conditional_register_usage for more details. */
7044 void
7045 ix86_call_abi_override (const_tree fndecl)
7046 {
7047 cfun->machine->call_abi = ix86_function_abi (fndecl);
7048 }
7049
7050 /* Return 1 if pseudo register should be created and used to hold
7051 GOT address for PIC code. */
7052 bool
7053 ix86_use_pseudo_pic_reg (void)
7054 {
7055 if ((TARGET_64BIT
7056 && (ix86_cmodel == CM_SMALL_PIC
7057 || TARGET_PECOFF))
7058 || !flag_pic)
7059 return false;
7060 return true;
7061 }
7062
7063 /* Initialize large model PIC register. */
7064
7065 static void
7066 ix86_init_large_pic_reg (unsigned int tmp_regno)
7067 {
7068 rtx_code_label *label;
7069 rtx tmp_reg;
7070
7071 gcc_assert (Pmode == DImode);
7072 label = gen_label_rtx ();
7073 emit_label (label);
7074 LABEL_PRESERVE_P (label) = 1;
7075 tmp_reg = gen_rtx_REG (Pmode, tmp_regno);
7076 gcc_assert (REGNO (pic_offset_table_rtx) != tmp_regno);
7077 emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
7078 label));
7079 emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
7080 emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
7081 pic_offset_table_rtx, tmp_reg));
7082 const char *name = LABEL_NAME (label);
7083 PUT_CODE (label, NOTE);
7084 NOTE_KIND (label) = NOTE_INSN_DELETED_LABEL;
7085 NOTE_DELETED_LABEL_NAME (label) = name;
7086 }
7087
7088 /* Create and initialize PIC register if required. */
7089 static void
7090 ix86_init_pic_reg (void)
7091 {
7092 edge entry_edge;
7093 rtx_insn *seq;
7094
7095 if (!ix86_use_pseudo_pic_reg ())
7096 return;
7097
7098 start_sequence ();
7099
7100 if (TARGET_64BIT)
7101 {
7102 if (ix86_cmodel == CM_LARGE_PIC)
7103 ix86_init_large_pic_reg (R11_REG);
7104 else
7105 emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
7106 }
7107 else
7108 {
7109 /* If there is future mcount call in the function it is more profitable
7110 to emit SET_GOT into ABI defined REAL_PIC_OFFSET_TABLE_REGNUM. */
7111 rtx reg = crtl->profile
7112 ? gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM)
7113 : pic_offset_table_rtx;
7114 rtx_insn *insn = emit_insn (gen_set_got (reg));
7115 RTX_FRAME_RELATED_P (insn) = 1;
7116 if (crtl->profile)
7117 emit_move_insn (pic_offset_table_rtx, reg);
7118 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
7119 }
7120
7121 seq = get_insns ();
7122 end_sequence ();
7123
7124 entry_edge = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7125 insert_insn_on_edge (seq, entry_edge);
7126 commit_one_edge_insertion (entry_edge);
7127 }
7128
7129 /* Initialize a variable CUM of type CUMULATIVE_ARGS
7130 for a call to a function whose data type is FNTYPE.
7131 For a library call, FNTYPE is 0. */
7132
7133 void
7134 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
7135 tree fntype, /* tree ptr for function decl */
7136 rtx libname, /* SYMBOL_REF of library name or 0 */
7137 tree fndecl,
7138 int caller)
7139 {
7140 struct cgraph_local_info *i = NULL;
7141 struct cgraph_node *target = NULL;
7142
7143 memset (cum, 0, sizeof (*cum));
7144
7145 if (fndecl)
7146 {
7147 target = cgraph_node::get (fndecl);
7148 if (target)
7149 {
7150 target = target->function_symbol ();
7151 i = cgraph_node::local_info (target->decl);
7152 cum->call_abi = ix86_function_abi (target->decl);
7153 }
7154 else
7155 cum->call_abi = ix86_function_abi (fndecl);
7156 }
7157 else
7158 cum->call_abi = ix86_function_type_abi (fntype);
7159
7160 cum->caller = caller;
7161
7162 /* Set up the number of registers to use for passing arguments. */
7163 cum->nregs = ix86_regparm;
7164 if (TARGET_64BIT)
7165 {
7166 cum->nregs = (cum->call_abi == SYSV_ABI
7167 ? X86_64_REGPARM_MAX
7168 : X86_64_MS_REGPARM_MAX);
7169 }
7170 if (TARGET_SSE)
7171 {
7172 cum->sse_nregs = SSE_REGPARM_MAX;
7173 if (TARGET_64BIT)
7174 {
7175 cum->sse_nregs = (cum->call_abi == SYSV_ABI
7176 ? X86_64_SSE_REGPARM_MAX
7177 : X86_64_MS_SSE_REGPARM_MAX);
7178 }
7179 }
7180 if (TARGET_MMX)
7181 cum->mmx_nregs = MMX_REGPARM_MAX;
7182 cum->warn_avx512f = true;
7183 cum->warn_avx = true;
7184 cum->warn_sse = true;
7185 cum->warn_mmx = true;
7186
7187 /* Because type might mismatch in between caller and callee, we need to
7188 use actual type of function for local calls.
7189 FIXME: cgraph_analyze can be told to actually record if function uses
7190 va_start so for local functions maybe_vaarg can be made aggressive
7191 helping K&R code.
7192 FIXME: once typesytem is fixed, we won't need this code anymore. */
7193 if (i && i->local && i->can_change_signature)
7194 fntype = TREE_TYPE (target->decl);
7195 cum->stdarg = stdarg_p (fntype);
7196 cum->maybe_vaarg = (fntype
7197 ? (!prototype_p (fntype) || stdarg_p (fntype))
7198 : !libname);
7199
7200 cum->bnd_regno = FIRST_BND_REG;
7201 cum->bnds_in_bt = 0;
7202 cum->force_bnd_pass = 0;
7203 cum->decl = fndecl;
7204
7205 cum->warn_empty = !warn_abi || cum->stdarg;
7206 if (!cum->warn_empty && fntype)
7207 {
7208 function_args_iterator iter;
7209 tree argtype;
7210 bool seen_empty_type = false;
7211 FOREACH_FUNCTION_ARGS (fntype, argtype, iter)
7212 {
7213 if (VOID_TYPE_P (argtype))
7214 break;
7215 if (TYPE_EMPTY_P (argtype))
7216 seen_empty_type = true;
7217 else if (seen_empty_type)
7218 {
7219 cum->warn_empty = true;
7220 break;
7221 }
7222 }
7223 }
7224
7225 if (!TARGET_64BIT)
7226 {
7227 /* If there are variable arguments, then we won't pass anything
7228 in registers in 32-bit mode. */
7229 if (stdarg_p (fntype))
7230 {
7231 cum->nregs = 0;
7232 /* Since in 32-bit, variable arguments are always passed on
7233 stack, there is scratch register available for indirect
7234 sibcall. */
7235 cfun->machine->arg_reg_available = true;
7236 cum->sse_nregs = 0;
7237 cum->mmx_nregs = 0;
7238 cum->warn_avx512f = false;
7239 cum->warn_avx = false;
7240 cum->warn_sse = false;
7241 cum->warn_mmx = false;
7242 return;
7243 }
7244
7245 /* Use ecx and edx registers if function has fastcall attribute,
7246 else look for regparm information. */
7247 if (fntype)
7248 {
7249 unsigned int ccvt = ix86_get_callcvt (fntype);
7250 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
7251 {
7252 cum->nregs = 1;
7253 cum->fastcall = 1; /* Same first register as in fastcall. */
7254 }
7255 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
7256 {
7257 cum->nregs = 2;
7258 cum->fastcall = 1;
7259 }
7260 else
7261 cum->nregs = ix86_function_regparm (fntype, fndecl);
7262 }
7263
7264 /* Set up the number of SSE registers used for passing SFmode
7265 and DFmode arguments. Warn for mismatching ABI. */
7266 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
7267 }
7268
7269 cfun->machine->arg_reg_available = (cum->nregs > 0);
7270 }
7271
7272 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
7273 But in the case of vector types, it is some vector mode.
7274
7275 When we have only some of our vector isa extensions enabled, then there
7276 are some modes for which vector_mode_supported_p is false. For these
7277 modes, the generic vector support in gcc will choose some non-vector mode
7278 in order to implement the type. By computing the natural mode, we'll
7279 select the proper ABI location for the operand and not depend on whatever
7280 the middle-end decides to do with these vector types.
7281
7282 The midde-end can't deal with the vector types > 16 bytes. In this
7283 case, we return the original mode and warn ABI change if CUM isn't
7284 NULL.
7285
7286 If INT_RETURN is true, warn ABI change if the vector mode isn't
7287 available for function return value. */
7288
7289 static machine_mode
7290 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
7291 bool in_return)
7292 {
7293 machine_mode mode = TYPE_MODE (type);
7294
7295 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
7296 {
7297 HOST_WIDE_INT size = int_size_in_bytes (type);
7298 if ((size == 8 || size == 16 || size == 32 || size == 64)
7299 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
7300 && TYPE_VECTOR_SUBPARTS (type) > 1)
7301 {
7302 machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
7303
7304 /* There are no XFmode vector modes. */
7305 if (innermode == XFmode)
7306 return mode;
7307
7308 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
7309 mode = MIN_MODE_VECTOR_FLOAT;
7310 else
7311 mode = MIN_MODE_VECTOR_INT;
7312
7313 /* Get the mode which has this inner mode and number of units. */
7314 FOR_EACH_MODE_FROM (mode, mode)
7315 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
7316 && GET_MODE_INNER (mode) == innermode)
7317 {
7318 if (size == 64 && !TARGET_AVX512F && !TARGET_IAMCU)
7319 {
7320 static bool warnedavx512f;
7321 static bool warnedavx512f_ret;
7322
7323 if (cum && cum->warn_avx512f && !warnedavx512f)
7324 {
7325 if (warning (OPT_Wpsabi, "AVX512F vector argument "
7326 "without AVX512F enabled changes the ABI"))
7327 warnedavx512f = true;
7328 }
7329 else if (in_return && !warnedavx512f_ret)
7330 {
7331 if (warning (OPT_Wpsabi, "AVX512F vector return "
7332 "without AVX512F enabled changes the ABI"))
7333 warnedavx512f_ret = true;
7334 }
7335
7336 return TYPE_MODE (type);
7337 }
7338 else if (size == 32 && !TARGET_AVX && !TARGET_IAMCU)
7339 {
7340 static bool warnedavx;
7341 static bool warnedavx_ret;
7342
7343 if (cum && cum->warn_avx && !warnedavx)
7344 {
7345 if (warning (OPT_Wpsabi, "AVX vector argument "
7346 "without AVX enabled changes the ABI"))
7347 warnedavx = true;
7348 }
7349 else if (in_return && !warnedavx_ret)
7350 {
7351 if (warning (OPT_Wpsabi, "AVX vector return "
7352 "without AVX enabled changes the ABI"))
7353 warnedavx_ret = true;
7354 }
7355
7356 return TYPE_MODE (type);
7357 }
7358 else if (((size == 8 && TARGET_64BIT) || size == 16)
7359 && !TARGET_SSE
7360 && !TARGET_IAMCU)
7361 {
7362 static bool warnedsse;
7363 static bool warnedsse_ret;
7364
7365 if (cum && cum->warn_sse && !warnedsse)
7366 {
7367 if (warning (OPT_Wpsabi, "SSE vector argument "
7368 "without SSE enabled changes the ABI"))
7369 warnedsse = true;
7370 }
7371 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
7372 {
7373 if (warning (OPT_Wpsabi, "SSE vector return "
7374 "without SSE enabled changes the ABI"))
7375 warnedsse_ret = true;
7376 }
7377 }
7378 else if ((size == 8 && !TARGET_64BIT)
7379 && (!cfun
7380 || cfun->machine->func_type == TYPE_NORMAL)
7381 && !TARGET_MMX
7382 && !TARGET_IAMCU)
7383 {
7384 static bool warnedmmx;
7385 static bool warnedmmx_ret;
7386
7387 if (cum && cum->warn_mmx && !warnedmmx)
7388 {
7389 if (warning (OPT_Wpsabi, "MMX vector argument "
7390 "without MMX enabled changes the ABI"))
7391 warnedmmx = true;
7392 }
7393 else if (in_return && !warnedmmx_ret)
7394 {
7395 if (warning (OPT_Wpsabi, "MMX vector return "
7396 "without MMX enabled changes the ABI"))
7397 warnedmmx_ret = true;
7398 }
7399 }
7400 return mode;
7401 }
7402
7403 gcc_unreachable ();
7404 }
7405 }
7406
7407 return mode;
7408 }
7409
7410 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
7411 this may not agree with the mode that the type system has chosen for the
7412 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
7413 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
7414
7415 static rtx
7416 gen_reg_or_parallel (machine_mode mode, machine_mode orig_mode,
7417 unsigned int regno)
7418 {
7419 rtx tmp;
7420
7421 if (orig_mode != BLKmode)
7422 tmp = gen_rtx_REG (orig_mode, regno);
7423 else
7424 {
7425 tmp = gen_rtx_REG (mode, regno);
7426 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
7427 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
7428 }
7429
7430 return tmp;
7431 }
7432
7433 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
7434 of this code is to classify each 8bytes of incoming argument by the register
7435 class and assign registers accordingly. */
7436
7437 /* Return the union class of CLASS1 and CLASS2.
7438 See the x86-64 PS ABI for details. */
7439
7440 static enum x86_64_reg_class
7441 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
7442 {
7443 /* Rule #1: If both classes are equal, this is the resulting class. */
7444 if (class1 == class2)
7445 return class1;
7446
7447 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
7448 the other class. */
7449 if (class1 == X86_64_NO_CLASS)
7450 return class2;
7451 if (class2 == X86_64_NO_CLASS)
7452 return class1;
7453
7454 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
7455 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
7456 return X86_64_MEMORY_CLASS;
7457
7458 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
7459 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
7460 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
7461 return X86_64_INTEGERSI_CLASS;
7462 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
7463 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
7464 return X86_64_INTEGER_CLASS;
7465
7466 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
7467 MEMORY is used. */
7468 if (class1 == X86_64_X87_CLASS
7469 || class1 == X86_64_X87UP_CLASS
7470 || class1 == X86_64_COMPLEX_X87_CLASS
7471 || class2 == X86_64_X87_CLASS
7472 || class2 == X86_64_X87UP_CLASS
7473 || class2 == X86_64_COMPLEX_X87_CLASS)
7474 return X86_64_MEMORY_CLASS;
7475
7476 /* Rule #6: Otherwise class SSE is used. */
7477 return X86_64_SSE_CLASS;
7478 }
7479
7480 /* Classify the argument of type TYPE and mode MODE.
7481 CLASSES will be filled by the register class used to pass each word
7482 of the operand. The number of words is returned. In case the parameter
7483 should be passed in memory, 0 is returned. As a special case for zero
7484 sized containers, classes[0] will be NO_CLASS and 1 is returned.
7485
7486 BIT_OFFSET is used internally for handling records and specifies offset
7487 of the offset in bits modulo 512 to avoid overflow cases.
7488
7489 See the x86-64 PS ABI for details.
7490 */
7491
7492 static int
7493 classify_argument (machine_mode mode, const_tree type,
7494 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
7495 {
7496 HOST_WIDE_INT bytes =
7497 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7498 int words = CEIL (bytes + (bit_offset % 64) / 8, UNITS_PER_WORD);
7499
7500 /* Variable sized entities are always passed/returned in memory. */
7501 if (bytes < 0)
7502 return 0;
7503
7504 if (mode != VOIDmode
7505 && targetm.calls.must_pass_in_stack (mode, type))
7506 return 0;
7507
7508 if (type && AGGREGATE_TYPE_P (type))
7509 {
7510 int i;
7511 tree field;
7512 enum x86_64_reg_class subclasses[MAX_CLASSES];
7513
7514 /* On x86-64 we pass structures larger than 64 bytes on the stack. */
7515 if (bytes > 64)
7516 return 0;
7517
7518 for (i = 0; i < words; i++)
7519 classes[i] = X86_64_NO_CLASS;
7520
7521 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
7522 signalize memory class, so handle it as special case. */
7523 if (!words)
7524 {
7525 classes[0] = X86_64_NO_CLASS;
7526 return 1;
7527 }
7528
7529 /* Classify each field of record and merge classes. */
7530 switch (TREE_CODE (type))
7531 {
7532 case RECORD_TYPE:
7533 /* And now merge the fields of structure. */
7534 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7535 {
7536 if (TREE_CODE (field) == FIELD_DECL)
7537 {
7538 int num;
7539
7540 if (TREE_TYPE (field) == error_mark_node)
7541 continue;
7542
7543 /* Bitfields are always classified as integer. Handle them
7544 early, since later code would consider them to be
7545 misaligned integers. */
7546 if (DECL_BIT_FIELD (field))
7547 {
7548 for (i = (int_bit_position (field)
7549 + (bit_offset % 64)) / 8 / 8;
7550 i < ((int_bit_position (field) + (bit_offset % 64))
7551 + tree_to_shwi (DECL_SIZE (field))
7552 + 63) / 8 / 8; i++)
7553 classes[i] =
7554 merge_classes (X86_64_INTEGER_CLASS,
7555 classes[i]);
7556 }
7557 else
7558 {
7559 int pos;
7560
7561 type = TREE_TYPE (field);
7562
7563 /* Flexible array member is ignored. */
7564 if (TYPE_MODE (type) == BLKmode
7565 && TREE_CODE (type) == ARRAY_TYPE
7566 && TYPE_SIZE (type) == NULL_TREE
7567 && TYPE_DOMAIN (type) != NULL_TREE
7568 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
7569 == NULL_TREE))
7570 {
7571 static bool warned;
7572
7573 if (!warned && warn_psabi)
7574 {
7575 warned = true;
7576 inform (input_location,
7577 "the ABI of passing struct with"
7578 " a flexible array member has"
7579 " changed in GCC 4.4");
7580 }
7581 continue;
7582 }
7583 num = classify_argument (TYPE_MODE (type), type,
7584 subclasses,
7585 (int_bit_position (field)
7586 + bit_offset) % 512);
7587 if (!num)
7588 return 0;
7589 pos = (int_bit_position (field)
7590 + (bit_offset % 64)) / 8 / 8;
7591 for (i = 0; i < num && (i + pos) < words; i++)
7592 classes[i + pos] =
7593 merge_classes (subclasses[i], classes[i + pos]);
7594 }
7595 }
7596 }
7597 break;
7598
7599 case ARRAY_TYPE:
7600 /* Arrays are handled as small records. */
7601 {
7602 int num;
7603 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
7604 TREE_TYPE (type), subclasses, bit_offset);
7605 if (!num)
7606 return 0;
7607
7608 /* The partial classes are now full classes. */
7609 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
7610 subclasses[0] = X86_64_SSE_CLASS;
7611 if (subclasses[0] == X86_64_INTEGERSI_CLASS
7612 && !((bit_offset % 64) == 0 && bytes == 4))
7613 subclasses[0] = X86_64_INTEGER_CLASS;
7614
7615 for (i = 0; i < words; i++)
7616 classes[i] = subclasses[i % num];
7617
7618 break;
7619 }
7620 case UNION_TYPE:
7621 case QUAL_UNION_TYPE:
7622 /* Unions are similar to RECORD_TYPE but offset is always 0.
7623 */
7624 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7625 {
7626 if (TREE_CODE (field) == FIELD_DECL)
7627 {
7628 int num;
7629
7630 if (TREE_TYPE (field) == error_mark_node)
7631 continue;
7632
7633 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
7634 TREE_TYPE (field), subclasses,
7635 bit_offset);
7636 if (!num)
7637 return 0;
7638 for (i = 0; i < num && i < words; i++)
7639 classes[i] = merge_classes (subclasses[i], classes[i]);
7640 }
7641 }
7642 break;
7643
7644 default:
7645 gcc_unreachable ();
7646 }
7647
7648 if (words > 2)
7649 {
7650 /* When size > 16 bytes, if the first one isn't
7651 X86_64_SSE_CLASS or any other ones aren't
7652 X86_64_SSEUP_CLASS, everything should be passed in
7653 memory. */
7654 if (classes[0] != X86_64_SSE_CLASS)
7655 return 0;
7656
7657 for (i = 1; i < words; i++)
7658 if (classes[i] != X86_64_SSEUP_CLASS)
7659 return 0;
7660 }
7661
7662 /* Final merger cleanup. */
7663 for (i = 0; i < words; i++)
7664 {
7665 /* If one class is MEMORY, everything should be passed in
7666 memory. */
7667 if (classes[i] == X86_64_MEMORY_CLASS)
7668 return 0;
7669
7670 /* The X86_64_SSEUP_CLASS should be always preceded by
7671 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
7672 if (classes[i] == X86_64_SSEUP_CLASS
7673 && classes[i - 1] != X86_64_SSE_CLASS
7674 && classes[i - 1] != X86_64_SSEUP_CLASS)
7675 {
7676 /* The first one should never be X86_64_SSEUP_CLASS. */
7677 gcc_assert (i != 0);
7678 classes[i] = X86_64_SSE_CLASS;
7679 }
7680
7681 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
7682 everything should be passed in memory. */
7683 if (classes[i] == X86_64_X87UP_CLASS
7684 && (classes[i - 1] != X86_64_X87_CLASS))
7685 {
7686 static bool warned;
7687
7688 /* The first one should never be X86_64_X87UP_CLASS. */
7689 gcc_assert (i != 0);
7690 if (!warned && warn_psabi)
7691 {
7692 warned = true;
7693 inform (input_location,
7694 "the ABI of passing union with long double"
7695 " has changed in GCC 4.4");
7696 }
7697 return 0;
7698 }
7699 }
7700 return words;
7701 }
7702
7703 /* Compute alignment needed. We align all types to natural boundaries with
7704 exception of XFmode that is aligned to 64bits. */
7705 if (mode != VOIDmode && mode != BLKmode)
7706 {
7707 int mode_alignment = GET_MODE_BITSIZE (mode);
7708
7709 if (mode == XFmode)
7710 mode_alignment = 128;
7711 else if (mode == XCmode)
7712 mode_alignment = 256;
7713 if (COMPLEX_MODE_P (mode))
7714 mode_alignment /= 2;
7715 /* Misaligned fields are always returned in memory. */
7716 if (bit_offset % mode_alignment)
7717 return 0;
7718 }
7719
7720 /* for V1xx modes, just use the base mode */
7721 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
7722 && GET_MODE_UNIT_SIZE (mode) == bytes)
7723 mode = GET_MODE_INNER (mode);
7724
7725 /* Classification of atomic types. */
7726 switch (mode)
7727 {
7728 case E_SDmode:
7729 case E_DDmode:
7730 classes[0] = X86_64_SSE_CLASS;
7731 return 1;
7732 case E_TDmode:
7733 classes[0] = X86_64_SSE_CLASS;
7734 classes[1] = X86_64_SSEUP_CLASS;
7735 return 2;
7736 case E_DImode:
7737 case E_SImode:
7738 case E_HImode:
7739 case E_QImode:
7740 case E_CSImode:
7741 case E_CHImode:
7742 case E_CQImode:
7743 {
7744 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
7745
7746 /* Analyze last 128 bits only. */
7747 size = (size - 1) & 0x7f;
7748
7749 if (size < 32)
7750 {
7751 classes[0] = X86_64_INTEGERSI_CLASS;
7752 return 1;
7753 }
7754 else if (size < 64)
7755 {
7756 classes[0] = X86_64_INTEGER_CLASS;
7757 return 1;
7758 }
7759 else if (size < 64+32)
7760 {
7761 classes[0] = X86_64_INTEGER_CLASS;
7762 classes[1] = X86_64_INTEGERSI_CLASS;
7763 return 2;
7764 }
7765 else if (size < 64+64)
7766 {
7767 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7768 return 2;
7769 }
7770 else
7771 gcc_unreachable ();
7772 }
7773 case E_CDImode:
7774 case E_TImode:
7775 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
7776 return 2;
7777 case E_COImode:
7778 case E_OImode:
7779 /* OImode shouldn't be used directly. */
7780 gcc_unreachable ();
7781 case E_CTImode:
7782 return 0;
7783 case E_SFmode:
7784 if (!(bit_offset % 64))
7785 classes[0] = X86_64_SSESF_CLASS;
7786 else
7787 classes[0] = X86_64_SSE_CLASS;
7788 return 1;
7789 case E_DFmode:
7790 classes[0] = X86_64_SSEDF_CLASS;
7791 return 1;
7792 case E_XFmode:
7793 classes[0] = X86_64_X87_CLASS;
7794 classes[1] = X86_64_X87UP_CLASS;
7795 return 2;
7796 case E_TFmode:
7797 classes[0] = X86_64_SSE_CLASS;
7798 classes[1] = X86_64_SSEUP_CLASS;
7799 return 2;
7800 case E_SCmode:
7801 classes[0] = X86_64_SSE_CLASS;
7802 if (!(bit_offset % 64))
7803 return 1;
7804 else
7805 {
7806 static bool warned;
7807
7808 if (!warned && warn_psabi)
7809 {
7810 warned = true;
7811 inform (input_location,
7812 "the ABI of passing structure with complex float"
7813 " member has changed in GCC 4.4");
7814 }
7815 classes[1] = X86_64_SSESF_CLASS;
7816 return 2;
7817 }
7818 case E_DCmode:
7819 classes[0] = X86_64_SSEDF_CLASS;
7820 classes[1] = X86_64_SSEDF_CLASS;
7821 return 2;
7822 case E_XCmode:
7823 classes[0] = X86_64_COMPLEX_X87_CLASS;
7824 return 1;
7825 case E_TCmode:
7826 /* This modes is larger than 16 bytes. */
7827 return 0;
7828 case E_V8SFmode:
7829 case E_V8SImode:
7830 case E_V32QImode:
7831 case E_V16HImode:
7832 case E_V4DFmode:
7833 case E_V4DImode:
7834 classes[0] = X86_64_SSE_CLASS;
7835 classes[1] = X86_64_SSEUP_CLASS;
7836 classes[2] = X86_64_SSEUP_CLASS;
7837 classes[3] = X86_64_SSEUP_CLASS;
7838 return 4;
7839 case E_V8DFmode:
7840 case E_V16SFmode:
7841 case E_V8DImode:
7842 case E_V16SImode:
7843 case E_V32HImode:
7844 case E_V64QImode:
7845 classes[0] = X86_64_SSE_CLASS;
7846 classes[1] = X86_64_SSEUP_CLASS;
7847 classes[2] = X86_64_SSEUP_CLASS;
7848 classes[3] = X86_64_SSEUP_CLASS;
7849 classes[4] = X86_64_SSEUP_CLASS;
7850 classes[5] = X86_64_SSEUP_CLASS;
7851 classes[6] = X86_64_SSEUP_CLASS;
7852 classes[7] = X86_64_SSEUP_CLASS;
7853 return 8;
7854 case E_V4SFmode:
7855 case E_V4SImode:
7856 case E_V16QImode:
7857 case E_V8HImode:
7858 case E_V2DFmode:
7859 case E_V2DImode:
7860 classes[0] = X86_64_SSE_CLASS;
7861 classes[1] = X86_64_SSEUP_CLASS;
7862 return 2;
7863 case E_V1TImode:
7864 case E_V1DImode:
7865 case E_V2SFmode:
7866 case E_V2SImode:
7867 case E_V4HImode:
7868 case E_V8QImode:
7869 classes[0] = X86_64_SSE_CLASS;
7870 return 1;
7871 case E_BLKmode:
7872 case E_VOIDmode:
7873 return 0;
7874 default:
7875 gcc_assert (VECTOR_MODE_P (mode));
7876
7877 if (bytes > 16)
7878 return 0;
7879
7880 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
7881
7882 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
7883 classes[0] = X86_64_INTEGERSI_CLASS;
7884 else
7885 classes[0] = X86_64_INTEGER_CLASS;
7886 classes[1] = X86_64_INTEGER_CLASS;
7887 return 1 + (bytes > 8);
7888 }
7889 }
7890
7891 /* Examine the argument and return set number of register required in each
7892 class. Return true iff parameter should be passed in memory. */
7893
7894 static bool
7895 examine_argument (machine_mode mode, const_tree type, int in_return,
7896 int *int_nregs, int *sse_nregs)
7897 {
7898 enum x86_64_reg_class regclass[MAX_CLASSES];
7899 int n = classify_argument (mode, type, regclass, 0);
7900
7901 *int_nregs = 0;
7902 *sse_nregs = 0;
7903
7904 if (!n)
7905 return true;
7906 for (n--; n >= 0; n--)
7907 switch (regclass[n])
7908 {
7909 case X86_64_INTEGER_CLASS:
7910 case X86_64_INTEGERSI_CLASS:
7911 (*int_nregs)++;
7912 break;
7913 case X86_64_SSE_CLASS:
7914 case X86_64_SSESF_CLASS:
7915 case X86_64_SSEDF_CLASS:
7916 (*sse_nregs)++;
7917 break;
7918 case X86_64_NO_CLASS:
7919 case X86_64_SSEUP_CLASS:
7920 break;
7921 case X86_64_X87_CLASS:
7922 case X86_64_X87UP_CLASS:
7923 case X86_64_COMPLEX_X87_CLASS:
7924 if (!in_return)
7925 return true;
7926 break;
7927 case X86_64_MEMORY_CLASS:
7928 gcc_unreachable ();
7929 }
7930
7931 return false;
7932 }
7933
7934 /* Construct container for the argument used by GCC interface. See
7935 FUNCTION_ARG for the detailed description. */
7936
7937 static rtx
7938 construct_container (machine_mode mode, machine_mode orig_mode,
7939 const_tree type, int in_return, int nintregs, int nsseregs,
7940 const int *intreg, int sse_regno)
7941 {
7942 /* The following variables hold the static issued_error state. */
7943 static bool issued_sse_arg_error;
7944 static bool issued_sse_ret_error;
7945 static bool issued_x87_ret_error;
7946
7947 machine_mode tmpmode;
7948 int bytes =
7949 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
7950 enum x86_64_reg_class regclass[MAX_CLASSES];
7951 int n;
7952 int i;
7953 int nexps = 0;
7954 int needed_sseregs, needed_intregs;
7955 rtx exp[MAX_CLASSES];
7956 rtx ret;
7957
7958 n = classify_argument (mode, type, regclass, 0);
7959 if (!n)
7960 return NULL;
7961 if (examine_argument (mode, type, in_return, &needed_intregs,
7962 &needed_sseregs))
7963 return NULL;
7964 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
7965 return NULL;
7966
7967 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
7968 some less clueful developer tries to use floating-point anyway. */
7969 if (needed_sseregs && !TARGET_SSE)
7970 {
7971 if (in_return)
7972 {
7973 if (!issued_sse_ret_error)
7974 {
7975 error ("SSE register return with SSE disabled");
7976 issued_sse_ret_error = true;
7977 }
7978 }
7979 else if (!issued_sse_arg_error)
7980 {
7981 error ("SSE register argument with SSE disabled");
7982 issued_sse_arg_error = true;
7983 }
7984 return NULL;
7985 }
7986
7987 /* Likewise, error if the ABI requires us to return values in the
7988 x87 registers and the user specified -mno-80387. */
7989 if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return)
7990 for (i = 0; i < n; i++)
7991 if (regclass[i] == X86_64_X87_CLASS
7992 || regclass[i] == X86_64_X87UP_CLASS
7993 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
7994 {
7995 if (!issued_x87_ret_error)
7996 {
7997 error ("x87 register return with x87 disabled");
7998 issued_x87_ret_error = true;
7999 }
8000 return NULL;
8001 }
8002
8003 /* First construct simple cases. Avoid SCmode, since we want to use
8004 single register to pass this type. */
8005 if (n == 1 && mode != SCmode)
8006 switch (regclass[0])
8007 {
8008 case X86_64_INTEGER_CLASS:
8009 case X86_64_INTEGERSI_CLASS:
8010 return gen_rtx_REG (mode, intreg[0]);
8011 case X86_64_SSE_CLASS:
8012 case X86_64_SSESF_CLASS:
8013 case X86_64_SSEDF_CLASS:
8014 if (mode != BLKmode)
8015 return gen_reg_or_parallel (mode, orig_mode,
8016 SSE_REGNO (sse_regno));
8017 break;
8018 case X86_64_X87_CLASS:
8019 case X86_64_COMPLEX_X87_CLASS:
8020 return gen_rtx_REG (mode, FIRST_STACK_REG);
8021 case X86_64_NO_CLASS:
8022 /* Zero sized array, struct or class. */
8023 return NULL;
8024 default:
8025 gcc_unreachable ();
8026 }
8027 if (n == 2
8028 && regclass[0] == X86_64_SSE_CLASS
8029 && regclass[1] == X86_64_SSEUP_CLASS
8030 && mode != BLKmode)
8031 return gen_reg_or_parallel (mode, orig_mode,
8032 SSE_REGNO (sse_regno));
8033 if (n == 4
8034 && regclass[0] == X86_64_SSE_CLASS
8035 && regclass[1] == X86_64_SSEUP_CLASS
8036 && regclass[2] == X86_64_SSEUP_CLASS
8037 && regclass[3] == X86_64_SSEUP_CLASS
8038 && mode != BLKmode)
8039 return gen_reg_or_parallel (mode, orig_mode,
8040 SSE_REGNO (sse_regno));
8041 if (n == 8
8042 && regclass[0] == X86_64_SSE_CLASS
8043 && regclass[1] == X86_64_SSEUP_CLASS
8044 && regclass[2] == X86_64_SSEUP_CLASS
8045 && regclass[3] == X86_64_SSEUP_CLASS
8046 && regclass[4] == X86_64_SSEUP_CLASS
8047 && regclass[5] == X86_64_SSEUP_CLASS
8048 && regclass[6] == X86_64_SSEUP_CLASS
8049 && regclass[7] == X86_64_SSEUP_CLASS
8050 && mode != BLKmode)
8051 return gen_reg_or_parallel (mode, orig_mode,
8052 SSE_REGNO (sse_regno));
8053 if (n == 2
8054 && regclass[0] == X86_64_X87_CLASS
8055 && regclass[1] == X86_64_X87UP_CLASS)
8056 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
8057
8058 if (n == 2
8059 && regclass[0] == X86_64_INTEGER_CLASS
8060 && regclass[1] == X86_64_INTEGER_CLASS
8061 && (mode == CDImode || mode == TImode)
8062 && intreg[0] + 1 == intreg[1])
8063 return gen_rtx_REG (mode, intreg[0]);
8064
8065 /* Otherwise figure out the entries of the PARALLEL. */
8066 for (i = 0; i < n; i++)
8067 {
8068 int pos;
8069
8070 switch (regclass[i])
8071 {
8072 case X86_64_NO_CLASS:
8073 break;
8074 case X86_64_INTEGER_CLASS:
8075 case X86_64_INTEGERSI_CLASS:
8076 /* Merge TImodes on aligned occasions here too. */
8077 if (i * 8 + 8 > bytes)
8078 {
8079 unsigned int tmpbits = (bytes - i * 8) * BITS_PER_UNIT;
8080 if (!int_mode_for_size (tmpbits, 0).exists (&tmpmode))
8081 /* We've requested 24 bytes we
8082 don't have mode for. Use DImode. */
8083 tmpmode = DImode;
8084 }
8085 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
8086 tmpmode = SImode;
8087 else
8088 tmpmode = DImode;
8089 exp [nexps++]
8090 = gen_rtx_EXPR_LIST (VOIDmode,
8091 gen_rtx_REG (tmpmode, *intreg),
8092 GEN_INT (i*8));
8093 intreg++;
8094 break;
8095 case X86_64_SSESF_CLASS:
8096 exp [nexps++]
8097 = gen_rtx_EXPR_LIST (VOIDmode,
8098 gen_rtx_REG (SFmode,
8099 SSE_REGNO (sse_regno)),
8100 GEN_INT (i*8));
8101 sse_regno++;
8102 break;
8103 case X86_64_SSEDF_CLASS:
8104 exp [nexps++]
8105 = gen_rtx_EXPR_LIST (VOIDmode,
8106 gen_rtx_REG (DFmode,
8107 SSE_REGNO (sse_regno)),
8108 GEN_INT (i*8));
8109 sse_regno++;
8110 break;
8111 case X86_64_SSE_CLASS:
8112 pos = i;
8113 switch (n)
8114 {
8115 case 1:
8116 tmpmode = DImode;
8117 break;
8118 case 2:
8119 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
8120 {
8121 tmpmode = TImode;
8122 i++;
8123 }
8124 else
8125 tmpmode = DImode;
8126 break;
8127 case 4:
8128 gcc_assert (i == 0
8129 && regclass[1] == X86_64_SSEUP_CLASS
8130 && regclass[2] == X86_64_SSEUP_CLASS
8131 && regclass[3] == X86_64_SSEUP_CLASS);
8132 tmpmode = OImode;
8133 i += 3;
8134 break;
8135 case 8:
8136 gcc_assert (i == 0
8137 && regclass[1] == X86_64_SSEUP_CLASS
8138 && regclass[2] == X86_64_SSEUP_CLASS
8139 && regclass[3] == X86_64_SSEUP_CLASS
8140 && regclass[4] == X86_64_SSEUP_CLASS
8141 && regclass[5] == X86_64_SSEUP_CLASS
8142 && regclass[6] == X86_64_SSEUP_CLASS
8143 && regclass[7] == X86_64_SSEUP_CLASS);
8144 tmpmode = XImode;
8145 i += 7;
8146 break;
8147 default:
8148 gcc_unreachable ();
8149 }
8150 exp [nexps++]
8151 = gen_rtx_EXPR_LIST (VOIDmode,
8152 gen_rtx_REG (tmpmode,
8153 SSE_REGNO (sse_regno)),
8154 GEN_INT (pos*8));
8155 sse_regno++;
8156 break;
8157 default:
8158 gcc_unreachable ();
8159 }
8160 }
8161
8162 /* Empty aligned struct, union or class. */
8163 if (nexps == 0)
8164 return NULL;
8165
8166 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
8167 for (i = 0; i < nexps; i++)
8168 XVECEXP (ret, 0, i) = exp [i];
8169 return ret;
8170 }
8171
8172 /* Update the data in CUM to advance over an argument of mode MODE
8173 and data type TYPE. (TYPE is null for libcalls where that information
8174 may not be available.)
8175
8176 Return a number of integer regsiters advanced over. */
8177
8178 static int
8179 function_arg_advance_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8180 const_tree type, HOST_WIDE_INT bytes,
8181 HOST_WIDE_INT words)
8182 {
8183 int res = 0;
8184 bool error_p = false;
8185
8186 if (TARGET_IAMCU)
8187 {
8188 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8189 bytes in registers. */
8190 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8191 goto pass_in_reg;
8192 return res;
8193 }
8194
8195 switch (mode)
8196 {
8197 default:
8198 break;
8199
8200 case E_BLKmode:
8201 if (bytes < 0)
8202 break;
8203 /* FALLTHRU */
8204
8205 case E_DImode:
8206 case E_SImode:
8207 case E_HImode:
8208 case E_QImode:
8209 pass_in_reg:
8210 cum->words += words;
8211 cum->nregs -= words;
8212 cum->regno += words;
8213 if (cum->nregs >= 0)
8214 res = words;
8215 if (cum->nregs <= 0)
8216 {
8217 cum->nregs = 0;
8218 cfun->machine->arg_reg_available = false;
8219 cum->regno = 0;
8220 }
8221 break;
8222
8223 case E_OImode:
8224 /* OImode shouldn't be used directly. */
8225 gcc_unreachable ();
8226
8227 case E_DFmode:
8228 if (cum->float_in_sse == -1)
8229 error_p = true;
8230 if (cum->float_in_sse < 2)
8231 break;
8232 /* FALLTHRU */
8233 case E_SFmode:
8234 if (cum->float_in_sse == -1)
8235 error_p = true;
8236 if (cum->float_in_sse < 1)
8237 break;
8238 /* FALLTHRU */
8239
8240 case E_V8SFmode:
8241 case E_V8SImode:
8242 case E_V64QImode:
8243 case E_V32HImode:
8244 case E_V16SImode:
8245 case E_V8DImode:
8246 case E_V16SFmode:
8247 case E_V8DFmode:
8248 case E_V32QImode:
8249 case E_V16HImode:
8250 case E_V4DFmode:
8251 case E_V4DImode:
8252 case E_TImode:
8253 case E_V16QImode:
8254 case E_V8HImode:
8255 case E_V4SImode:
8256 case E_V2DImode:
8257 case E_V4SFmode:
8258 case E_V2DFmode:
8259 if (!type || !AGGREGATE_TYPE_P (type))
8260 {
8261 cum->sse_words += words;
8262 cum->sse_nregs -= 1;
8263 cum->sse_regno += 1;
8264 if (cum->sse_nregs <= 0)
8265 {
8266 cum->sse_nregs = 0;
8267 cum->sse_regno = 0;
8268 }
8269 }
8270 break;
8271
8272 case E_V8QImode:
8273 case E_V4HImode:
8274 case E_V2SImode:
8275 case E_V2SFmode:
8276 case E_V1TImode:
8277 case E_V1DImode:
8278 if (!type || !AGGREGATE_TYPE_P (type))
8279 {
8280 cum->mmx_words += words;
8281 cum->mmx_nregs -= 1;
8282 cum->mmx_regno += 1;
8283 if (cum->mmx_nregs <= 0)
8284 {
8285 cum->mmx_nregs = 0;
8286 cum->mmx_regno = 0;
8287 }
8288 }
8289 break;
8290 }
8291 if (error_p)
8292 {
8293 cum->float_in_sse = 0;
8294 error ("calling %qD with SSE calling convention without "
8295 "SSE/SSE2 enabled", cum->decl);
8296 sorry ("this is a GCC bug that can be worked around by adding "
8297 "attribute used to function called");
8298 }
8299
8300 return res;
8301 }
8302
8303 static int
8304 function_arg_advance_64 (CUMULATIVE_ARGS *cum, machine_mode mode,
8305 const_tree type, HOST_WIDE_INT words, bool named)
8306 {
8307 int int_nregs, sse_nregs;
8308
8309 /* Unnamed 512 and 256bit vector mode parameters are passed on stack. */
8310 if (!named && (VALID_AVX512F_REG_MODE (mode)
8311 || VALID_AVX256_REG_MODE (mode)))
8312 return 0;
8313
8314 if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
8315 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
8316 {
8317 cum->nregs -= int_nregs;
8318 cum->sse_nregs -= sse_nregs;
8319 cum->regno += int_nregs;
8320 cum->sse_regno += sse_nregs;
8321 return int_nregs;
8322 }
8323 else
8324 {
8325 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
8326 cum->words = ROUND_UP (cum->words, align);
8327 cum->words += words;
8328 return 0;
8329 }
8330 }
8331
8332 static int
8333 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
8334 HOST_WIDE_INT words)
8335 {
8336 /* Otherwise, this should be passed indirect. */
8337 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
8338
8339 cum->words += words;
8340 if (cum->nregs > 0)
8341 {
8342 cum->nregs -= 1;
8343 cum->regno += 1;
8344 return 1;
8345 }
8346 return 0;
8347 }
8348
8349 /* Update the data in CUM to advance over an argument of mode MODE and
8350 data type TYPE. (TYPE is null for libcalls where that information
8351 may not be available.) */
8352
8353 static void
8354 ix86_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
8355 const_tree type, bool named)
8356 {
8357 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8358 HOST_WIDE_INT bytes, words;
8359 int nregs;
8360
8361 /* The argument of interrupt handler is a special case and is
8362 handled in ix86_function_arg. */
8363 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8364 return;
8365
8366 if (mode == BLKmode)
8367 bytes = int_size_in_bytes (type);
8368 else
8369 bytes = GET_MODE_SIZE (mode);
8370 words = CEIL (bytes, UNITS_PER_WORD);
8371
8372 if (type)
8373 mode = type_natural_mode (type, NULL, false);
8374
8375 if ((type && POINTER_BOUNDS_TYPE_P (type))
8376 || POINTER_BOUNDS_MODE_P (mode))
8377 {
8378 /* If we pass bounds in BT then just update remained bounds count. */
8379 if (cum->bnds_in_bt)
8380 {
8381 cum->bnds_in_bt--;
8382 return;
8383 }
8384
8385 /* Update remained number of bounds to force. */
8386 if (cum->force_bnd_pass)
8387 cum->force_bnd_pass--;
8388
8389 cum->bnd_regno++;
8390
8391 return;
8392 }
8393
8394 /* The first arg not going to Bounds Tables resets this counter. */
8395 cum->bnds_in_bt = 0;
8396 /* For unnamed args we always pass bounds to avoid bounds mess when
8397 passed and received types do not match. If bounds do not follow
8398 unnamed arg, still pretend required number of bounds were passed. */
8399 if (cum->force_bnd_pass)
8400 {
8401 cum->bnd_regno += cum->force_bnd_pass;
8402 cum->force_bnd_pass = 0;
8403 }
8404
8405 if (TARGET_64BIT)
8406 {
8407 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8408
8409 if (call_abi == MS_ABI)
8410 nregs = function_arg_advance_ms_64 (cum, bytes, words);
8411 else
8412 nregs = function_arg_advance_64 (cum, mode, type, words, named);
8413 }
8414 else
8415 nregs = function_arg_advance_32 (cum, mode, type, bytes, words);
8416
8417 /* For stdarg we expect bounds to be passed for each value passed
8418 in register. */
8419 if (cum->stdarg)
8420 cum->force_bnd_pass = nregs;
8421 /* For pointers passed in memory we expect bounds passed in Bounds
8422 Table. */
8423 if (!nregs)
8424 {
8425 /* Track if there are outgoing arguments on stack. */
8426 if (cum->caller)
8427 cfun->machine->outgoing_args_on_stack = true;
8428
8429 cum->bnds_in_bt = chkp_type_bounds_count (type);
8430 }
8431 }
8432
8433 /* Define where to put the arguments to a function.
8434 Value is zero to push the argument on the stack,
8435 or a hard register in which to store the argument.
8436
8437 MODE is the argument's machine mode.
8438 TYPE is the data type of the argument (as a tree).
8439 This is null for libcalls where that information may
8440 not be available.
8441 CUM is a variable of type CUMULATIVE_ARGS which gives info about
8442 the preceding args and about the function being called.
8443 NAMED is nonzero if this argument is a named parameter
8444 (otherwise it is an extra parameter matching an ellipsis). */
8445
8446 static rtx
8447 function_arg_32 (CUMULATIVE_ARGS *cum, machine_mode mode,
8448 machine_mode orig_mode, const_tree type,
8449 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
8450 {
8451 bool error_p = false;
8452
8453 /* Avoid the AL settings for the Unix64 ABI. */
8454 if (mode == VOIDmode)
8455 return constm1_rtx;
8456
8457 if (TARGET_IAMCU)
8458 {
8459 /* Intel MCU psABI passes scalars and aggregates no larger than 8
8460 bytes in registers. */
8461 if (!VECTOR_MODE_P (mode) && bytes <= 8)
8462 goto pass_in_reg;
8463 return NULL_RTX;
8464 }
8465
8466 switch (mode)
8467 {
8468 default:
8469 break;
8470
8471 case E_BLKmode:
8472 if (bytes < 0)
8473 break;
8474 /* FALLTHRU */
8475 case E_DImode:
8476 case E_SImode:
8477 case E_HImode:
8478 case E_QImode:
8479 pass_in_reg:
8480 if (words <= cum->nregs)
8481 {
8482 int regno = cum->regno;
8483
8484 /* Fastcall allocates the first two DWORD (SImode) or
8485 smaller arguments to ECX and EDX if it isn't an
8486 aggregate type . */
8487 if (cum->fastcall)
8488 {
8489 if (mode == BLKmode
8490 || mode == DImode
8491 || (type && AGGREGATE_TYPE_P (type)))
8492 break;
8493
8494 /* ECX not EAX is the first allocated register. */
8495 if (regno == AX_REG)
8496 regno = CX_REG;
8497 }
8498 return gen_rtx_REG (mode, regno);
8499 }
8500 break;
8501
8502 case E_DFmode:
8503 if (cum->float_in_sse == -1)
8504 error_p = true;
8505 if (cum->float_in_sse < 2)
8506 break;
8507 /* FALLTHRU */
8508 case E_SFmode:
8509 if (cum->float_in_sse == -1)
8510 error_p = true;
8511 if (cum->float_in_sse < 1)
8512 break;
8513 /* FALLTHRU */
8514 case E_TImode:
8515 /* In 32bit, we pass TImode in xmm registers. */
8516 case E_V16QImode:
8517 case E_V8HImode:
8518 case E_V4SImode:
8519 case E_V2DImode:
8520 case E_V4SFmode:
8521 case E_V2DFmode:
8522 if (!type || !AGGREGATE_TYPE_P (type))
8523 {
8524 if (cum->sse_nregs)
8525 return gen_reg_or_parallel (mode, orig_mode,
8526 cum->sse_regno + FIRST_SSE_REG);
8527 }
8528 break;
8529
8530 case E_OImode:
8531 case E_XImode:
8532 /* OImode and XImode shouldn't be used directly. */
8533 gcc_unreachable ();
8534
8535 case E_V64QImode:
8536 case E_V32HImode:
8537 case E_V16SImode:
8538 case E_V8DImode:
8539 case E_V16SFmode:
8540 case E_V8DFmode:
8541 case E_V8SFmode:
8542 case E_V8SImode:
8543 case E_V32QImode:
8544 case E_V16HImode:
8545 case E_V4DFmode:
8546 case E_V4DImode:
8547 if (!type || !AGGREGATE_TYPE_P (type))
8548 {
8549 if (cum->sse_nregs)
8550 return gen_reg_or_parallel (mode, orig_mode,
8551 cum->sse_regno + FIRST_SSE_REG);
8552 }
8553 break;
8554
8555 case E_V8QImode:
8556 case E_V4HImode:
8557 case E_V2SImode:
8558 case E_V2SFmode:
8559 case E_V1TImode:
8560 case E_V1DImode:
8561 if (!type || !AGGREGATE_TYPE_P (type))
8562 {
8563 if (cum->mmx_nregs)
8564 return gen_reg_or_parallel (mode, orig_mode,
8565 cum->mmx_regno + FIRST_MMX_REG);
8566 }
8567 break;
8568 }
8569 if (error_p)
8570 {
8571 cum->float_in_sse = 0;
8572 error ("calling %qD with SSE calling convention without "
8573 "SSE/SSE2 enabled", cum->decl);
8574 sorry ("this is a GCC bug that can be worked around by adding "
8575 "attribute used to function called");
8576 }
8577
8578 return NULL_RTX;
8579 }
8580
8581 static rtx
8582 function_arg_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8583 machine_mode orig_mode, const_tree type, bool named)
8584 {
8585 /* Handle a hidden AL argument containing number of registers
8586 for varargs x86-64 functions. */
8587 if (mode == VOIDmode)
8588 return GEN_INT (cum->maybe_vaarg
8589 ? (cum->sse_nregs < 0
8590 ? X86_64_SSE_REGPARM_MAX
8591 : cum->sse_regno)
8592 : -1);
8593
8594 switch (mode)
8595 {
8596 default:
8597 break;
8598
8599 case E_V8SFmode:
8600 case E_V8SImode:
8601 case E_V32QImode:
8602 case E_V16HImode:
8603 case E_V4DFmode:
8604 case E_V4DImode:
8605 case E_V16SFmode:
8606 case E_V16SImode:
8607 case E_V64QImode:
8608 case E_V32HImode:
8609 case E_V8DFmode:
8610 case E_V8DImode:
8611 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
8612 if (!named)
8613 return NULL;
8614 break;
8615 }
8616
8617 return construct_container (mode, orig_mode, type, 0, cum->nregs,
8618 cum->sse_nregs,
8619 &x86_64_int_parameter_registers [cum->regno],
8620 cum->sse_regno);
8621 }
8622
8623 static rtx
8624 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, machine_mode mode,
8625 machine_mode orig_mode, bool named,
8626 HOST_WIDE_INT bytes)
8627 {
8628 unsigned int regno;
8629
8630 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
8631 We use value of -2 to specify that current function call is MSABI. */
8632 if (mode == VOIDmode)
8633 return GEN_INT (-2);
8634
8635 /* If we've run out of registers, it goes on the stack. */
8636 if (cum->nregs == 0)
8637 return NULL_RTX;
8638
8639 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
8640
8641 /* Only floating point modes are passed in anything but integer regs. */
8642 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
8643 {
8644 if (named)
8645 regno = cum->regno + FIRST_SSE_REG;
8646 else
8647 {
8648 rtx t1, t2;
8649
8650 /* Unnamed floating parameters are passed in both the
8651 SSE and integer registers. */
8652 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
8653 t2 = gen_rtx_REG (mode, regno);
8654 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
8655 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
8656 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
8657 }
8658 }
8659 /* Handle aggregated types passed in register. */
8660 if (orig_mode == BLKmode)
8661 {
8662 if (bytes > 0 && bytes <= 8)
8663 mode = (bytes > 4 ? DImode : SImode);
8664 if (mode == BLKmode)
8665 mode = DImode;
8666 }
8667
8668 return gen_reg_or_parallel (mode, orig_mode, regno);
8669 }
8670
8671 /* Return where to put the arguments to a function.
8672 Return zero to push the argument on the stack, or a hard register in which to store the argument.
8673
8674 MODE is the argument's machine mode. TYPE is the data type of the
8675 argument. It is null for libcalls where that information may not be
8676 available. CUM gives information about the preceding args and about
8677 the function being called. NAMED is nonzero if this argument is a
8678 named parameter (otherwise it is an extra parameter matching an
8679 ellipsis). */
8680
8681 static rtx
8682 ix86_function_arg (cumulative_args_t cum_v, machine_mode omode,
8683 const_tree type, bool named)
8684 {
8685 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8686 machine_mode mode = omode;
8687 HOST_WIDE_INT bytes, words;
8688 rtx arg;
8689
8690 if (!cum->caller && cfun->machine->func_type != TYPE_NORMAL)
8691 {
8692 gcc_assert (type != NULL_TREE);
8693 if (POINTER_TYPE_P (type))
8694 {
8695 /* This is the pointer argument. */
8696 gcc_assert (TYPE_MODE (type) == Pmode);
8697 /* It is at -WORD(AP) in the current frame in interrupt and
8698 exception handlers. */
8699 arg = plus_constant (Pmode, arg_pointer_rtx, -UNITS_PER_WORD);
8700 }
8701 else
8702 {
8703 gcc_assert (cfun->machine->func_type == TYPE_EXCEPTION
8704 && TREE_CODE (type) == INTEGER_TYPE
8705 && TYPE_MODE (type) == word_mode);
8706 /* The error code is the word-mode integer argument at
8707 -2 * WORD(AP) in the current frame of the exception
8708 handler. */
8709 arg = gen_rtx_MEM (word_mode,
8710 plus_constant (Pmode,
8711 arg_pointer_rtx,
8712 -2 * UNITS_PER_WORD));
8713 }
8714 return arg;
8715 }
8716
8717 /* All pointer bounds arguments are handled separately here. */
8718 if ((type && POINTER_BOUNDS_TYPE_P (type))
8719 || POINTER_BOUNDS_MODE_P (mode))
8720 {
8721 /* Return NULL if bounds are forced to go in Bounds Table. */
8722 if (cum->bnds_in_bt)
8723 arg = NULL;
8724 /* Return the next available bound reg if any. */
8725 else if (cum->bnd_regno <= LAST_BND_REG)
8726 arg = gen_rtx_REG (BNDmode, cum->bnd_regno);
8727 /* Return the next special slot number otherwise. */
8728 else
8729 arg = GEN_INT (cum->bnd_regno - LAST_BND_REG - 1);
8730
8731 return arg;
8732 }
8733
8734 if (mode == BLKmode)
8735 bytes = int_size_in_bytes (type);
8736 else
8737 bytes = GET_MODE_SIZE (mode);
8738 words = CEIL (bytes, UNITS_PER_WORD);
8739
8740 /* To simplify the code below, represent vector types with a vector mode
8741 even if MMX/SSE are not active. */
8742 if (type && TREE_CODE (type) == VECTOR_TYPE)
8743 mode = type_natural_mode (type, cum, false);
8744
8745 if (TARGET_64BIT)
8746 {
8747 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8748
8749 if (call_abi == MS_ABI)
8750 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
8751 else
8752 arg = function_arg_64 (cum, mode, omode, type, named);
8753 }
8754 else
8755 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
8756
8757 /* Track if there are outgoing arguments on stack. */
8758 if (arg == NULL_RTX && cum->caller)
8759 cfun->machine->outgoing_args_on_stack = true;
8760
8761 return arg;
8762 }
8763
8764 /* A C expression that indicates when an argument must be passed by
8765 reference. If nonzero for an argument, a copy of that argument is
8766 made in memory and a pointer to the argument is passed instead of
8767 the argument itself. The pointer is passed in whatever way is
8768 appropriate for passing a pointer to that type. */
8769
8770 static bool
8771 ix86_pass_by_reference (cumulative_args_t cum_v, machine_mode mode,
8772 const_tree type, bool)
8773 {
8774 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8775
8776 /* Bounds are never passed by reference. */
8777 if ((type && POINTER_BOUNDS_TYPE_P (type))
8778 || POINTER_BOUNDS_MODE_P (mode))
8779 return false;
8780
8781 if (TARGET_64BIT)
8782 {
8783 enum calling_abi call_abi = cum ? cum->call_abi : ix86_abi;
8784
8785 /* See Windows x64 Software Convention. */
8786 if (call_abi == MS_ABI)
8787 {
8788 HOST_WIDE_INT msize = GET_MODE_SIZE (mode);
8789
8790 if (type)
8791 {
8792 /* Arrays are passed by reference. */
8793 if (TREE_CODE (type) == ARRAY_TYPE)
8794 return true;
8795
8796 if (RECORD_OR_UNION_TYPE_P (type))
8797 {
8798 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
8799 are passed by reference. */
8800 msize = int_size_in_bytes (type);
8801 }
8802 }
8803
8804 /* __m128 is passed by reference. */
8805 return msize != 1 && msize != 2 && msize != 4 && msize != 8;
8806 }
8807 else if (type && int_size_in_bytes (type) == -1)
8808 return true;
8809 }
8810
8811 return false;
8812 }
8813
8814 /* Return true when TYPE should be 128bit aligned for 32bit argument
8815 passing ABI. XXX: This function is obsolete and is only used for
8816 checking psABI compatibility with previous versions of GCC. */
8817
8818 static bool
8819 ix86_compat_aligned_value_p (const_tree type)
8820 {
8821 machine_mode mode = TYPE_MODE (type);
8822 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
8823 || mode == TDmode
8824 || mode == TFmode
8825 || mode == TCmode)
8826 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
8827 return true;
8828 if (TYPE_ALIGN (type) < 128)
8829 return false;
8830
8831 if (AGGREGATE_TYPE_P (type))
8832 {
8833 /* Walk the aggregates recursively. */
8834 switch (TREE_CODE (type))
8835 {
8836 case RECORD_TYPE:
8837 case UNION_TYPE:
8838 case QUAL_UNION_TYPE:
8839 {
8840 tree field;
8841
8842 /* Walk all the structure fields. */
8843 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
8844 {
8845 if (TREE_CODE (field) == FIELD_DECL
8846 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
8847 return true;
8848 }
8849 break;
8850 }
8851
8852 case ARRAY_TYPE:
8853 /* Just for use if some languages passes arrays by value. */
8854 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
8855 return true;
8856 break;
8857
8858 default:
8859 gcc_unreachable ();
8860 }
8861 }
8862 return false;
8863 }
8864
8865 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
8866 XXX: This function is obsolete and is only used for checking psABI
8867 compatibility with previous versions of GCC. */
8868
8869 static unsigned int
8870 ix86_compat_function_arg_boundary (machine_mode mode,
8871 const_tree type, unsigned int align)
8872 {
8873 /* In 32bit, only _Decimal128 and __float128 are aligned to their
8874 natural boundaries. */
8875 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
8876 {
8877 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
8878 make an exception for SSE modes since these require 128bit
8879 alignment.
8880
8881 The handling here differs from field_alignment. ICC aligns MMX
8882 arguments to 4 byte boundaries, while structure fields are aligned
8883 to 8 byte boundaries. */
8884 if (!type)
8885 {
8886 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
8887 align = PARM_BOUNDARY;
8888 }
8889 else
8890 {
8891 if (!ix86_compat_aligned_value_p (type))
8892 align = PARM_BOUNDARY;
8893 }
8894 }
8895 if (align > BIGGEST_ALIGNMENT)
8896 align = BIGGEST_ALIGNMENT;
8897 return align;
8898 }
8899
8900 /* Return true when TYPE should be 128bit aligned for 32bit argument
8901 passing ABI. */
8902
8903 static bool
8904 ix86_contains_aligned_value_p (const_tree type)
8905 {
8906 machine_mode mode = TYPE_MODE (type);
8907
8908 if (mode == XFmode || mode == XCmode)
8909 return false;
8910
8911 if (TYPE_ALIGN (type) < 128)
8912 return false;
8913
8914 if (AGGREGATE_TYPE_P (type))
8915 {
8916 /* Walk the aggregates recursively. */
8917 switch (TREE_CODE (type))
8918 {
8919 case RECORD_TYPE:
8920 case UNION_TYPE:
8921 case QUAL_UNION_TYPE:
8922 {
8923 tree field;
8924
8925 /* Walk all the structure fields. */
8926 for (field = TYPE_FIELDS (type);
8927 field;
8928 field = DECL_CHAIN (field))
8929 {
8930 if (TREE_CODE (field) == FIELD_DECL
8931 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
8932 return true;
8933 }
8934 break;
8935 }
8936
8937 case ARRAY_TYPE:
8938 /* Just for use if some languages passes arrays by value. */
8939 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
8940 return true;
8941 break;
8942
8943 default:
8944 gcc_unreachable ();
8945 }
8946 }
8947 else
8948 return TYPE_ALIGN (type) >= 128;
8949
8950 return false;
8951 }
8952
8953 /* Gives the alignment boundary, in bits, of an argument with the
8954 specified mode and type. */
8955
8956 static unsigned int
8957 ix86_function_arg_boundary (machine_mode mode, const_tree type)
8958 {
8959 unsigned int align;
8960 if (type)
8961 {
8962 /* Since the main variant type is used for call, we convert it to
8963 the main variant type. */
8964 type = TYPE_MAIN_VARIANT (type);
8965 align = TYPE_ALIGN (type);
8966 }
8967 else
8968 align = GET_MODE_ALIGNMENT (mode);
8969 if (align < PARM_BOUNDARY)
8970 align = PARM_BOUNDARY;
8971 else
8972 {
8973 static bool warned;
8974 unsigned int saved_align = align;
8975
8976 if (!TARGET_64BIT)
8977 {
8978 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
8979 if (!type)
8980 {
8981 if (mode == XFmode || mode == XCmode)
8982 align = PARM_BOUNDARY;
8983 }
8984 else if (!ix86_contains_aligned_value_p (type))
8985 align = PARM_BOUNDARY;
8986
8987 if (align < 128)
8988 align = PARM_BOUNDARY;
8989 }
8990
8991 if (warn_psabi
8992 && !warned
8993 && align != ix86_compat_function_arg_boundary (mode, type,
8994 saved_align))
8995 {
8996 warned = true;
8997 inform (input_location,
8998 "The ABI for passing parameters with %d-byte"
8999 " alignment has changed in GCC 4.6",
9000 align / BITS_PER_UNIT);
9001 }
9002 }
9003
9004 return align;
9005 }
9006
9007 /* Return true if N is a possible register number of function value. */
9008
9009 static bool
9010 ix86_function_value_regno_p (const unsigned int regno)
9011 {
9012 switch (regno)
9013 {
9014 case AX_REG:
9015 return true;
9016 case DX_REG:
9017 return (!TARGET_64BIT || ix86_cfun_abi () != MS_ABI);
9018 case DI_REG:
9019 case SI_REG:
9020 return TARGET_64BIT && ix86_cfun_abi () != MS_ABI;
9021
9022 case BND0_REG:
9023 case BND1_REG:
9024 return chkp_function_instrumented_p (current_function_decl);
9025
9026 /* Complex values are returned in %st(0)/%st(1) pair. */
9027 case ST0_REG:
9028 case ST1_REG:
9029 /* TODO: The function should depend on current function ABI but
9030 builtins.c would need updating then. Therefore we use the
9031 default ABI. */
9032 if (TARGET_64BIT && ix86_cfun_abi () == MS_ABI)
9033 return false;
9034 return TARGET_FLOAT_RETURNS_IN_80387;
9035
9036 /* Complex values are returned in %xmm0/%xmm1 pair. */
9037 case XMM0_REG:
9038 case XMM1_REG:
9039 return TARGET_SSE;
9040
9041 case MM0_REG:
9042 if (TARGET_MACHO || TARGET_64BIT)
9043 return false;
9044 return TARGET_MMX;
9045 }
9046
9047 return false;
9048 }
9049
9050 /* Define how to find the value returned by a function.
9051 VALTYPE is the data type of the value (as a tree).
9052 If the precise function being called is known, FUNC is its FUNCTION_DECL;
9053 otherwise, FUNC is 0. */
9054
9055 static rtx
9056 function_value_32 (machine_mode orig_mode, machine_mode mode,
9057 const_tree fntype, const_tree fn)
9058 {
9059 unsigned int regno;
9060
9061 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
9062 we normally prevent this case when mmx is not available. However
9063 some ABIs may require the result to be returned like DImode. */
9064 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
9065 regno = FIRST_MMX_REG;
9066
9067 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
9068 we prevent this case when sse is not available. However some ABIs
9069 may require the result to be returned like integer TImode. */
9070 else if (mode == TImode
9071 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
9072 regno = FIRST_SSE_REG;
9073
9074 /* 32-byte vector modes in %ymm0. */
9075 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
9076 regno = FIRST_SSE_REG;
9077
9078 /* 64-byte vector modes in %zmm0. */
9079 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
9080 regno = FIRST_SSE_REG;
9081
9082 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
9083 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
9084 regno = FIRST_FLOAT_REG;
9085 else
9086 /* Most things go in %eax. */
9087 regno = AX_REG;
9088
9089 /* Override FP return register with %xmm0 for local functions when
9090 SSE math is enabled or for functions with sseregparm attribute. */
9091 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
9092 {
9093 int sse_level = ix86_function_sseregparm (fntype, fn, false);
9094 if (sse_level == -1)
9095 {
9096 error ("calling %qD with SSE calling convention without "
9097 "SSE/SSE2 enabled", fn);
9098 sorry ("this is a GCC bug that can be worked around by adding "
9099 "attribute used to function called");
9100 }
9101 else if ((sse_level >= 1 && mode == SFmode)
9102 || (sse_level == 2 && mode == DFmode))
9103 regno = FIRST_SSE_REG;
9104 }
9105
9106 /* OImode shouldn't be used directly. */
9107 gcc_assert (mode != OImode);
9108
9109 return gen_rtx_REG (orig_mode, regno);
9110 }
9111
9112 static rtx
9113 function_value_64 (machine_mode orig_mode, machine_mode mode,
9114 const_tree valtype)
9115 {
9116 rtx ret;
9117
9118 /* Handle libcalls, which don't provide a type node. */
9119 if (valtype == NULL)
9120 {
9121 unsigned int regno;
9122
9123 switch (mode)
9124 {
9125 case E_SFmode:
9126 case E_SCmode:
9127 case E_DFmode:
9128 case E_DCmode:
9129 case E_TFmode:
9130 case E_SDmode:
9131 case E_DDmode:
9132 case E_TDmode:
9133 regno = FIRST_SSE_REG;
9134 break;
9135 case E_XFmode:
9136 case E_XCmode:
9137 regno = FIRST_FLOAT_REG;
9138 break;
9139 case E_TCmode:
9140 return NULL;
9141 default:
9142 regno = AX_REG;
9143 }
9144
9145 return gen_rtx_REG (mode, regno);
9146 }
9147 else if (POINTER_TYPE_P (valtype))
9148 {
9149 /* Pointers are always returned in word_mode. */
9150 mode = word_mode;
9151 }
9152
9153 ret = construct_container (mode, orig_mode, valtype, 1,
9154 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
9155 x86_64_int_return_registers, 0);
9156
9157 /* For zero sized structures, construct_container returns NULL, but we
9158 need to keep rest of compiler happy by returning meaningful value. */
9159 if (!ret)
9160 ret = gen_rtx_REG (orig_mode, AX_REG);
9161
9162 return ret;
9163 }
9164
9165 static rtx
9166 function_value_ms_64 (machine_mode orig_mode, machine_mode mode,
9167 const_tree valtype)
9168 {
9169 unsigned int regno = AX_REG;
9170
9171 if (TARGET_SSE)
9172 {
9173 switch (GET_MODE_SIZE (mode))
9174 {
9175 case 16:
9176 if (valtype != NULL_TREE
9177 && !VECTOR_INTEGER_TYPE_P (valtype)
9178 && !VECTOR_INTEGER_TYPE_P (valtype)
9179 && !INTEGRAL_TYPE_P (valtype)
9180 && !VECTOR_FLOAT_TYPE_P (valtype))
9181 break;
9182 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9183 && !COMPLEX_MODE_P (mode))
9184 regno = FIRST_SSE_REG;
9185 break;
9186 case 8:
9187 case 4:
9188 if (mode == SFmode || mode == DFmode)
9189 regno = FIRST_SSE_REG;
9190 break;
9191 default:
9192 break;
9193 }
9194 }
9195 return gen_rtx_REG (orig_mode, regno);
9196 }
9197
9198 static rtx
9199 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
9200 machine_mode orig_mode, machine_mode mode)
9201 {
9202 const_tree fn, fntype;
9203
9204 fn = NULL_TREE;
9205 if (fntype_or_decl && DECL_P (fntype_or_decl))
9206 fn = fntype_or_decl;
9207 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
9208
9209 if ((valtype && POINTER_BOUNDS_TYPE_P (valtype))
9210 || POINTER_BOUNDS_MODE_P (mode))
9211 return gen_rtx_REG (BNDmode, FIRST_BND_REG);
9212 else if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
9213 return function_value_ms_64 (orig_mode, mode, valtype);
9214 else if (TARGET_64BIT)
9215 return function_value_64 (orig_mode, mode, valtype);
9216 else
9217 return function_value_32 (orig_mode, mode, fntype, fn);
9218 }
9219
9220 static rtx
9221 ix86_function_value (const_tree valtype, const_tree fntype_or_decl, bool)
9222 {
9223 machine_mode mode, orig_mode;
9224
9225 orig_mode = TYPE_MODE (valtype);
9226 mode = type_natural_mode (valtype, NULL, true);
9227 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
9228 }
9229
9230 /* Return an RTX representing a place where a function returns
9231 or recieves pointer bounds or NULL if no bounds are returned.
9232
9233 VALTYPE is a data type of a value returned by the function.
9234
9235 FN_DECL_OR_TYPE is a tree node representing FUNCTION_DECL
9236 or FUNCTION_TYPE of the function.
9237
9238 If OUTGOING is false, return a place in which the caller will
9239 see the return value. Otherwise, return a place where a
9240 function returns a value. */
9241
9242 static rtx
9243 ix86_function_value_bounds (const_tree valtype,
9244 const_tree fntype_or_decl ATTRIBUTE_UNUSED,
9245 bool outgoing ATTRIBUTE_UNUSED)
9246 {
9247 rtx res = NULL_RTX;
9248
9249 if (BOUNDED_TYPE_P (valtype))
9250 res = gen_rtx_REG (BNDmode, FIRST_BND_REG);
9251 else if (chkp_type_has_pointer (valtype))
9252 {
9253 bitmap slots;
9254 rtx bounds[2];
9255 bitmap_iterator bi;
9256 unsigned i, bnd_no = 0;
9257
9258 bitmap_obstack_initialize (NULL);
9259 slots = BITMAP_ALLOC (NULL);
9260 chkp_find_bound_slots (valtype, slots);
9261
9262 EXECUTE_IF_SET_IN_BITMAP (slots, 0, i, bi)
9263 {
9264 rtx reg = gen_rtx_REG (BNDmode, FIRST_BND_REG + bnd_no);
9265 rtx offs = GEN_INT (i * POINTER_SIZE / BITS_PER_UNIT);
9266 gcc_assert (bnd_no < 2);
9267 bounds[bnd_no++] = gen_rtx_EXPR_LIST (VOIDmode, reg, offs);
9268 }
9269
9270 res = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (bnd_no, bounds));
9271
9272 BITMAP_FREE (slots);
9273 bitmap_obstack_release (NULL);
9274 }
9275 else
9276 res = NULL_RTX;
9277
9278 return res;
9279 }
9280
9281 /* Pointer function arguments and return values are promoted to
9282 word_mode for normal functions. */
9283
9284 static machine_mode
9285 ix86_promote_function_mode (const_tree type, machine_mode mode,
9286 int *punsignedp, const_tree fntype,
9287 int for_return)
9288 {
9289 if (cfun->machine->func_type == TYPE_NORMAL
9290 && type != NULL_TREE
9291 && POINTER_TYPE_P (type))
9292 {
9293 *punsignedp = POINTERS_EXTEND_UNSIGNED;
9294 return word_mode;
9295 }
9296 return default_promote_function_mode (type, mode, punsignedp, fntype,
9297 for_return);
9298 }
9299
9300 /* Return true if a structure, union or array with MODE containing FIELD
9301 should be accessed using BLKmode. */
9302
9303 static bool
9304 ix86_member_type_forces_blk (const_tree field, machine_mode mode)
9305 {
9306 /* Union with XFmode must be in BLKmode. */
9307 return (mode == XFmode
9308 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
9309 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
9310 }
9311
9312 rtx
9313 ix86_libcall_value (machine_mode mode)
9314 {
9315 return ix86_function_value_1 (NULL, NULL, mode, mode);
9316 }
9317
9318 /* Return true iff type is returned in memory. */
9319
9320 static bool
9321 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
9322 {
9323 #ifdef SUBTARGET_RETURN_IN_MEMORY
9324 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
9325 #else
9326 const machine_mode mode = type_natural_mode (type, NULL, true);
9327 HOST_WIDE_INT size;
9328
9329 if (POINTER_BOUNDS_TYPE_P (type))
9330 return false;
9331
9332 if (TARGET_64BIT)
9333 {
9334 if (ix86_function_type_abi (fntype) == MS_ABI)
9335 {
9336 size = int_size_in_bytes (type);
9337
9338 /* __m128 is returned in xmm0. */
9339 if ((!type || VECTOR_INTEGER_TYPE_P (type)
9340 || INTEGRAL_TYPE_P (type)
9341 || VECTOR_FLOAT_TYPE_P (type))
9342 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
9343 && !COMPLEX_MODE_P (mode)
9344 && (GET_MODE_SIZE (mode) == 16 || size == 16))
9345 return false;
9346
9347 /* Otherwise, the size must be exactly in [1248]. */
9348 return size != 1 && size != 2 && size != 4 && size != 8;
9349 }
9350 else
9351 {
9352 int needed_intregs, needed_sseregs;
9353
9354 return examine_argument (mode, type, 1,
9355 &needed_intregs, &needed_sseregs);
9356 }
9357 }
9358 else
9359 {
9360 size = int_size_in_bytes (type);
9361
9362 /* Intel MCU psABI returns scalars and aggregates no larger than 8
9363 bytes in registers. */
9364 if (TARGET_IAMCU)
9365 return VECTOR_MODE_P (mode) || size < 0 || size > 8;
9366
9367 if (mode == BLKmode)
9368 return true;
9369
9370 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
9371 return false;
9372
9373 if (VECTOR_MODE_P (mode) || mode == TImode)
9374 {
9375 /* User-created vectors small enough to fit in EAX. */
9376 if (size < 8)
9377 return false;
9378
9379 /* Unless ABI prescibes otherwise,
9380 MMX/3dNow values are returned in MM0 if available. */
9381
9382 if (size == 8)
9383 return TARGET_VECT8_RETURNS || !TARGET_MMX;
9384
9385 /* SSE values are returned in XMM0 if available. */
9386 if (size == 16)
9387 return !TARGET_SSE;
9388
9389 /* AVX values are returned in YMM0 if available. */
9390 if (size == 32)
9391 return !TARGET_AVX;
9392
9393 /* AVX512F values are returned in ZMM0 if available. */
9394 if (size == 64)
9395 return !TARGET_AVX512F;
9396 }
9397
9398 if (mode == XFmode)
9399 return false;
9400
9401 if (size > 12)
9402 return true;
9403
9404 /* OImode shouldn't be used directly. */
9405 gcc_assert (mode != OImode);
9406
9407 return false;
9408 }
9409 #endif
9410 }
9411
9412 \f
9413 /* Create the va_list data type. */
9414
9415 static tree
9416 ix86_build_builtin_va_list_64 (void)
9417 {
9418 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
9419
9420 record = lang_hooks.types.make_type (RECORD_TYPE);
9421 type_decl = build_decl (BUILTINS_LOCATION,
9422 TYPE_DECL, get_identifier ("__va_list_tag"), record);
9423
9424 f_gpr = build_decl (BUILTINS_LOCATION,
9425 FIELD_DECL, get_identifier ("gp_offset"),
9426 unsigned_type_node);
9427 f_fpr = build_decl (BUILTINS_LOCATION,
9428 FIELD_DECL, get_identifier ("fp_offset"),
9429 unsigned_type_node);
9430 f_ovf = build_decl (BUILTINS_LOCATION,
9431 FIELD_DECL, get_identifier ("overflow_arg_area"),
9432 ptr_type_node);
9433 f_sav = build_decl (BUILTINS_LOCATION,
9434 FIELD_DECL, get_identifier ("reg_save_area"),
9435 ptr_type_node);
9436
9437 va_list_gpr_counter_field = f_gpr;
9438 va_list_fpr_counter_field = f_fpr;
9439
9440 DECL_FIELD_CONTEXT (f_gpr) = record;
9441 DECL_FIELD_CONTEXT (f_fpr) = record;
9442 DECL_FIELD_CONTEXT (f_ovf) = record;
9443 DECL_FIELD_CONTEXT (f_sav) = record;
9444
9445 TYPE_STUB_DECL (record) = type_decl;
9446 TYPE_NAME (record) = type_decl;
9447 TYPE_FIELDS (record) = f_gpr;
9448 DECL_CHAIN (f_gpr) = f_fpr;
9449 DECL_CHAIN (f_fpr) = f_ovf;
9450 DECL_CHAIN (f_ovf) = f_sav;
9451
9452 layout_type (record);
9453
9454 TYPE_ATTRIBUTES (record) = tree_cons (get_identifier ("sysv_abi va_list"),
9455 NULL_TREE, TYPE_ATTRIBUTES (record));
9456
9457 /* The correct type is an array type of one element. */
9458 return build_array_type (record, build_index_type (size_zero_node));
9459 }
9460
9461 /* Setup the builtin va_list data type and for 64-bit the additional
9462 calling convention specific va_list data types. */
9463
9464 static tree
9465 ix86_build_builtin_va_list (void)
9466 {
9467 if (TARGET_64BIT)
9468 {
9469 /* Initialize ABI specific va_list builtin types.
9470
9471 In lto1, we can encounter two va_list types:
9472 - one as a result of the type-merge across TUs, and
9473 - the one constructed here.
9474 These two types will not have the same TYPE_MAIN_VARIANT, and therefore
9475 a type identity check in canonical_va_list_type based on
9476 TYPE_MAIN_VARIANT (which we used to have) will not work.
9477 Instead, we tag each va_list_type_node with its unique attribute, and
9478 look for the attribute in the type identity check in
9479 canonical_va_list_type.
9480
9481 Tagging sysv_va_list_type_node directly with the attribute is
9482 problematic since it's a array of one record, which will degrade into a
9483 pointer to record when used as parameter (see build_va_arg comments for
9484 an example), dropping the attribute in the process. So we tag the
9485 record instead. */
9486
9487 /* For SYSV_ABI we use an array of one record. */
9488 sysv_va_list_type_node = ix86_build_builtin_va_list_64 ();
9489
9490 /* For MS_ABI we use plain pointer to argument area. */
9491 tree char_ptr_type = build_pointer_type (char_type_node);
9492 tree attr = tree_cons (get_identifier ("ms_abi va_list"), NULL_TREE,
9493 TYPE_ATTRIBUTES (char_ptr_type));
9494 ms_va_list_type_node = build_type_attribute_variant (char_ptr_type, attr);
9495
9496 return ((ix86_abi == MS_ABI)
9497 ? ms_va_list_type_node
9498 : sysv_va_list_type_node);
9499 }
9500 else
9501 {
9502 /* For i386 we use plain pointer to argument area. */
9503 return build_pointer_type (char_type_node);
9504 }
9505 }
9506
9507 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
9508
9509 static void
9510 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
9511 {
9512 rtx save_area, mem;
9513 alias_set_type set;
9514 int i, max;
9515
9516 /* GPR size of varargs save area. */
9517 if (cfun->va_list_gpr_size)
9518 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
9519 else
9520 ix86_varargs_gpr_size = 0;
9521
9522 /* FPR size of varargs save area. We don't need it if we don't pass
9523 anything in SSE registers. */
9524 if (TARGET_SSE && cfun->va_list_fpr_size)
9525 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
9526 else
9527 ix86_varargs_fpr_size = 0;
9528
9529 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
9530 return;
9531
9532 save_area = frame_pointer_rtx;
9533 set = get_varargs_alias_set ();
9534
9535 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9536 if (max > X86_64_REGPARM_MAX)
9537 max = X86_64_REGPARM_MAX;
9538
9539 for (i = cum->regno; i < max; i++)
9540 {
9541 mem = gen_rtx_MEM (word_mode,
9542 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
9543 MEM_NOTRAP_P (mem) = 1;
9544 set_mem_alias_set (mem, set);
9545 emit_move_insn (mem,
9546 gen_rtx_REG (word_mode,
9547 x86_64_int_parameter_registers[i]));
9548 }
9549
9550 if (ix86_varargs_fpr_size)
9551 {
9552 machine_mode smode;
9553 rtx_code_label *label;
9554 rtx test;
9555
9556 /* Now emit code to save SSE registers. The AX parameter contains number
9557 of SSE parameter registers used to call this function, though all we
9558 actually check here is the zero/non-zero status. */
9559
9560 label = gen_label_rtx ();
9561 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
9562 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
9563 label));
9564
9565 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
9566 we used movdqa (i.e. TImode) instead? Perhaps even better would
9567 be if we could determine the real mode of the data, via a hook
9568 into pass_stdarg. Ignore all that for now. */
9569 smode = V4SFmode;
9570 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
9571 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
9572
9573 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
9574 if (max > X86_64_SSE_REGPARM_MAX)
9575 max = X86_64_SSE_REGPARM_MAX;
9576
9577 for (i = cum->sse_regno; i < max; ++i)
9578 {
9579 mem = plus_constant (Pmode, save_area,
9580 i * 16 + ix86_varargs_gpr_size);
9581 mem = gen_rtx_MEM (smode, mem);
9582 MEM_NOTRAP_P (mem) = 1;
9583 set_mem_alias_set (mem, set);
9584 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
9585
9586 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
9587 }
9588
9589 emit_label (label);
9590 }
9591 }
9592
9593 static void
9594 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
9595 {
9596 alias_set_type set = get_varargs_alias_set ();
9597 int i;
9598
9599 /* Reset to zero, as there might be a sysv vaarg used
9600 before. */
9601 ix86_varargs_gpr_size = 0;
9602 ix86_varargs_fpr_size = 0;
9603
9604 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
9605 {
9606 rtx reg, mem;
9607
9608 mem = gen_rtx_MEM (Pmode,
9609 plus_constant (Pmode, virtual_incoming_args_rtx,
9610 i * UNITS_PER_WORD));
9611 MEM_NOTRAP_P (mem) = 1;
9612 set_mem_alias_set (mem, set);
9613
9614 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
9615 emit_move_insn (mem, reg);
9616 }
9617 }
9618
9619 static void
9620 ix86_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9621 tree type, int *, int no_rtl)
9622 {
9623 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9624 CUMULATIVE_ARGS next_cum;
9625 tree fntype;
9626
9627 /* This argument doesn't appear to be used anymore. Which is good,
9628 because the old code here didn't suppress rtl generation. */
9629 gcc_assert (!no_rtl);
9630
9631 if (!TARGET_64BIT)
9632 return;
9633
9634 fntype = TREE_TYPE (current_function_decl);
9635
9636 /* For varargs, we do not want to skip the dummy va_dcl argument.
9637 For stdargs, we do want to skip the last named argument. */
9638 next_cum = *cum;
9639 if (stdarg_p (fntype))
9640 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9641 true);
9642
9643 if (cum->call_abi == MS_ABI)
9644 setup_incoming_varargs_ms_64 (&next_cum);
9645 else
9646 setup_incoming_varargs_64 (&next_cum);
9647 }
9648
9649 static void
9650 ix86_setup_incoming_vararg_bounds (cumulative_args_t cum_v,
9651 machine_mode mode,
9652 tree type,
9653 int *pretend_size ATTRIBUTE_UNUSED,
9654 int no_rtl)
9655 {
9656 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9657 CUMULATIVE_ARGS next_cum;
9658 tree fntype;
9659 rtx save_area;
9660 int bnd_reg, i, max;
9661
9662 gcc_assert (!no_rtl);
9663
9664 /* Do nothing if we use plain pointer to argument area. */
9665 if (!TARGET_64BIT || cum->call_abi == MS_ABI)
9666 return;
9667
9668 fntype = TREE_TYPE (current_function_decl);
9669
9670 /* For varargs, we do not want to skip the dummy va_dcl argument.
9671 For stdargs, we do want to skip the last named argument. */
9672 next_cum = *cum;
9673 if (stdarg_p (fntype))
9674 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
9675 true);
9676 save_area = frame_pointer_rtx;
9677
9678 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
9679 if (max > X86_64_REGPARM_MAX)
9680 max = X86_64_REGPARM_MAX;
9681
9682 bnd_reg = cum->bnd_regno + cum->force_bnd_pass;
9683 if (chkp_function_instrumented_p (current_function_decl))
9684 for (i = cum->regno; i < max; i++)
9685 {
9686 rtx addr = plus_constant (Pmode, save_area, i * UNITS_PER_WORD);
9687 rtx ptr = gen_rtx_REG (Pmode,
9688 x86_64_int_parameter_registers[i]);
9689 rtx bounds;
9690
9691 if (bnd_reg <= LAST_BND_REG)
9692 bounds = gen_rtx_REG (BNDmode, bnd_reg);
9693 else
9694 {
9695 rtx ldx_addr =
9696 plus_constant (Pmode, arg_pointer_rtx,
9697 (LAST_BND_REG - bnd_reg) * GET_MODE_SIZE (Pmode));
9698 bounds = gen_reg_rtx (BNDmode);
9699 emit_insn (BNDmode == BND64mode
9700 ? gen_bnd64_ldx (bounds, ldx_addr, ptr)
9701 : gen_bnd32_ldx (bounds, ldx_addr, ptr));
9702 }
9703
9704 emit_insn (BNDmode == BND64mode
9705 ? gen_bnd64_stx (addr, ptr, bounds)
9706 : gen_bnd32_stx (addr, ptr, bounds));
9707
9708 bnd_reg++;
9709 }
9710 }
9711
9712
9713 /* Checks if TYPE is of kind va_list char *. */
9714
9715 static bool
9716 is_va_list_char_pointer (tree type)
9717 {
9718 tree canonic;
9719
9720 /* For 32-bit it is always true. */
9721 if (!TARGET_64BIT)
9722 return true;
9723 canonic = ix86_canonical_va_list_type (type);
9724 return (canonic == ms_va_list_type_node
9725 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
9726 }
9727
9728 /* Implement va_start. */
9729
9730 static void
9731 ix86_va_start (tree valist, rtx nextarg)
9732 {
9733 HOST_WIDE_INT words, n_gpr, n_fpr;
9734 tree f_gpr, f_fpr, f_ovf, f_sav;
9735 tree gpr, fpr, ovf, sav, t;
9736 tree type;
9737 rtx ovf_rtx;
9738
9739 if (flag_split_stack
9740 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9741 {
9742 unsigned int scratch_regno;
9743
9744 /* When we are splitting the stack, we can't refer to the stack
9745 arguments using internal_arg_pointer, because they may be on
9746 the old stack. The split stack prologue will arrange to
9747 leave a pointer to the old stack arguments in a scratch
9748 register, which we here copy to a pseudo-register. The split
9749 stack prologue can't set the pseudo-register directly because
9750 it (the prologue) runs before any registers have been saved. */
9751
9752 scratch_regno = split_stack_prologue_scratch_regno ();
9753 if (scratch_regno != INVALID_REGNUM)
9754 {
9755 rtx reg;
9756 rtx_insn *seq;
9757
9758 reg = gen_reg_rtx (Pmode);
9759 cfun->machine->split_stack_varargs_pointer = reg;
9760
9761 start_sequence ();
9762 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
9763 seq = get_insns ();
9764 end_sequence ();
9765
9766 push_topmost_sequence ();
9767 emit_insn_after (seq, entry_of_function ());
9768 pop_topmost_sequence ();
9769 }
9770 }
9771
9772 /* Only 64bit target needs something special. */
9773 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9774 {
9775 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9776 std_expand_builtin_va_start (valist, nextarg);
9777 else
9778 {
9779 rtx va_r, next;
9780
9781 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
9782 next = expand_binop (ptr_mode, add_optab,
9783 cfun->machine->split_stack_varargs_pointer,
9784 crtl->args.arg_offset_rtx,
9785 NULL_RTX, 0, OPTAB_LIB_WIDEN);
9786 convert_move (va_r, next, 0);
9787
9788 /* Store zero bounds for va_list. */
9789 if (chkp_function_instrumented_p (current_function_decl))
9790 chkp_expand_bounds_reset_for_mem (valist,
9791 make_tree (TREE_TYPE (valist),
9792 next));
9793
9794 }
9795 return;
9796 }
9797
9798 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9799 f_fpr = DECL_CHAIN (f_gpr);
9800 f_ovf = DECL_CHAIN (f_fpr);
9801 f_sav = DECL_CHAIN (f_ovf);
9802
9803 valist = build_simple_mem_ref (valist);
9804 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
9805 /* The following should be folded into the MEM_REF offset. */
9806 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
9807 f_gpr, NULL_TREE);
9808 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
9809 f_fpr, NULL_TREE);
9810 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
9811 f_ovf, NULL_TREE);
9812 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
9813 f_sav, NULL_TREE);
9814
9815 /* Count number of gp and fp argument registers used. */
9816 words = crtl->args.info.words;
9817 n_gpr = crtl->args.info.regno;
9818 n_fpr = crtl->args.info.sse_regno;
9819
9820 if (cfun->va_list_gpr_size)
9821 {
9822 type = TREE_TYPE (gpr);
9823 t = build2 (MODIFY_EXPR, type,
9824 gpr, build_int_cst (type, n_gpr * 8));
9825 TREE_SIDE_EFFECTS (t) = 1;
9826 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9827 }
9828
9829 if (TARGET_SSE && cfun->va_list_fpr_size)
9830 {
9831 type = TREE_TYPE (fpr);
9832 t = build2 (MODIFY_EXPR, type, fpr,
9833 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
9834 TREE_SIDE_EFFECTS (t) = 1;
9835 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9836 }
9837
9838 /* Find the overflow area. */
9839 type = TREE_TYPE (ovf);
9840 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
9841 ovf_rtx = crtl->args.internal_arg_pointer;
9842 else
9843 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
9844 t = make_tree (type, ovf_rtx);
9845 if (words != 0)
9846 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
9847
9848 /* Store zero bounds for overflow area pointer. */
9849 if (chkp_function_instrumented_p (current_function_decl))
9850 chkp_expand_bounds_reset_for_mem (ovf, t);
9851
9852 t = build2 (MODIFY_EXPR, type, ovf, t);
9853 TREE_SIDE_EFFECTS (t) = 1;
9854 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9855
9856 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
9857 {
9858 /* Find the register save area.
9859 Prologue of the function save it right above stack frame. */
9860 type = TREE_TYPE (sav);
9861 t = make_tree (type, frame_pointer_rtx);
9862 if (!ix86_varargs_gpr_size)
9863 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
9864
9865 /* Store zero bounds for save area pointer. */
9866 if (chkp_function_instrumented_p (current_function_decl))
9867 chkp_expand_bounds_reset_for_mem (sav, t);
9868
9869 t = build2 (MODIFY_EXPR, type, sav, t);
9870 TREE_SIDE_EFFECTS (t) = 1;
9871 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9872 }
9873 }
9874
9875 /* Implement va_arg. */
9876
9877 static tree
9878 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
9879 gimple_seq *post_p)
9880 {
9881 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
9882 tree f_gpr, f_fpr, f_ovf, f_sav;
9883 tree gpr, fpr, ovf, sav, t;
9884 int size, rsize;
9885 tree lab_false, lab_over = NULL_TREE;
9886 tree addr, t2;
9887 rtx container;
9888 int indirect_p = 0;
9889 tree ptrtype;
9890 machine_mode nat_mode;
9891 unsigned int arg_boundary;
9892
9893 /* Only 64bit target needs something special. */
9894 if (is_va_list_char_pointer (TREE_TYPE (valist)))
9895 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
9896
9897 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
9898 f_fpr = DECL_CHAIN (f_gpr);
9899 f_ovf = DECL_CHAIN (f_fpr);
9900 f_sav = DECL_CHAIN (f_ovf);
9901
9902 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
9903 valist, f_gpr, NULL_TREE);
9904
9905 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
9906 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
9907 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
9908
9909 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9910 if (indirect_p)
9911 type = build_pointer_type (type);
9912 size = arg_int_size_in_bytes (type);
9913 rsize = CEIL (size, UNITS_PER_WORD);
9914
9915 nat_mode = type_natural_mode (type, NULL, false);
9916 switch (nat_mode)
9917 {
9918 case E_V8SFmode:
9919 case E_V8SImode:
9920 case E_V32QImode:
9921 case E_V16HImode:
9922 case E_V4DFmode:
9923 case E_V4DImode:
9924 case E_V16SFmode:
9925 case E_V16SImode:
9926 case E_V64QImode:
9927 case E_V32HImode:
9928 case E_V8DFmode:
9929 case E_V8DImode:
9930 /* Unnamed 256 and 512bit vector mode parameters are passed on stack. */
9931 if (!TARGET_64BIT_MS_ABI)
9932 {
9933 container = NULL;
9934 break;
9935 }
9936 /* FALLTHRU */
9937
9938 default:
9939 container = construct_container (nat_mode, TYPE_MODE (type),
9940 type, 0, X86_64_REGPARM_MAX,
9941 X86_64_SSE_REGPARM_MAX, intreg,
9942 0);
9943 break;
9944 }
9945
9946 /* Pull the value out of the saved registers. */
9947
9948 addr = create_tmp_var (ptr_type_node, "addr");
9949
9950 if (container)
9951 {
9952 int needed_intregs, needed_sseregs;
9953 bool need_temp;
9954 tree int_addr, sse_addr;
9955
9956 lab_false = create_artificial_label (UNKNOWN_LOCATION);
9957 lab_over = create_artificial_label (UNKNOWN_LOCATION);
9958
9959 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
9960
9961 need_temp = (!REG_P (container)
9962 && ((needed_intregs && TYPE_ALIGN (type) > 64)
9963 || TYPE_ALIGN (type) > 128));
9964
9965 /* In case we are passing structure, verify that it is consecutive block
9966 on the register save area. If not we need to do moves. */
9967 if (!need_temp && !REG_P (container))
9968 {
9969 /* Verify that all registers are strictly consecutive */
9970 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
9971 {
9972 int i;
9973
9974 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9975 {
9976 rtx slot = XVECEXP (container, 0, i);
9977 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
9978 || INTVAL (XEXP (slot, 1)) != i * 16)
9979 need_temp = true;
9980 }
9981 }
9982 else
9983 {
9984 int i;
9985
9986 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
9987 {
9988 rtx slot = XVECEXP (container, 0, i);
9989 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
9990 || INTVAL (XEXP (slot, 1)) != i * 8)
9991 need_temp = true;
9992 }
9993 }
9994 }
9995 if (!need_temp)
9996 {
9997 int_addr = addr;
9998 sse_addr = addr;
9999 }
10000 else
10001 {
10002 int_addr = create_tmp_var (ptr_type_node, "int_addr");
10003 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
10004 }
10005
10006 /* First ensure that we fit completely in registers. */
10007 if (needed_intregs)
10008 {
10009 t = build_int_cst (TREE_TYPE (gpr),
10010 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
10011 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
10012 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10013 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10014 gimplify_and_add (t, pre_p);
10015 }
10016 if (needed_sseregs)
10017 {
10018 t = build_int_cst (TREE_TYPE (fpr),
10019 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
10020 + X86_64_REGPARM_MAX * 8);
10021 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
10022 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
10023 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
10024 gimplify_and_add (t, pre_p);
10025 }
10026
10027 /* Compute index to start of area used for integer regs. */
10028 if (needed_intregs)
10029 {
10030 /* int_addr = gpr + sav; */
10031 t = fold_build_pointer_plus (sav, gpr);
10032 gimplify_assign (int_addr, t, pre_p);
10033 }
10034 if (needed_sseregs)
10035 {
10036 /* sse_addr = fpr + sav; */
10037 t = fold_build_pointer_plus (sav, fpr);
10038 gimplify_assign (sse_addr, t, pre_p);
10039 }
10040 if (need_temp)
10041 {
10042 int i, prev_size = 0;
10043 tree temp = create_tmp_var (type, "va_arg_tmp");
10044
10045 /* addr = &temp; */
10046 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
10047 gimplify_assign (addr, t, pre_p);
10048
10049 for (i = 0; i < XVECLEN (container, 0); i++)
10050 {
10051 rtx slot = XVECEXP (container, 0, i);
10052 rtx reg = XEXP (slot, 0);
10053 machine_mode mode = GET_MODE (reg);
10054 tree piece_type;
10055 tree addr_type;
10056 tree daddr_type;
10057 tree src_addr, src;
10058 int src_offset;
10059 tree dest_addr, dest;
10060 int cur_size = GET_MODE_SIZE (mode);
10061
10062 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
10063 prev_size = INTVAL (XEXP (slot, 1));
10064 if (prev_size + cur_size > size)
10065 {
10066 cur_size = size - prev_size;
10067 unsigned int nbits = cur_size * BITS_PER_UNIT;
10068 if (!int_mode_for_size (nbits, 1).exists (&mode))
10069 mode = QImode;
10070 }
10071 piece_type = lang_hooks.types.type_for_mode (mode, 1);
10072 if (mode == GET_MODE (reg))
10073 addr_type = build_pointer_type (piece_type);
10074 else
10075 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10076 true);
10077 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
10078 true);
10079
10080 if (SSE_REGNO_P (REGNO (reg)))
10081 {
10082 src_addr = sse_addr;
10083 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
10084 }
10085 else
10086 {
10087 src_addr = int_addr;
10088 src_offset = REGNO (reg) * 8;
10089 }
10090 src_addr = fold_convert (addr_type, src_addr);
10091 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
10092
10093 dest_addr = fold_convert (daddr_type, addr);
10094 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
10095 if (cur_size == GET_MODE_SIZE (mode))
10096 {
10097 src = build_va_arg_indirect_ref (src_addr);
10098 dest = build_va_arg_indirect_ref (dest_addr);
10099
10100 gimplify_assign (dest, src, pre_p);
10101 }
10102 else
10103 {
10104 tree copy
10105 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
10106 3, dest_addr, src_addr,
10107 size_int (cur_size));
10108 gimplify_and_add (copy, pre_p);
10109 }
10110 prev_size += cur_size;
10111 }
10112 }
10113
10114 if (needed_intregs)
10115 {
10116 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
10117 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
10118 gimplify_assign (gpr, t, pre_p);
10119 }
10120
10121 if (needed_sseregs)
10122 {
10123 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
10124 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
10125 gimplify_assign (unshare_expr (fpr), t, pre_p);
10126 }
10127
10128 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
10129
10130 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
10131 }
10132
10133 /* ... otherwise out of the overflow area. */
10134
10135 /* When we align parameter on stack for caller, if the parameter
10136 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
10137 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
10138 here with caller. */
10139 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
10140 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
10141 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
10142
10143 /* Care for on-stack alignment if needed. */
10144 if (arg_boundary <= 64 || size == 0)
10145 t = ovf;
10146 else
10147 {
10148 HOST_WIDE_INT align = arg_boundary / 8;
10149 t = fold_build_pointer_plus_hwi (ovf, align - 1);
10150 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10151 build_int_cst (TREE_TYPE (t), -align));
10152 }
10153
10154 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
10155 gimplify_assign (addr, t, pre_p);
10156
10157 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
10158 gimplify_assign (unshare_expr (ovf), t, pre_p);
10159
10160 if (container)
10161 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
10162
10163 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
10164 addr = fold_convert (ptrtype, addr);
10165
10166 if (indirect_p)
10167 addr = build_va_arg_indirect_ref (addr);
10168 return build_va_arg_indirect_ref (addr);
10169 }
10170 \f
10171 /* Return true if OPNUM's MEM should be matched
10172 in movabs* patterns. */
10173
10174 bool
10175 ix86_check_movabs (rtx insn, int opnum)
10176 {
10177 rtx set, mem;
10178
10179 set = PATTERN (insn);
10180 if (GET_CODE (set) == PARALLEL)
10181 set = XVECEXP (set, 0, 0);
10182 gcc_assert (GET_CODE (set) == SET);
10183 mem = XEXP (set, opnum);
10184 while (SUBREG_P (mem))
10185 mem = SUBREG_REG (mem);
10186 gcc_assert (MEM_P (mem));
10187 return volatile_ok || !MEM_VOLATILE_P (mem);
10188 }
10189
10190 /* Return false if INSN contains a MEM with a non-default address space. */
10191 bool
10192 ix86_check_no_addr_space (rtx insn)
10193 {
10194 subrtx_var_iterator::array_type array;
10195 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (insn), ALL)
10196 {
10197 rtx x = *iter;
10198 if (MEM_P (x) && !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (x)))
10199 return false;
10200 }
10201 return true;
10202 }
10203 \f
10204 /* Initialize the table of extra 80387 mathematical constants. */
10205
10206 static void
10207 init_ext_80387_constants (void)
10208 {
10209 static const char * cst[5] =
10210 {
10211 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
10212 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
10213 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
10214 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
10215 "3.1415926535897932385128089594061862044", /* 4: fldpi */
10216 };
10217 int i;
10218
10219 for (i = 0; i < 5; i++)
10220 {
10221 real_from_string (&ext_80387_constants_table[i], cst[i]);
10222 /* Ensure each constant is rounded to XFmode precision. */
10223 real_convert (&ext_80387_constants_table[i],
10224 XFmode, &ext_80387_constants_table[i]);
10225 }
10226
10227 ext_80387_constants_init = 1;
10228 }
10229
10230 /* Return non-zero if the constant is something that
10231 can be loaded with a special instruction. */
10232
10233 int
10234 standard_80387_constant_p (rtx x)
10235 {
10236 machine_mode mode = GET_MODE (x);
10237
10238 const REAL_VALUE_TYPE *r;
10239
10240 if (!(CONST_DOUBLE_P (x) && X87_FLOAT_MODE_P (mode)))
10241 return -1;
10242
10243 if (x == CONST0_RTX (mode))
10244 return 1;
10245 if (x == CONST1_RTX (mode))
10246 return 2;
10247
10248 r = CONST_DOUBLE_REAL_VALUE (x);
10249
10250 /* For XFmode constants, try to find a special 80387 instruction when
10251 optimizing for size or on those CPUs that benefit from them. */
10252 if (mode == XFmode
10253 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
10254 {
10255 int i;
10256
10257 if (! ext_80387_constants_init)
10258 init_ext_80387_constants ();
10259
10260 for (i = 0; i < 5; i++)
10261 if (real_identical (r, &ext_80387_constants_table[i]))
10262 return i + 3;
10263 }
10264
10265 /* Load of the constant -0.0 or -1.0 will be split as
10266 fldz;fchs or fld1;fchs sequence. */
10267 if (real_isnegzero (r))
10268 return 8;
10269 if (real_identical (r, &dconstm1))
10270 return 9;
10271
10272 return 0;
10273 }
10274
10275 /* Return the opcode of the special instruction to be used to load
10276 the constant X. */
10277
10278 const char *
10279 standard_80387_constant_opcode (rtx x)
10280 {
10281 switch (standard_80387_constant_p (x))
10282 {
10283 case 1:
10284 return "fldz";
10285 case 2:
10286 return "fld1";
10287 case 3:
10288 return "fldlg2";
10289 case 4:
10290 return "fldln2";
10291 case 5:
10292 return "fldl2e";
10293 case 6:
10294 return "fldl2t";
10295 case 7:
10296 return "fldpi";
10297 case 8:
10298 case 9:
10299 return "#";
10300 default:
10301 gcc_unreachable ();
10302 }
10303 }
10304
10305 /* Return the CONST_DOUBLE representing the 80387 constant that is
10306 loaded by the specified special instruction. The argument IDX
10307 matches the return value from standard_80387_constant_p. */
10308
10309 rtx
10310 standard_80387_constant_rtx (int idx)
10311 {
10312 int i;
10313
10314 if (! ext_80387_constants_init)
10315 init_ext_80387_constants ();
10316
10317 switch (idx)
10318 {
10319 case 3:
10320 case 4:
10321 case 5:
10322 case 6:
10323 case 7:
10324 i = idx - 3;
10325 break;
10326
10327 default:
10328 gcc_unreachable ();
10329 }
10330
10331 return const_double_from_real_value (ext_80387_constants_table[i],
10332 XFmode);
10333 }
10334
10335 /* Return 1 if X is all bits 0 and 2 if X is all bits 1
10336 in supported SSE/AVX vector mode. */
10337
10338 int
10339 standard_sse_constant_p (rtx x, machine_mode pred_mode)
10340 {
10341 machine_mode mode;
10342
10343 if (!TARGET_SSE)
10344 return 0;
10345
10346 mode = GET_MODE (x);
10347
10348 if (x == const0_rtx || const0_operand (x, mode))
10349 return 1;
10350
10351 if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10352 {
10353 /* VOIDmode integer constant, get mode from the predicate. */
10354 if (mode == VOIDmode)
10355 mode = pred_mode;
10356
10357 switch (GET_MODE_SIZE (mode))
10358 {
10359 case 64:
10360 if (TARGET_AVX512F)
10361 return 2;
10362 break;
10363 case 32:
10364 if (TARGET_AVX2)
10365 return 2;
10366 break;
10367 case 16:
10368 if (TARGET_SSE2)
10369 return 2;
10370 break;
10371 case 0:
10372 /* VOIDmode */
10373 gcc_unreachable ();
10374 default:
10375 break;
10376 }
10377 }
10378
10379 return 0;
10380 }
10381
10382 /* Return the opcode of the special instruction to be used to load
10383 the constant X. */
10384
10385 const char *
10386 standard_sse_constant_opcode (rtx_insn *insn, rtx x)
10387 {
10388 machine_mode mode;
10389
10390 gcc_assert (TARGET_SSE);
10391
10392 mode = GET_MODE (x);
10393
10394 if (x == const0_rtx || const0_operand (x, mode))
10395 {
10396 switch (get_attr_mode (insn))
10397 {
10398 case MODE_XI:
10399 return "vpxord\t%g0, %g0, %g0";
10400 case MODE_OI:
10401 return (TARGET_AVX512VL
10402 ? "vpxord\t%x0, %x0, %x0"
10403 : "vpxor\t%x0, %x0, %x0");
10404 case MODE_TI:
10405 return (TARGET_AVX512VL
10406 ? "vpxord\t%t0, %t0, %t0"
10407 : "%vpxor\t%0, %d0");
10408
10409 case MODE_V8DF:
10410 return (TARGET_AVX512DQ
10411 ? "vxorpd\t%g0, %g0, %g0"
10412 : "vpxorq\t%g0, %g0, %g0");
10413 case MODE_V4DF:
10414 return "vxorpd\t%x0, %x0, %x0";
10415 case MODE_V2DF:
10416 return "%vxorpd\t%0, %d0";
10417
10418 case MODE_V16SF:
10419 return (TARGET_AVX512DQ
10420 ? "vxorps\t%g0, %g0, %g0"
10421 : "vpxord\t%g0, %g0, %g0");
10422 case MODE_V8SF:
10423 return "vxorps\t%x0, %x0, %x0";
10424 case MODE_V4SF:
10425 return "%vxorps\t%0, %d0";
10426
10427 default:
10428 gcc_unreachable ();
10429 }
10430 }
10431 else if (x == constm1_rtx || vector_all_ones_operand (x, mode))
10432 {
10433 enum attr_mode insn_mode = get_attr_mode (insn);
10434
10435 switch (insn_mode)
10436 {
10437 case MODE_XI:
10438 case MODE_V8DF:
10439 case MODE_V16SF:
10440 gcc_assert (TARGET_AVX512F);
10441 return "vpternlogd\t{$0xFF, %g0, %g0, %g0|%g0, %g0, %g0, 0xFF}";
10442
10443 case MODE_OI:
10444 case MODE_V4DF:
10445 case MODE_V8SF:
10446 gcc_assert (TARGET_AVX2);
10447 /* FALLTHRU */
10448 case MODE_TI:
10449 case MODE_V2DF:
10450 case MODE_V4SF:
10451 gcc_assert (TARGET_SSE2);
10452 return (TARGET_AVX
10453 ? "vpcmpeqd\t%0, %0, %0"
10454 : "pcmpeqd\t%0, %0");
10455
10456 default:
10457 gcc_unreachable ();
10458 }
10459 }
10460
10461 gcc_unreachable ();
10462 }
10463
10464 /* Returns true if INSN can be transformed from a memory load
10465 to a supported FP constant load. */
10466
10467 bool
10468 ix86_standard_x87sse_constant_load_p (const rtx_insn *insn, rtx dst)
10469 {
10470 rtx src = find_constant_src (insn);
10471
10472 gcc_assert (REG_P (dst));
10473
10474 if (src == NULL
10475 || (SSE_REGNO_P (REGNO (dst))
10476 && standard_sse_constant_p (src, GET_MODE (dst)) != 1)
10477 || (STACK_REGNO_P (REGNO (dst))
10478 && standard_80387_constant_p (src) < 1))
10479 return false;
10480
10481 return true;
10482 }
10483
10484 /* Returns true if OP contains a symbol reference */
10485
10486 bool
10487 symbolic_reference_mentioned_p (rtx op)
10488 {
10489 const char *fmt;
10490 int i;
10491
10492 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
10493 return true;
10494
10495 fmt = GET_RTX_FORMAT (GET_CODE (op));
10496 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
10497 {
10498 if (fmt[i] == 'E')
10499 {
10500 int j;
10501
10502 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
10503 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
10504 return true;
10505 }
10506
10507 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
10508 return true;
10509 }
10510
10511 return false;
10512 }
10513
10514 /* Return true if it is appropriate to emit `ret' instructions in the
10515 body of a function. Do this only if the epilogue is simple, needing a
10516 couple of insns. Prior to reloading, we can't tell how many registers
10517 must be saved, so return false then. Return false if there is no frame
10518 marker to de-allocate. */
10519
10520 bool
10521 ix86_can_use_return_insn_p (void)
10522 {
10523 if (ix86_function_naked (current_function_decl))
10524 return false;
10525
10526 /* Don't use `ret' instruction in interrupt handler. */
10527 if (! reload_completed
10528 || frame_pointer_needed
10529 || cfun->machine->func_type != TYPE_NORMAL)
10530 return 0;
10531
10532 /* Don't allow more than 32k pop, since that's all we can do
10533 with one instruction. */
10534 if (crtl->args.pops_args && crtl->args.size >= 32768)
10535 return 0;
10536
10537 struct ix86_frame &frame = cfun->machine->frame;
10538 return (frame.stack_pointer_offset == UNITS_PER_WORD
10539 && (frame.nregs + frame.nsseregs) == 0);
10540 }
10541 \f
10542 /* Value should be nonzero if functions must have frame pointers.
10543 Zero means the frame pointer need not be set up (and parms may
10544 be accessed via the stack pointer) in functions that seem suitable. */
10545
10546 static bool
10547 ix86_frame_pointer_required (void)
10548 {
10549 /* If we accessed previous frames, then the generated code expects
10550 to be able to access the saved ebp value in our frame. */
10551 if (cfun->machine->accesses_prev_frame)
10552 return true;
10553
10554 /* Several x86 os'es need a frame pointer for other reasons,
10555 usually pertaining to setjmp. */
10556 if (SUBTARGET_FRAME_POINTER_REQUIRED)
10557 return true;
10558
10559 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
10560 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
10561 return true;
10562
10563 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
10564 allocation is 4GB. */
10565 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
10566 return true;
10567
10568 /* SSE saves require frame-pointer when stack is misaligned. */
10569 if (TARGET_64BIT_MS_ABI && ix86_incoming_stack_boundary < 128)
10570 return true;
10571
10572 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
10573 turns off the frame pointer by default. Turn it back on now if
10574 we've not got a leaf function. */
10575 if (TARGET_OMIT_LEAF_FRAME_POINTER
10576 && (!crtl->is_leaf
10577 || ix86_current_function_calls_tls_descriptor))
10578 return true;
10579
10580 if (crtl->profile && !flag_fentry)
10581 return true;
10582
10583 return false;
10584 }
10585
10586 /* Record that the current function accesses previous call frames. */
10587
10588 void
10589 ix86_setup_frame_addresses (void)
10590 {
10591 cfun->machine->accesses_prev_frame = 1;
10592 }
10593 \f
10594 #ifndef USE_HIDDEN_LINKONCE
10595 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
10596 # define USE_HIDDEN_LINKONCE 1
10597 # else
10598 # define USE_HIDDEN_LINKONCE 0
10599 # endif
10600 #endif
10601
10602 static int pic_labels_used;
10603
10604 /* Fills in the label name that should be used for a pc thunk for
10605 the given register. */
10606
10607 static void
10608 get_pc_thunk_name (char name[32], unsigned int regno)
10609 {
10610 gcc_assert (!TARGET_64BIT);
10611
10612 if (USE_HIDDEN_LINKONCE)
10613 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
10614 else
10615 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
10616 }
10617
10618
10619 /* This function generates code for -fpic that loads %ebx with
10620 the return address of the caller and then returns. */
10621
10622 static void
10623 ix86_code_end (void)
10624 {
10625 rtx xops[2];
10626 int regno;
10627
10628 for (regno = FIRST_INT_REG; regno <= LAST_INT_REG; regno++)
10629 {
10630 char name[32];
10631 tree decl;
10632
10633 if (!(pic_labels_used & (1 << regno)))
10634 continue;
10635
10636 get_pc_thunk_name (name, regno);
10637
10638 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
10639 get_identifier (name),
10640 build_function_type_list (void_type_node, NULL_TREE));
10641 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
10642 NULL_TREE, void_type_node);
10643 TREE_PUBLIC (decl) = 1;
10644 TREE_STATIC (decl) = 1;
10645 DECL_IGNORED_P (decl) = 1;
10646
10647 #if TARGET_MACHO
10648 if (TARGET_MACHO)
10649 {
10650 switch_to_section (darwin_sections[picbase_thunk_section]);
10651 fputs ("\t.weak_definition\t", asm_out_file);
10652 assemble_name (asm_out_file, name);
10653 fputs ("\n\t.private_extern\t", asm_out_file);
10654 assemble_name (asm_out_file, name);
10655 putc ('\n', asm_out_file);
10656 ASM_OUTPUT_LABEL (asm_out_file, name);
10657 DECL_WEAK (decl) = 1;
10658 }
10659 else
10660 #endif
10661 if (USE_HIDDEN_LINKONCE)
10662 {
10663 cgraph_node::create (decl)->set_comdat_group (DECL_ASSEMBLER_NAME (decl));
10664
10665 targetm.asm_out.unique_section (decl, 0);
10666 switch_to_section (get_named_section (decl, NULL, 0));
10667
10668 targetm.asm_out.globalize_label (asm_out_file, name);
10669 fputs ("\t.hidden\t", asm_out_file);
10670 assemble_name (asm_out_file, name);
10671 putc ('\n', asm_out_file);
10672 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
10673 }
10674 else
10675 {
10676 switch_to_section (text_section);
10677 ASM_OUTPUT_LABEL (asm_out_file, name);
10678 }
10679
10680 DECL_INITIAL (decl) = make_node (BLOCK);
10681 current_function_decl = decl;
10682 allocate_struct_function (decl, false);
10683 init_function_start (decl);
10684 /* We're about to hide the function body from callees of final_* by
10685 emitting it directly; tell them we're a thunk, if they care. */
10686 cfun->is_thunk = true;
10687 first_function_block_is_cold = false;
10688 /* Make sure unwind info is emitted for the thunk if needed. */
10689 final_start_function (emit_barrier (), asm_out_file, 1);
10690
10691 /* Pad stack IP move with 4 instructions (two NOPs count
10692 as one instruction). */
10693 if (TARGET_PAD_SHORT_FUNCTION)
10694 {
10695 int i = 8;
10696
10697 while (i--)
10698 fputs ("\tnop\n", asm_out_file);
10699 }
10700
10701 xops[0] = gen_rtx_REG (Pmode, regno);
10702 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
10703 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
10704 output_asm_insn ("%!ret", NULL);
10705 final_end_function ();
10706 init_insn_lengths ();
10707 free_after_compilation (cfun);
10708 set_cfun (NULL);
10709 current_function_decl = NULL;
10710 }
10711
10712 if (flag_split_stack)
10713 file_end_indicate_split_stack ();
10714 }
10715
10716 /* Emit code for the SET_GOT patterns. */
10717
10718 const char *
10719 output_set_got (rtx dest, rtx label)
10720 {
10721 rtx xops[3];
10722
10723 xops[0] = dest;
10724
10725 if (TARGET_VXWORKS_RTP && flag_pic)
10726 {
10727 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
10728 xops[2] = gen_rtx_MEM (Pmode,
10729 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
10730 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
10731
10732 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
10733 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
10734 an unadorned address. */
10735 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
10736 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
10737 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
10738 return "";
10739 }
10740
10741 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
10742
10743 if (flag_pic)
10744 {
10745 char name[32];
10746 get_pc_thunk_name (name, REGNO (dest));
10747 pic_labels_used |= 1 << REGNO (dest);
10748
10749 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
10750 xops[2] = gen_rtx_MEM (QImode, xops[2]);
10751 output_asm_insn ("%!call\t%X2", xops);
10752
10753 #if TARGET_MACHO
10754 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
10755 This is what will be referenced by the Mach-O PIC subsystem. */
10756 if (machopic_should_output_picbase_label () || !label)
10757 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
10758
10759 /* When we are restoring the pic base at the site of a nonlocal label,
10760 and we decided to emit the pic base above, we will still output a
10761 local label used for calculating the correction offset (even though
10762 the offset will be 0 in that case). */
10763 if (label)
10764 targetm.asm_out.internal_label (asm_out_file, "L",
10765 CODE_LABEL_NUMBER (label));
10766 #endif
10767 }
10768 else
10769 {
10770 if (TARGET_MACHO)
10771 /* We don't need a pic base, we're not producing pic. */
10772 gcc_unreachable ();
10773
10774 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
10775 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
10776 targetm.asm_out.internal_label (asm_out_file, "L",
10777 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
10778 }
10779
10780 if (!TARGET_MACHO)
10781 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
10782
10783 return "";
10784 }
10785
10786 /* Generate an "push" pattern for input ARG. */
10787
10788 static rtx
10789 gen_push (rtx arg)
10790 {
10791 struct machine_function *m = cfun->machine;
10792
10793 if (m->fs.cfa_reg == stack_pointer_rtx)
10794 m->fs.cfa_offset += UNITS_PER_WORD;
10795 m->fs.sp_offset += UNITS_PER_WORD;
10796
10797 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10798 arg = gen_rtx_REG (word_mode, REGNO (arg));
10799
10800 return gen_rtx_SET (gen_rtx_MEM (word_mode,
10801 gen_rtx_PRE_DEC (Pmode,
10802 stack_pointer_rtx)),
10803 arg);
10804 }
10805
10806 /* Generate an "pop" pattern for input ARG. */
10807
10808 static rtx
10809 gen_pop (rtx arg)
10810 {
10811 if (REG_P (arg) && GET_MODE (arg) != word_mode)
10812 arg = gen_rtx_REG (word_mode, REGNO (arg));
10813
10814 return gen_rtx_SET (arg,
10815 gen_rtx_MEM (word_mode,
10816 gen_rtx_POST_INC (Pmode,
10817 stack_pointer_rtx)));
10818 }
10819
10820 /* Return >= 0 if there is an unused call-clobbered register available
10821 for the entire function. */
10822
10823 static unsigned int
10824 ix86_select_alt_pic_regnum (void)
10825 {
10826 if (ix86_use_pseudo_pic_reg ())
10827 return INVALID_REGNUM;
10828
10829 if (crtl->is_leaf
10830 && !crtl->profile
10831 && !ix86_current_function_calls_tls_descriptor)
10832 {
10833 int i, drap;
10834 /* Can't use the same register for both PIC and DRAP. */
10835 if (crtl->drap_reg)
10836 drap = REGNO (crtl->drap_reg);
10837 else
10838 drap = -1;
10839 for (i = 2; i >= 0; --i)
10840 if (i != drap && !df_regs_ever_live_p (i))
10841 return i;
10842 }
10843
10844 return INVALID_REGNUM;
10845 }
10846
10847 /* Return true if REGNO is used by the epilogue. */
10848
10849 bool
10850 ix86_epilogue_uses (int regno)
10851 {
10852 /* If there are no caller-saved registers, we preserve all registers,
10853 except for MMX and x87 registers which aren't supported when saving
10854 and restoring registers. Don't explicitly save SP register since
10855 it is always preserved. */
10856 return (epilogue_completed
10857 && cfun->machine->no_caller_saved_registers
10858 && !fixed_regs[regno]
10859 && !STACK_REGNO_P (regno)
10860 && !MMX_REGNO_P (regno));
10861 }
10862
10863 /* Return nonzero if register REGNO can be used as a scratch register
10864 in peephole2. */
10865
10866 static bool
10867 ix86_hard_regno_scratch_ok (unsigned int regno)
10868 {
10869 /* If there are no caller-saved registers, we can't use any register
10870 as a scratch register after epilogue and use REGNO as scratch
10871 register only if it has been used before to avoid saving and
10872 restoring it. */
10873 return (!cfun->machine->no_caller_saved_registers
10874 || (!epilogue_completed
10875 && df_regs_ever_live_p (regno)));
10876 }
10877
10878 /* Return true if register class CL should be an additional allocno
10879 class. */
10880
10881 static bool
10882 ix86_additional_allocno_class_p (reg_class_t cl)
10883 {
10884 return cl == MOD4_SSE_REGS;
10885 }
10886
10887 /* Return TRUE if we need to save REGNO. */
10888
10889 static bool
10890 ix86_save_reg (unsigned int regno, bool maybe_eh_return, bool ignore_outlined)
10891 {
10892 /* If there are no caller-saved registers, we preserve all registers,
10893 except for MMX and x87 registers which aren't supported when saving
10894 and restoring registers. Don't explicitly save SP register since
10895 it is always preserved. */
10896 if (cfun->machine->no_caller_saved_registers)
10897 {
10898 /* Don't preserve registers used for function return value. */
10899 rtx reg = crtl->return_rtx;
10900 if (reg)
10901 {
10902 unsigned int i = REGNO (reg);
10903 unsigned int nregs = REG_NREGS (reg);
10904 while (nregs-- > 0)
10905 if ((i + nregs) == regno)
10906 return false;
10907
10908 reg = crtl->return_bnd;
10909 if (reg)
10910 {
10911 i = REGNO (reg);
10912 nregs = REG_NREGS (reg);
10913 while (nregs-- > 0)
10914 if ((i + nregs) == regno)
10915 return false;
10916 }
10917 }
10918
10919 return (df_regs_ever_live_p (regno)
10920 && !fixed_regs[regno]
10921 && !STACK_REGNO_P (regno)
10922 && !MMX_REGNO_P (regno)
10923 && (regno != HARD_FRAME_POINTER_REGNUM
10924 || !frame_pointer_needed));
10925 }
10926
10927 if (regno == REAL_PIC_OFFSET_TABLE_REGNUM
10928 && pic_offset_table_rtx)
10929 {
10930 if (ix86_use_pseudo_pic_reg ())
10931 {
10932 /* REAL_PIC_OFFSET_TABLE_REGNUM used by call to
10933 _mcount in prologue. */
10934 if (!TARGET_64BIT && flag_pic && crtl->profile)
10935 return true;
10936 }
10937 else if (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10938 || crtl->profile
10939 || crtl->calls_eh_return
10940 || crtl->uses_const_pool
10941 || cfun->has_nonlocal_label)
10942 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
10943 }
10944
10945 if (crtl->calls_eh_return && maybe_eh_return)
10946 {
10947 unsigned i;
10948 for (i = 0; ; i++)
10949 {
10950 unsigned test = EH_RETURN_DATA_REGNO (i);
10951 if (test == INVALID_REGNUM)
10952 break;
10953 if (test == regno)
10954 return true;
10955 }
10956 }
10957
10958 if (ignore_outlined && cfun->machine->call_ms2sysv)
10959 {
10960 unsigned count = cfun->machine->call_ms2sysv_extra_regs
10961 + xlogue_layout::MIN_REGS;
10962 if (xlogue_layout::is_stub_managed_reg (regno, count))
10963 return false;
10964 }
10965
10966 if (crtl->drap_reg
10967 && regno == REGNO (crtl->drap_reg)
10968 && !cfun->machine->no_drap_save_restore)
10969 return true;
10970
10971 return (df_regs_ever_live_p (regno)
10972 && !call_used_regs[regno]
10973 && !fixed_regs[regno]
10974 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
10975 }
10976
10977 /* Return number of saved general prupose registers. */
10978
10979 static int
10980 ix86_nsaved_regs (void)
10981 {
10982 int nregs = 0;
10983 int regno;
10984
10985 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10986 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
10987 nregs ++;
10988 return nregs;
10989 }
10990
10991 /* Return number of saved SSE registers. */
10992
10993 static int
10994 ix86_nsaved_sseregs (void)
10995 {
10996 int nregs = 0;
10997 int regno;
10998
10999 if (!TARGET_64BIT_MS_ABI)
11000 return 0;
11001 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11002 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11003 nregs ++;
11004 return nregs;
11005 }
11006
11007 /* Given FROM and TO register numbers, say whether this elimination is
11008 allowed. If stack alignment is needed, we can only replace argument
11009 pointer with hard frame pointer, or replace frame pointer with stack
11010 pointer. Otherwise, frame pointer elimination is automatically
11011 handled and all other eliminations are valid. */
11012
11013 static bool
11014 ix86_can_eliminate (const int from, const int to)
11015 {
11016 if (stack_realign_fp)
11017 return ((from == ARG_POINTER_REGNUM
11018 && to == HARD_FRAME_POINTER_REGNUM)
11019 || (from == FRAME_POINTER_REGNUM
11020 && to == STACK_POINTER_REGNUM));
11021 else
11022 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
11023 }
11024
11025 /* Return the offset between two registers, one to be eliminated, and the other
11026 its replacement, at the start of a routine. */
11027
11028 HOST_WIDE_INT
11029 ix86_initial_elimination_offset (int from, int to)
11030 {
11031 struct ix86_frame &frame = cfun->machine->frame;
11032
11033 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
11034 return frame.hard_frame_pointer_offset;
11035 else if (from == FRAME_POINTER_REGNUM
11036 && to == HARD_FRAME_POINTER_REGNUM)
11037 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
11038 else
11039 {
11040 gcc_assert (to == STACK_POINTER_REGNUM);
11041
11042 if (from == ARG_POINTER_REGNUM)
11043 return frame.stack_pointer_offset;
11044
11045 gcc_assert (from == FRAME_POINTER_REGNUM);
11046 return frame.stack_pointer_offset - frame.frame_pointer_offset;
11047 }
11048 }
11049
11050 /* In a dynamically-aligned function, we can't know the offset from
11051 stack pointer to frame pointer, so we must ensure that setjmp
11052 eliminates fp against the hard fp (%ebp) rather than trying to
11053 index from %esp up to the top of the frame across a gap that is
11054 of unknown (at compile-time) size. */
11055 static rtx
11056 ix86_builtin_setjmp_frame_value (void)
11057 {
11058 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
11059 }
11060
11061 /* Emits a warning for unsupported msabi to sysv pro/epilogues. */
11062 static void warn_once_call_ms2sysv_xlogues (const char *feature)
11063 {
11064 static bool warned_once = false;
11065 if (!warned_once)
11066 {
11067 warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
11068 feature);
11069 warned_once = true;
11070 }
11071 }
11072
11073 /* When using -fsplit-stack, the allocation routines set a field in
11074 the TCB to the bottom of the stack plus this much space, measured
11075 in bytes. */
11076
11077 #define SPLIT_STACK_AVAILABLE 256
11078
11079 /* Fill structure ix86_frame about frame of currently computed function. */
11080
11081 static void
11082 ix86_compute_frame_layout (void)
11083 {
11084 struct ix86_frame *frame = &cfun->machine->frame;
11085 struct machine_function *m = cfun->machine;
11086 unsigned HOST_WIDE_INT stack_alignment_needed;
11087 HOST_WIDE_INT offset;
11088 unsigned HOST_WIDE_INT preferred_alignment;
11089 HOST_WIDE_INT size = get_frame_size ();
11090 HOST_WIDE_INT to_allocate;
11091
11092 /* m->call_ms2sysv is initially enabled in ix86_expand_call for all 64-bit
11093 * ms_abi functions that call a sysv function. We now need to prune away
11094 * cases where it should be disabled. */
11095 if (TARGET_64BIT && m->call_ms2sysv)
11096 {
11097 gcc_assert (TARGET_64BIT_MS_ABI);
11098 gcc_assert (TARGET_CALL_MS2SYSV_XLOGUES);
11099 gcc_assert (!TARGET_SEH);
11100 gcc_assert (TARGET_SSE);
11101 gcc_assert (!ix86_using_red_zone ());
11102
11103 if (crtl->calls_eh_return)
11104 {
11105 gcc_assert (!reload_completed);
11106 m->call_ms2sysv = false;
11107 warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
11108 }
11109
11110 else if (ix86_static_chain_on_stack)
11111 {
11112 gcc_assert (!reload_completed);
11113 m->call_ms2sysv = false;
11114 warn_once_call_ms2sysv_xlogues ("static call chains");
11115 }
11116
11117 /* Finally, compute which registers the stub will manage. */
11118 else
11119 {
11120 unsigned count = xlogue_layout::count_stub_managed_regs ();
11121 m->call_ms2sysv_extra_regs = count - xlogue_layout::MIN_REGS;
11122 m->call_ms2sysv_pad_in = 0;
11123 }
11124 }
11125
11126 frame->nregs = ix86_nsaved_regs ();
11127 frame->nsseregs = ix86_nsaved_sseregs ();
11128
11129 /* 64-bit MS ABI seem to require stack alignment to be always 16,
11130 except for function prologues, leaf functions and when the defult
11131 incoming stack boundary is overriden at command line or via
11132 force_align_arg_pointer attribute. */
11133 if ((TARGET_64BIT_MS_ABI && crtl->preferred_stack_boundary < 128)
11134 && (!crtl->is_leaf || cfun->calls_alloca != 0
11135 || ix86_current_function_calls_tls_descriptor
11136 || ix86_incoming_stack_boundary < 128))
11137 {
11138 crtl->preferred_stack_boundary = 128;
11139 crtl->stack_alignment_needed = 128;
11140 }
11141
11142 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
11143 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
11144
11145 gcc_assert (!size || stack_alignment_needed);
11146 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
11147 gcc_assert (preferred_alignment <= stack_alignment_needed);
11148
11149 /* The only ABI saving SSE regs should be 64-bit ms_abi. */
11150 gcc_assert (TARGET_64BIT || !frame->nsseregs);
11151 if (TARGET_64BIT && m->call_ms2sysv)
11152 {
11153 gcc_assert (stack_alignment_needed >= 16);
11154 gcc_assert (!frame->nsseregs);
11155 }
11156
11157 /* For SEH we have to limit the amount of code movement into the prologue.
11158 At present we do this via a BLOCKAGE, at which point there's very little
11159 scheduling that can be done, which means that there's very little point
11160 in doing anything except PUSHs. */
11161 if (TARGET_SEH)
11162 m->use_fast_prologue_epilogue = false;
11163 else if (!optimize_bb_for_size_p (ENTRY_BLOCK_PTR_FOR_FN (cfun)))
11164 {
11165 int count = frame->nregs;
11166 struct cgraph_node *node = cgraph_node::get (current_function_decl);
11167
11168 /* The fast prologue uses move instead of push to save registers. This
11169 is significantly longer, but also executes faster as modern hardware
11170 can execute the moves in parallel, but can't do that for push/pop.
11171
11172 Be careful about choosing what prologue to emit: When function takes
11173 many instructions to execute we may use slow version as well as in
11174 case function is known to be outside hot spot (this is known with
11175 feedback only). Weight the size of function by number of registers
11176 to save as it is cheap to use one or two push instructions but very
11177 slow to use many of them. */
11178 if (count)
11179 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
11180 if (node->frequency < NODE_FREQUENCY_NORMAL
11181 || (flag_branch_probabilities
11182 && node->frequency < NODE_FREQUENCY_HOT))
11183 m->use_fast_prologue_epilogue = false;
11184 else
11185 m->use_fast_prologue_epilogue
11186 = !expensive_function_p (count);
11187 }
11188
11189 frame->save_regs_using_mov
11190 = (TARGET_PROLOGUE_USING_MOVE && m->use_fast_prologue_epilogue
11191 /* If static stack checking is enabled and done with probes,
11192 the registers need to be saved before allocating the frame. */
11193 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
11194
11195 /* Skip return address and error code in exception handler. */
11196 offset = INCOMING_FRAME_SP_OFFSET;
11197
11198 /* Skip pushed static chain. */
11199 if (ix86_static_chain_on_stack)
11200 offset += UNITS_PER_WORD;
11201
11202 /* Skip saved base pointer. */
11203 if (frame_pointer_needed)
11204 offset += UNITS_PER_WORD;
11205 frame->hfp_save_offset = offset;
11206
11207 /* The traditional frame pointer location is at the top of the frame. */
11208 frame->hard_frame_pointer_offset = offset;
11209
11210 /* Register save area */
11211 offset += frame->nregs * UNITS_PER_WORD;
11212 frame->reg_save_offset = offset;
11213
11214 /* On SEH target, registers are pushed just before the frame pointer
11215 location. */
11216 if (TARGET_SEH)
11217 frame->hard_frame_pointer_offset = offset;
11218
11219 /* Calculate the size of the va-arg area (not including padding, if any). */
11220 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
11221
11222 if (stack_realign_fp)
11223 {
11224 /* We may need a 16-byte aligned stack for the remainder of the
11225 register save area, but the stack frame for the local function
11226 may require a greater alignment if using AVX/2/512. In order
11227 to avoid wasting space, we first calculate the space needed for
11228 the rest of the register saves, add that to the stack pointer,
11229 and then realign the stack to the boundary of the start of the
11230 frame for the local function. */
11231 HOST_WIDE_INT space_needed = 0;
11232 HOST_WIDE_INT sse_reg_space_needed = 0;
11233
11234 if (TARGET_64BIT)
11235 {
11236 if (m->call_ms2sysv)
11237 {
11238 m->call_ms2sysv_pad_in = 0;
11239 space_needed = xlogue_layout::get_instance ().get_stack_space_used ();
11240 }
11241
11242 else if (frame->nsseregs)
11243 /* The only ABI that has saved SSE registers (Win64) also has a
11244 16-byte aligned default stack. However, many programs violate
11245 the ABI, and Wine64 forces stack realignment to compensate. */
11246 space_needed = frame->nsseregs * 16;
11247
11248 sse_reg_space_needed = space_needed = ROUND_UP (space_needed, 16);
11249
11250 /* 64-bit frame->va_arg_size should always be a multiple of 16, but
11251 rounding to be pedantic. */
11252 space_needed = ROUND_UP (space_needed + frame->va_arg_size, 16);
11253 }
11254 else
11255 space_needed = frame->va_arg_size;
11256
11257 /* Record the allocation size required prior to the realignment AND. */
11258 frame->stack_realign_allocate = space_needed;
11259
11260 /* The re-aligned stack starts at frame->stack_realign_offset. Values
11261 before this point are not directly comparable with values below
11262 this point. Use sp_valid_at to determine if the stack pointer is
11263 valid for a given offset, fp_valid_at for the frame pointer, or
11264 choose_baseaddr to have a base register chosen for you.
11265
11266 Note that the result of (frame->stack_realign_offset
11267 & (stack_alignment_needed - 1)) may not equal zero. */
11268 offset = ROUND_UP (offset + space_needed, stack_alignment_needed);
11269 frame->stack_realign_offset = offset - space_needed;
11270 frame->sse_reg_save_offset = frame->stack_realign_offset
11271 + sse_reg_space_needed;
11272 }
11273 else
11274 {
11275 frame->stack_realign_offset = offset;
11276
11277 if (TARGET_64BIT && m->call_ms2sysv)
11278 {
11279 m->call_ms2sysv_pad_in = !!(offset & UNITS_PER_WORD);
11280 offset += xlogue_layout::get_instance ().get_stack_space_used ();
11281 }
11282
11283 /* Align and set SSE register save area. */
11284 else if (frame->nsseregs)
11285 {
11286 /* If the incoming stack boundary is at least 16 bytes, or DRAP is
11287 required and the DRAP re-alignment boundary is at least 16 bytes,
11288 then we want the SSE register save area properly aligned. */
11289 if (ix86_incoming_stack_boundary >= 128
11290 || (stack_realign_drap && stack_alignment_needed >= 16))
11291 offset = ROUND_UP (offset, 16);
11292 offset += frame->nsseregs * 16;
11293 }
11294 frame->sse_reg_save_offset = offset;
11295 offset += frame->va_arg_size;
11296 }
11297
11298 /* Align start of frame for local function. */
11299 if (m->call_ms2sysv
11300 || frame->va_arg_size != 0
11301 || size != 0
11302 || !crtl->is_leaf
11303 || cfun->calls_alloca
11304 || ix86_current_function_calls_tls_descriptor)
11305 offset = ROUND_UP (offset, stack_alignment_needed);
11306
11307 /* Frame pointer points here. */
11308 frame->frame_pointer_offset = offset;
11309
11310 offset += size;
11311
11312 /* Add outgoing arguments area. Can be skipped if we eliminated
11313 all the function calls as dead code.
11314 Skipping is however impossible when function calls alloca. Alloca
11315 expander assumes that last crtl->outgoing_args_size
11316 of stack frame are unused. */
11317 if (ACCUMULATE_OUTGOING_ARGS
11318 && (!crtl->is_leaf || cfun->calls_alloca
11319 || ix86_current_function_calls_tls_descriptor))
11320 {
11321 offset += crtl->outgoing_args_size;
11322 frame->outgoing_arguments_size = crtl->outgoing_args_size;
11323 }
11324 else
11325 frame->outgoing_arguments_size = 0;
11326
11327 /* Align stack boundary. Only needed if we're calling another function
11328 or using alloca. */
11329 if (!crtl->is_leaf || cfun->calls_alloca
11330 || ix86_current_function_calls_tls_descriptor)
11331 offset = ROUND_UP (offset, preferred_alignment);
11332
11333 /* We've reached end of stack frame. */
11334 frame->stack_pointer_offset = offset;
11335
11336 /* Size prologue needs to allocate. */
11337 to_allocate = offset - frame->sse_reg_save_offset;
11338
11339 if ((!to_allocate && frame->nregs <= 1)
11340 || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000)))
11341 frame->save_regs_using_mov = false;
11342
11343 if (ix86_using_red_zone ()
11344 && crtl->sp_is_unchanging
11345 && crtl->is_leaf
11346 && !ix86_pc_thunk_call_expanded
11347 && !ix86_current_function_calls_tls_descriptor)
11348 {
11349 frame->red_zone_size = to_allocate;
11350 if (frame->save_regs_using_mov)
11351 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
11352 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
11353 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
11354 }
11355 else
11356 frame->red_zone_size = 0;
11357 frame->stack_pointer_offset -= frame->red_zone_size;
11358
11359 /* The SEH frame pointer location is near the bottom of the frame.
11360 This is enforced by the fact that the difference between the
11361 stack pointer and the frame pointer is limited to 240 bytes in
11362 the unwind data structure. */
11363 if (TARGET_SEH)
11364 {
11365 HOST_WIDE_INT diff;
11366
11367 /* If we can leave the frame pointer where it is, do so. Also, returns
11368 the establisher frame for __builtin_frame_address (0). */
11369 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
11370 if (diff <= SEH_MAX_FRAME_SIZE
11371 && (diff > 240 || (diff & 15) != 0)
11372 && !crtl->accesses_prior_frames)
11373 {
11374 /* Ideally we'd determine what portion of the local stack frame
11375 (within the constraint of the lowest 240) is most heavily used.
11376 But without that complication, simply bias the frame pointer
11377 by 128 bytes so as to maximize the amount of the local stack
11378 frame that is addressable with 8-bit offsets. */
11379 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
11380 }
11381 }
11382 }
11383
11384 /* This is semi-inlined memory_address_length, but simplified
11385 since we know that we're always dealing with reg+offset, and
11386 to avoid having to create and discard all that rtl. */
11387
11388 static inline int
11389 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
11390 {
11391 int len = 4;
11392
11393 if (offset == 0)
11394 {
11395 /* EBP and R13 cannot be encoded without an offset. */
11396 len = (regno == BP_REG || regno == R13_REG);
11397 }
11398 else if (IN_RANGE (offset, -128, 127))
11399 len = 1;
11400
11401 /* ESP and R12 must be encoded with a SIB byte. */
11402 if (regno == SP_REG || regno == R12_REG)
11403 len++;
11404
11405 return len;
11406 }
11407
11408 /* Determine if the stack pointer is valid for accessing the CFA_OFFSET in
11409 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11410
11411 static bool
11412 sp_valid_at (HOST_WIDE_INT cfa_offset)
11413 {
11414 const struct machine_frame_state &fs = cfun->machine->fs;
11415 if (fs.sp_realigned && cfa_offset <= fs.sp_realigned_offset)
11416 {
11417 /* Validate that the cfa_offset isn't in a "no-man's land". */
11418 gcc_assert (cfa_offset <= fs.sp_realigned_fp_last);
11419 return false;
11420 }
11421 return fs.sp_valid;
11422 }
11423
11424 /* Determine if the frame pointer is valid for accessing the CFA_OFFSET in
11425 the frame save area. The register is saved at CFA - CFA_OFFSET. */
11426
11427 static inline bool
11428 fp_valid_at (HOST_WIDE_INT cfa_offset)
11429 {
11430 const struct machine_frame_state &fs = cfun->machine->fs;
11431 if (fs.sp_realigned && cfa_offset > fs.sp_realigned_fp_last)
11432 {
11433 /* Validate that the cfa_offset isn't in a "no-man's land". */
11434 gcc_assert (cfa_offset >= fs.sp_realigned_offset);
11435 return false;
11436 }
11437 return fs.fp_valid;
11438 }
11439
11440 /* Choose a base register based upon alignment requested, speed and/or
11441 size. */
11442
11443 static void
11444 choose_basereg (HOST_WIDE_INT cfa_offset, rtx &base_reg,
11445 HOST_WIDE_INT &base_offset,
11446 unsigned int align_reqested, unsigned int *align)
11447 {
11448 const struct machine_function *m = cfun->machine;
11449 unsigned int hfp_align;
11450 unsigned int drap_align;
11451 unsigned int sp_align;
11452 bool hfp_ok = fp_valid_at (cfa_offset);
11453 bool drap_ok = m->fs.drap_valid;
11454 bool sp_ok = sp_valid_at (cfa_offset);
11455
11456 hfp_align = drap_align = sp_align = INCOMING_STACK_BOUNDARY;
11457
11458 /* Filter out any registers that don't meet the requested alignment
11459 criteria. */
11460 if (align_reqested)
11461 {
11462 if (m->fs.realigned)
11463 hfp_align = drap_align = sp_align = crtl->stack_alignment_needed;
11464 /* SEH unwind code does do not currently support REG_CFA_EXPRESSION
11465 notes (which we would need to use a realigned stack pointer),
11466 so disable on SEH targets. */
11467 else if (m->fs.sp_realigned)
11468 sp_align = crtl->stack_alignment_needed;
11469
11470 hfp_ok = hfp_ok && hfp_align >= align_reqested;
11471 drap_ok = drap_ok && drap_align >= align_reqested;
11472 sp_ok = sp_ok && sp_align >= align_reqested;
11473 }
11474
11475 if (m->use_fast_prologue_epilogue)
11476 {
11477 /* Choose the base register most likely to allow the most scheduling
11478 opportunities. Generally FP is valid throughout the function,
11479 while DRAP must be reloaded within the epilogue. But choose either
11480 over the SP due to increased encoding size. */
11481
11482 if (hfp_ok)
11483 {
11484 base_reg = hard_frame_pointer_rtx;
11485 base_offset = m->fs.fp_offset - cfa_offset;
11486 }
11487 else if (drap_ok)
11488 {
11489 base_reg = crtl->drap_reg;
11490 base_offset = 0 - cfa_offset;
11491 }
11492 else if (sp_ok)
11493 {
11494 base_reg = stack_pointer_rtx;
11495 base_offset = m->fs.sp_offset - cfa_offset;
11496 }
11497 }
11498 else
11499 {
11500 HOST_WIDE_INT toffset;
11501 int len = 16, tlen;
11502
11503 /* Choose the base register with the smallest address encoding.
11504 With a tie, choose FP > DRAP > SP. */
11505 if (sp_ok)
11506 {
11507 base_reg = stack_pointer_rtx;
11508 base_offset = m->fs.sp_offset - cfa_offset;
11509 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
11510 }
11511 if (drap_ok)
11512 {
11513 toffset = 0 - cfa_offset;
11514 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
11515 if (tlen <= len)
11516 {
11517 base_reg = crtl->drap_reg;
11518 base_offset = toffset;
11519 len = tlen;
11520 }
11521 }
11522 if (hfp_ok)
11523 {
11524 toffset = m->fs.fp_offset - cfa_offset;
11525 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
11526 if (tlen <= len)
11527 {
11528 base_reg = hard_frame_pointer_rtx;
11529 base_offset = toffset;
11530 len = tlen;
11531 }
11532 }
11533 }
11534
11535 /* Set the align return value. */
11536 if (align)
11537 {
11538 if (base_reg == stack_pointer_rtx)
11539 *align = sp_align;
11540 else if (base_reg == crtl->drap_reg)
11541 *align = drap_align;
11542 else if (base_reg == hard_frame_pointer_rtx)
11543 *align = hfp_align;
11544 }
11545 }
11546
11547 /* Return an RTX that points to CFA_OFFSET within the stack frame and
11548 the alignment of address. If ALIGN is non-null, it should point to
11549 an alignment value (in bits) that is preferred or zero and will
11550 recieve the alignment of the base register that was selected,
11551 irrespective of rather or not CFA_OFFSET is a multiple of that
11552 alignment value. If it is possible for the base register offset to be
11553 non-immediate then SCRATCH_REGNO should specify a scratch register to
11554 use.
11555
11556 The valid base registers are taken from CFUN->MACHINE->FS. */
11557
11558 static rtx
11559 choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align,
11560 unsigned int scratch_regno = INVALID_REGNUM)
11561 {
11562 rtx base_reg = NULL;
11563 HOST_WIDE_INT base_offset = 0;
11564
11565 /* If a specific alignment is requested, try to get a base register
11566 with that alignment first. */
11567 if (align && *align)
11568 choose_basereg (cfa_offset, base_reg, base_offset, *align, align);
11569
11570 if (!base_reg)
11571 choose_basereg (cfa_offset, base_reg, base_offset, 0, align);
11572
11573 gcc_assert (base_reg != NULL);
11574
11575 rtx base_offset_rtx = GEN_INT (base_offset);
11576
11577 if (!x86_64_immediate_operand (base_offset_rtx, Pmode))
11578 {
11579 gcc_assert (scratch_regno != INVALID_REGNUM);
11580
11581 rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11582 emit_move_insn (scratch_reg, base_offset_rtx);
11583
11584 return gen_rtx_PLUS (Pmode, base_reg, scratch_reg);
11585 }
11586
11587 return plus_constant (Pmode, base_reg, base_offset);
11588 }
11589
11590 /* Emit code to save registers in the prologue. */
11591
11592 static void
11593 ix86_emit_save_regs (void)
11594 {
11595 unsigned int regno;
11596 rtx_insn *insn;
11597
11598 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
11599 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11600 {
11601 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
11602 RTX_FRAME_RELATED_P (insn) = 1;
11603 }
11604 }
11605
11606 /* Emit a single register save at CFA - CFA_OFFSET. */
11607
11608 static void
11609 ix86_emit_save_reg_using_mov (machine_mode mode, unsigned int regno,
11610 HOST_WIDE_INT cfa_offset)
11611 {
11612 struct machine_function *m = cfun->machine;
11613 rtx reg = gen_rtx_REG (mode, regno);
11614 rtx mem, addr, base, insn;
11615 unsigned int align = GET_MODE_ALIGNMENT (mode);
11616
11617 addr = choose_baseaddr (cfa_offset, &align);
11618 mem = gen_frame_mem (mode, addr);
11619
11620 /* The location aligment depends upon the base register. */
11621 align = MIN (GET_MODE_ALIGNMENT (mode), align);
11622 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
11623 set_mem_align (mem, align);
11624
11625 insn = emit_insn (gen_rtx_SET (mem, reg));
11626 RTX_FRAME_RELATED_P (insn) = 1;
11627
11628 base = addr;
11629 if (GET_CODE (base) == PLUS)
11630 base = XEXP (base, 0);
11631 gcc_checking_assert (REG_P (base));
11632
11633 /* When saving registers into a re-aligned local stack frame, avoid
11634 any tricky guessing by dwarf2out. */
11635 if (m->fs.realigned)
11636 {
11637 gcc_checking_assert (stack_realign_drap);
11638
11639 if (regno == REGNO (crtl->drap_reg))
11640 {
11641 /* A bit of a hack. We force the DRAP register to be saved in
11642 the re-aligned stack frame, which provides us with a copy
11643 of the CFA that will last past the prologue. Install it. */
11644 gcc_checking_assert (cfun->machine->fs.fp_valid);
11645 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11646 cfun->machine->fs.fp_offset - cfa_offset);
11647 mem = gen_rtx_MEM (mode, addr);
11648 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
11649 }
11650 else
11651 {
11652 /* The frame pointer is a stable reference within the
11653 aligned frame. Use it. */
11654 gcc_checking_assert (cfun->machine->fs.fp_valid);
11655 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
11656 cfun->machine->fs.fp_offset - cfa_offset);
11657 mem = gen_rtx_MEM (mode, addr);
11658 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11659 }
11660 }
11661
11662 else if (base == stack_pointer_rtx && m->fs.sp_realigned
11663 && cfa_offset >= m->fs.sp_realigned_offset)
11664 {
11665 gcc_checking_assert (stack_realign_fp);
11666 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
11667 }
11668
11669 /* The memory may not be relative to the current CFA register,
11670 which means that we may need to generate a new pattern for
11671 use by the unwind info. */
11672 else if (base != m->fs.cfa_reg)
11673 {
11674 addr = plus_constant (Pmode, m->fs.cfa_reg,
11675 m->fs.cfa_offset - cfa_offset);
11676 mem = gen_rtx_MEM (mode, addr);
11677 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (mem, reg));
11678 }
11679 }
11680
11681 /* Emit code to save registers using MOV insns.
11682 First register is stored at CFA - CFA_OFFSET. */
11683 static void
11684 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
11685 {
11686 unsigned int regno;
11687
11688 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11689 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11690 {
11691 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
11692 cfa_offset -= UNITS_PER_WORD;
11693 }
11694 }
11695
11696 /* Emit code to save SSE registers using MOV insns.
11697 First register is stored at CFA - CFA_OFFSET. */
11698 static void
11699 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
11700 {
11701 unsigned int regno;
11702
11703 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
11704 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true, true))
11705 {
11706 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
11707 cfa_offset -= GET_MODE_SIZE (V4SFmode);
11708 }
11709 }
11710
11711 static GTY(()) rtx queued_cfa_restores;
11712
11713 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
11714 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
11715 Don't add the note if the previously saved value will be left untouched
11716 within stack red-zone till return, as unwinders can find the same value
11717 in the register and on the stack. */
11718
11719 static void
11720 ix86_add_cfa_restore_note (rtx_insn *insn, rtx reg, HOST_WIDE_INT cfa_offset)
11721 {
11722 if (!crtl->shrink_wrapped
11723 && cfa_offset <= cfun->machine->fs.red_zone_offset)
11724 return;
11725
11726 if (insn)
11727 {
11728 add_reg_note (insn, REG_CFA_RESTORE, reg);
11729 RTX_FRAME_RELATED_P (insn) = 1;
11730 }
11731 else
11732 queued_cfa_restores
11733 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
11734 }
11735
11736 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
11737
11738 static void
11739 ix86_add_queued_cfa_restore_notes (rtx insn)
11740 {
11741 rtx last;
11742 if (!queued_cfa_restores)
11743 return;
11744 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
11745 ;
11746 XEXP (last, 1) = REG_NOTES (insn);
11747 REG_NOTES (insn) = queued_cfa_restores;
11748 queued_cfa_restores = NULL_RTX;
11749 RTX_FRAME_RELATED_P (insn) = 1;
11750 }
11751
11752 /* Expand prologue or epilogue stack adjustment.
11753 The pattern exist to put a dependency on all ebp-based memory accesses.
11754 STYLE should be negative if instructions should be marked as frame related,
11755 zero if %r11 register is live and cannot be freely used and positive
11756 otherwise. */
11757
11758 static rtx
11759 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
11760 int style, bool set_cfa)
11761 {
11762 struct machine_function *m = cfun->machine;
11763 rtx insn;
11764 bool add_frame_related_expr = false;
11765
11766 if (Pmode == SImode)
11767 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
11768 else if (x86_64_immediate_operand (offset, DImode))
11769 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
11770 else
11771 {
11772 rtx tmp;
11773 /* r11 is used by indirect sibcall return as well, set before the
11774 epilogue and used after the epilogue. */
11775 if (style)
11776 tmp = gen_rtx_REG (DImode, R11_REG);
11777 else
11778 {
11779 gcc_assert (src != hard_frame_pointer_rtx
11780 && dest != hard_frame_pointer_rtx);
11781 tmp = hard_frame_pointer_rtx;
11782 }
11783 insn = emit_insn (gen_rtx_SET (tmp, offset));
11784 if (style < 0)
11785 add_frame_related_expr = true;
11786
11787 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
11788 }
11789
11790 insn = emit_insn (insn);
11791 if (style >= 0)
11792 ix86_add_queued_cfa_restore_notes (insn);
11793
11794 if (set_cfa)
11795 {
11796 rtx r;
11797
11798 gcc_assert (m->fs.cfa_reg == src);
11799 m->fs.cfa_offset += INTVAL (offset);
11800 m->fs.cfa_reg = dest;
11801
11802 r = gen_rtx_PLUS (Pmode, src, offset);
11803 r = gen_rtx_SET (dest, r);
11804 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
11805 RTX_FRAME_RELATED_P (insn) = 1;
11806 }
11807 else if (style < 0)
11808 {
11809 RTX_FRAME_RELATED_P (insn) = 1;
11810 if (add_frame_related_expr)
11811 {
11812 rtx r = gen_rtx_PLUS (Pmode, src, offset);
11813 r = gen_rtx_SET (dest, r);
11814 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
11815 }
11816 }
11817
11818 if (dest == stack_pointer_rtx)
11819 {
11820 HOST_WIDE_INT ooffset = m->fs.sp_offset;
11821 bool valid = m->fs.sp_valid;
11822 bool realigned = m->fs.sp_realigned;
11823
11824 if (src == hard_frame_pointer_rtx)
11825 {
11826 valid = m->fs.fp_valid;
11827 realigned = false;
11828 ooffset = m->fs.fp_offset;
11829 }
11830 else if (src == crtl->drap_reg)
11831 {
11832 valid = m->fs.drap_valid;
11833 realigned = false;
11834 ooffset = 0;
11835 }
11836 else
11837 {
11838 /* Else there are two possibilities: SP itself, which we set
11839 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
11840 taken care of this by hand along the eh_return path. */
11841 gcc_checking_assert (src == stack_pointer_rtx
11842 || offset == const0_rtx);
11843 }
11844
11845 m->fs.sp_offset = ooffset - INTVAL (offset);
11846 m->fs.sp_valid = valid;
11847 m->fs.sp_realigned = realigned;
11848 }
11849 return insn;
11850 }
11851
11852 /* Find an available register to be used as dynamic realign argument
11853 pointer regsiter. Such a register will be written in prologue and
11854 used in begin of body, so it must not be
11855 1. parameter passing register.
11856 2. GOT pointer.
11857 We reuse static-chain register if it is available. Otherwise, we
11858 use DI for i386 and R13 for x86-64. We chose R13 since it has
11859 shorter encoding.
11860
11861 Return: the regno of chosen register. */
11862
11863 static unsigned int
11864 find_drap_reg (void)
11865 {
11866 tree decl = cfun->decl;
11867
11868 /* Always use callee-saved register if there are no caller-saved
11869 registers. */
11870 if (TARGET_64BIT)
11871 {
11872 /* Use R13 for nested function or function need static chain.
11873 Since function with tail call may use any caller-saved
11874 registers in epilogue, DRAP must not use caller-saved
11875 register in such case. */
11876 if (DECL_STATIC_CHAIN (decl)
11877 || cfun->machine->no_caller_saved_registers
11878 || crtl->tail_call_emit)
11879 return R13_REG;
11880
11881 return R10_REG;
11882 }
11883 else
11884 {
11885 /* Use DI for nested function or function need static chain.
11886 Since function with tail call may use any caller-saved
11887 registers in epilogue, DRAP must not use caller-saved
11888 register in such case. */
11889 if (DECL_STATIC_CHAIN (decl)
11890 || cfun->machine->no_caller_saved_registers
11891 || crtl->tail_call_emit)
11892 return DI_REG;
11893
11894 /* Reuse static chain register if it isn't used for parameter
11895 passing. */
11896 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
11897 {
11898 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
11899 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
11900 return CX_REG;
11901 }
11902 return DI_REG;
11903 }
11904 }
11905
11906 /* Handle a "force_align_arg_pointer" attribute. */
11907
11908 static tree
11909 ix86_handle_force_align_arg_pointer_attribute (tree *node, tree name,
11910 tree, int, bool *no_add_attrs)
11911 {
11912 if (TREE_CODE (*node) != FUNCTION_TYPE
11913 && TREE_CODE (*node) != METHOD_TYPE
11914 && TREE_CODE (*node) != FIELD_DECL
11915 && TREE_CODE (*node) != TYPE_DECL)
11916 {
11917 warning (OPT_Wattributes, "%qE attribute only applies to functions",
11918 name);
11919 *no_add_attrs = true;
11920 }
11921
11922 return NULL_TREE;
11923 }
11924
11925 /* Return minimum incoming stack alignment. */
11926
11927 static unsigned int
11928 ix86_minimum_incoming_stack_boundary (bool sibcall)
11929 {
11930 unsigned int incoming_stack_boundary;
11931
11932 /* Stack of interrupt handler is aligned to 128 bits in 64bit mode. */
11933 if (cfun->machine->func_type != TYPE_NORMAL)
11934 incoming_stack_boundary = TARGET_64BIT ? 128 : MIN_STACK_BOUNDARY;
11935 /* Prefer the one specified at command line. */
11936 else if (ix86_user_incoming_stack_boundary)
11937 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
11938 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
11939 if -mstackrealign is used, it isn't used for sibcall check and
11940 estimated stack alignment is 128bit. */
11941 else if (!sibcall
11942 && ix86_force_align_arg_pointer
11943 && crtl->stack_alignment_estimated == 128)
11944 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11945 else
11946 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
11947
11948 /* Incoming stack alignment can be changed on individual functions
11949 via force_align_arg_pointer attribute. We use the smallest
11950 incoming stack boundary. */
11951 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
11952 && lookup_attribute (ix86_force_align_arg_pointer_string,
11953 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
11954 incoming_stack_boundary = MIN_STACK_BOUNDARY;
11955
11956 /* The incoming stack frame has to be aligned at least at
11957 parm_stack_boundary. */
11958 if (incoming_stack_boundary < crtl->parm_stack_boundary)
11959 incoming_stack_boundary = crtl->parm_stack_boundary;
11960
11961 /* Stack at entrance of main is aligned by runtime. We use the
11962 smallest incoming stack boundary. */
11963 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
11964 && DECL_NAME (current_function_decl)
11965 && MAIN_NAME_P (DECL_NAME (current_function_decl))
11966 && DECL_FILE_SCOPE_P (current_function_decl))
11967 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
11968
11969 return incoming_stack_boundary;
11970 }
11971
11972 /* Update incoming stack boundary and estimated stack alignment. */
11973
11974 static void
11975 ix86_update_stack_boundary (void)
11976 {
11977 ix86_incoming_stack_boundary
11978 = ix86_minimum_incoming_stack_boundary (false);
11979
11980 /* x86_64 vararg needs 16byte stack alignment for register save
11981 area. */
11982 if (TARGET_64BIT
11983 && cfun->stdarg
11984 && crtl->stack_alignment_estimated < 128)
11985 crtl->stack_alignment_estimated = 128;
11986
11987 /* __tls_get_addr needs to be called with 16-byte aligned stack. */
11988 if (ix86_tls_descriptor_calls_expanded_in_cfun
11989 && crtl->preferred_stack_boundary < 128)
11990 crtl->preferred_stack_boundary = 128;
11991 }
11992
11993 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
11994 needed or an rtx for DRAP otherwise. */
11995
11996 static rtx
11997 ix86_get_drap_rtx (void)
11998 {
11999 /* We must use DRAP if there are outgoing arguments on stack and
12000 ACCUMULATE_OUTGOING_ARGS is false. */
12001 if (ix86_force_drap
12002 || (cfun->machine->outgoing_args_on_stack
12003 && !ACCUMULATE_OUTGOING_ARGS))
12004 crtl->need_drap = true;
12005
12006 if (stack_realign_drap)
12007 {
12008 /* Assign DRAP to vDRAP and returns vDRAP */
12009 unsigned int regno = find_drap_reg ();
12010 rtx drap_vreg;
12011 rtx arg_ptr;
12012 rtx_insn *seq, *insn;
12013
12014 arg_ptr = gen_rtx_REG (Pmode, regno);
12015 crtl->drap_reg = arg_ptr;
12016
12017 start_sequence ();
12018 drap_vreg = copy_to_reg (arg_ptr);
12019 seq = get_insns ();
12020 end_sequence ();
12021
12022 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
12023 if (!optimize)
12024 {
12025 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
12026 RTX_FRAME_RELATED_P (insn) = 1;
12027 }
12028 return drap_vreg;
12029 }
12030 else
12031 return NULL;
12032 }
12033
12034 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
12035
12036 static rtx
12037 ix86_internal_arg_pointer (void)
12038 {
12039 return virtual_incoming_args_rtx;
12040 }
12041
12042 struct scratch_reg {
12043 rtx reg;
12044 bool saved;
12045 };
12046
12047 /* Return a short-lived scratch register for use on function entry.
12048 In 32-bit mode, it is valid only after the registers are saved
12049 in the prologue. This register must be released by means of
12050 release_scratch_register_on_entry once it is dead. */
12051
12052 static void
12053 get_scratch_register_on_entry (struct scratch_reg *sr)
12054 {
12055 int regno;
12056
12057 sr->saved = false;
12058
12059 if (TARGET_64BIT)
12060 {
12061 /* We always use R11 in 64-bit mode. */
12062 regno = R11_REG;
12063 }
12064 else
12065 {
12066 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
12067 bool fastcall_p
12068 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12069 bool thiscall_p
12070 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
12071 bool static_chain_p = DECL_STATIC_CHAIN (decl);
12072 int regparm = ix86_function_regparm (fntype, decl);
12073 int drap_regno
12074 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
12075
12076 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
12077 for the static chain register. */
12078 if ((regparm < 1 || (fastcall_p && !static_chain_p))
12079 && drap_regno != AX_REG)
12080 regno = AX_REG;
12081 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
12082 for the static chain register. */
12083 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
12084 regno = AX_REG;
12085 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
12086 regno = DX_REG;
12087 /* ecx is the static chain register. */
12088 else if (regparm < 3 && !fastcall_p && !thiscall_p
12089 && !static_chain_p
12090 && drap_regno != CX_REG)
12091 regno = CX_REG;
12092 else if (ix86_save_reg (BX_REG, true, false))
12093 regno = BX_REG;
12094 /* esi is the static chain register. */
12095 else if (!(regparm == 3 && static_chain_p)
12096 && ix86_save_reg (SI_REG, true, false))
12097 regno = SI_REG;
12098 else if (ix86_save_reg (DI_REG, true, false))
12099 regno = DI_REG;
12100 else
12101 {
12102 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
12103 sr->saved = true;
12104 }
12105 }
12106
12107 sr->reg = gen_rtx_REG (Pmode, regno);
12108 if (sr->saved)
12109 {
12110 rtx_insn *insn = emit_insn (gen_push (sr->reg));
12111 RTX_FRAME_RELATED_P (insn) = 1;
12112 }
12113 }
12114
12115 /* Release a scratch register obtained from the preceding function. */
12116
12117 static void
12118 release_scratch_register_on_entry (struct scratch_reg *sr)
12119 {
12120 if (sr->saved)
12121 {
12122 struct machine_function *m = cfun->machine;
12123 rtx x, insn = emit_insn (gen_pop (sr->reg));
12124
12125 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
12126 RTX_FRAME_RELATED_P (insn) = 1;
12127 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
12128 x = gen_rtx_SET (stack_pointer_rtx, x);
12129 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
12130 m->fs.sp_offset -= UNITS_PER_WORD;
12131 }
12132 }
12133
12134 /* Return the probing interval for -fstack-clash-protection. */
12135
12136 static HOST_WIDE_INT
12137 get_probe_interval (void)
12138 {
12139 if (flag_stack_clash_protection)
12140 return (HOST_WIDE_INT_1U
12141 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL));
12142 else
12143 return (HOST_WIDE_INT_1U << STACK_CHECK_PROBE_INTERVAL_EXP);
12144 }
12145
12146 /* Emit code to adjust the stack pointer by SIZE bytes while probing it.
12147
12148 This differs from the next routine in that it tries hard to prevent
12149 attacks that jump the stack guard. Thus it is never allowed to allocate
12150 more than PROBE_INTERVAL bytes of stack space without a suitable
12151 probe. */
12152
12153 static void
12154 ix86_adjust_stack_and_probe_stack_clash (const HOST_WIDE_INT size)
12155 {
12156 struct machine_function *m = cfun->machine;
12157
12158 /* If this function does not statically allocate stack space, then
12159 no probes are needed. */
12160 if (!size)
12161 {
12162 /* However, the allocation of space via pushes for register
12163 saves could be viewed as allocating space, but without the
12164 need to probe. */
12165 if (m->frame.nregs || m->frame.nsseregs || frame_pointer_needed)
12166 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12167 else
12168 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
12169 return;
12170 }
12171
12172 /* If we are a noreturn function, then we have to consider the
12173 possibility that we're called via a jump rather than a call.
12174
12175 Thus we don't have the implicit probe generated by saving the
12176 return address into the stack at the call. Thus, the stack
12177 pointer could be anywhere in the guard page. The safe thing
12178 to do is emit a probe now.
12179
12180 ?!? This should be revamped to work like aarch64 and s390 where
12181 we track the offset from the most recent probe. Normally that
12182 offset would be zero. For a noreturn function we would reset
12183 it to PROBE_INTERVAL - (STACK_BOUNDARY / BITS_PER_UNIT). Then
12184 we just probe when we cross PROBE_INTERVAL. */
12185 if (TREE_THIS_VOLATILE (cfun->decl))
12186 {
12187 /* We can safely use any register here since we're just going to push
12188 its value and immediately pop it back. But we do try and avoid
12189 argument passing registers so as not to introduce dependencies in
12190 the pipeline. For 32 bit we use %esi and for 64 bit we use %rax. */
12191 rtx dummy_reg = gen_rtx_REG (word_mode, TARGET_64BIT ? AX_REG : SI_REG);
12192 rtx_insn *insn = emit_insn (gen_push (dummy_reg));
12193 RTX_FRAME_RELATED_P (insn) = 1;
12194 ix86_emit_restore_reg_using_pop (dummy_reg);
12195 emit_insn (gen_blockage ());
12196 }
12197
12198 /* If we allocate less than the size of the guard statically,
12199 then no probing is necessary, but we do need to allocate
12200 the stack. */
12201 if (size < (1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE)))
12202 {
12203 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12204 GEN_INT (-size), -1,
12205 m->fs.cfa_reg == stack_pointer_rtx);
12206 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
12207 return;
12208 }
12209
12210 /* We're allocating a large enough stack frame that we need to
12211 emit probes. Either emit them inline or in a loop depending
12212 on the size. */
12213 HOST_WIDE_INT probe_interval = get_probe_interval ();
12214 if (size <= 4 * probe_interval)
12215 {
12216 HOST_WIDE_INT i;
12217 for (i = probe_interval; i <= size; i += probe_interval)
12218 {
12219 /* Allocate PROBE_INTERVAL bytes. */
12220 rtx insn
12221 = pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12222 GEN_INT (-probe_interval), -1,
12223 m->fs.cfa_reg == stack_pointer_rtx);
12224 add_reg_note (insn, REG_STACK_CHECK, const0_rtx);
12225
12226 /* And probe at *sp. */
12227 emit_stack_probe (stack_pointer_rtx);
12228 emit_insn (gen_blockage ());
12229 }
12230
12231 /* We need to allocate space for the residual, but we do not need
12232 to probe the residual. */
12233 HOST_WIDE_INT residual = (i - probe_interval - size);
12234 if (residual)
12235 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12236 GEN_INT (residual), -1,
12237 m->fs.cfa_reg == stack_pointer_rtx);
12238 dump_stack_clash_frame_info (PROBE_INLINE, residual != 0);
12239 }
12240 else
12241 {
12242 struct scratch_reg sr;
12243 get_scratch_register_on_entry (&sr);
12244
12245 /* Step 1: round SIZE down to a multiple of the interval. */
12246 HOST_WIDE_INT rounded_size = size & -probe_interval;
12247
12248 /* Step 2: compute final value of the loop counter. Use lea if
12249 possible. */
12250 rtx addr = plus_constant (Pmode, stack_pointer_rtx, -rounded_size);
12251 rtx insn;
12252 if (address_no_seg_operand (addr, Pmode))
12253 insn = emit_insn (gen_rtx_SET (sr.reg, addr));
12254 else
12255 {
12256 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12257 insn = emit_insn (gen_rtx_SET (sr.reg,
12258 gen_rtx_PLUS (Pmode, sr.reg,
12259 stack_pointer_rtx)));
12260 }
12261 if (m->fs.cfa_reg == stack_pointer_rtx)
12262 {
12263 add_reg_note (insn, REG_CFA_DEF_CFA,
12264 plus_constant (Pmode, sr.reg,
12265 m->fs.cfa_offset + rounded_size));
12266 RTX_FRAME_RELATED_P (insn) = 1;
12267 }
12268
12269 /* Step 3: the loop. */
12270 rtx size_rtx = GEN_INT (rounded_size);
12271 insn = emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg,
12272 size_rtx));
12273 if (m->fs.cfa_reg == stack_pointer_rtx)
12274 {
12275 m->fs.cfa_offset += rounded_size;
12276 add_reg_note (insn, REG_CFA_DEF_CFA,
12277 plus_constant (Pmode, stack_pointer_rtx,
12278 m->fs.cfa_offset));
12279 RTX_FRAME_RELATED_P (insn) = 1;
12280 }
12281 m->fs.sp_offset += rounded_size;
12282 emit_insn (gen_blockage ());
12283
12284 /* Step 4: adjust SP if we cannot assert at compile-time that SIZE
12285 is equal to ROUNDED_SIZE. */
12286
12287 if (size != rounded_size)
12288 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
12289 GEN_INT (rounded_size - size), -1,
12290 m->fs.cfa_reg == stack_pointer_rtx);
12291 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
12292
12293 release_scratch_register_on_entry (&sr);
12294 }
12295
12296 /* Make sure nothing is scheduled before we are done. */
12297 emit_insn (gen_blockage ());
12298 }
12299
12300 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
12301
12302 static void
12303 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
12304 {
12305 /* We skip the probe for the first interval + a small dope of 4 words and
12306 probe that many bytes past the specified size to maintain a protection
12307 area at the botton of the stack. */
12308 const int dope = 4 * UNITS_PER_WORD;
12309 rtx size_rtx = GEN_INT (size), last;
12310
12311 /* See if we have a constant small number of probes to generate. If so,
12312 that's the easy case. The run-time loop is made up of 9 insns in the
12313 generic case while the compile-time loop is made up of 3+2*(n-1) insns
12314 for n # of intervals. */
12315 if (size <= 4 * get_probe_interval ())
12316 {
12317 HOST_WIDE_INT i, adjust;
12318 bool first_probe = true;
12319
12320 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
12321 values of N from 1 until it exceeds SIZE. If only one probe is
12322 needed, this will not generate any code. Then adjust and probe
12323 to PROBE_INTERVAL + SIZE. */
12324 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12325 {
12326 if (first_probe)
12327 {
12328 adjust = 2 * get_probe_interval () + dope;
12329 first_probe = false;
12330 }
12331 else
12332 adjust = get_probe_interval ();
12333
12334 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12335 plus_constant (Pmode, stack_pointer_rtx,
12336 -adjust)));
12337 emit_stack_probe (stack_pointer_rtx);
12338 }
12339
12340 if (first_probe)
12341 adjust = size + get_probe_interval () + dope;
12342 else
12343 adjust = size + get_probe_interval () - i;
12344
12345 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12346 plus_constant (Pmode, stack_pointer_rtx,
12347 -adjust)));
12348 emit_stack_probe (stack_pointer_rtx);
12349
12350 /* Adjust back to account for the additional first interval. */
12351 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12352 plus_constant (Pmode, stack_pointer_rtx,
12353 (get_probe_interval ()
12354 + dope))));
12355 }
12356
12357 /* Otherwise, do the same as above, but in a loop. Note that we must be
12358 extra careful with variables wrapping around because we might be at
12359 the very top (or the very bottom) of the address space and we have
12360 to be able to handle this case properly; in particular, we use an
12361 equality test for the loop condition. */
12362 else
12363 {
12364 HOST_WIDE_INT rounded_size;
12365 struct scratch_reg sr;
12366
12367 get_scratch_register_on_entry (&sr);
12368
12369
12370 /* Step 1: round SIZE to the previous multiple of the interval. */
12371
12372 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12373
12374
12375 /* Step 2: compute initial and final value of the loop counter. */
12376
12377 /* SP = SP_0 + PROBE_INTERVAL. */
12378 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12379 plus_constant (Pmode, stack_pointer_rtx,
12380 - (get_probe_interval () + dope))));
12381
12382 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
12383 if (rounded_size <= (HOST_WIDE_INT_1 << 31))
12384 emit_insn (gen_rtx_SET (sr.reg,
12385 plus_constant (Pmode, stack_pointer_rtx,
12386 -rounded_size)));
12387 else
12388 {
12389 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
12390 emit_insn (gen_rtx_SET (sr.reg,
12391 gen_rtx_PLUS (Pmode, sr.reg,
12392 stack_pointer_rtx)));
12393 }
12394
12395
12396 /* Step 3: the loop
12397
12398 do
12399 {
12400 SP = SP + PROBE_INTERVAL
12401 probe at SP
12402 }
12403 while (SP != LAST_ADDR)
12404
12405 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
12406 values of N from 1 until it is equal to ROUNDED_SIZE. */
12407
12408 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
12409
12410
12411 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
12412 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
12413
12414 if (size != rounded_size)
12415 {
12416 emit_insn (gen_rtx_SET (stack_pointer_rtx,
12417 plus_constant (Pmode, stack_pointer_rtx,
12418 rounded_size - size)));
12419 emit_stack_probe (stack_pointer_rtx);
12420 }
12421
12422 /* Adjust back to account for the additional first interval. */
12423 last = emit_insn (gen_rtx_SET (stack_pointer_rtx,
12424 plus_constant (Pmode, stack_pointer_rtx,
12425 (get_probe_interval ()
12426 + dope))));
12427
12428 release_scratch_register_on_entry (&sr);
12429 }
12430
12431 /* Even if the stack pointer isn't the CFA register, we need to correctly
12432 describe the adjustments made to it, in particular differentiate the
12433 frame-related ones from the frame-unrelated ones. */
12434 if (size > 0)
12435 {
12436 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
12437 XVECEXP (expr, 0, 0)
12438 = gen_rtx_SET (stack_pointer_rtx,
12439 plus_constant (Pmode, stack_pointer_rtx, -size));
12440 XVECEXP (expr, 0, 1)
12441 = gen_rtx_SET (stack_pointer_rtx,
12442 plus_constant (Pmode, stack_pointer_rtx,
12443 get_probe_interval () + dope + size));
12444 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
12445 RTX_FRAME_RELATED_P (last) = 1;
12446
12447 cfun->machine->fs.sp_offset += size;
12448 }
12449
12450 /* Make sure nothing is scheduled before we are done. */
12451 emit_insn (gen_blockage ());
12452 }
12453
12454 /* Adjust the stack pointer up to REG while probing it. */
12455
12456 const char *
12457 output_adjust_stack_and_probe (rtx reg)
12458 {
12459 static int labelno = 0;
12460 char loop_lab[32];
12461 rtx xops[2];
12462
12463 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12464
12465 /* Loop. */
12466 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12467
12468 /* SP = SP + PROBE_INTERVAL. */
12469 xops[0] = stack_pointer_rtx;
12470 xops[1] = GEN_INT (get_probe_interval ());
12471 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12472
12473 /* Probe at SP. */
12474 xops[1] = const0_rtx;
12475 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
12476
12477 /* Test if SP == LAST_ADDR. */
12478 xops[0] = stack_pointer_rtx;
12479 xops[1] = reg;
12480 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12481
12482 /* Branch. */
12483 fputs ("\tjne\t", asm_out_file);
12484 assemble_name_raw (asm_out_file, loop_lab);
12485 fputc ('\n', asm_out_file);
12486
12487 return "";
12488 }
12489
12490 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
12491 inclusive. These are offsets from the current stack pointer. */
12492
12493 static void
12494 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
12495 {
12496 /* See if we have a constant small number of probes to generate. If so,
12497 that's the easy case. The run-time loop is made up of 6 insns in the
12498 generic case while the compile-time loop is made up of n insns for n #
12499 of intervals. */
12500 if (size <= 6 * get_probe_interval ())
12501 {
12502 HOST_WIDE_INT i;
12503
12504 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
12505 it exceeds SIZE. If only one probe is needed, this will not
12506 generate any code. Then probe at FIRST + SIZE. */
12507 for (i = get_probe_interval (); i < size; i += get_probe_interval ())
12508 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12509 -(first + i)));
12510
12511 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
12512 -(first + size)));
12513 }
12514
12515 /* Otherwise, do the same as above, but in a loop. Note that we must be
12516 extra careful with variables wrapping around because we might be at
12517 the very top (or the very bottom) of the address space and we have
12518 to be able to handle this case properly; in particular, we use an
12519 equality test for the loop condition. */
12520 else
12521 {
12522 HOST_WIDE_INT rounded_size, last;
12523 struct scratch_reg sr;
12524
12525 get_scratch_register_on_entry (&sr);
12526
12527
12528 /* Step 1: round SIZE to the previous multiple of the interval. */
12529
12530 rounded_size = ROUND_DOWN (size, get_probe_interval ());
12531
12532
12533 /* Step 2: compute initial and final value of the loop counter. */
12534
12535 /* TEST_OFFSET = FIRST. */
12536 emit_move_insn (sr.reg, GEN_INT (-first));
12537
12538 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
12539 last = first + rounded_size;
12540
12541
12542 /* Step 3: the loop
12543
12544 do
12545 {
12546 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
12547 probe at TEST_ADDR
12548 }
12549 while (TEST_ADDR != LAST_ADDR)
12550
12551 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
12552 until it is equal to ROUNDED_SIZE. */
12553
12554 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
12555
12556
12557 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
12558 that SIZE is equal to ROUNDED_SIZE. */
12559
12560 if (size != rounded_size)
12561 emit_stack_probe (plus_constant (Pmode,
12562 gen_rtx_PLUS (Pmode,
12563 stack_pointer_rtx,
12564 sr.reg),
12565 rounded_size - size));
12566
12567 release_scratch_register_on_entry (&sr);
12568 }
12569
12570 /* Make sure nothing is scheduled before we are done. */
12571 emit_insn (gen_blockage ());
12572 }
12573
12574 /* Probe a range of stack addresses from REG to END, inclusive. These are
12575 offsets from the current stack pointer. */
12576
12577 const char *
12578 output_probe_stack_range (rtx reg, rtx end)
12579 {
12580 static int labelno = 0;
12581 char loop_lab[32];
12582 rtx xops[3];
12583
12584 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
12585
12586 /* Loop. */
12587 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
12588
12589 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
12590 xops[0] = reg;
12591 xops[1] = GEN_INT (get_probe_interval ());
12592 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
12593
12594 /* Probe at TEST_ADDR. */
12595 xops[0] = stack_pointer_rtx;
12596 xops[1] = reg;
12597 xops[2] = const0_rtx;
12598 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
12599
12600 /* Test if TEST_ADDR == LAST_ADDR. */
12601 xops[0] = reg;
12602 xops[1] = end;
12603 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
12604
12605 /* Branch. */
12606 fputs ("\tjne\t", asm_out_file);
12607 assemble_name_raw (asm_out_file, loop_lab);
12608 fputc ('\n', asm_out_file);
12609
12610 return "";
12611 }
12612
12613 /* Finalize stack_realign_needed and frame_pointer_needed flags, which
12614 will guide prologue/epilogue to be generated in correct form. */
12615
12616 static void
12617 ix86_finalize_stack_frame_flags (void)
12618 {
12619 /* Check if stack realign is really needed after reload, and
12620 stores result in cfun */
12621 unsigned int incoming_stack_boundary
12622 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
12623 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
12624 unsigned int stack_alignment
12625 = (crtl->is_leaf && !ix86_current_function_calls_tls_descriptor
12626 ? crtl->max_used_stack_slot_alignment
12627 : crtl->stack_alignment_needed);
12628 unsigned int stack_realign
12629 = (incoming_stack_boundary < stack_alignment);
12630 bool recompute_frame_layout_p = false;
12631
12632 if (crtl->stack_realign_finalized)
12633 {
12634 /* After stack_realign_needed is finalized, we can't no longer
12635 change it. */
12636 gcc_assert (crtl->stack_realign_needed == stack_realign);
12637 return;
12638 }
12639
12640 /* If the only reason for frame_pointer_needed is that we conservatively
12641 assumed stack realignment might be needed or -fno-omit-frame-pointer
12642 is used, but in the end nothing that needed the stack alignment had
12643 been spilled nor stack access, clear frame_pointer_needed and say we
12644 don't need stack realignment. */
12645 if ((stack_realign || !flag_omit_frame_pointer)
12646 && frame_pointer_needed
12647 && crtl->is_leaf
12648 && crtl->sp_is_unchanging
12649 && !ix86_current_function_calls_tls_descriptor
12650 && !crtl->accesses_prior_frames
12651 && !cfun->calls_alloca
12652 && !crtl->calls_eh_return
12653 /* See ira_setup_eliminable_regset for the rationale. */
12654 && !(STACK_CHECK_MOVING_SP
12655 && flag_stack_check
12656 && flag_exceptions
12657 && cfun->can_throw_non_call_exceptions)
12658 && !ix86_frame_pointer_required ()
12659 && get_frame_size () == 0
12660 && ix86_nsaved_sseregs () == 0
12661 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
12662 {
12663 HARD_REG_SET set_up_by_prologue, prologue_used;
12664 basic_block bb;
12665
12666 CLEAR_HARD_REG_SET (prologue_used);
12667 CLEAR_HARD_REG_SET (set_up_by_prologue);
12668 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
12669 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
12670 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
12671 HARD_FRAME_POINTER_REGNUM);
12672
12673 /* The preferred stack alignment is the minimum stack alignment. */
12674 if (stack_alignment > crtl->preferred_stack_boundary)
12675 stack_alignment = crtl->preferred_stack_boundary;
12676
12677 bool require_stack_frame = false;
12678
12679 FOR_EACH_BB_FN (bb, cfun)
12680 {
12681 rtx_insn *insn;
12682 FOR_BB_INSNS (bb, insn)
12683 if (NONDEBUG_INSN_P (insn)
12684 && requires_stack_frame_p (insn, prologue_used,
12685 set_up_by_prologue))
12686 {
12687 require_stack_frame = true;
12688
12689 if (stack_realign)
12690 {
12691 /* Find the maximum stack alignment. */
12692 subrtx_iterator::array_type array;
12693 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
12694 if (MEM_P (*iter)
12695 && (reg_mentioned_p (stack_pointer_rtx,
12696 *iter)
12697 || reg_mentioned_p (frame_pointer_rtx,
12698 *iter)))
12699 {
12700 unsigned int alignment = MEM_ALIGN (*iter);
12701 if (alignment > stack_alignment)
12702 stack_alignment = alignment;
12703 }
12704 }
12705 }
12706 }
12707
12708 if (require_stack_frame)
12709 {
12710 /* Stack frame is required. If stack alignment needed is less
12711 than incoming stack boundary, don't realign stack. */
12712 stack_realign = incoming_stack_boundary < stack_alignment;
12713 if (!stack_realign)
12714 {
12715 crtl->max_used_stack_slot_alignment
12716 = incoming_stack_boundary;
12717 crtl->stack_alignment_needed
12718 = incoming_stack_boundary;
12719 /* Also update preferred_stack_boundary for leaf
12720 functions. */
12721 crtl->preferred_stack_boundary
12722 = incoming_stack_boundary;
12723 }
12724 }
12725 else
12726 {
12727 /* If drap has been set, but it actually isn't live at the
12728 start of the function, there is no reason to set it up. */
12729 if (crtl->drap_reg)
12730 {
12731 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12732 if (! REGNO_REG_SET_P (DF_LR_IN (bb),
12733 REGNO (crtl->drap_reg)))
12734 {
12735 crtl->drap_reg = NULL_RTX;
12736 crtl->need_drap = false;
12737 }
12738 }
12739 else
12740 cfun->machine->no_drap_save_restore = true;
12741
12742 frame_pointer_needed = false;
12743 stack_realign = false;
12744 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
12745 crtl->stack_alignment_needed = incoming_stack_boundary;
12746 crtl->stack_alignment_estimated = incoming_stack_boundary;
12747 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
12748 crtl->preferred_stack_boundary = incoming_stack_boundary;
12749 df_finish_pass (true);
12750 df_scan_alloc (NULL);
12751 df_scan_blocks ();
12752 df_compute_regs_ever_live (true);
12753 df_analyze ();
12754
12755 if (flag_var_tracking)
12756 {
12757 /* Since frame pointer is no longer available, replace it with
12758 stack pointer - UNITS_PER_WORD in debug insns. */
12759 df_ref ref, next;
12760 for (ref = DF_REG_USE_CHAIN (HARD_FRAME_POINTER_REGNUM);
12761 ref; ref = next)
12762 {
12763 next = DF_REF_NEXT_REG (ref);
12764 if (!DF_REF_INSN_INFO (ref))
12765 continue;
12766
12767 /* Make sure the next ref is for a different instruction,
12768 so that we're not affected by the rescan. */
12769 rtx_insn *insn = DF_REF_INSN (ref);
12770 while (next && DF_REF_INSN (next) == insn)
12771 next = DF_REF_NEXT_REG (next);
12772
12773 if (DEBUG_INSN_P (insn))
12774 {
12775 bool changed = false;
12776 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
12777 {
12778 rtx *loc = DF_REF_LOC (ref);
12779 if (*loc == hard_frame_pointer_rtx)
12780 {
12781 *loc = plus_constant (Pmode,
12782 stack_pointer_rtx,
12783 -UNITS_PER_WORD);
12784 changed = true;
12785 }
12786 }
12787 if (changed)
12788 df_insn_rescan (insn);
12789 }
12790 }
12791 }
12792
12793 recompute_frame_layout_p = true;
12794 }
12795 }
12796
12797 if (crtl->stack_realign_needed != stack_realign)
12798 recompute_frame_layout_p = true;
12799 crtl->stack_realign_needed = stack_realign;
12800 crtl->stack_realign_finalized = true;
12801 if (recompute_frame_layout_p)
12802 ix86_compute_frame_layout ();
12803 }
12804
12805 /* Delete SET_GOT right after entry block if it is allocated to reg. */
12806
12807 static void
12808 ix86_elim_entry_set_got (rtx reg)
12809 {
12810 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
12811 rtx_insn *c_insn = BB_HEAD (bb);
12812 if (!NONDEBUG_INSN_P (c_insn))
12813 c_insn = next_nonnote_nondebug_insn (c_insn);
12814 if (c_insn && NONJUMP_INSN_P (c_insn))
12815 {
12816 rtx pat = PATTERN (c_insn);
12817 if (GET_CODE (pat) == PARALLEL)
12818 {
12819 rtx vec = XVECEXP (pat, 0, 0);
12820 if (GET_CODE (vec) == SET
12821 && XINT (XEXP (vec, 1), 1) == UNSPEC_SET_GOT
12822 && REGNO (XEXP (vec, 0)) == REGNO (reg))
12823 delete_insn (c_insn);
12824 }
12825 }
12826 }
12827
12828 static rtx
12829 gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store)
12830 {
12831 rtx addr, mem;
12832
12833 if (offset)
12834 addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset));
12835 mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg);
12836 return gen_rtx_SET (store ? mem : reg, store ? reg : mem);
12837 }
12838
12839 static inline rtx
12840 gen_frame_load (rtx reg, rtx frame_reg, int offset)
12841 {
12842 return gen_frame_set (reg, frame_reg, offset, false);
12843 }
12844
12845 static inline rtx
12846 gen_frame_store (rtx reg, rtx frame_reg, int offset)
12847 {
12848 return gen_frame_set (reg, frame_reg, offset, true);
12849 }
12850
12851 static void
12852 ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame)
12853 {
12854 struct machine_function *m = cfun->machine;
12855 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
12856 + m->call_ms2sysv_extra_regs;
12857 rtvec v = rtvec_alloc (ncregs + 1);
12858 unsigned int align, i, vi = 0;
12859 rtx_insn *insn;
12860 rtx sym, addr;
12861 rtx rax = gen_rtx_REG (word_mode, AX_REG);
12862 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
12863
12864 /* AL should only be live with sysv_abi. */
12865 gcc_assert (!ix86_eax_live_at_start_p ());
12866 gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset);
12867
12868 /* Setup RAX as the stub's base pointer. We use stack_realign_offset rather
12869 we've actually realigned the stack or not. */
12870 align = GET_MODE_ALIGNMENT (V4SFmode);
12871 addr = choose_baseaddr (frame.stack_realign_offset
12872 + xlogue.get_stub_ptr_offset (), &align, AX_REG);
12873 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
12874
12875 emit_insn (gen_rtx_SET (rax, addr));
12876
12877 /* Get the stub symbol. */
12878 sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP
12879 : XLOGUE_STUB_SAVE);
12880 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
12881
12882 for (i = 0; i < ncregs; ++i)
12883 {
12884 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
12885 rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode),
12886 r.regno);
12887 RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);
12888 }
12889
12890 gcc_assert (vi == (unsigned)GET_NUM_ELEM (v));
12891
12892 insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v));
12893 RTX_FRAME_RELATED_P (insn) = true;
12894 }
12895
12896 /* Expand the prologue into a bunch of separate insns. */
12897
12898 void
12899 ix86_expand_prologue (void)
12900 {
12901 struct machine_function *m = cfun->machine;
12902 rtx insn, t;
12903 struct ix86_frame frame;
12904 HOST_WIDE_INT allocate;
12905 bool int_registers_saved;
12906 bool sse_registers_saved;
12907 bool save_stub_call_needed;
12908 rtx static_chain = NULL_RTX;
12909
12910 if (ix86_function_naked (current_function_decl))
12911 return;
12912
12913 ix86_finalize_stack_frame_flags ();
12914
12915 /* DRAP should not coexist with stack_realign_fp */
12916 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
12917
12918 memset (&m->fs, 0, sizeof (m->fs));
12919
12920 /* Initialize CFA state for before the prologue. */
12921 m->fs.cfa_reg = stack_pointer_rtx;
12922 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
12923
12924 /* Track SP offset to the CFA. We continue tracking this after we've
12925 swapped the CFA register away from SP. In the case of re-alignment
12926 this is fudged; we're interested to offsets within the local frame. */
12927 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
12928 m->fs.sp_valid = true;
12929 m->fs.sp_realigned = false;
12930
12931 frame = m->frame;
12932
12933 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
12934 {
12935 /* We should have already generated an error for any use of
12936 ms_hook on a nested function. */
12937 gcc_checking_assert (!ix86_static_chain_on_stack);
12938
12939 /* Check if profiling is active and we shall use profiling before
12940 prologue variant. If so sorry. */
12941 if (crtl->profile && flag_fentry != 0)
12942 sorry ("ms_hook_prologue attribute isn%'t compatible "
12943 "with -mfentry for 32-bit");
12944
12945 /* In ix86_asm_output_function_label we emitted:
12946 8b ff movl.s %edi,%edi
12947 55 push %ebp
12948 8b ec movl.s %esp,%ebp
12949
12950 This matches the hookable function prologue in Win32 API
12951 functions in Microsoft Windows XP Service Pack 2 and newer.
12952 Wine uses this to enable Windows apps to hook the Win32 API
12953 functions provided by Wine.
12954
12955 What that means is that we've already set up the frame pointer. */
12956
12957 if (frame_pointer_needed
12958 && !(crtl->drap_reg && crtl->stack_realign_needed))
12959 {
12960 rtx push, mov;
12961
12962 /* We've decided to use the frame pointer already set up.
12963 Describe this to the unwinder by pretending that both
12964 push and mov insns happen right here.
12965
12966 Putting the unwind info here at the end of the ms_hook
12967 is done so that we can make absolutely certain we get
12968 the required byte sequence at the start of the function,
12969 rather than relying on an assembler that can produce
12970 the exact encoding required.
12971
12972 However it does mean (in the unpatched case) that we have
12973 a 1 insn window where the asynchronous unwind info is
12974 incorrect. However, if we placed the unwind info at
12975 its correct location we would have incorrect unwind info
12976 in the patched case. Which is probably all moot since
12977 I don't expect Wine generates dwarf2 unwind info for the
12978 system libraries that use this feature. */
12979
12980 insn = emit_insn (gen_blockage ());
12981
12982 push = gen_push (hard_frame_pointer_rtx);
12983 mov = gen_rtx_SET (hard_frame_pointer_rtx,
12984 stack_pointer_rtx);
12985 RTX_FRAME_RELATED_P (push) = 1;
12986 RTX_FRAME_RELATED_P (mov) = 1;
12987
12988 RTX_FRAME_RELATED_P (insn) = 1;
12989 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
12990 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
12991
12992 /* Note that gen_push incremented m->fs.cfa_offset, even
12993 though we didn't emit the push insn here. */
12994 m->fs.cfa_reg = hard_frame_pointer_rtx;
12995 m->fs.fp_offset = m->fs.cfa_offset;
12996 m->fs.fp_valid = true;
12997 }
12998 else
12999 {
13000 /* The frame pointer is not needed so pop %ebp again.
13001 This leaves us with a pristine state. */
13002 emit_insn (gen_pop (hard_frame_pointer_rtx));
13003 }
13004 }
13005
13006 /* The first insn of a function that accepts its static chain on the
13007 stack is to push the register that would be filled in by a direct
13008 call. This insn will be skipped by the trampoline. */
13009 else if (ix86_static_chain_on_stack)
13010 {
13011 static_chain = ix86_static_chain (cfun->decl, false);
13012 insn = emit_insn (gen_push (static_chain));
13013 emit_insn (gen_blockage ());
13014
13015 /* We don't want to interpret this push insn as a register save,
13016 only as a stack adjustment. The real copy of the register as
13017 a save will be done later, if needed. */
13018 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
13019 t = gen_rtx_SET (stack_pointer_rtx, t);
13020 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
13021 RTX_FRAME_RELATED_P (insn) = 1;
13022 }
13023
13024 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
13025 of DRAP is needed and stack realignment is really needed after reload */
13026 if (stack_realign_drap)
13027 {
13028 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13029
13030 /* Can't use DRAP in interrupt function. */
13031 if (cfun->machine->func_type != TYPE_NORMAL)
13032 sorry ("Dynamic Realign Argument Pointer (DRAP) not supported "
13033 "in interrupt service routine. This may be worked "
13034 "around by avoiding functions with aggregate return.");
13035
13036 /* Only need to push parameter pointer reg if it is caller saved. */
13037 if (!call_used_regs[REGNO (crtl->drap_reg)])
13038 {
13039 /* Push arg pointer reg */
13040 insn = emit_insn (gen_push (crtl->drap_reg));
13041 RTX_FRAME_RELATED_P (insn) = 1;
13042 }
13043
13044 /* Grab the argument pointer. */
13045 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
13046 insn = emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13047 RTX_FRAME_RELATED_P (insn) = 1;
13048 m->fs.cfa_reg = crtl->drap_reg;
13049 m->fs.cfa_offset = 0;
13050
13051 /* Align the stack. */
13052 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13053 stack_pointer_rtx,
13054 GEN_INT (-align_bytes)));
13055 RTX_FRAME_RELATED_P (insn) = 1;
13056
13057 /* Replicate the return address on the stack so that return
13058 address can be reached via (argp - 1) slot. This is needed
13059 to implement macro RETURN_ADDR_RTX and intrinsic function
13060 expand_builtin_return_addr etc. */
13061 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
13062 t = gen_frame_mem (word_mode, t);
13063 insn = emit_insn (gen_push (t));
13064 RTX_FRAME_RELATED_P (insn) = 1;
13065
13066 /* For the purposes of frame and register save area addressing,
13067 we've started over with a new frame. */
13068 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
13069 m->fs.realigned = true;
13070
13071 if (static_chain)
13072 {
13073 /* Replicate static chain on the stack so that static chain
13074 can be reached via (argp - 2) slot. This is needed for
13075 nested function with stack realignment. */
13076 insn = emit_insn (gen_push (static_chain));
13077 RTX_FRAME_RELATED_P (insn) = 1;
13078 }
13079 }
13080
13081 int_registers_saved = (frame.nregs == 0);
13082 sse_registers_saved = (frame.nsseregs == 0);
13083 save_stub_call_needed = (m->call_ms2sysv);
13084 gcc_assert (sse_registers_saved || !save_stub_call_needed);
13085
13086 if (frame_pointer_needed && !m->fs.fp_valid)
13087 {
13088 /* Note: AT&T enter does NOT have reversed args. Enter is probably
13089 slower on all targets. Also sdb didn't like it. */
13090 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
13091 RTX_FRAME_RELATED_P (insn) = 1;
13092
13093 /* Push registers now, before setting the frame pointer
13094 on SEH target. */
13095 if (!int_registers_saved
13096 && TARGET_SEH
13097 && !frame.save_regs_using_mov)
13098 {
13099 ix86_emit_save_regs ();
13100 int_registers_saved = true;
13101 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13102 }
13103
13104 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
13105 {
13106 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
13107 RTX_FRAME_RELATED_P (insn) = 1;
13108
13109 if (m->fs.cfa_reg == stack_pointer_rtx)
13110 m->fs.cfa_reg = hard_frame_pointer_rtx;
13111 m->fs.fp_offset = m->fs.sp_offset;
13112 m->fs.fp_valid = true;
13113 }
13114 }
13115
13116 if (!int_registers_saved)
13117 {
13118 /* If saving registers via PUSH, do so now. */
13119 if (!frame.save_regs_using_mov)
13120 {
13121 ix86_emit_save_regs ();
13122 int_registers_saved = true;
13123 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
13124 }
13125
13126 /* When using red zone we may start register saving before allocating
13127 the stack frame saving one cycle of the prologue. However, avoid
13128 doing this if we have to probe the stack; at least on x86_64 the
13129 stack probe can turn into a call that clobbers a red zone location. */
13130 else if (ix86_using_red_zone ()
13131 && (! TARGET_STACK_PROBE
13132 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
13133 {
13134 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13135 int_registers_saved = true;
13136 }
13137 }
13138
13139 if (stack_realign_fp)
13140 {
13141 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
13142 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
13143
13144 /* Record last valid frame pointer offset. */
13145 m->fs.sp_realigned_fp_last = frame.reg_save_offset;
13146
13147 /* The computation of the size of the re-aligned stack frame means
13148 that we must allocate the size of the register save area before
13149 performing the actual alignment. Otherwise we cannot guarantee
13150 that there's enough storage above the realignment point. */
13151 allocate = frame.reg_save_offset - m->fs.sp_offset
13152 + frame.stack_realign_allocate;
13153 if (allocate)
13154 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13155 GEN_INT (-allocate), -1, false);
13156
13157 /* Align the stack. */
13158 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
13159 stack_pointer_rtx,
13160 GEN_INT (-align_bytes)));
13161 m->fs.sp_offset = ROUND_UP (m->fs.sp_offset, align_bytes);
13162 m->fs.sp_realigned_offset = m->fs.sp_offset
13163 - frame.stack_realign_allocate;
13164 /* The stack pointer may no longer be equal to CFA - m->fs.sp_offset.
13165 Beyond this point, stack access should be done via choose_baseaddr or
13166 by using sp_valid_at and fp_valid_at to determine the correct base
13167 register. Henceforth, any CFA offset should be thought of as logical
13168 and not physical. */
13169 gcc_assert (m->fs.sp_realigned_offset >= m->fs.sp_realigned_fp_last);
13170 gcc_assert (m->fs.sp_realigned_offset == frame.stack_realign_offset);
13171 m->fs.sp_realigned = true;
13172
13173 /* SEH unwind emit doesn't currently support REG_CFA_EXPRESSION, which
13174 is needed to describe where a register is saved using a realigned
13175 stack pointer, so we need to invalidate the stack pointer for that
13176 target. */
13177 if (TARGET_SEH)
13178 m->fs.sp_valid = false;
13179
13180 /* If SP offset is non-immediate after allocation of the stack frame,
13181 then emit SSE saves or stub call prior to allocating the rest of the
13182 stack frame. This is less efficient for the out-of-line stub because
13183 we can't combine allocations across the call barrier, but it's better
13184 than using a scratch register. */
13185 else if (!x86_64_immediate_operand (GEN_INT (frame.stack_pointer_offset
13186 - m->fs.sp_realigned_offset),
13187 Pmode))
13188 {
13189 if (!sse_registers_saved)
13190 {
13191 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13192 sse_registers_saved = true;
13193 }
13194 else if (save_stub_call_needed)
13195 {
13196 ix86_emit_outlined_ms2sysv_save (frame);
13197 save_stub_call_needed = false;
13198 }
13199 }
13200 }
13201
13202 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
13203
13204 if (flag_stack_usage_info)
13205 {
13206 /* We start to count from ARG_POINTER. */
13207 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
13208
13209 /* If it was realigned, take into account the fake frame. */
13210 if (stack_realign_drap)
13211 {
13212 if (ix86_static_chain_on_stack)
13213 stack_size += UNITS_PER_WORD;
13214
13215 if (!call_used_regs[REGNO (crtl->drap_reg)])
13216 stack_size += UNITS_PER_WORD;
13217
13218 /* This over-estimates by 1 minimal-stack-alignment-unit but
13219 mitigates that by counting in the new return address slot. */
13220 current_function_dynamic_stack_size
13221 += crtl->stack_alignment_needed / BITS_PER_UNIT;
13222 }
13223
13224 current_function_static_stack_size = stack_size;
13225 }
13226
13227 /* On SEH target with very large frame size, allocate an area to save
13228 SSE registers (as the very large allocation won't be described). */
13229 if (TARGET_SEH
13230 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
13231 && !sse_registers_saved)
13232 {
13233 HOST_WIDE_INT sse_size =
13234 frame.sse_reg_save_offset - frame.reg_save_offset;
13235
13236 gcc_assert (int_registers_saved);
13237
13238 /* No need to do stack checking as the area will be immediately
13239 written. */
13240 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13241 GEN_INT (-sse_size), -1,
13242 m->fs.cfa_reg == stack_pointer_rtx);
13243 allocate -= sse_size;
13244 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13245 sse_registers_saved = true;
13246 }
13247
13248 /* The stack has already been decremented by the instruction calling us
13249 so probe if the size is non-negative to preserve the protection area. */
13250 if (allocate >= 0
13251 && (flag_stack_check == STATIC_BUILTIN_STACK_CHECK
13252 || flag_stack_clash_protection))
13253 {
13254 /* This assert wants to verify that integer registers were saved
13255 prior to probing. This is necessary when probing may be implemented
13256 as a function call (Windows). It is not necessary for stack clash
13257 protection probing. */
13258 if (!flag_stack_clash_protection)
13259 gcc_assert (int_registers_saved);
13260
13261 if (flag_stack_clash_protection)
13262 {
13263 ix86_adjust_stack_and_probe_stack_clash (allocate);
13264 allocate = 0;
13265 }
13266 else if (STACK_CHECK_MOVING_SP)
13267 {
13268 if (!(crtl->is_leaf && !cfun->calls_alloca
13269 && allocate <= get_probe_interval ()))
13270 {
13271 ix86_adjust_stack_and_probe (allocate);
13272 allocate = 0;
13273 }
13274 }
13275 else
13276 {
13277 HOST_WIDE_INT size = allocate;
13278
13279 if (TARGET_64BIT && size >= HOST_WIDE_INT_C (0x80000000))
13280 size = 0x80000000 - get_stack_check_protect () - 1;
13281
13282 if (TARGET_STACK_PROBE)
13283 {
13284 if (crtl->is_leaf && !cfun->calls_alloca)
13285 {
13286 if (size > get_probe_interval ())
13287 ix86_emit_probe_stack_range (0, size);
13288 }
13289 else
13290 ix86_emit_probe_stack_range (0,
13291 size + get_stack_check_protect ());
13292 }
13293 else
13294 {
13295 if (crtl->is_leaf && !cfun->calls_alloca)
13296 {
13297 if (size > get_probe_interval ()
13298 && size > get_stack_check_protect ())
13299 ix86_emit_probe_stack_range (get_stack_check_protect (),
13300 size - get_stack_check_protect ());
13301 }
13302 else
13303 ix86_emit_probe_stack_range (get_stack_check_protect (), size);
13304 }
13305 }
13306 }
13307
13308 if (allocate == 0)
13309 ;
13310 else if (!ix86_target_stack_probe ()
13311 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
13312 {
13313 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13314 GEN_INT (-allocate), -1,
13315 m->fs.cfa_reg == stack_pointer_rtx);
13316 }
13317 else
13318 {
13319 rtx eax = gen_rtx_REG (Pmode, AX_REG);
13320 rtx r10 = NULL;
13321 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
13322 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
13323 bool eax_live = ix86_eax_live_at_start_p ();
13324 bool r10_live = false;
13325
13326 if (TARGET_64BIT)
13327 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
13328
13329 if (eax_live)
13330 {
13331 insn = emit_insn (gen_push (eax));
13332 allocate -= UNITS_PER_WORD;
13333 /* Note that SEH directives need to continue tracking the stack
13334 pointer even after the frame pointer has been set up. */
13335 if (sp_is_cfa_reg || TARGET_SEH)
13336 {
13337 if (sp_is_cfa_reg)
13338 m->fs.cfa_offset += UNITS_PER_WORD;
13339 RTX_FRAME_RELATED_P (insn) = 1;
13340 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13341 gen_rtx_SET (stack_pointer_rtx,
13342 plus_constant (Pmode, stack_pointer_rtx,
13343 -UNITS_PER_WORD)));
13344 }
13345 }
13346
13347 if (r10_live)
13348 {
13349 r10 = gen_rtx_REG (Pmode, R10_REG);
13350 insn = emit_insn (gen_push (r10));
13351 allocate -= UNITS_PER_WORD;
13352 if (sp_is_cfa_reg || TARGET_SEH)
13353 {
13354 if (sp_is_cfa_reg)
13355 m->fs.cfa_offset += UNITS_PER_WORD;
13356 RTX_FRAME_RELATED_P (insn) = 1;
13357 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13358 gen_rtx_SET (stack_pointer_rtx,
13359 plus_constant (Pmode, stack_pointer_rtx,
13360 -UNITS_PER_WORD)));
13361 }
13362 }
13363
13364 emit_move_insn (eax, GEN_INT (allocate));
13365 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
13366
13367 /* Use the fact that AX still contains ALLOCATE. */
13368 adjust_stack_insn = (Pmode == DImode
13369 ? gen_pro_epilogue_adjust_stack_di_sub
13370 : gen_pro_epilogue_adjust_stack_si_sub);
13371
13372 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
13373 stack_pointer_rtx, eax));
13374
13375 if (sp_is_cfa_reg || TARGET_SEH)
13376 {
13377 if (sp_is_cfa_reg)
13378 m->fs.cfa_offset += allocate;
13379 RTX_FRAME_RELATED_P (insn) = 1;
13380 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
13381 gen_rtx_SET (stack_pointer_rtx,
13382 plus_constant (Pmode, stack_pointer_rtx,
13383 -allocate)));
13384 }
13385 m->fs.sp_offset += allocate;
13386
13387 /* Use stack_pointer_rtx for relative addressing so that code
13388 works for realigned stack, too. */
13389 if (r10_live && eax_live)
13390 {
13391 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13392 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
13393 gen_frame_mem (word_mode, t));
13394 t = plus_constant (Pmode, t, UNITS_PER_WORD);
13395 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
13396 gen_frame_mem (word_mode, t));
13397 }
13398 else if (eax_live || r10_live)
13399 {
13400 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
13401 emit_move_insn (gen_rtx_REG (word_mode,
13402 (eax_live ? AX_REG : R10_REG)),
13403 gen_frame_mem (word_mode, t));
13404 }
13405 }
13406 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
13407
13408 /* If we havn't already set up the frame pointer, do so now. */
13409 if (frame_pointer_needed && !m->fs.fp_valid)
13410 {
13411 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
13412 GEN_INT (frame.stack_pointer_offset
13413 - frame.hard_frame_pointer_offset));
13414 insn = emit_insn (insn);
13415 RTX_FRAME_RELATED_P (insn) = 1;
13416 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
13417
13418 if (m->fs.cfa_reg == stack_pointer_rtx)
13419 m->fs.cfa_reg = hard_frame_pointer_rtx;
13420 m->fs.fp_offset = frame.hard_frame_pointer_offset;
13421 m->fs.fp_valid = true;
13422 }
13423
13424 if (!int_registers_saved)
13425 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
13426 if (!sse_registers_saved)
13427 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
13428 else if (save_stub_call_needed)
13429 ix86_emit_outlined_ms2sysv_save (frame);
13430
13431 /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT
13432 in PROLOGUE. */
13433 if (!TARGET_64BIT && pic_offset_table_rtx && crtl->profile && !flag_fentry)
13434 {
13435 rtx pic = gen_rtx_REG (Pmode, REAL_PIC_OFFSET_TABLE_REGNUM);
13436 insn = emit_insn (gen_set_got (pic));
13437 RTX_FRAME_RELATED_P (insn) = 1;
13438 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
13439 emit_insn (gen_prologue_use (pic));
13440 /* Deleting already emmitted SET_GOT if exist and allocated to
13441 REAL_PIC_OFFSET_TABLE_REGNUM. */
13442 ix86_elim_entry_set_got (pic);
13443 }
13444
13445 if (crtl->drap_reg && !crtl->stack_realign_needed)
13446 {
13447 /* vDRAP is setup but after reload it turns out stack realign
13448 isn't necessary, here we will emit prologue to setup DRAP
13449 without stack realign adjustment */
13450 t = choose_baseaddr (0, NULL);
13451 emit_insn (gen_rtx_SET (crtl->drap_reg, t));
13452 }
13453
13454 /* Prevent instructions from being scheduled into register save push
13455 sequence when access to the redzone area is done through frame pointer.
13456 The offset between the frame pointer and the stack pointer is calculated
13457 relative to the value of the stack pointer at the end of the function
13458 prologue, and moving instructions that access redzone area via frame
13459 pointer inside push sequence violates this assumption. */
13460 if (frame_pointer_needed && frame.red_zone_size)
13461 emit_insn (gen_memory_blockage ());
13462
13463 /* SEH requires that the prologue end within 256 bytes of the start of
13464 the function. Prevent instruction schedules that would extend that.
13465 Further, prevent alloca modifications to the stack pointer from being
13466 combined with prologue modifications. */
13467 if (TARGET_SEH)
13468 emit_insn (gen_prologue_use (stack_pointer_rtx));
13469 }
13470
13471 /* Emit code to restore REG using a POP insn. */
13472
13473 static void
13474 ix86_emit_restore_reg_using_pop (rtx reg)
13475 {
13476 struct machine_function *m = cfun->machine;
13477 rtx_insn *insn = emit_insn (gen_pop (reg));
13478
13479 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
13480 m->fs.sp_offset -= UNITS_PER_WORD;
13481
13482 if (m->fs.cfa_reg == crtl->drap_reg
13483 && REGNO (reg) == REGNO (crtl->drap_reg))
13484 {
13485 /* Previously we'd represented the CFA as an expression
13486 like *(%ebp - 8). We've just popped that value from
13487 the stack, which means we need to reset the CFA to
13488 the drap register. This will remain until we restore
13489 the stack pointer. */
13490 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13491 RTX_FRAME_RELATED_P (insn) = 1;
13492
13493 /* This means that the DRAP register is valid for addressing too. */
13494 m->fs.drap_valid = true;
13495 return;
13496 }
13497
13498 if (m->fs.cfa_reg == stack_pointer_rtx)
13499 {
13500 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
13501 x = gen_rtx_SET (stack_pointer_rtx, x);
13502 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
13503 RTX_FRAME_RELATED_P (insn) = 1;
13504
13505 m->fs.cfa_offset -= UNITS_PER_WORD;
13506 }
13507
13508 /* When the frame pointer is the CFA, and we pop it, we are
13509 swapping back to the stack pointer as the CFA. This happens
13510 for stack frames that don't allocate other data, so we assume
13511 the stack pointer is now pointing at the return address, i.e.
13512 the function entry state, which makes the offset be 1 word. */
13513 if (reg == hard_frame_pointer_rtx)
13514 {
13515 m->fs.fp_valid = false;
13516 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13517 {
13518 m->fs.cfa_reg = stack_pointer_rtx;
13519 m->fs.cfa_offset -= UNITS_PER_WORD;
13520
13521 add_reg_note (insn, REG_CFA_DEF_CFA,
13522 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
13523 GEN_INT (m->fs.cfa_offset)));
13524 RTX_FRAME_RELATED_P (insn) = 1;
13525 }
13526 }
13527 }
13528
13529 /* Emit code to restore saved registers using POP insns. */
13530
13531 static void
13532 ix86_emit_restore_regs_using_pop (void)
13533 {
13534 unsigned int regno;
13535
13536 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13537 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true))
13538 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
13539 }
13540
13541 /* Emit code and notes for the LEAVE instruction. If insn is non-null,
13542 omits the emit and only attaches the notes. */
13543
13544 static void
13545 ix86_emit_leave (rtx_insn *insn)
13546 {
13547 struct machine_function *m = cfun->machine;
13548 if (!insn)
13549 insn = emit_insn (ix86_gen_leave ());
13550
13551 ix86_add_queued_cfa_restore_notes (insn);
13552
13553 gcc_assert (m->fs.fp_valid);
13554 m->fs.sp_valid = true;
13555 m->fs.sp_realigned = false;
13556 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
13557 m->fs.fp_valid = false;
13558
13559 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
13560 {
13561 m->fs.cfa_reg = stack_pointer_rtx;
13562 m->fs.cfa_offset = m->fs.sp_offset;
13563
13564 add_reg_note (insn, REG_CFA_DEF_CFA,
13565 plus_constant (Pmode, stack_pointer_rtx,
13566 m->fs.sp_offset));
13567 RTX_FRAME_RELATED_P (insn) = 1;
13568 }
13569 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
13570 m->fs.fp_offset);
13571 }
13572
13573 /* Emit code to restore saved registers using MOV insns.
13574 First register is restored from CFA - CFA_OFFSET. */
13575 static void
13576 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
13577 bool maybe_eh_return)
13578 {
13579 struct machine_function *m = cfun->machine;
13580 unsigned int regno;
13581
13582 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13583 if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13584 {
13585 rtx reg = gen_rtx_REG (word_mode, regno);
13586 rtx mem;
13587 rtx_insn *insn;
13588
13589 mem = choose_baseaddr (cfa_offset, NULL);
13590 mem = gen_frame_mem (word_mode, mem);
13591 insn = emit_move_insn (reg, mem);
13592
13593 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
13594 {
13595 /* Previously we'd represented the CFA as an expression
13596 like *(%ebp - 8). We've just popped that value from
13597 the stack, which means we need to reset the CFA to
13598 the drap register. This will remain until we restore
13599 the stack pointer. */
13600 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
13601 RTX_FRAME_RELATED_P (insn) = 1;
13602
13603 /* This means that the DRAP register is valid for addressing. */
13604 m->fs.drap_valid = true;
13605 }
13606 else
13607 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13608
13609 cfa_offset -= UNITS_PER_WORD;
13610 }
13611 }
13612
13613 /* Emit code to restore saved registers using MOV insns.
13614 First register is restored from CFA - CFA_OFFSET. */
13615 static void
13616 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
13617 bool maybe_eh_return)
13618 {
13619 unsigned int regno;
13620
13621 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
13622 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return, true))
13623 {
13624 rtx reg = gen_rtx_REG (V4SFmode, regno);
13625 rtx mem;
13626 unsigned int align = GET_MODE_ALIGNMENT (V4SFmode);
13627
13628 mem = choose_baseaddr (cfa_offset, &align);
13629 mem = gen_rtx_MEM (V4SFmode, mem);
13630
13631 /* The location aligment depends upon the base register. */
13632 align = MIN (GET_MODE_ALIGNMENT (V4SFmode), align);
13633 gcc_assert (! (cfa_offset & (align / BITS_PER_UNIT - 1)));
13634 set_mem_align (mem, align);
13635 emit_insn (gen_rtx_SET (reg, mem));
13636
13637 ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
13638
13639 cfa_offset -= GET_MODE_SIZE (V4SFmode);
13640 }
13641 }
13642
13643 static void
13644 ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame,
13645 bool use_call, int style)
13646 {
13647 struct machine_function *m = cfun->machine;
13648 const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS
13649 + m->call_ms2sysv_extra_regs;
13650 rtvec v;
13651 unsigned int elems_needed, align, i, vi = 0;
13652 rtx_insn *insn;
13653 rtx sym, tmp;
13654 rtx rsi = gen_rtx_REG (word_mode, SI_REG);
13655 rtx r10 = NULL_RTX;
13656 const struct xlogue_layout &xlogue = xlogue_layout::get_instance ();
13657 HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset ();
13658 HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset;
13659 rtx rsi_frame_load = NULL_RTX;
13660 HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1;
13661 enum xlogue_stub stub;
13662
13663 gcc_assert (!m->fs.fp_valid || frame_pointer_needed);
13664
13665 /* If using a realigned stack, we should never start with padding. */
13666 gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ());
13667
13668 /* Setup RSI as the stub's base pointer. */
13669 align = GET_MODE_ALIGNMENT (V4SFmode);
13670 tmp = choose_baseaddr (rsi_offset, &align, SI_REG);
13671 gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode));
13672
13673 emit_insn (gen_rtx_SET (rsi, tmp));
13674
13675 /* Get a symbol for the stub. */
13676 if (frame_pointer_needed)
13677 stub = use_call ? XLOGUE_STUB_RESTORE_HFP
13678 : XLOGUE_STUB_RESTORE_HFP_TAIL;
13679 else
13680 stub = use_call ? XLOGUE_STUB_RESTORE
13681 : XLOGUE_STUB_RESTORE_TAIL;
13682 sym = xlogue.get_stub_rtx (stub);
13683
13684 elems_needed = ncregs;
13685 if (use_call)
13686 elems_needed += 1;
13687 else
13688 elems_needed += frame_pointer_needed ? 5 : 3;
13689 v = rtvec_alloc (elems_needed);
13690
13691 /* We call the epilogue stub when we need to pop incoming args or we are
13692 doing a sibling call as the tail. Otherwise, we will emit a jmp to the
13693 epilogue stub and it is the tail-call. */
13694 if (use_call)
13695 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13696 else
13697 {
13698 RTVEC_ELT (v, vi++) = ret_rtx;
13699 RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym);
13700 if (frame_pointer_needed)
13701 {
13702 rtx rbp = gen_rtx_REG (DImode, BP_REG);
13703 gcc_assert (m->fs.fp_valid);
13704 gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx);
13705
13706 tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8));
13707 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp);
13708 RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp));
13709 tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
13710 RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp);
13711 }
13712 else
13713 {
13714 /* If no hard frame pointer, we set R10 to the SP restore value. */
13715 gcc_assert (!m->fs.fp_valid);
13716 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
13717 gcc_assert (m->fs.sp_valid);
13718
13719 r10 = gen_rtx_REG (DImode, R10_REG);
13720 tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset));
13721 emit_insn (gen_rtx_SET (r10, tmp));
13722
13723 RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10);
13724 }
13725 }
13726
13727 /* Generate frame load insns and restore notes. */
13728 for (i = 0; i < ncregs; ++i)
13729 {
13730 const xlogue_layout::reginfo &r = xlogue.get_reginfo (i);
13731 machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode;
13732 rtx reg, frame_load;
13733
13734 reg = gen_rtx_REG (mode, r.regno);
13735 frame_load = gen_frame_load (reg, rsi, r.offset);
13736
13737 /* Save RSI frame load insn & note to add last. */
13738 if (r.regno == SI_REG)
13739 {
13740 gcc_assert (!rsi_frame_load);
13741 rsi_frame_load = frame_load;
13742 rsi_restore_offset = r.offset;
13743 }
13744 else
13745 {
13746 RTVEC_ELT (v, vi++) = frame_load;
13747 ix86_add_cfa_restore_note (NULL, reg, r.offset);
13748 }
13749 }
13750
13751 /* Add RSI frame load & restore note at the end. */
13752 gcc_assert (rsi_frame_load);
13753 gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1);
13754 RTVEC_ELT (v, vi++) = rsi_frame_load;
13755 ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG),
13756 rsi_restore_offset);
13757
13758 /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */
13759 if (!use_call && !frame_pointer_needed)
13760 {
13761 gcc_assert (m->fs.sp_valid);
13762 gcc_assert (!m->fs.sp_realigned);
13763
13764 /* At this point, R10 should point to frame.stack_realign_offset. */
13765 if (m->fs.cfa_reg == stack_pointer_rtx)
13766 m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset;
13767 m->fs.sp_offset = frame.stack_realign_offset;
13768 }
13769
13770 gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v));
13771 tmp = gen_rtx_PARALLEL (VOIDmode, v);
13772 if (use_call)
13773 insn = emit_insn (tmp);
13774 else
13775 {
13776 insn = emit_jump_insn (tmp);
13777 JUMP_LABEL (insn) = ret_rtx;
13778
13779 if (frame_pointer_needed)
13780 ix86_emit_leave (insn);
13781 else
13782 {
13783 /* Need CFA adjust note. */
13784 tmp = gen_rtx_SET (stack_pointer_rtx, r10);
13785 add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp);
13786 }
13787 }
13788
13789 RTX_FRAME_RELATED_P (insn) = true;
13790 ix86_add_queued_cfa_restore_notes (insn);
13791
13792 /* If we're not doing a tail-call, we need to adjust the stack. */
13793 if (use_call && m->fs.sp_valid)
13794 {
13795 HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset;
13796 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13797 GEN_INT (dealloc), style,
13798 m->fs.cfa_reg == stack_pointer_rtx);
13799 }
13800 }
13801
13802 /* Restore function stack, frame, and registers. */
13803
13804 void
13805 ix86_expand_epilogue (int style)
13806 {
13807 struct machine_function *m = cfun->machine;
13808 struct machine_frame_state frame_state_save = m->fs;
13809 struct ix86_frame frame;
13810 bool restore_regs_via_mov;
13811 bool using_drap;
13812 bool restore_stub_is_tail = false;
13813
13814 if (ix86_function_naked (current_function_decl))
13815 {
13816 /* The program should not reach this point. */
13817 emit_insn (gen_ud2 ());
13818 return;
13819 }
13820
13821 ix86_finalize_stack_frame_flags ();
13822 frame = m->frame;
13823
13824 m->fs.sp_realigned = stack_realign_fp;
13825 m->fs.sp_valid = stack_realign_fp
13826 || !frame_pointer_needed
13827 || crtl->sp_is_unchanging;
13828 gcc_assert (!m->fs.sp_valid
13829 || m->fs.sp_offset == frame.stack_pointer_offset);
13830
13831 /* The FP must be valid if the frame pointer is present. */
13832 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
13833 gcc_assert (!m->fs.fp_valid
13834 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
13835
13836 /* We must have *some* valid pointer to the stack frame. */
13837 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
13838
13839 /* The DRAP is never valid at this point. */
13840 gcc_assert (!m->fs.drap_valid);
13841
13842 /* See the comment about red zone and frame
13843 pointer usage in ix86_expand_prologue. */
13844 if (frame_pointer_needed && frame.red_zone_size)
13845 emit_insn (gen_memory_blockage ());
13846
13847 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
13848 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
13849
13850 /* Determine the CFA offset of the end of the red-zone. */
13851 m->fs.red_zone_offset = 0;
13852 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
13853 {
13854 /* The red-zone begins below return address and error code in
13855 exception handler. */
13856 m->fs.red_zone_offset = RED_ZONE_SIZE + INCOMING_FRAME_SP_OFFSET;
13857
13858 /* When the register save area is in the aligned portion of
13859 the stack, determine the maximum runtime displacement that
13860 matches up with the aligned frame. */
13861 if (stack_realign_drap)
13862 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
13863 + UNITS_PER_WORD);
13864 }
13865
13866 /* Special care must be taken for the normal return case of a function
13867 using eh_return: the eax and edx registers are marked as saved, but
13868 not restored along this path. Adjust the save location to match. */
13869 if (crtl->calls_eh_return && style != 2)
13870 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
13871
13872 /* EH_RETURN requires the use of moves to function properly. */
13873 if (crtl->calls_eh_return)
13874 restore_regs_via_mov = true;
13875 /* SEH requires the use of pops to identify the epilogue. */
13876 else if (TARGET_SEH)
13877 restore_regs_via_mov = false;
13878 /* If we're only restoring one register and sp cannot be used then
13879 using a move instruction to restore the register since it's
13880 less work than reloading sp and popping the register. */
13881 else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
13882 restore_regs_via_mov = true;
13883 else if (TARGET_EPILOGUE_USING_MOVE
13884 && cfun->machine->use_fast_prologue_epilogue
13885 && (frame.nregs > 1
13886 || m->fs.sp_offset != frame.reg_save_offset))
13887 restore_regs_via_mov = true;
13888 else if (frame_pointer_needed
13889 && !frame.nregs
13890 && m->fs.sp_offset != frame.reg_save_offset)
13891 restore_regs_via_mov = true;
13892 else if (frame_pointer_needed
13893 && TARGET_USE_LEAVE
13894 && cfun->machine->use_fast_prologue_epilogue
13895 && frame.nregs == 1)
13896 restore_regs_via_mov = true;
13897 else
13898 restore_regs_via_mov = false;
13899
13900 if (restore_regs_via_mov || frame.nsseregs)
13901 {
13902 /* Ensure that the entire register save area is addressable via
13903 the stack pointer, if we will restore SSE regs via sp. */
13904 if (TARGET_64BIT
13905 && m->fs.sp_offset > 0x7fffffff
13906 && sp_valid_at (frame.stack_realign_offset + 1)
13907 && (frame.nsseregs + frame.nregs) != 0)
13908 {
13909 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
13910 GEN_INT (m->fs.sp_offset
13911 - frame.sse_reg_save_offset),
13912 style,
13913 m->fs.cfa_reg == stack_pointer_rtx);
13914 }
13915 }
13916
13917 /* If there are any SSE registers to restore, then we have to do it
13918 via moves, since there's obviously no pop for SSE regs. */
13919 if (frame.nsseregs)
13920 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
13921 style == 2);
13922
13923 if (m->call_ms2sysv)
13924 {
13925 int pop_incoming_args = crtl->args.pops_args && crtl->args.size;
13926
13927 /* We cannot use a tail-call for the stub if:
13928 1. We have to pop incoming args,
13929 2. We have additional int regs to restore, or
13930 3. A sibling call will be the tail-call, or
13931 4. We are emitting an eh_return_internal epilogue.
13932
13933 TODO: Item 4 has not yet tested!
13934
13935 If any of the above are true, we will call the stub rather than
13936 jump to it. */
13937 restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1);
13938 ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style);
13939 }
13940
13941 /* If using out-of-line stub that is a tail-call, then...*/
13942 if (m->call_ms2sysv && restore_stub_is_tail)
13943 {
13944 /* TODO: parinoid tests. (remove eventually) */
13945 gcc_assert (m->fs.sp_valid);
13946 gcc_assert (!m->fs.sp_realigned);
13947 gcc_assert (!m->fs.fp_valid);
13948 gcc_assert (!m->fs.realigned);
13949 gcc_assert (m->fs.sp_offset == UNITS_PER_WORD);
13950 gcc_assert (!crtl->drap_reg);
13951 gcc_assert (!frame.nregs);
13952 }
13953 else if (restore_regs_via_mov)
13954 {
13955 rtx t;
13956
13957 if (frame.nregs)
13958 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
13959
13960 /* eh_return epilogues need %ecx added to the stack pointer. */
13961 if (style == 2)
13962 {
13963 rtx sa = EH_RETURN_STACKADJ_RTX;
13964 rtx_insn *insn;
13965
13966 /* %ecx can't be used for both DRAP register and eh_return. */
13967 if (crtl->drap_reg)
13968 gcc_assert (REGNO (crtl->drap_reg) != CX_REG);
13969
13970 /* regparm nested functions don't work with eh_return. */
13971 gcc_assert (!ix86_static_chain_on_stack);
13972
13973 if (frame_pointer_needed)
13974 {
13975 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
13976 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
13977 emit_insn (gen_rtx_SET (sa, t));
13978
13979 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
13980 insn = emit_move_insn (hard_frame_pointer_rtx, t);
13981
13982 /* Note that we use SA as a temporary CFA, as the return
13983 address is at the proper place relative to it. We
13984 pretend this happens at the FP restore insn because
13985 prior to this insn the FP would be stored at the wrong
13986 offset relative to SA, and after this insn we have no
13987 other reasonable register to use for the CFA. We don't
13988 bother resetting the CFA to the SP for the duration of
13989 the return insn, unless the control flow instrumentation
13990 is done. In this case the SP is used later and we have
13991 to reset CFA to SP. */
13992 add_reg_note (insn, REG_CFA_DEF_CFA,
13993 plus_constant (Pmode, sa, UNITS_PER_WORD));
13994 ix86_add_queued_cfa_restore_notes (insn);
13995 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
13996 RTX_FRAME_RELATED_P (insn) = 1;
13997
13998 m->fs.cfa_reg = sa;
13999 m->fs.cfa_offset = UNITS_PER_WORD;
14000 m->fs.fp_valid = false;
14001
14002 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
14003 const0_rtx, style,
14004 flag_cf_protection);
14005 }
14006 else
14007 {
14008 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
14009 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
14010 insn = emit_insn (gen_rtx_SET (stack_pointer_rtx, t));
14011 ix86_add_queued_cfa_restore_notes (insn);
14012
14013 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
14014 if (m->fs.cfa_offset != UNITS_PER_WORD)
14015 {
14016 m->fs.cfa_offset = UNITS_PER_WORD;
14017 add_reg_note (insn, REG_CFA_DEF_CFA,
14018 plus_constant (Pmode, stack_pointer_rtx,
14019 UNITS_PER_WORD));
14020 RTX_FRAME_RELATED_P (insn) = 1;
14021 }
14022 }
14023 m->fs.sp_offset = UNITS_PER_WORD;
14024 m->fs.sp_valid = true;
14025 m->fs.sp_realigned = false;
14026 }
14027 }
14028 else
14029 {
14030 /* SEH requires that the function end with (1) a stack adjustment
14031 if necessary, (2) a sequence of pops, and (3) a return or
14032 jump instruction. Prevent insns from the function body from
14033 being scheduled into this sequence. */
14034 if (TARGET_SEH)
14035 {
14036 /* Prevent a catch region from being adjacent to the standard
14037 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
14038 several other flags that would be interesting to test are
14039 not yet set up. */
14040 if (flag_non_call_exceptions)
14041 emit_insn (gen_nops (const1_rtx));
14042 else
14043 emit_insn (gen_blockage ());
14044 }
14045
14046 /* First step is to deallocate the stack frame so that we can
14047 pop the registers. If the stack pointer was realigned, it needs
14048 to be restored now. Also do it on SEH target for very large
14049 frame as the emitted instructions aren't allowed by the ABI
14050 in epilogues. */
14051 if (!m->fs.sp_valid || m->fs.sp_realigned
14052 || (TARGET_SEH
14053 && (m->fs.sp_offset - frame.reg_save_offset
14054 >= SEH_MAX_FRAME_SIZE)))
14055 {
14056 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
14057 GEN_INT (m->fs.fp_offset
14058 - frame.reg_save_offset),
14059 style, false);
14060 }
14061 else if (m->fs.sp_offset != frame.reg_save_offset)
14062 {
14063 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14064 GEN_INT (m->fs.sp_offset
14065 - frame.reg_save_offset),
14066 style,
14067 m->fs.cfa_reg == stack_pointer_rtx);
14068 }
14069
14070 ix86_emit_restore_regs_using_pop ();
14071 }
14072
14073 /* If we used a stack pointer and haven't already got rid of it,
14074 then do so now. */
14075 if (m->fs.fp_valid)
14076 {
14077 /* If the stack pointer is valid and pointing at the frame
14078 pointer store address, then we only need a pop. */
14079 if (sp_valid_at (frame.hfp_save_offset)
14080 && m->fs.sp_offset == frame.hfp_save_offset)
14081 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14082 /* Leave results in shorter dependency chains on CPUs that are
14083 able to grok it fast. */
14084 else if (TARGET_USE_LEAVE
14085 || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun))
14086 || !cfun->machine->use_fast_prologue_epilogue)
14087 ix86_emit_leave (NULL);
14088 else
14089 {
14090 pro_epilogue_adjust_stack (stack_pointer_rtx,
14091 hard_frame_pointer_rtx,
14092 const0_rtx, style, !using_drap);
14093 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
14094 }
14095 }
14096
14097 if (using_drap)
14098 {
14099 int param_ptr_offset = UNITS_PER_WORD;
14100 rtx_insn *insn;
14101
14102 gcc_assert (stack_realign_drap);
14103
14104 if (ix86_static_chain_on_stack)
14105 param_ptr_offset += UNITS_PER_WORD;
14106 if (!call_used_regs[REGNO (crtl->drap_reg)])
14107 param_ptr_offset += UNITS_PER_WORD;
14108
14109 insn = emit_insn (gen_rtx_SET
14110 (stack_pointer_rtx,
14111 gen_rtx_PLUS (Pmode,
14112 crtl->drap_reg,
14113 GEN_INT (-param_ptr_offset))));
14114 m->fs.cfa_reg = stack_pointer_rtx;
14115 m->fs.cfa_offset = param_ptr_offset;
14116 m->fs.sp_offset = param_ptr_offset;
14117 m->fs.realigned = false;
14118
14119 add_reg_note (insn, REG_CFA_DEF_CFA,
14120 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14121 GEN_INT (param_ptr_offset)));
14122 RTX_FRAME_RELATED_P (insn) = 1;
14123
14124 if (!call_used_regs[REGNO (crtl->drap_reg)])
14125 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
14126 }
14127
14128 /* At this point the stack pointer must be valid, and we must have
14129 restored all of the registers. We may not have deallocated the
14130 entire stack frame. We've delayed this until now because it may
14131 be possible to merge the local stack deallocation with the
14132 deallocation forced by ix86_static_chain_on_stack. */
14133 gcc_assert (m->fs.sp_valid);
14134 gcc_assert (!m->fs.sp_realigned);
14135 gcc_assert (!m->fs.fp_valid);
14136 gcc_assert (!m->fs.realigned);
14137 if (m->fs.sp_offset != UNITS_PER_WORD)
14138 {
14139 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14140 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
14141 style, true);
14142 }
14143 else
14144 ix86_add_queued_cfa_restore_notes (get_last_insn ());
14145
14146 /* Sibcall epilogues don't want a return instruction. */
14147 if (style == 0)
14148 {
14149 m->fs = frame_state_save;
14150 return;
14151 }
14152
14153 if (cfun->machine->func_type != TYPE_NORMAL)
14154 emit_jump_insn (gen_interrupt_return ());
14155 else if (crtl->args.pops_args && crtl->args.size)
14156 {
14157 rtx popc = GEN_INT (crtl->args.pops_args);
14158
14159 /* i386 can only pop 64K bytes. If asked to pop more, pop return
14160 address, do explicit add, and jump indirectly to the caller. */
14161
14162 if (crtl->args.pops_args >= 65536)
14163 {
14164 rtx ecx = gen_rtx_REG (SImode, CX_REG);
14165 rtx_insn *insn;
14166
14167 /* There is no "pascal" calling convention in any 64bit ABI. */
14168 gcc_assert (!TARGET_64BIT);
14169
14170 insn = emit_insn (gen_pop (ecx));
14171 m->fs.cfa_offset -= UNITS_PER_WORD;
14172 m->fs.sp_offset -= UNITS_PER_WORD;
14173
14174 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14175 x = gen_rtx_SET (stack_pointer_rtx, x);
14176 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14177 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14178 RTX_FRAME_RELATED_P (insn) = 1;
14179
14180 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
14181 popc, -1, true);
14182 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14183 }
14184 else
14185 emit_jump_insn (gen_simple_return_pop_internal (popc));
14186 }
14187 else if (!m->call_ms2sysv || !restore_stub_is_tail)
14188 {
14189 /* In case of return from EH a simple return cannot be used
14190 as a return address will be compared with a shadow stack
14191 return address. Use indirect jump instead. */
14192 if (style == 2 && flag_cf_protection)
14193 {
14194 /* Register used in indirect jump must be in word_mode. But
14195 Pmode may not be the same as word_mode for x32. */
14196 rtx ecx = gen_rtx_REG (word_mode, CX_REG);
14197 rtx_insn *insn;
14198
14199 insn = emit_insn (gen_pop (ecx));
14200 m->fs.cfa_offset -= UNITS_PER_WORD;
14201 m->fs.sp_offset -= UNITS_PER_WORD;
14202
14203 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
14204 x = gen_rtx_SET (stack_pointer_rtx, x);
14205 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
14206 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
14207 RTX_FRAME_RELATED_P (insn) = 1;
14208
14209 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
14210 }
14211 else
14212 emit_jump_insn (gen_simple_return_internal ());
14213 }
14214
14215 /* Restore the state back to the state from the prologue,
14216 so that it's correct for the next epilogue. */
14217 m->fs = frame_state_save;
14218 }
14219
14220 /* Reset from the function's potential modifications. */
14221
14222 static void
14223 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED)
14224 {
14225 if (pic_offset_table_rtx
14226 && !ix86_use_pseudo_pic_reg ())
14227 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
14228
14229 if (TARGET_MACHO)
14230 {
14231 rtx_insn *insn = get_last_insn ();
14232 rtx_insn *deleted_debug_label = NULL;
14233
14234 /* Mach-O doesn't support labels at the end of objects, so if
14235 it looks like we might want one, take special action.
14236 First, collect any sequence of deleted debug labels. */
14237 while (insn
14238 && NOTE_P (insn)
14239 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
14240 {
14241 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
14242 notes only, instead set their CODE_LABEL_NUMBER to -1,
14243 otherwise there would be code generation differences
14244 in between -g and -g0. */
14245 if (NOTE_P (insn) && NOTE_KIND (insn)
14246 == NOTE_INSN_DELETED_DEBUG_LABEL)
14247 deleted_debug_label = insn;
14248 insn = PREV_INSN (insn);
14249 }
14250
14251 /* If we have:
14252 label:
14253 barrier
14254 then this needs to be detected, so skip past the barrier. */
14255
14256 if (insn && BARRIER_P (insn))
14257 insn = PREV_INSN (insn);
14258
14259 /* Up to now we've only seen notes or barriers. */
14260 if (insn)
14261 {
14262 if (LABEL_P (insn)
14263 || (NOTE_P (insn)
14264 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL))
14265 /* Trailing label. */
14266 fputs ("\tnop\n", file);
14267 else if (cfun && ! cfun->is_thunk)
14268 {
14269 /* See if we have a completely empty function body, skipping
14270 the special case of the picbase thunk emitted as asm. */
14271 while (insn && ! INSN_P (insn))
14272 insn = PREV_INSN (insn);
14273 /* If we don't find any insns, we've got an empty function body;
14274 I.e. completely empty - without a return or branch. This is
14275 taken as the case where a function body has been removed
14276 because it contains an inline __builtin_unreachable(). GCC
14277 declares that reaching __builtin_unreachable() means UB so
14278 we're not obliged to do anything special; however, we want
14279 non-zero-sized function bodies. To meet this, and help the
14280 user out, let's trap the case. */
14281 if (insn == NULL)
14282 fputs ("\tud2\n", file);
14283 }
14284 }
14285 else if (deleted_debug_label)
14286 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
14287 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
14288 CODE_LABEL_NUMBER (insn) = -1;
14289 }
14290 }
14291
14292 /* Return a scratch register to use in the split stack prologue. The
14293 split stack prologue is used for -fsplit-stack. It is the first
14294 instructions in the function, even before the regular prologue.
14295 The scratch register can be any caller-saved register which is not
14296 used for parameters or for the static chain. */
14297
14298 static unsigned int
14299 split_stack_prologue_scratch_regno (void)
14300 {
14301 if (TARGET_64BIT)
14302 return R11_REG;
14303 else
14304 {
14305 bool is_fastcall, is_thiscall;
14306 int regparm;
14307
14308 is_fastcall = (lookup_attribute ("fastcall",
14309 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14310 != NULL);
14311 is_thiscall = (lookup_attribute ("thiscall",
14312 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
14313 != NULL);
14314 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
14315
14316 if (is_fastcall)
14317 {
14318 if (DECL_STATIC_CHAIN (cfun->decl))
14319 {
14320 sorry ("-fsplit-stack does not support fastcall with "
14321 "nested function");
14322 return INVALID_REGNUM;
14323 }
14324 return AX_REG;
14325 }
14326 else if (is_thiscall)
14327 {
14328 if (!DECL_STATIC_CHAIN (cfun->decl))
14329 return DX_REG;
14330 return AX_REG;
14331 }
14332 else if (regparm < 3)
14333 {
14334 if (!DECL_STATIC_CHAIN (cfun->decl))
14335 return CX_REG;
14336 else
14337 {
14338 if (regparm >= 2)
14339 {
14340 sorry ("-fsplit-stack does not support 2 register "
14341 "parameters for a nested function");
14342 return INVALID_REGNUM;
14343 }
14344 return DX_REG;
14345 }
14346 }
14347 else
14348 {
14349 /* FIXME: We could make this work by pushing a register
14350 around the addition and comparison. */
14351 sorry ("-fsplit-stack does not support 3 register parameters");
14352 return INVALID_REGNUM;
14353 }
14354 }
14355 }
14356
14357 /* A SYMBOL_REF for the function which allocates new stackspace for
14358 -fsplit-stack. */
14359
14360 static GTY(()) rtx split_stack_fn;
14361
14362 /* A SYMBOL_REF for the more stack function when using the large
14363 model. */
14364
14365 static GTY(()) rtx split_stack_fn_large;
14366
14367 /* Return location of the stack guard value in the TLS block. */
14368
14369 rtx
14370 ix86_split_stack_guard (void)
14371 {
14372 int offset;
14373 addr_space_t as = DEFAULT_TLS_SEG_REG;
14374 rtx r;
14375
14376 gcc_assert (flag_split_stack);
14377
14378 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14379 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14380 #else
14381 gcc_unreachable ();
14382 #endif
14383
14384 r = GEN_INT (offset);
14385 r = gen_const_mem (Pmode, r);
14386 set_mem_addr_space (r, as);
14387
14388 return r;
14389 }
14390
14391 /* Handle -fsplit-stack. These are the first instructions in the
14392 function, even before the regular prologue. */
14393
14394 void
14395 ix86_expand_split_stack_prologue (void)
14396 {
14397 HOST_WIDE_INT allocate;
14398 unsigned HOST_WIDE_INT args_size;
14399 rtx_code_label *label;
14400 rtx limit, current, allocate_rtx, call_insn, call_fusage;
14401 rtx scratch_reg = NULL_RTX;
14402 rtx_code_label *varargs_label = NULL;
14403 rtx fn;
14404
14405 gcc_assert (flag_split_stack && reload_completed);
14406
14407 ix86_finalize_stack_frame_flags ();
14408 struct ix86_frame &frame = cfun->machine->frame;
14409 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
14410
14411 /* This is the label we will branch to if we have enough stack
14412 space. We expect the basic block reordering pass to reverse this
14413 branch if optimizing, so that we branch in the unlikely case. */
14414 label = gen_label_rtx ();
14415
14416 /* We need to compare the stack pointer minus the frame size with
14417 the stack boundary in the TCB. The stack boundary always gives
14418 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
14419 can compare directly. Otherwise we need to do an addition. */
14420
14421 limit = ix86_split_stack_guard ();
14422
14423 if (allocate < SPLIT_STACK_AVAILABLE)
14424 current = stack_pointer_rtx;
14425 else
14426 {
14427 unsigned int scratch_regno;
14428 rtx offset;
14429
14430 /* We need a scratch register to hold the stack pointer minus
14431 the required frame size. Since this is the very start of the
14432 function, the scratch register can be any caller-saved
14433 register which is not used for parameters. */
14434 offset = GEN_INT (- allocate);
14435 scratch_regno = split_stack_prologue_scratch_regno ();
14436 if (scratch_regno == INVALID_REGNUM)
14437 return;
14438 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14439 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
14440 {
14441 /* We don't use ix86_gen_add3 in this case because it will
14442 want to split to lea, but when not optimizing the insn
14443 will not be split after this point. */
14444 emit_insn (gen_rtx_SET (scratch_reg,
14445 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14446 offset)));
14447 }
14448 else
14449 {
14450 emit_move_insn (scratch_reg, offset);
14451 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
14452 stack_pointer_rtx));
14453 }
14454 current = scratch_reg;
14455 }
14456
14457 ix86_expand_branch (GEU, current, limit, label);
14458 rtx_insn *jump_insn = get_last_insn ();
14459 JUMP_LABEL (jump_insn) = label;
14460
14461 /* Mark the jump as very likely to be taken. */
14462 add_reg_br_prob_note (jump_insn, profile_probability::very_likely ());
14463
14464 if (split_stack_fn == NULL_RTX)
14465 {
14466 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
14467 SYMBOL_REF_FLAGS (split_stack_fn) |= SYMBOL_FLAG_LOCAL;
14468 }
14469 fn = split_stack_fn;
14470
14471 /* Get more stack space. We pass in the desired stack space and the
14472 size of the arguments to copy to the new stack. In 32-bit mode
14473 we push the parameters; __morestack will return on a new stack
14474 anyhow. In 64-bit mode we pass the parameters in r10 and
14475 r11. */
14476 allocate_rtx = GEN_INT (allocate);
14477 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
14478 call_fusage = NULL_RTX;
14479 rtx pop = NULL_RTX;
14480 if (TARGET_64BIT)
14481 {
14482 rtx reg10, reg11;
14483
14484 reg10 = gen_rtx_REG (Pmode, R10_REG);
14485 reg11 = gen_rtx_REG (Pmode, R11_REG);
14486
14487 /* If this function uses a static chain, it will be in %r10.
14488 Preserve it across the call to __morestack. */
14489 if (DECL_STATIC_CHAIN (cfun->decl))
14490 {
14491 rtx rax;
14492
14493 rax = gen_rtx_REG (word_mode, AX_REG);
14494 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
14495 use_reg (&call_fusage, rax);
14496 }
14497
14498 if ((ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
14499 && !TARGET_PECOFF)
14500 {
14501 HOST_WIDE_INT argval;
14502
14503 gcc_assert (Pmode == DImode);
14504 /* When using the large model we need to load the address
14505 into a register, and we've run out of registers. So we
14506 switch to a different calling convention, and we call a
14507 different function: __morestack_large. We pass the
14508 argument size in the upper 32 bits of r10 and pass the
14509 frame size in the lower 32 bits. */
14510 gcc_assert ((allocate & HOST_WIDE_INT_C (0xffffffff)) == allocate);
14511 gcc_assert ((args_size & 0xffffffff) == args_size);
14512
14513 if (split_stack_fn_large == NULL_RTX)
14514 {
14515 split_stack_fn_large =
14516 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
14517 SYMBOL_REF_FLAGS (split_stack_fn_large) |= SYMBOL_FLAG_LOCAL;
14518 }
14519 if (ix86_cmodel == CM_LARGE_PIC)
14520 {
14521 rtx_code_label *label;
14522 rtx x;
14523
14524 label = gen_label_rtx ();
14525 emit_label (label);
14526 LABEL_PRESERVE_P (label) = 1;
14527 emit_insn (gen_set_rip_rex64 (reg10, label));
14528 emit_insn (gen_set_got_offset_rex64 (reg11, label));
14529 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
14530 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
14531 UNSPEC_GOT);
14532 x = gen_rtx_CONST (Pmode, x);
14533 emit_move_insn (reg11, x);
14534 x = gen_rtx_PLUS (Pmode, reg10, reg11);
14535 x = gen_const_mem (Pmode, x);
14536 emit_move_insn (reg11, x);
14537 }
14538 else
14539 emit_move_insn (reg11, split_stack_fn_large);
14540
14541 fn = reg11;
14542
14543 argval = ((args_size << 16) << 16) + allocate;
14544 emit_move_insn (reg10, GEN_INT (argval));
14545 }
14546 else
14547 {
14548 emit_move_insn (reg10, allocate_rtx);
14549 emit_move_insn (reg11, GEN_INT (args_size));
14550 use_reg (&call_fusage, reg11);
14551 }
14552
14553 use_reg (&call_fusage, reg10);
14554 }
14555 else
14556 {
14557 rtx_insn *insn = emit_insn (gen_push (GEN_INT (args_size)));
14558 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (UNITS_PER_WORD));
14559 insn = emit_insn (gen_push (allocate_rtx));
14560 add_reg_note (insn, REG_ARGS_SIZE, GEN_INT (2 * UNITS_PER_WORD));
14561 pop = GEN_INT (2 * UNITS_PER_WORD);
14562 }
14563 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
14564 GEN_INT (UNITS_PER_WORD), constm1_rtx,
14565 pop, false);
14566 add_function_usage_to (call_insn, call_fusage);
14567 if (!TARGET_64BIT)
14568 add_reg_note (call_insn, REG_ARGS_SIZE, GEN_INT (0));
14569 /* Indicate that this function can't jump to non-local gotos. */
14570 make_reg_eh_region_note_nothrow_nononlocal (as_a <rtx_insn *> (call_insn));
14571
14572 /* In order to make call/return prediction work right, we now need
14573 to execute a return instruction. See
14574 libgcc/config/i386/morestack.S for the details on how this works.
14575
14576 For flow purposes gcc must not see this as a return
14577 instruction--we need control flow to continue at the subsequent
14578 label. Therefore, we use an unspec. */
14579 gcc_assert (crtl->args.pops_args < 65536);
14580 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
14581
14582 /* If we are in 64-bit mode and this function uses a static chain,
14583 we saved %r10 in %rax before calling _morestack. */
14584 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
14585 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
14586 gen_rtx_REG (word_mode, AX_REG));
14587
14588 /* If this function calls va_start, we need to store a pointer to
14589 the arguments on the old stack, because they may not have been
14590 all copied to the new stack. At this point the old stack can be
14591 found at the frame pointer value used by __morestack, because
14592 __morestack has set that up before calling back to us. Here we
14593 store that pointer in a scratch register, and in
14594 ix86_expand_prologue we store the scratch register in a stack
14595 slot. */
14596 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14597 {
14598 unsigned int scratch_regno;
14599 rtx frame_reg;
14600 int words;
14601
14602 scratch_regno = split_stack_prologue_scratch_regno ();
14603 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
14604 frame_reg = gen_rtx_REG (Pmode, BP_REG);
14605
14606 /* 64-bit:
14607 fp -> old fp value
14608 return address within this function
14609 return address of caller of this function
14610 stack arguments
14611 So we add three words to get to the stack arguments.
14612
14613 32-bit:
14614 fp -> old fp value
14615 return address within this function
14616 first argument to __morestack
14617 second argument to __morestack
14618 return address of caller of this function
14619 stack arguments
14620 So we add five words to get to the stack arguments.
14621 */
14622 words = TARGET_64BIT ? 3 : 5;
14623 emit_insn (gen_rtx_SET (scratch_reg,
14624 gen_rtx_PLUS (Pmode, frame_reg,
14625 GEN_INT (words * UNITS_PER_WORD))));
14626
14627 varargs_label = gen_label_rtx ();
14628 emit_jump_insn (gen_jump (varargs_label));
14629 JUMP_LABEL (get_last_insn ()) = varargs_label;
14630
14631 emit_barrier ();
14632 }
14633
14634 emit_label (label);
14635 LABEL_NUSES (label) = 1;
14636
14637 /* If this function calls va_start, we now have to set the scratch
14638 register for the case where we do not call __morestack. In this
14639 case we need to set it based on the stack pointer. */
14640 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14641 {
14642 emit_insn (gen_rtx_SET (scratch_reg,
14643 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
14644 GEN_INT (UNITS_PER_WORD))));
14645
14646 emit_label (varargs_label);
14647 LABEL_NUSES (varargs_label) = 1;
14648 }
14649 }
14650
14651 /* We may have to tell the dataflow pass that the split stack prologue
14652 is initializing a scratch register. */
14653
14654 static void
14655 ix86_live_on_entry (bitmap regs)
14656 {
14657 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
14658 {
14659 gcc_assert (flag_split_stack);
14660 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
14661 }
14662 }
14663 \f
14664 /* Extract the parts of an RTL expression that is a valid memory address
14665 for an instruction. Return 0 if the structure of the address is
14666 grossly off. Return -1 if the address contains ASHIFT, so it is not
14667 strictly valid, but still used for computing length of lea instruction. */
14668
14669 int
14670 ix86_decompose_address (rtx addr, struct ix86_address *out)
14671 {
14672 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
14673 rtx base_reg, index_reg;
14674 HOST_WIDE_INT scale = 1;
14675 rtx scale_rtx = NULL_RTX;
14676 rtx tmp;
14677 int retval = 1;
14678 addr_space_t seg = ADDR_SPACE_GENERIC;
14679
14680 /* Allow zero-extended SImode addresses,
14681 they will be emitted with addr32 prefix. */
14682 if (TARGET_64BIT && GET_MODE (addr) == DImode)
14683 {
14684 if (GET_CODE (addr) == ZERO_EXTEND
14685 && GET_MODE (XEXP (addr, 0)) == SImode)
14686 {
14687 addr = XEXP (addr, 0);
14688 if (CONST_INT_P (addr))
14689 return 0;
14690 }
14691 else if (GET_CODE (addr) == AND
14692 && const_32bit_mask (XEXP (addr, 1), DImode))
14693 {
14694 addr = lowpart_subreg (SImode, XEXP (addr, 0), DImode);
14695 if (addr == NULL_RTX)
14696 return 0;
14697
14698 if (CONST_INT_P (addr))
14699 return 0;
14700 }
14701 }
14702
14703 /* Allow SImode subregs of DImode addresses,
14704 they will be emitted with addr32 prefix. */
14705 if (TARGET_64BIT && GET_MODE (addr) == SImode)
14706 {
14707 if (SUBREG_P (addr)
14708 && GET_MODE (SUBREG_REG (addr)) == DImode)
14709 {
14710 addr = SUBREG_REG (addr);
14711 if (CONST_INT_P (addr))
14712 return 0;
14713 }
14714 }
14715
14716 if (REG_P (addr))
14717 base = addr;
14718 else if (SUBREG_P (addr))
14719 {
14720 if (REG_P (SUBREG_REG (addr)))
14721 base = addr;
14722 else
14723 return 0;
14724 }
14725 else if (GET_CODE (addr) == PLUS)
14726 {
14727 rtx addends[4], op;
14728 int n = 0, i;
14729
14730 op = addr;
14731 do
14732 {
14733 if (n >= 4)
14734 return 0;
14735 addends[n++] = XEXP (op, 1);
14736 op = XEXP (op, 0);
14737 }
14738 while (GET_CODE (op) == PLUS);
14739 if (n >= 4)
14740 return 0;
14741 addends[n] = op;
14742
14743 for (i = n; i >= 0; --i)
14744 {
14745 op = addends[i];
14746 switch (GET_CODE (op))
14747 {
14748 case MULT:
14749 if (index)
14750 return 0;
14751 index = XEXP (op, 0);
14752 scale_rtx = XEXP (op, 1);
14753 break;
14754
14755 case ASHIFT:
14756 if (index)
14757 return 0;
14758 index = XEXP (op, 0);
14759 tmp = XEXP (op, 1);
14760 if (!CONST_INT_P (tmp))
14761 return 0;
14762 scale = INTVAL (tmp);
14763 if ((unsigned HOST_WIDE_INT) scale > 3)
14764 return 0;
14765 scale = 1 << scale;
14766 break;
14767
14768 case ZERO_EXTEND:
14769 op = XEXP (op, 0);
14770 if (GET_CODE (op) != UNSPEC)
14771 return 0;
14772 /* FALLTHRU */
14773
14774 case UNSPEC:
14775 if (XINT (op, 1) == UNSPEC_TP
14776 && TARGET_TLS_DIRECT_SEG_REFS
14777 && seg == ADDR_SPACE_GENERIC)
14778 seg = DEFAULT_TLS_SEG_REG;
14779 else
14780 return 0;
14781 break;
14782
14783 case SUBREG:
14784 if (!REG_P (SUBREG_REG (op)))
14785 return 0;
14786 /* FALLTHRU */
14787
14788 case REG:
14789 if (!base)
14790 base = op;
14791 else if (!index)
14792 index = op;
14793 else
14794 return 0;
14795 break;
14796
14797 case CONST:
14798 case CONST_INT:
14799 case SYMBOL_REF:
14800 case LABEL_REF:
14801 if (disp)
14802 return 0;
14803 disp = op;
14804 break;
14805
14806 default:
14807 return 0;
14808 }
14809 }
14810 }
14811 else if (GET_CODE (addr) == MULT)
14812 {
14813 index = XEXP (addr, 0); /* index*scale */
14814 scale_rtx = XEXP (addr, 1);
14815 }
14816 else if (GET_CODE (addr) == ASHIFT)
14817 {
14818 /* We're called for lea too, which implements ashift on occasion. */
14819 index = XEXP (addr, 0);
14820 tmp = XEXP (addr, 1);
14821 if (!CONST_INT_P (tmp))
14822 return 0;
14823 scale = INTVAL (tmp);
14824 if ((unsigned HOST_WIDE_INT) scale > 3)
14825 return 0;
14826 scale = 1 << scale;
14827 retval = -1;
14828 }
14829 else
14830 disp = addr; /* displacement */
14831
14832 if (index)
14833 {
14834 if (REG_P (index))
14835 ;
14836 else if (SUBREG_P (index)
14837 && REG_P (SUBREG_REG (index)))
14838 ;
14839 else
14840 return 0;
14841 }
14842
14843 /* Extract the integral value of scale. */
14844 if (scale_rtx)
14845 {
14846 if (!CONST_INT_P (scale_rtx))
14847 return 0;
14848 scale = INTVAL (scale_rtx);
14849 }
14850
14851 base_reg = base && SUBREG_P (base) ? SUBREG_REG (base) : base;
14852 index_reg = index && SUBREG_P (index) ? SUBREG_REG (index) : index;
14853
14854 /* Avoid useless 0 displacement. */
14855 if (disp == const0_rtx && (base || index))
14856 disp = NULL_RTX;
14857
14858 /* Allow arg pointer and stack pointer as index if there is not scaling. */
14859 if (base_reg && index_reg && scale == 1
14860 && (REGNO (index_reg) == ARG_POINTER_REGNUM
14861 || REGNO (index_reg) == FRAME_POINTER_REGNUM
14862 || REGNO (index_reg) == SP_REG))
14863 {
14864 std::swap (base, index);
14865 std::swap (base_reg, index_reg);
14866 }
14867
14868 /* Special case: %ebp cannot be encoded as a base without a displacement.
14869 Similarly %r13. */
14870 if (!disp && base_reg
14871 && (REGNO (base_reg) == ARG_POINTER_REGNUM
14872 || REGNO (base_reg) == FRAME_POINTER_REGNUM
14873 || REGNO (base_reg) == BP_REG
14874 || REGNO (base_reg) == R13_REG))
14875 disp = const0_rtx;
14876
14877 /* Special case: on K6, [%esi] makes the instruction vector decoded.
14878 Avoid this by transforming to [%esi+0].
14879 Reload calls address legitimization without cfun defined, so we need
14880 to test cfun for being non-NULL. */
14881 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
14882 && base_reg && !index_reg && !disp
14883 && REGNO (base_reg) == SI_REG)
14884 disp = const0_rtx;
14885
14886 /* Special case: encode reg+reg instead of reg*2. */
14887 if (!base && index && scale == 2)
14888 base = index, base_reg = index_reg, scale = 1;
14889
14890 /* Special case: scaling cannot be encoded without base or displacement. */
14891 if (!base && !disp && index && scale != 1)
14892 disp = const0_rtx;
14893
14894 out->base = base;
14895 out->index = index;
14896 out->disp = disp;
14897 out->scale = scale;
14898 out->seg = seg;
14899
14900 return retval;
14901 }
14902 \f
14903 /* Return cost of the memory address x.
14904 For i386, it is better to use a complex address than let gcc copy
14905 the address into a reg and make a new pseudo. But not if the address
14906 requires to two regs - that would mean more pseudos with longer
14907 lifetimes. */
14908 static int
14909 ix86_address_cost (rtx x, machine_mode, addr_space_t, bool)
14910 {
14911 struct ix86_address parts;
14912 int cost = 1;
14913 int ok = ix86_decompose_address (x, &parts);
14914
14915 gcc_assert (ok);
14916
14917 if (parts.base && SUBREG_P (parts.base))
14918 parts.base = SUBREG_REG (parts.base);
14919 if (parts.index && SUBREG_P (parts.index))
14920 parts.index = SUBREG_REG (parts.index);
14921
14922 /* Attempt to minimize number of registers in the address by increasing
14923 address cost for each used register. We don't increase address cost
14924 for "pic_offset_table_rtx". When a memopt with "pic_offset_table_rtx"
14925 is not invariant itself it most likely means that base or index is not
14926 invariant. Therefore only "pic_offset_table_rtx" could be hoisted out,
14927 which is not profitable for x86. */
14928 if (parts.base
14929 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
14930 && (current_pass->type == GIMPLE_PASS
14931 || !pic_offset_table_rtx
14932 || !REG_P (parts.base)
14933 || REGNO (pic_offset_table_rtx) != REGNO (parts.base)))
14934 cost++;
14935
14936 if (parts.index
14937 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
14938 && (current_pass->type == GIMPLE_PASS
14939 || !pic_offset_table_rtx
14940 || !REG_P (parts.index)
14941 || REGNO (pic_offset_table_rtx) != REGNO (parts.index)))
14942 cost++;
14943
14944 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
14945 since it's predecode logic can't detect the length of instructions
14946 and it degenerates to vector decoded. Increase cost of such
14947 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
14948 to split such addresses or even refuse such addresses at all.
14949
14950 Following addressing modes are affected:
14951 [base+scale*index]
14952 [scale*index+disp]
14953 [base+index]
14954
14955 The first and last case may be avoidable by explicitly coding the zero in
14956 memory address, but I don't have AMD-K6 machine handy to check this
14957 theory. */
14958
14959 if (TARGET_K6
14960 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
14961 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
14962 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
14963 cost += 10;
14964
14965 return cost;
14966 }
14967 \f
14968 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
14969 this is used for to form addresses to local data when -fPIC is in
14970 use. */
14971
14972 static bool
14973 darwin_local_data_pic (rtx disp)
14974 {
14975 return (GET_CODE (disp) == UNSPEC
14976 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
14977 }
14978
14979 /* True if operand X should be loaded from GOT. */
14980
14981 bool
14982 ix86_force_load_from_GOT_p (rtx x)
14983 {
14984 return ((TARGET_64BIT || HAVE_AS_IX86_GOT32X)
14985 && !TARGET_PECOFF && !TARGET_MACHO
14986 && !flag_plt && !flag_pic
14987 && ix86_cmodel != CM_LARGE
14988 && GET_CODE (x) == SYMBOL_REF
14989 && SYMBOL_REF_FUNCTION_P (x)
14990 && !SYMBOL_REF_LOCAL_P (x));
14991 }
14992
14993 /* Determine if a given RTX is a valid constant. We already know this
14994 satisfies CONSTANT_P. */
14995
14996 static bool
14997 ix86_legitimate_constant_p (machine_mode mode, rtx x)
14998 {
14999 /* Pointer bounds constants are not valid. */
15000 if (POINTER_BOUNDS_MODE_P (GET_MODE (x)))
15001 return false;
15002
15003 switch (GET_CODE (x))
15004 {
15005 case CONST:
15006 x = XEXP (x, 0);
15007
15008 if (GET_CODE (x) == PLUS)
15009 {
15010 if (!CONST_INT_P (XEXP (x, 1)))
15011 return false;
15012 x = XEXP (x, 0);
15013 }
15014
15015 if (TARGET_MACHO && darwin_local_data_pic (x))
15016 return true;
15017
15018 /* Only some unspecs are valid as "constants". */
15019 if (GET_CODE (x) == UNSPEC)
15020 switch (XINT (x, 1))
15021 {
15022 case UNSPEC_GOT:
15023 case UNSPEC_GOTOFF:
15024 case UNSPEC_PLTOFF:
15025 return TARGET_64BIT;
15026 case UNSPEC_TPOFF:
15027 case UNSPEC_NTPOFF:
15028 x = XVECEXP (x, 0, 0);
15029 return (GET_CODE (x) == SYMBOL_REF
15030 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15031 case UNSPEC_DTPOFF:
15032 x = XVECEXP (x, 0, 0);
15033 return (GET_CODE (x) == SYMBOL_REF
15034 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
15035 default:
15036 return false;
15037 }
15038
15039 /* We must have drilled down to a symbol. */
15040 if (GET_CODE (x) == LABEL_REF)
15041 return true;
15042 if (GET_CODE (x) != SYMBOL_REF)
15043 return false;
15044 /* FALLTHRU */
15045
15046 case SYMBOL_REF:
15047 /* TLS symbols are never valid. */
15048 if (SYMBOL_REF_TLS_MODEL (x))
15049 return false;
15050
15051 /* DLLIMPORT symbols are never valid. */
15052 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15053 && SYMBOL_REF_DLLIMPORT_P (x))
15054 return false;
15055
15056 #if TARGET_MACHO
15057 /* mdynamic-no-pic */
15058 if (MACHO_DYNAMIC_NO_PIC_P)
15059 return machopic_symbol_defined_p (x);
15060 #endif
15061
15062 /* External function address should be loaded
15063 via the GOT slot to avoid PLT. */
15064 if (ix86_force_load_from_GOT_p (x))
15065 return false;
15066
15067 break;
15068
15069 CASE_CONST_SCALAR_INT:
15070 switch (mode)
15071 {
15072 case E_TImode:
15073 if (TARGET_64BIT)
15074 return true;
15075 /* FALLTHRU */
15076 case E_OImode:
15077 case E_XImode:
15078 if (!standard_sse_constant_p (x, mode))
15079 return false;
15080 default:
15081 break;
15082 }
15083 break;
15084
15085 case CONST_VECTOR:
15086 if (!standard_sse_constant_p (x, mode))
15087 return false;
15088
15089 default:
15090 break;
15091 }
15092
15093 /* Otherwise we handle everything else in the move patterns. */
15094 return true;
15095 }
15096
15097 /* Determine if it's legal to put X into the constant pool. This
15098 is not possible for the address of thread-local symbols, which
15099 is checked above. */
15100
15101 static bool
15102 ix86_cannot_force_const_mem (machine_mode mode, rtx x)
15103 {
15104 /* We can put any immediate constant in memory. */
15105 switch (GET_CODE (x))
15106 {
15107 CASE_CONST_ANY:
15108 return false;
15109
15110 default:
15111 break;
15112 }
15113
15114 return !ix86_legitimate_constant_p (mode, x);
15115 }
15116
15117 /* Nonzero if the symbol is marked as dllimport, or as stub-variable,
15118 otherwise zero. */
15119
15120 static bool
15121 is_imported_p (rtx x)
15122 {
15123 if (!TARGET_DLLIMPORT_DECL_ATTRIBUTES
15124 || GET_CODE (x) != SYMBOL_REF)
15125 return false;
15126
15127 return SYMBOL_REF_DLLIMPORT_P (x) || SYMBOL_REF_STUBVAR_P (x);
15128 }
15129
15130
15131 /* Nonzero if the constant value X is a legitimate general operand
15132 when generating PIC code. It is given that flag_pic is on and
15133 that X satisfies CONSTANT_P. */
15134
15135 bool
15136 legitimate_pic_operand_p (rtx x)
15137 {
15138 rtx inner;
15139
15140 switch (GET_CODE (x))
15141 {
15142 case CONST:
15143 inner = XEXP (x, 0);
15144 if (GET_CODE (inner) == PLUS
15145 && CONST_INT_P (XEXP (inner, 1)))
15146 inner = XEXP (inner, 0);
15147
15148 /* Only some unspecs are valid as "constants". */
15149 if (GET_CODE (inner) == UNSPEC)
15150 switch (XINT (inner, 1))
15151 {
15152 case UNSPEC_GOT:
15153 case UNSPEC_GOTOFF:
15154 case UNSPEC_PLTOFF:
15155 return TARGET_64BIT;
15156 case UNSPEC_TPOFF:
15157 x = XVECEXP (inner, 0, 0);
15158 return (GET_CODE (x) == SYMBOL_REF
15159 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
15160 case UNSPEC_MACHOPIC_OFFSET:
15161 return legitimate_pic_address_disp_p (x);
15162 default:
15163 return false;
15164 }
15165 /* FALLTHRU */
15166
15167 case SYMBOL_REF:
15168 case LABEL_REF:
15169 return legitimate_pic_address_disp_p (x);
15170
15171 default:
15172 return true;
15173 }
15174 }
15175
15176 /* Determine if a given CONST RTX is a valid memory displacement
15177 in PIC mode. */
15178
15179 bool
15180 legitimate_pic_address_disp_p (rtx disp)
15181 {
15182 bool saw_plus;
15183
15184 /* In 64bit mode we can allow direct addresses of symbols and labels
15185 when they are not dynamic symbols. */
15186 if (TARGET_64BIT)
15187 {
15188 rtx op0 = disp, op1;
15189
15190 switch (GET_CODE (disp))
15191 {
15192 case LABEL_REF:
15193 return true;
15194
15195 case CONST:
15196 if (GET_CODE (XEXP (disp, 0)) != PLUS)
15197 break;
15198 op0 = XEXP (XEXP (disp, 0), 0);
15199 op1 = XEXP (XEXP (disp, 0), 1);
15200 if (!CONST_INT_P (op1))
15201 break;
15202 if (GET_CODE (op0) == UNSPEC
15203 && (XINT (op0, 1) == UNSPEC_DTPOFF
15204 || XINT (op0, 1) == UNSPEC_NTPOFF)
15205 && trunc_int_for_mode (INTVAL (op1), SImode) == INTVAL (op1))
15206 return true;
15207 if (INTVAL (op1) >= 16*1024*1024
15208 || INTVAL (op1) < -16*1024*1024)
15209 break;
15210 if (GET_CODE (op0) == LABEL_REF)
15211 return true;
15212 if (GET_CODE (op0) == CONST
15213 && GET_CODE (XEXP (op0, 0)) == UNSPEC
15214 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
15215 return true;
15216 if (GET_CODE (op0) == UNSPEC
15217 && XINT (op0, 1) == UNSPEC_PCREL)
15218 return true;
15219 if (GET_CODE (op0) != SYMBOL_REF)
15220 break;
15221 /* FALLTHRU */
15222
15223 case SYMBOL_REF:
15224 /* TLS references should always be enclosed in UNSPEC.
15225 The dllimported symbol needs always to be resolved. */
15226 if (SYMBOL_REF_TLS_MODEL (op0)
15227 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && SYMBOL_REF_DLLIMPORT_P (op0)))
15228 return false;
15229
15230 if (TARGET_PECOFF)
15231 {
15232 if (is_imported_p (op0))
15233 return true;
15234
15235 if (SYMBOL_REF_FAR_ADDR_P (op0)
15236 || !SYMBOL_REF_LOCAL_P (op0))
15237 break;
15238
15239 /* Function-symbols need to be resolved only for
15240 large-model.
15241 For the small-model we don't need to resolve anything
15242 here. */
15243 if ((ix86_cmodel != CM_LARGE_PIC
15244 && SYMBOL_REF_FUNCTION_P (op0))
15245 || ix86_cmodel == CM_SMALL_PIC)
15246 return true;
15247 /* Non-external symbols don't need to be resolved for
15248 large, and medium-model. */
15249 if ((ix86_cmodel == CM_LARGE_PIC
15250 || ix86_cmodel == CM_MEDIUM_PIC)
15251 && !SYMBOL_REF_EXTERNAL_P (op0))
15252 return true;
15253 }
15254 else if (!SYMBOL_REF_FAR_ADDR_P (op0)
15255 && (SYMBOL_REF_LOCAL_P (op0)
15256 || (HAVE_LD_PIE_COPYRELOC
15257 && flag_pie
15258 && !SYMBOL_REF_WEAK (op0)
15259 && !SYMBOL_REF_FUNCTION_P (op0)))
15260 && ix86_cmodel != CM_LARGE_PIC)
15261 return true;
15262 break;
15263
15264 default:
15265 break;
15266 }
15267 }
15268 if (GET_CODE (disp) != CONST)
15269 return false;
15270 disp = XEXP (disp, 0);
15271
15272 if (TARGET_64BIT)
15273 {
15274 /* We are unsafe to allow PLUS expressions. This limit allowed distance
15275 of GOT tables. We should not need these anyway. */
15276 if (GET_CODE (disp) != UNSPEC
15277 || (XINT (disp, 1) != UNSPEC_GOTPCREL
15278 && XINT (disp, 1) != UNSPEC_GOTOFF
15279 && XINT (disp, 1) != UNSPEC_PCREL
15280 && XINT (disp, 1) != UNSPEC_PLTOFF))
15281 return false;
15282
15283 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
15284 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
15285 return false;
15286 return true;
15287 }
15288
15289 saw_plus = false;
15290 if (GET_CODE (disp) == PLUS)
15291 {
15292 if (!CONST_INT_P (XEXP (disp, 1)))
15293 return false;
15294 disp = XEXP (disp, 0);
15295 saw_plus = true;
15296 }
15297
15298 if (TARGET_MACHO && darwin_local_data_pic (disp))
15299 return true;
15300
15301 if (GET_CODE (disp) != UNSPEC)
15302 return false;
15303
15304 switch (XINT (disp, 1))
15305 {
15306 case UNSPEC_GOT:
15307 if (saw_plus)
15308 return false;
15309 /* We need to check for both symbols and labels because VxWorks loads
15310 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
15311 details. */
15312 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15313 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
15314 case UNSPEC_GOTOFF:
15315 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
15316 While ABI specify also 32bit relocation but we don't produce it in
15317 small PIC model at all. */
15318 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
15319 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
15320 && !TARGET_64BIT)
15321 return !TARGET_PECOFF && gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
15322 return false;
15323 case UNSPEC_GOTTPOFF:
15324 case UNSPEC_GOTNTPOFF:
15325 case UNSPEC_INDNTPOFF:
15326 if (saw_plus)
15327 return false;
15328 disp = XVECEXP (disp, 0, 0);
15329 return (GET_CODE (disp) == SYMBOL_REF
15330 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
15331 case UNSPEC_NTPOFF:
15332 disp = XVECEXP (disp, 0, 0);
15333 return (GET_CODE (disp) == SYMBOL_REF
15334 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
15335 case UNSPEC_DTPOFF:
15336 disp = XVECEXP (disp, 0, 0);
15337 return (GET_CODE (disp) == SYMBOL_REF
15338 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
15339 }
15340
15341 return false;
15342 }
15343
15344 /* Determine if op is suitable RTX for an address register.
15345 Return naked register if a register or a register subreg is
15346 found, otherwise return NULL_RTX. */
15347
15348 static rtx
15349 ix86_validate_address_register (rtx op)
15350 {
15351 machine_mode mode = GET_MODE (op);
15352
15353 /* Only SImode or DImode registers can form the address. */
15354 if (mode != SImode && mode != DImode)
15355 return NULL_RTX;
15356
15357 if (REG_P (op))
15358 return op;
15359 else if (SUBREG_P (op))
15360 {
15361 rtx reg = SUBREG_REG (op);
15362
15363 if (!REG_P (reg))
15364 return NULL_RTX;
15365
15366 mode = GET_MODE (reg);
15367
15368 /* Don't allow SUBREGs that span more than a word. It can
15369 lead to spill failures when the register is one word out
15370 of a two word structure. */
15371 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
15372 return NULL_RTX;
15373
15374 /* Allow only SUBREGs of non-eliminable hard registers. */
15375 if (register_no_elim_operand (reg, mode))
15376 return reg;
15377 }
15378
15379 /* Op is not a register. */
15380 return NULL_RTX;
15381 }
15382
15383 /* Recognizes RTL expressions that are valid memory addresses for an
15384 instruction. The MODE argument is the machine mode for the MEM
15385 expression that wants to use this address.
15386
15387 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
15388 convert common non-canonical forms to canonical form so that they will
15389 be recognized. */
15390
15391 static bool
15392 ix86_legitimate_address_p (machine_mode, rtx addr, bool strict)
15393 {
15394 struct ix86_address parts;
15395 rtx base, index, disp;
15396 HOST_WIDE_INT scale;
15397 addr_space_t seg;
15398
15399 if (ix86_decompose_address (addr, &parts) <= 0)
15400 /* Decomposition failed. */
15401 return false;
15402
15403 base = parts.base;
15404 index = parts.index;
15405 disp = parts.disp;
15406 scale = parts.scale;
15407 seg = parts.seg;
15408
15409 /* Validate base register. */
15410 if (base)
15411 {
15412 rtx reg = ix86_validate_address_register (base);
15413
15414 if (reg == NULL_RTX)
15415 return false;
15416
15417 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
15418 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
15419 /* Base is not valid. */
15420 return false;
15421 }
15422
15423 /* Validate index register. */
15424 if (index)
15425 {
15426 rtx reg = ix86_validate_address_register (index);
15427
15428 if (reg == NULL_RTX)
15429 return false;
15430
15431 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
15432 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
15433 /* Index is not valid. */
15434 return false;
15435 }
15436
15437 /* Index and base should have the same mode. */
15438 if (base && index
15439 && GET_MODE (base) != GET_MODE (index))
15440 return false;
15441
15442 /* Address override works only on the (%reg) part of %fs:(%reg). */
15443 if (seg != ADDR_SPACE_GENERIC
15444 && ((base && GET_MODE (base) != word_mode)
15445 || (index && GET_MODE (index) != word_mode)))
15446 return false;
15447
15448 /* Validate scale factor. */
15449 if (scale != 1)
15450 {
15451 if (!index)
15452 /* Scale without index. */
15453 return false;
15454
15455 if (scale != 2 && scale != 4 && scale != 8)
15456 /* Scale is not a valid multiplier. */
15457 return false;
15458 }
15459
15460 /* Validate displacement. */
15461 if (disp)
15462 {
15463 if (GET_CODE (disp) == CONST
15464 && GET_CODE (XEXP (disp, 0)) == UNSPEC
15465 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
15466 switch (XINT (XEXP (disp, 0), 1))
15467 {
15468 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit
15469 when used. While ABI specify also 32bit relocations, we
15470 don't produce them at all and use IP relative instead.
15471 Allow GOT in 32bit mode for both PIC and non-PIC if symbol
15472 should be loaded via GOT. */
15473 case UNSPEC_GOT:
15474 if (!TARGET_64BIT
15475 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15476 goto is_legitimate_pic;
15477 /* FALLTHRU */
15478 case UNSPEC_GOTOFF:
15479 gcc_assert (flag_pic);
15480 if (!TARGET_64BIT)
15481 goto is_legitimate_pic;
15482
15483 /* 64bit address unspec. */
15484 return false;
15485
15486 case UNSPEC_GOTPCREL:
15487 if (ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
15488 goto is_legitimate_pic;
15489 /* FALLTHRU */
15490 case UNSPEC_PCREL:
15491 gcc_assert (flag_pic);
15492 goto is_legitimate_pic;
15493
15494 case UNSPEC_GOTTPOFF:
15495 case UNSPEC_GOTNTPOFF:
15496 case UNSPEC_INDNTPOFF:
15497 case UNSPEC_NTPOFF:
15498 case UNSPEC_DTPOFF:
15499 break;
15500
15501 default:
15502 /* Invalid address unspec. */
15503 return false;
15504 }
15505
15506 else if (SYMBOLIC_CONST (disp)
15507 && (flag_pic
15508 || (TARGET_MACHO
15509 #if TARGET_MACHO
15510 && MACHOPIC_INDIRECT
15511 && !machopic_operand_p (disp)
15512 #endif
15513 )))
15514 {
15515
15516 is_legitimate_pic:
15517 if (TARGET_64BIT && (index || base))
15518 {
15519 /* foo@dtpoff(%rX) is ok. */
15520 if (GET_CODE (disp) != CONST
15521 || GET_CODE (XEXP (disp, 0)) != PLUS
15522 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
15523 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
15524 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
15525 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
15526 /* Non-constant pic memory reference. */
15527 return false;
15528 }
15529 else if ((!TARGET_MACHO || flag_pic)
15530 && ! legitimate_pic_address_disp_p (disp))
15531 /* Displacement is an invalid pic construct. */
15532 return false;
15533 #if TARGET_MACHO
15534 else if (MACHO_DYNAMIC_NO_PIC_P
15535 && !ix86_legitimate_constant_p (Pmode, disp))
15536 /* displacment must be referenced via non_lazy_pointer */
15537 return false;
15538 #endif
15539
15540 /* This code used to verify that a symbolic pic displacement
15541 includes the pic_offset_table_rtx register.
15542
15543 While this is good idea, unfortunately these constructs may
15544 be created by "adds using lea" optimization for incorrect
15545 code like:
15546
15547 int a;
15548 int foo(int i)
15549 {
15550 return *(&a+i);
15551 }
15552
15553 This code is nonsensical, but results in addressing
15554 GOT table with pic_offset_table_rtx base. We can't
15555 just refuse it easily, since it gets matched by
15556 "addsi3" pattern, that later gets split to lea in the
15557 case output register differs from input. While this
15558 can be handled by separate addsi pattern for this case
15559 that never results in lea, this seems to be easier and
15560 correct fix for crash to disable this test. */
15561 }
15562 else if (GET_CODE (disp) != LABEL_REF
15563 && !CONST_INT_P (disp)
15564 && (GET_CODE (disp) != CONST
15565 || !ix86_legitimate_constant_p (Pmode, disp))
15566 && (GET_CODE (disp) != SYMBOL_REF
15567 || !ix86_legitimate_constant_p (Pmode, disp)))
15568 /* Displacement is not constant. */
15569 return false;
15570 else if (TARGET_64BIT
15571 && !x86_64_immediate_operand (disp, VOIDmode))
15572 /* Displacement is out of range. */
15573 return false;
15574 /* In x32 mode, constant addresses are sign extended to 64bit, so
15575 we have to prevent addresses from 0x80000000 to 0xffffffff. */
15576 else if (TARGET_X32 && !(index || base)
15577 && CONST_INT_P (disp)
15578 && val_signbit_known_set_p (SImode, INTVAL (disp)))
15579 return false;
15580 }
15581
15582 /* Everything looks valid. */
15583 return true;
15584 }
15585
15586 /* Determine if a given RTX is a valid constant address. */
15587
15588 bool
15589 constant_address_p (rtx x)
15590 {
15591 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
15592 }
15593 \f
15594 /* Return a unique alias set for the GOT. */
15595
15596 static alias_set_type
15597 ix86_GOT_alias_set (void)
15598 {
15599 static alias_set_type set = -1;
15600 if (set == -1)
15601 set = new_alias_set ();
15602 return set;
15603 }
15604
15605 /* Return a legitimate reference for ORIG (an address) using the
15606 register REG. If REG is 0, a new pseudo is generated.
15607
15608 There are two types of references that must be handled:
15609
15610 1. Global data references must load the address from the GOT, via
15611 the PIC reg. An insn is emitted to do this load, and the reg is
15612 returned.
15613
15614 2. Static data references, constant pool addresses, and code labels
15615 compute the address as an offset from the GOT, whose base is in
15616 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
15617 differentiate them from global data objects. The returned
15618 address is the PIC reg + an unspec constant.
15619
15620 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
15621 reg also appears in the address. */
15622
15623 static rtx
15624 legitimize_pic_address (rtx orig, rtx reg)
15625 {
15626 rtx addr = orig;
15627 rtx new_rtx = orig;
15628
15629 #if TARGET_MACHO
15630 if (TARGET_MACHO && !TARGET_64BIT)
15631 {
15632 if (reg == 0)
15633 reg = gen_reg_rtx (Pmode);
15634 /* Use the generic Mach-O PIC machinery. */
15635 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
15636 }
15637 #endif
15638
15639 if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
15640 {
15641 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15642 if (tmp)
15643 return tmp;
15644 }
15645
15646 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
15647 new_rtx = addr;
15648 else if ((!TARGET_64BIT
15649 || /* TARGET_64BIT && */ ix86_cmodel != CM_SMALL_PIC)
15650 && !TARGET_PECOFF
15651 && gotoff_operand (addr, Pmode))
15652 {
15653 /* This symbol may be referenced via a displacement
15654 from the PIC base address (@GOTOFF). */
15655 if (GET_CODE (addr) == CONST)
15656 addr = XEXP (addr, 0);
15657
15658 if (GET_CODE (addr) == PLUS)
15659 {
15660 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
15661 UNSPEC_GOTOFF);
15662 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
15663 }
15664 else
15665 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
15666
15667 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15668
15669 if (TARGET_64BIT)
15670 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15671
15672 if (reg != 0)
15673 {
15674 gcc_assert (REG_P (reg));
15675 new_rtx = expand_simple_binop (Pmode, PLUS, pic_offset_table_rtx,
15676 new_rtx, reg, 1, OPTAB_DIRECT);
15677 }
15678 else
15679 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15680 }
15681 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
15682 /* We can't use @GOTOFF for text labels
15683 on VxWorks, see gotoff_operand. */
15684 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
15685 {
15686 rtx tmp = legitimize_pe_coff_symbol (addr, true);
15687 if (tmp)
15688 return tmp;
15689
15690 /* For x64 PE-COFF there is no GOT table,
15691 so we use address directly. */
15692 if (TARGET_64BIT && TARGET_PECOFF)
15693 {
15694 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
15695 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15696 }
15697 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
15698 {
15699 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
15700 UNSPEC_GOTPCREL);
15701 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15702 new_rtx = gen_const_mem (Pmode, new_rtx);
15703 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15704 }
15705 else
15706 {
15707 /* This symbol must be referenced via a load
15708 from the Global Offset Table (@GOT). */
15709 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
15710 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15711 if (TARGET_64BIT)
15712 new_rtx = force_reg (Pmode, new_rtx);
15713 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15714 new_rtx = gen_const_mem (Pmode, new_rtx);
15715 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
15716 }
15717
15718 new_rtx = copy_to_suggested_reg (new_rtx, reg, Pmode);
15719 }
15720 else
15721 {
15722 if (CONST_INT_P (addr)
15723 && !x86_64_immediate_operand (addr, VOIDmode))
15724 new_rtx = copy_to_suggested_reg (addr, reg, Pmode);
15725 else if (GET_CODE (addr) == CONST)
15726 {
15727 addr = XEXP (addr, 0);
15728
15729 /* We must match stuff we generate before. Assume the only
15730 unspecs that can get here are ours. Not that we could do
15731 anything with them anyway.... */
15732 if (GET_CODE (addr) == UNSPEC
15733 || (GET_CODE (addr) == PLUS
15734 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
15735 return orig;
15736 gcc_assert (GET_CODE (addr) == PLUS);
15737 }
15738
15739 if (GET_CODE (addr) == PLUS)
15740 {
15741 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
15742
15743 /* Check first to see if this is a constant
15744 offset from a @GOTOFF symbol reference. */
15745 if (!TARGET_PECOFF
15746 && gotoff_operand (op0, Pmode)
15747 && CONST_INT_P (op1))
15748 {
15749 if (!TARGET_64BIT)
15750 {
15751 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
15752 UNSPEC_GOTOFF);
15753 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
15754 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
15755
15756 if (reg != 0)
15757 {
15758 gcc_assert (REG_P (reg));
15759 new_rtx = expand_simple_binop (Pmode, PLUS,
15760 pic_offset_table_rtx,
15761 new_rtx, reg, 1,
15762 OPTAB_DIRECT);
15763 }
15764 else
15765 new_rtx
15766 = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
15767 }
15768 else
15769 {
15770 if (INTVAL (op1) < -16*1024*1024
15771 || INTVAL (op1) >= 16*1024*1024)
15772 {
15773 if (!x86_64_immediate_operand (op1, Pmode))
15774 op1 = force_reg (Pmode, op1);
15775
15776 new_rtx
15777 = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
15778 }
15779 }
15780 }
15781 else
15782 {
15783 rtx base = legitimize_pic_address (op0, reg);
15784 machine_mode mode = GET_MODE (base);
15785 new_rtx
15786 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
15787
15788 if (CONST_INT_P (new_rtx))
15789 {
15790 if (INTVAL (new_rtx) < -16*1024*1024
15791 || INTVAL (new_rtx) >= 16*1024*1024)
15792 {
15793 if (!x86_64_immediate_operand (new_rtx, mode))
15794 new_rtx = force_reg (mode, new_rtx);
15795
15796 new_rtx
15797 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
15798 }
15799 else
15800 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
15801 }
15802 else
15803 {
15804 /* For %rip addressing, we have to use
15805 just disp32, not base nor index. */
15806 if (TARGET_64BIT
15807 && (GET_CODE (base) == SYMBOL_REF
15808 || GET_CODE (base) == LABEL_REF))
15809 base = force_reg (mode, base);
15810 if (GET_CODE (new_rtx) == PLUS
15811 && CONSTANT_P (XEXP (new_rtx, 1)))
15812 {
15813 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
15814 new_rtx = XEXP (new_rtx, 1);
15815 }
15816 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
15817 }
15818 }
15819 }
15820 }
15821 return new_rtx;
15822 }
15823 \f
15824 /* Load the thread pointer. If TO_REG is true, force it into a register. */
15825
15826 static rtx
15827 get_thread_pointer (machine_mode tp_mode, bool to_reg)
15828 {
15829 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
15830
15831 if (GET_MODE (tp) != tp_mode)
15832 {
15833 gcc_assert (GET_MODE (tp) == SImode);
15834 gcc_assert (tp_mode == DImode);
15835
15836 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
15837 }
15838
15839 if (to_reg)
15840 tp = copy_to_mode_reg (tp_mode, tp);
15841
15842 return tp;
15843 }
15844
15845 /* Construct the SYMBOL_REF for the tls_get_addr function. */
15846
15847 static GTY(()) rtx ix86_tls_symbol;
15848
15849 static rtx
15850 ix86_tls_get_addr (void)
15851 {
15852 if (!ix86_tls_symbol)
15853 {
15854 const char *sym
15855 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
15856 ? "___tls_get_addr" : "__tls_get_addr");
15857
15858 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
15859 }
15860
15861 if (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF)
15862 {
15863 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, ix86_tls_symbol),
15864 UNSPEC_PLTOFF);
15865 return gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
15866 gen_rtx_CONST (Pmode, unspec));
15867 }
15868
15869 return ix86_tls_symbol;
15870 }
15871
15872 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
15873
15874 static GTY(()) rtx ix86_tls_module_base_symbol;
15875
15876 rtx
15877 ix86_tls_module_base (void)
15878 {
15879 if (!ix86_tls_module_base_symbol)
15880 {
15881 ix86_tls_module_base_symbol
15882 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
15883
15884 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15885 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15886 }
15887
15888 return ix86_tls_module_base_symbol;
15889 }
15890
15891 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
15892 false if we expect this to be used for a memory address and true if
15893 we expect to load the address into a register. */
15894
15895 static rtx
15896 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
15897 {
15898 rtx dest, base, off;
15899 rtx pic = NULL_RTX, tp = NULL_RTX;
15900 machine_mode tp_mode = Pmode;
15901 int type;
15902
15903 /* Fall back to global dynamic model if tool chain cannot support local
15904 dynamic. */
15905 if (TARGET_SUN_TLS && !TARGET_64BIT
15906 && !HAVE_AS_IX86_TLSLDMPLT && !HAVE_AS_IX86_TLSLDM
15907 && model == TLS_MODEL_LOCAL_DYNAMIC)
15908 model = TLS_MODEL_GLOBAL_DYNAMIC;
15909
15910 switch (model)
15911 {
15912 case TLS_MODEL_GLOBAL_DYNAMIC:
15913 dest = gen_reg_rtx (Pmode);
15914
15915 if (!TARGET_64BIT)
15916 {
15917 if (flag_pic && !TARGET_PECOFF)
15918 pic = pic_offset_table_rtx;
15919 else
15920 {
15921 pic = gen_reg_rtx (Pmode);
15922 emit_insn (gen_set_got (pic));
15923 }
15924 }
15925
15926 if (TARGET_GNU2_TLS)
15927 {
15928 if (TARGET_64BIT)
15929 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
15930 else
15931 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
15932
15933 tp = get_thread_pointer (Pmode, true);
15934 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
15935
15936 if (GET_MODE (x) != Pmode)
15937 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15938
15939 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
15940 }
15941 else
15942 {
15943 rtx caddr = ix86_tls_get_addr ();
15944
15945 if (TARGET_64BIT)
15946 {
15947 rtx rax = gen_rtx_REG (Pmode, AX_REG);
15948 rtx_insn *insns;
15949
15950 start_sequence ();
15951 emit_call_insn
15952 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
15953 insns = get_insns ();
15954 end_sequence ();
15955
15956 if (GET_MODE (x) != Pmode)
15957 x = gen_rtx_ZERO_EXTEND (Pmode, x);
15958
15959 RTL_CONST_CALL_P (insns) = 1;
15960 emit_libcall_block (insns, dest, rax, x);
15961 }
15962 else
15963 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
15964 }
15965 break;
15966
15967 case TLS_MODEL_LOCAL_DYNAMIC:
15968 base = gen_reg_rtx (Pmode);
15969
15970 if (!TARGET_64BIT)
15971 {
15972 if (flag_pic)
15973 pic = pic_offset_table_rtx;
15974 else
15975 {
15976 pic = gen_reg_rtx (Pmode);
15977 emit_insn (gen_set_got (pic));
15978 }
15979 }
15980
15981 if (TARGET_GNU2_TLS)
15982 {
15983 rtx tmp = ix86_tls_module_base ();
15984
15985 if (TARGET_64BIT)
15986 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
15987 else
15988 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
15989
15990 tp = get_thread_pointer (Pmode, true);
15991 set_unique_reg_note (get_last_insn (), REG_EQUAL,
15992 gen_rtx_MINUS (Pmode, tmp, tp));
15993 }
15994 else
15995 {
15996 rtx caddr = ix86_tls_get_addr ();
15997
15998 if (TARGET_64BIT)
15999 {
16000 rtx rax = gen_rtx_REG (Pmode, AX_REG);
16001 rtx_insn *insns;
16002 rtx eqv;
16003
16004 start_sequence ();
16005 emit_call_insn
16006 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
16007 insns = get_insns ();
16008 end_sequence ();
16009
16010 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
16011 share the LD_BASE result with other LD model accesses. */
16012 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
16013 UNSPEC_TLS_LD_BASE);
16014
16015 RTL_CONST_CALL_P (insns) = 1;
16016 emit_libcall_block (insns, base, rax, eqv);
16017 }
16018 else
16019 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
16020 }
16021
16022 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
16023 off = gen_rtx_CONST (Pmode, off);
16024
16025 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
16026
16027 if (TARGET_GNU2_TLS)
16028 {
16029 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
16030
16031 if (GET_MODE (x) != Pmode)
16032 x = gen_rtx_ZERO_EXTEND (Pmode, x);
16033
16034 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
16035 }
16036 break;
16037
16038 case TLS_MODEL_INITIAL_EXEC:
16039 if (TARGET_64BIT)
16040 {
16041 if (TARGET_SUN_TLS && !TARGET_X32)
16042 {
16043 /* The Sun linker took the AMD64 TLS spec literally
16044 and can only handle %rax as destination of the
16045 initial executable code sequence. */
16046
16047 dest = gen_reg_rtx (DImode);
16048 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
16049 return dest;
16050 }
16051
16052 /* Generate DImode references to avoid %fs:(%reg32)
16053 problems and linker IE->LE relaxation bug. */
16054 tp_mode = DImode;
16055 pic = NULL;
16056 type = UNSPEC_GOTNTPOFF;
16057 }
16058 else if (flag_pic)
16059 {
16060 pic = pic_offset_table_rtx;
16061 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
16062 }
16063 else if (!TARGET_ANY_GNU_TLS)
16064 {
16065 pic = gen_reg_rtx (Pmode);
16066 emit_insn (gen_set_got (pic));
16067 type = UNSPEC_GOTTPOFF;
16068 }
16069 else
16070 {
16071 pic = NULL;
16072 type = UNSPEC_INDNTPOFF;
16073 }
16074
16075 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
16076 off = gen_rtx_CONST (tp_mode, off);
16077 if (pic)
16078 off = gen_rtx_PLUS (tp_mode, pic, off);
16079 off = gen_const_mem (tp_mode, off);
16080 set_mem_alias_set (off, ix86_GOT_alias_set ());
16081
16082 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16083 {
16084 base = get_thread_pointer (tp_mode,
16085 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16086 off = force_reg (tp_mode, off);
16087 dest = gen_rtx_PLUS (tp_mode, base, off);
16088 if (tp_mode != Pmode)
16089 dest = convert_to_mode (Pmode, dest, 1);
16090 }
16091 else
16092 {
16093 base = get_thread_pointer (Pmode, true);
16094 dest = gen_reg_rtx (Pmode);
16095 emit_insn (ix86_gen_sub3 (dest, base, off));
16096 }
16097 break;
16098
16099 case TLS_MODEL_LOCAL_EXEC:
16100 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
16101 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16102 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
16103 off = gen_rtx_CONST (Pmode, off);
16104
16105 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
16106 {
16107 base = get_thread_pointer (Pmode,
16108 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
16109 return gen_rtx_PLUS (Pmode, base, off);
16110 }
16111 else
16112 {
16113 base = get_thread_pointer (Pmode, true);
16114 dest = gen_reg_rtx (Pmode);
16115 emit_insn (ix86_gen_sub3 (dest, base, off));
16116 }
16117 break;
16118
16119 default:
16120 gcc_unreachable ();
16121 }
16122
16123 return dest;
16124 }
16125
16126 /* Return true if OP refers to a TLS address. */
16127 bool
16128 ix86_tls_address_pattern_p (rtx op)
16129 {
16130 subrtx_var_iterator::array_type array;
16131 FOR_EACH_SUBRTX_VAR (iter, array, op, ALL)
16132 {
16133 rtx op = *iter;
16134 if (MEM_P (op))
16135 {
16136 rtx *x = &XEXP (op, 0);
16137 while (GET_CODE (*x) == PLUS)
16138 {
16139 int i;
16140 for (i = 0; i < 2; i++)
16141 {
16142 rtx u = XEXP (*x, i);
16143 if (GET_CODE (u) == ZERO_EXTEND)
16144 u = XEXP (u, 0);
16145 if (GET_CODE (u) == UNSPEC
16146 && XINT (u, 1) == UNSPEC_TP)
16147 return true;
16148 }
16149 x = &XEXP (*x, 0);
16150 }
16151
16152 iter.skip_subrtxes ();
16153 }
16154 }
16155
16156 return false;
16157 }
16158
16159 /* Rewrite *LOC so that it refers to a default TLS address space. */
16160 void
16161 ix86_rewrite_tls_address_1 (rtx *loc)
16162 {
16163 subrtx_ptr_iterator::array_type array;
16164 FOR_EACH_SUBRTX_PTR (iter, array, loc, ALL)
16165 {
16166 rtx *loc = *iter;
16167 if (MEM_P (*loc))
16168 {
16169 rtx addr = XEXP (*loc, 0);
16170 rtx *x = &addr;
16171 while (GET_CODE (*x) == PLUS)
16172 {
16173 int i;
16174 for (i = 0; i < 2; i++)
16175 {
16176 rtx u = XEXP (*x, i);
16177 if (GET_CODE (u) == ZERO_EXTEND)
16178 u = XEXP (u, 0);
16179 if (GET_CODE (u) == UNSPEC
16180 && XINT (u, 1) == UNSPEC_TP)
16181 {
16182 addr_space_t as = DEFAULT_TLS_SEG_REG;
16183
16184 *x = XEXP (*x, 1 - i);
16185
16186 *loc = replace_equiv_address_nv (*loc, addr, true);
16187 set_mem_addr_space (*loc, as);
16188 return;
16189 }
16190 }
16191 x = &XEXP (*x, 0);
16192 }
16193
16194 iter.skip_subrtxes ();
16195 }
16196 }
16197 }
16198
16199 /* Rewrite instruction pattern involvning TLS address
16200 so that it refers to a default TLS address space. */
16201 rtx
16202 ix86_rewrite_tls_address (rtx pattern)
16203 {
16204 pattern = copy_insn (pattern);
16205 ix86_rewrite_tls_address_1 (&pattern);
16206 return pattern;
16207 }
16208
16209 /* Create or return the unique __imp_DECL dllimport symbol corresponding
16210 to symbol DECL if BEIMPORT is true. Otherwise create or return the
16211 unique refptr-DECL symbol corresponding to symbol DECL. */
16212
16213 struct dllimport_hasher : ggc_cache_ptr_hash<tree_map>
16214 {
16215 static inline hashval_t hash (tree_map *m) { return m->hash; }
16216 static inline bool
16217 equal (tree_map *a, tree_map *b)
16218 {
16219 return a->base.from == b->base.from;
16220 }
16221
16222 static int
16223 keep_cache_entry (tree_map *&m)
16224 {
16225 return ggc_marked_p (m->base.from);
16226 }
16227 };
16228
16229 static GTY((cache)) hash_table<dllimport_hasher> *dllimport_map;
16230
16231 static tree
16232 get_dllimport_decl (tree decl, bool beimport)
16233 {
16234 struct tree_map *h, in;
16235 const char *name;
16236 const char *prefix;
16237 size_t namelen, prefixlen;
16238 char *imp_name;
16239 tree to;
16240 rtx rtl;
16241
16242 if (!dllimport_map)
16243 dllimport_map = hash_table<dllimport_hasher>::create_ggc (512);
16244
16245 in.hash = htab_hash_pointer (decl);
16246 in.base.from = decl;
16247 tree_map **loc = dllimport_map->find_slot_with_hash (&in, in.hash, INSERT);
16248 h = *loc;
16249 if (h)
16250 return h->to;
16251
16252 *loc = h = ggc_alloc<tree_map> ();
16253 h->hash = in.hash;
16254 h->base.from = decl;
16255 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
16256 VAR_DECL, NULL, ptr_type_node);
16257 DECL_ARTIFICIAL (to) = 1;
16258 DECL_IGNORED_P (to) = 1;
16259 DECL_EXTERNAL (to) = 1;
16260 TREE_READONLY (to) = 1;
16261
16262 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
16263 name = targetm.strip_name_encoding (name);
16264 if (beimport)
16265 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
16266 ? "*__imp_" : "*__imp__";
16267 else
16268 prefix = user_label_prefix[0] == 0 ? "*.refptr." : "*refptr.";
16269 namelen = strlen (name);
16270 prefixlen = strlen (prefix);
16271 imp_name = (char *) alloca (namelen + prefixlen + 1);
16272 memcpy (imp_name, prefix, prefixlen);
16273 memcpy (imp_name + prefixlen, name, namelen + 1);
16274
16275 name = ggc_alloc_string (imp_name, namelen + prefixlen);
16276 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
16277 SET_SYMBOL_REF_DECL (rtl, to);
16278 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL | SYMBOL_FLAG_STUBVAR;
16279 if (!beimport)
16280 {
16281 SYMBOL_REF_FLAGS (rtl) |= SYMBOL_FLAG_EXTERNAL;
16282 #ifdef SUB_TARGET_RECORD_STUB
16283 SUB_TARGET_RECORD_STUB (name);
16284 #endif
16285 }
16286
16287 rtl = gen_const_mem (Pmode, rtl);
16288 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
16289
16290 SET_DECL_RTL (to, rtl);
16291 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
16292
16293 return to;
16294 }
16295
16296 /* Expand SYMBOL into its corresponding far-address symbol.
16297 WANT_REG is true if we require the result be a register. */
16298
16299 static rtx
16300 legitimize_pe_coff_extern_decl (rtx symbol, bool want_reg)
16301 {
16302 tree imp_decl;
16303 rtx x;
16304
16305 gcc_assert (SYMBOL_REF_DECL (symbol));
16306 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), false);
16307
16308 x = DECL_RTL (imp_decl);
16309 if (want_reg)
16310 x = force_reg (Pmode, x);
16311 return x;
16312 }
16313
16314 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
16315 true if we require the result be a register. */
16316
16317 static rtx
16318 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
16319 {
16320 tree imp_decl;
16321 rtx x;
16322
16323 gcc_assert (SYMBOL_REF_DECL (symbol));
16324 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol), true);
16325
16326 x = DECL_RTL (imp_decl);
16327 if (want_reg)
16328 x = force_reg (Pmode, x);
16329 return x;
16330 }
16331
16332 /* Expand SYMBOL into its corresponding dllimport or refptr symbol. WANT_REG
16333 is true if we require the result be a register. */
16334
16335 static rtx
16336 legitimize_pe_coff_symbol (rtx addr, bool inreg)
16337 {
16338 if (!TARGET_PECOFF)
16339 return NULL_RTX;
16340
16341 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16342 {
16343 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
16344 return legitimize_dllimport_symbol (addr, inreg);
16345 if (GET_CODE (addr) == CONST
16346 && GET_CODE (XEXP (addr, 0)) == PLUS
16347 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16348 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
16349 {
16350 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), inreg);
16351 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16352 }
16353 }
16354
16355 if (ix86_cmodel != CM_LARGE_PIC && ix86_cmodel != CM_MEDIUM_PIC)
16356 return NULL_RTX;
16357 if (GET_CODE (addr) == SYMBOL_REF
16358 && !is_imported_p (addr)
16359 && SYMBOL_REF_EXTERNAL_P (addr)
16360 && SYMBOL_REF_DECL (addr))
16361 return legitimize_pe_coff_extern_decl (addr, inreg);
16362
16363 if (GET_CODE (addr) == CONST
16364 && GET_CODE (XEXP (addr, 0)) == PLUS
16365 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
16366 && !is_imported_p (XEXP (XEXP (addr, 0), 0))
16367 && SYMBOL_REF_EXTERNAL_P (XEXP (XEXP (addr, 0), 0))
16368 && SYMBOL_REF_DECL (XEXP (XEXP (addr, 0), 0)))
16369 {
16370 rtx t = legitimize_pe_coff_extern_decl (XEXP (XEXP (addr, 0), 0), inreg);
16371 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
16372 }
16373 return NULL_RTX;
16374 }
16375
16376 /* Try machine-dependent ways of modifying an illegitimate address
16377 to be legitimate. If we find one, return the new, valid address.
16378 This macro is used in only one place: `memory_address' in explow.c.
16379
16380 OLDX is the address as it was before break_out_memory_refs was called.
16381 In some cases it is useful to look at this to decide what needs to be done.
16382
16383 It is always safe for this macro to do nothing. It exists to recognize
16384 opportunities to optimize the output.
16385
16386 For the 80386, we handle X+REG by loading X into a register R and
16387 using R+REG. R will go in a general reg and indexing will be used.
16388 However, if REG is a broken-out memory address or multiplication,
16389 nothing needs to be done because REG can certainly go in a general reg.
16390
16391 When -fpic is used, special handling is needed for symbolic references.
16392 See comments by legitimize_pic_address in i386.c for details. */
16393
16394 static rtx
16395 ix86_legitimize_address (rtx x, rtx, machine_mode mode)
16396 {
16397 bool changed = false;
16398 unsigned log;
16399
16400 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
16401 if (log)
16402 return legitimize_tls_address (x, (enum tls_model) log, false);
16403 if (GET_CODE (x) == CONST
16404 && GET_CODE (XEXP (x, 0)) == PLUS
16405 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
16406 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
16407 {
16408 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
16409 (enum tls_model) log, false);
16410 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
16411 }
16412
16413 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
16414 {
16415 rtx tmp = legitimize_pe_coff_symbol (x, true);
16416 if (tmp)
16417 return tmp;
16418 }
16419
16420 if (flag_pic && SYMBOLIC_CONST (x))
16421 return legitimize_pic_address (x, 0);
16422
16423 #if TARGET_MACHO
16424 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
16425 return machopic_indirect_data_reference (x, 0);
16426 #endif
16427
16428 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
16429 if (GET_CODE (x) == ASHIFT
16430 && CONST_INT_P (XEXP (x, 1))
16431 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
16432 {
16433 changed = true;
16434 log = INTVAL (XEXP (x, 1));
16435 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
16436 GEN_INT (1 << log));
16437 }
16438
16439 if (GET_CODE (x) == PLUS)
16440 {
16441 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
16442
16443 if (GET_CODE (XEXP (x, 0)) == ASHIFT
16444 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16445 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
16446 {
16447 changed = true;
16448 log = INTVAL (XEXP (XEXP (x, 0), 1));
16449 XEXP (x, 0) = gen_rtx_MULT (Pmode,
16450 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
16451 GEN_INT (1 << log));
16452 }
16453
16454 if (GET_CODE (XEXP (x, 1)) == ASHIFT
16455 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
16456 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
16457 {
16458 changed = true;
16459 log = INTVAL (XEXP (XEXP (x, 1), 1));
16460 XEXP (x, 1) = gen_rtx_MULT (Pmode,
16461 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
16462 GEN_INT (1 << log));
16463 }
16464
16465 /* Put multiply first if it isn't already. */
16466 if (GET_CODE (XEXP (x, 1)) == MULT)
16467 {
16468 std::swap (XEXP (x, 0), XEXP (x, 1));
16469 changed = true;
16470 }
16471
16472 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
16473 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
16474 created by virtual register instantiation, register elimination, and
16475 similar optimizations. */
16476 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
16477 {
16478 changed = true;
16479 x = gen_rtx_PLUS (Pmode,
16480 gen_rtx_PLUS (Pmode, XEXP (x, 0),
16481 XEXP (XEXP (x, 1), 0)),
16482 XEXP (XEXP (x, 1), 1));
16483 }
16484
16485 /* Canonicalize
16486 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
16487 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
16488 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
16489 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
16490 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
16491 && CONSTANT_P (XEXP (x, 1)))
16492 {
16493 rtx constant;
16494 rtx other = NULL_RTX;
16495
16496 if (CONST_INT_P (XEXP (x, 1)))
16497 {
16498 constant = XEXP (x, 1);
16499 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
16500 }
16501 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
16502 {
16503 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
16504 other = XEXP (x, 1);
16505 }
16506 else
16507 constant = 0;
16508
16509 if (constant)
16510 {
16511 changed = true;
16512 x = gen_rtx_PLUS (Pmode,
16513 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
16514 XEXP (XEXP (XEXP (x, 0), 1), 0)),
16515 plus_constant (Pmode, other,
16516 INTVAL (constant)));
16517 }
16518 }
16519
16520 if (changed && ix86_legitimate_address_p (mode, x, false))
16521 return x;
16522
16523 if (GET_CODE (XEXP (x, 0)) == MULT)
16524 {
16525 changed = true;
16526 XEXP (x, 0) = copy_addr_to_reg (XEXP (x, 0));
16527 }
16528
16529 if (GET_CODE (XEXP (x, 1)) == MULT)
16530 {
16531 changed = true;
16532 XEXP (x, 1) = copy_addr_to_reg (XEXP (x, 1));
16533 }
16534
16535 if (changed
16536 && REG_P (XEXP (x, 1))
16537 && REG_P (XEXP (x, 0)))
16538 return x;
16539
16540 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
16541 {
16542 changed = true;
16543 x = legitimize_pic_address (x, 0);
16544 }
16545
16546 if (changed && ix86_legitimate_address_p (mode, x, false))
16547 return x;
16548
16549 if (REG_P (XEXP (x, 0)))
16550 {
16551 rtx temp = gen_reg_rtx (Pmode);
16552 rtx val = force_operand (XEXP (x, 1), temp);
16553 if (val != temp)
16554 {
16555 val = convert_to_mode (Pmode, val, 1);
16556 emit_move_insn (temp, val);
16557 }
16558
16559 XEXP (x, 1) = temp;
16560 return x;
16561 }
16562
16563 else if (REG_P (XEXP (x, 1)))
16564 {
16565 rtx temp = gen_reg_rtx (Pmode);
16566 rtx val = force_operand (XEXP (x, 0), temp);
16567 if (val != temp)
16568 {
16569 val = convert_to_mode (Pmode, val, 1);
16570 emit_move_insn (temp, val);
16571 }
16572
16573 XEXP (x, 0) = temp;
16574 return x;
16575 }
16576 }
16577
16578 return x;
16579 }
16580 \f
16581 /* Print an integer constant expression in assembler syntax. Addition
16582 and subtraction are the only arithmetic that may appear in these
16583 expressions. FILE is the stdio stream to write to, X is the rtx, and
16584 CODE is the operand print code from the output string. */
16585
16586 static void
16587 output_pic_addr_const (FILE *file, rtx x, int code)
16588 {
16589 char buf[256];
16590
16591 switch (GET_CODE (x))
16592 {
16593 case PC:
16594 gcc_assert (flag_pic);
16595 putc ('.', file);
16596 break;
16597
16598 case SYMBOL_REF:
16599 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
16600 output_addr_const (file, x);
16601 else
16602 {
16603 const char *name = XSTR (x, 0);
16604
16605 /* Mark the decl as referenced so that cgraph will
16606 output the function. */
16607 if (SYMBOL_REF_DECL (x))
16608 mark_decl_referenced (SYMBOL_REF_DECL (x));
16609
16610 #if TARGET_MACHO
16611 if (MACHOPIC_INDIRECT
16612 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
16613 name = machopic_indirection_name (x, /*stub_p=*/true);
16614 #endif
16615 assemble_name (file, name);
16616 }
16617 if (!TARGET_MACHO && !(TARGET_64BIT && TARGET_PECOFF)
16618 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
16619 fputs ("@PLT", file);
16620 break;
16621
16622 case LABEL_REF:
16623 x = XEXP (x, 0);
16624 /* FALLTHRU */
16625 case CODE_LABEL:
16626 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
16627 assemble_name (asm_out_file, buf);
16628 break;
16629
16630 case CONST_INT:
16631 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
16632 break;
16633
16634 case CONST:
16635 /* This used to output parentheses around the expression,
16636 but that does not work on the 386 (either ATT or BSD assembler). */
16637 output_pic_addr_const (file, XEXP (x, 0), code);
16638 break;
16639
16640 case CONST_DOUBLE:
16641 /* We can't handle floating point constants;
16642 TARGET_PRINT_OPERAND must handle them. */
16643 output_operand_lossage ("floating constant misused");
16644 break;
16645
16646 case PLUS:
16647 /* Some assemblers need integer constants to appear first. */
16648 if (CONST_INT_P (XEXP (x, 0)))
16649 {
16650 output_pic_addr_const (file, XEXP (x, 0), code);
16651 putc ('+', file);
16652 output_pic_addr_const (file, XEXP (x, 1), code);
16653 }
16654 else
16655 {
16656 gcc_assert (CONST_INT_P (XEXP (x, 1)));
16657 output_pic_addr_const (file, XEXP (x, 1), code);
16658 putc ('+', file);
16659 output_pic_addr_const (file, XEXP (x, 0), code);
16660 }
16661 break;
16662
16663 case MINUS:
16664 if (!TARGET_MACHO)
16665 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
16666 output_pic_addr_const (file, XEXP (x, 0), code);
16667 putc ('-', file);
16668 output_pic_addr_const (file, XEXP (x, 1), code);
16669 if (!TARGET_MACHO)
16670 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
16671 break;
16672
16673 case UNSPEC:
16674 gcc_assert (XVECLEN (x, 0) == 1);
16675 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
16676 switch (XINT (x, 1))
16677 {
16678 case UNSPEC_GOT:
16679 fputs ("@GOT", file);
16680 break;
16681 case UNSPEC_GOTOFF:
16682 fputs ("@GOTOFF", file);
16683 break;
16684 case UNSPEC_PLTOFF:
16685 fputs ("@PLTOFF", file);
16686 break;
16687 case UNSPEC_PCREL:
16688 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16689 "(%rip)" : "[rip]", file);
16690 break;
16691 case UNSPEC_GOTPCREL:
16692 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16693 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
16694 break;
16695 case UNSPEC_GOTTPOFF:
16696 /* FIXME: This might be @TPOFF in Sun ld too. */
16697 fputs ("@gottpoff", file);
16698 break;
16699 case UNSPEC_TPOFF:
16700 fputs ("@tpoff", file);
16701 break;
16702 case UNSPEC_NTPOFF:
16703 if (TARGET_64BIT)
16704 fputs ("@tpoff", file);
16705 else
16706 fputs ("@ntpoff", file);
16707 break;
16708 case UNSPEC_DTPOFF:
16709 fputs ("@dtpoff", file);
16710 break;
16711 case UNSPEC_GOTNTPOFF:
16712 if (TARGET_64BIT)
16713 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
16714 "@gottpoff(%rip)": "@gottpoff[rip]", file);
16715 else
16716 fputs ("@gotntpoff", file);
16717 break;
16718 case UNSPEC_INDNTPOFF:
16719 fputs ("@indntpoff", file);
16720 break;
16721 #if TARGET_MACHO
16722 case UNSPEC_MACHOPIC_OFFSET:
16723 putc ('-', file);
16724 machopic_output_function_base_name (file);
16725 break;
16726 #endif
16727 default:
16728 output_operand_lossage ("invalid UNSPEC as operand");
16729 break;
16730 }
16731 break;
16732
16733 default:
16734 output_operand_lossage ("invalid expression as operand");
16735 }
16736 }
16737
16738 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
16739 We need to emit DTP-relative relocations. */
16740
16741 static void ATTRIBUTE_UNUSED
16742 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
16743 {
16744 fputs (ASM_LONG, file);
16745 output_addr_const (file, x);
16746 fputs ("@dtpoff", file);
16747 switch (size)
16748 {
16749 case 4:
16750 break;
16751 case 8:
16752 fputs (", 0", file);
16753 break;
16754 default:
16755 gcc_unreachable ();
16756 }
16757 }
16758
16759 /* Return true if X is a representation of the PIC register. This copes
16760 with calls from ix86_find_base_term, where the register might have
16761 been replaced by a cselib value. */
16762
16763 static bool
16764 ix86_pic_register_p (rtx x)
16765 {
16766 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
16767 return (pic_offset_table_rtx
16768 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
16769 else if (!REG_P (x))
16770 return false;
16771 else if (pic_offset_table_rtx)
16772 {
16773 if (REGNO (x) == REGNO (pic_offset_table_rtx))
16774 return true;
16775 if (HARD_REGISTER_P (x)
16776 && !HARD_REGISTER_P (pic_offset_table_rtx)
16777 && ORIGINAL_REGNO (x) == REGNO (pic_offset_table_rtx))
16778 return true;
16779 return false;
16780 }
16781 else
16782 return REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
16783 }
16784
16785 /* Helper function for ix86_delegitimize_address.
16786 Attempt to delegitimize TLS local-exec accesses. */
16787
16788 static rtx
16789 ix86_delegitimize_tls_address (rtx orig_x)
16790 {
16791 rtx x = orig_x, unspec;
16792 struct ix86_address addr;
16793
16794 if (!TARGET_TLS_DIRECT_SEG_REFS)
16795 return orig_x;
16796 if (MEM_P (x))
16797 x = XEXP (x, 0);
16798 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
16799 return orig_x;
16800 if (ix86_decompose_address (x, &addr) == 0
16801 || addr.seg != DEFAULT_TLS_SEG_REG
16802 || addr.disp == NULL_RTX
16803 || GET_CODE (addr.disp) != CONST)
16804 return orig_x;
16805 unspec = XEXP (addr.disp, 0);
16806 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
16807 unspec = XEXP (unspec, 0);
16808 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
16809 return orig_x;
16810 x = XVECEXP (unspec, 0, 0);
16811 gcc_assert (GET_CODE (x) == SYMBOL_REF);
16812 if (unspec != XEXP (addr.disp, 0))
16813 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
16814 if (addr.index)
16815 {
16816 rtx idx = addr.index;
16817 if (addr.scale != 1)
16818 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
16819 x = gen_rtx_PLUS (Pmode, idx, x);
16820 }
16821 if (addr.base)
16822 x = gen_rtx_PLUS (Pmode, addr.base, x);
16823 if (MEM_P (orig_x))
16824 x = replace_equiv_address_nv (orig_x, x);
16825 return x;
16826 }
16827
16828 /* In the name of slightly smaller debug output, and to cater to
16829 general assembler lossage, recognize PIC+GOTOFF and turn it back
16830 into a direct symbol reference.
16831
16832 On Darwin, this is necessary to avoid a crash, because Darwin
16833 has a different PIC label for each routine but the DWARF debugging
16834 information is not associated with any particular routine, so it's
16835 necessary to remove references to the PIC label from RTL stored by
16836 the DWARF output code.
16837
16838 This helper is used in the normal ix86_delegitimize_address
16839 entrypoint (e.g. used in the target delegitimization hook) and
16840 in ix86_find_base_term. As compile time memory optimization, we
16841 avoid allocating rtxes that will not change anything on the outcome
16842 of the callers (find_base_value and find_base_term). */
16843
16844 static inline rtx
16845 ix86_delegitimize_address_1 (rtx x, bool base_term_p)
16846 {
16847 rtx orig_x = delegitimize_mem_from_attrs (x);
16848 /* addend is NULL or some rtx if x is something+GOTOFF where
16849 something doesn't include the PIC register. */
16850 rtx addend = NULL_RTX;
16851 /* reg_addend is NULL or a multiple of some register. */
16852 rtx reg_addend = NULL_RTX;
16853 /* const_addend is NULL or a const_int. */
16854 rtx const_addend = NULL_RTX;
16855 /* This is the result, or NULL. */
16856 rtx result = NULL_RTX;
16857
16858 x = orig_x;
16859
16860 if (MEM_P (x))
16861 x = XEXP (x, 0);
16862
16863 if (TARGET_64BIT)
16864 {
16865 if (GET_CODE (x) == CONST
16866 && GET_CODE (XEXP (x, 0)) == PLUS
16867 && GET_MODE (XEXP (x, 0)) == Pmode
16868 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
16869 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
16870 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
16871 {
16872 /* find_base_{value,term} only care about MEMs with arg_pointer_rtx
16873 base. A CONST can't be arg_pointer_rtx based. */
16874 if (base_term_p && MEM_P (orig_x))
16875 return orig_x;
16876 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
16877 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
16878 if (MEM_P (orig_x))
16879 x = replace_equiv_address_nv (orig_x, x);
16880 return x;
16881 }
16882
16883 if (GET_CODE (x) == CONST
16884 && GET_CODE (XEXP (x, 0)) == UNSPEC
16885 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
16886 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
16887 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
16888 {
16889 x = XVECEXP (XEXP (x, 0), 0, 0);
16890 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
16891 {
16892 x = lowpart_subreg (GET_MODE (orig_x), x, GET_MODE (x));
16893 if (x == NULL_RTX)
16894 return orig_x;
16895 }
16896 return x;
16897 }
16898
16899 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
16900 return ix86_delegitimize_tls_address (orig_x);
16901
16902 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
16903 and -mcmodel=medium -fpic. */
16904 }
16905
16906 if (GET_CODE (x) != PLUS
16907 || GET_CODE (XEXP (x, 1)) != CONST)
16908 return ix86_delegitimize_tls_address (orig_x);
16909
16910 if (ix86_pic_register_p (XEXP (x, 0)))
16911 /* %ebx + GOT/GOTOFF */
16912 ;
16913 else if (GET_CODE (XEXP (x, 0)) == PLUS)
16914 {
16915 /* %ebx + %reg * scale + GOT/GOTOFF */
16916 reg_addend = XEXP (x, 0);
16917 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
16918 reg_addend = XEXP (reg_addend, 1);
16919 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
16920 reg_addend = XEXP (reg_addend, 0);
16921 else
16922 {
16923 reg_addend = NULL_RTX;
16924 addend = XEXP (x, 0);
16925 }
16926 }
16927 else
16928 addend = XEXP (x, 0);
16929
16930 x = XEXP (XEXP (x, 1), 0);
16931 if (GET_CODE (x) == PLUS
16932 && CONST_INT_P (XEXP (x, 1)))
16933 {
16934 const_addend = XEXP (x, 1);
16935 x = XEXP (x, 0);
16936 }
16937
16938 if (GET_CODE (x) == UNSPEC
16939 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
16940 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
16941 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
16942 && !MEM_P (orig_x) && !addend)))
16943 result = XVECEXP (x, 0, 0);
16944
16945 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
16946 && !MEM_P (orig_x))
16947 result = XVECEXP (x, 0, 0);
16948
16949 if (! result)
16950 return ix86_delegitimize_tls_address (orig_x);
16951
16952 /* For (PLUS something CONST_INT) both find_base_{value,term} just
16953 recurse on the first operand. */
16954 if (const_addend && !base_term_p)
16955 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
16956 if (reg_addend)
16957 result = gen_rtx_PLUS (Pmode, reg_addend, result);
16958 if (addend)
16959 {
16960 /* If the rest of original X doesn't involve the PIC register, add
16961 addend and subtract pic_offset_table_rtx. This can happen e.g.
16962 for code like:
16963 leal (%ebx, %ecx, 4), %ecx
16964 ...
16965 movl foo@GOTOFF(%ecx), %edx
16966 in which case we return (%ecx - %ebx) + foo
16967 or (%ecx - _GLOBAL_OFFSET_TABLE_) + foo if pseudo_pic_reg
16968 and reload has completed. Don't do the latter for debug,
16969 as _GLOBAL_OFFSET_TABLE_ can't be expressed in the assembly. */
16970 if (pic_offset_table_rtx
16971 && (!reload_completed || !ix86_use_pseudo_pic_reg ()))
16972 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
16973 pic_offset_table_rtx),
16974 result);
16975 else if (base_term_p
16976 && pic_offset_table_rtx
16977 && !TARGET_MACHO
16978 && !TARGET_VXWORKS_RTP)
16979 {
16980 rtx tmp = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
16981 tmp = gen_rtx_MINUS (Pmode, copy_rtx (addend), tmp);
16982 result = gen_rtx_PLUS (Pmode, tmp, result);
16983 }
16984 else
16985 return orig_x;
16986 }
16987 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
16988 {
16989 result = lowpart_subreg (GET_MODE (orig_x), result, Pmode);
16990 if (result == NULL_RTX)
16991 return orig_x;
16992 }
16993 return result;
16994 }
16995
16996 /* The normal instantiation of the above template. */
16997
16998 static rtx
16999 ix86_delegitimize_address (rtx x)
17000 {
17001 return ix86_delegitimize_address_1 (x, false);
17002 }
17003
17004 /* If X is a machine specific address (i.e. a symbol or label being
17005 referenced as a displacement from the GOT implemented using an
17006 UNSPEC), then return the base term. Otherwise return X. */
17007
17008 rtx
17009 ix86_find_base_term (rtx x)
17010 {
17011 rtx term;
17012
17013 if (TARGET_64BIT)
17014 {
17015 if (GET_CODE (x) != CONST)
17016 return x;
17017 term = XEXP (x, 0);
17018 if (GET_CODE (term) == PLUS
17019 && CONST_INT_P (XEXP (term, 1)))
17020 term = XEXP (term, 0);
17021 if (GET_CODE (term) != UNSPEC
17022 || (XINT (term, 1) != UNSPEC_GOTPCREL
17023 && XINT (term, 1) != UNSPEC_PCREL))
17024 return x;
17025
17026 return XVECEXP (term, 0, 0);
17027 }
17028
17029 return ix86_delegitimize_address_1 (x, true);
17030 }
17031
17032 /* Return true if X shouldn't be emitted into the debug info.
17033 Disallow UNSPECs other than @gotoff - we can't emit _GLOBAL_OFFSET_TABLE_
17034 symbol easily into the .debug_info section, so we need not to
17035 delegitimize, but instead assemble as @gotoff.
17036 Disallow _GLOBAL_OFFSET_TABLE_ SYMBOL_REF - the assembler magically
17037 assembles that as _GLOBAL_OFFSET_TABLE_-. expression. */
17038
17039 static bool
17040 ix86_const_not_ok_for_debug_p (rtx x)
17041 {
17042 if (GET_CODE (x) == UNSPEC && XINT (x, 1) != UNSPEC_GOTOFF)
17043 return true;
17044
17045 if (SYMBOL_REF_P (x) && strcmp (XSTR (x, 0), GOT_SYMBOL_NAME) == 0)
17046 return true;
17047
17048 return false;
17049 }
17050 \f
17051 static void
17052 put_condition_code (enum rtx_code code, machine_mode mode, bool reverse,
17053 bool fp, FILE *file)
17054 {
17055 const char *suffix;
17056
17057 if (mode == CCFPmode)
17058 {
17059 code = ix86_fp_compare_code_to_integer (code);
17060 mode = CCmode;
17061 }
17062 if (reverse)
17063 code = reverse_condition (code);
17064
17065 switch (code)
17066 {
17067 case EQ:
17068 gcc_assert (mode != CCGZmode);
17069 switch (mode)
17070 {
17071 case E_CCAmode:
17072 suffix = "a";
17073 break;
17074 case E_CCCmode:
17075 suffix = "c";
17076 break;
17077 case E_CCOmode:
17078 suffix = "o";
17079 break;
17080 case E_CCPmode:
17081 suffix = "p";
17082 break;
17083 case E_CCSmode:
17084 suffix = "s";
17085 break;
17086 default:
17087 suffix = "e";
17088 break;
17089 }
17090 break;
17091 case NE:
17092 gcc_assert (mode != CCGZmode);
17093 switch (mode)
17094 {
17095 case E_CCAmode:
17096 suffix = "na";
17097 break;
17098 case E_CCCmode:
17099 suffix = "nc";
17100 break;
17101 case E_CCOmode:
17102 suffix = "no";
17103 break;
17104 case E_CCPmode:
17105 suffix = "np";
17106 break;
17107 case E_CCSmode:
17108 suffix = "ns";
17109 break;
17110 default:
17111 suffix = "ne";
17112 break;
17113 }
17114 break;
17115 case GT:
17116 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
17117 suffix = "g";
17118 break;
17119 case GTU:
17120 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
17121 Those same assemblers have the same but opposite lossage on cmov. */
17122 if (mode == CCmode)
17123 suffix = fp ? "nbe" : "a";
17124 else
17125 gcc_unreachable ();
17126 break;
17127 case LT:
17128 switch (mode)
17129 {
17130 case E_CCNOmode:
17131 case E_CCGOCmode:
17132 suffix = "s";
17133 break;
17134
17135 case E_CCmode:
17136 case E_CCGCmode:
17137 case E_CCGZmode:
17138 suffix = "l";
17139 break;
17140
17141 default:
17142 gcc_unreachable ();
17143 }
17144 break;
17145 case LTU:
17146 if (mode == CCmode || mode == CCGZmode)
17147 suffix = "b";
17148 else if (mode == CCCmode)
17149 suffix = fp ? "b" : "c";
17150 else
17151 gcc_unreachable ();
17152 break;
17153 case GE:
17154 switch (mode)
17155 {
17156 case E_CCNOmode:
17157 case E_CCGOCmode:
17158 suffix = "ns";
17159 break;
17160
17161 case E_CCmode:
17162 case E_CCGCmode:
17163 case E_CCGZmode:
17164 suffix = "ge";
17165 break;
17166
17167 default:
17168 gcc_unreachable ();
17169 }
17170 break;
17171 case GEU:
17172 if (mode == CCmode || mode == CCGZmode)
17173 suffix = "nb";
17174 else if (mode == CCCmode)
17175 suffix = fp ? "nb" : "nc";
17176 else
17177 gcc_unreachable ();
17178 break;
17179 case LE:
17180 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
17181 suffix = "le";
17182 break;
17183 case LEU:
17184 if (mode == CCmode)
17185 suffix = "be";
17186 else
17187 gcc_unreachable ();
17188 break;
17189 case UNORDERED:
17190 suffix = fp ? "u" : "p";
17191 break;
17192 case ORDERED:
17193 suffix = fp ? "nu" : "np";
17194 break;
17195 default:
17196 gcc_unreachable ();
17197 }
17198 fputs (suffix, file);
17199 }
17200
17201 /* Print the name of register X to FILE based on its machine mode and number.
17202 If CODE is 'w', pretend the mode is HImode.
17203 If CODE is 'b', pretend the mode is QImode.
17204 If CODE is 'k', pretend the mode is SImode.
17205 If CODE is 'q', pretend the mode is DImode.
17206 If CODE is 'x', pretend the mode is V4SFmode.
17207 If CODE is 't', pretend the mode is V8SFmode.
17208 If CODE is 'g', pretend the mode is V16SFmode.
17209 If CODE is 'h', pretend the reg is the 'high' byte register.
17210 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
17211 If CODE is 'd', duplicate the operand for AVX instruction.
17212 */
17213
17214 void
17215 print_reg (rtx x, int code, FILE *file)
17216 {
17217 const char *reg;
17218 int msize;
17219 unsigned int regno;
17220 bool duplicated;
17221
17222 if (ASSEMBLER_DIALECT == ASM_ATT)
17223 putc ('%', file);
17224
17225 if (x == pc_rtx)
17226 {
17227 gcc_assert (TARGET_64BIT);
17228 fputs ("rip", file);
17229 return;
17230 }
17231
17232 if (code == 'y' && STACK_TOP_P (x))
17233 {
17234 fputs ("st(0)", file);
17235 return;
17236 }
17237
17238 if (code == 'w')
17239 msize = 2;
17240 else if (code == 'b')
17241 msize = 1;
17242 else if (code == 'k')
17243 msize = 4;
17244 else if (code == 'q')
17245 msize = 8;
17246 else if (code == 'h')
17247 msize = 0;
17248 else if (code == 'x')
17249 msize = 16;
17250 else if (code == 't')
17251 msize = 32;
17252 else if (code == 'g')
17253 msize = 64;
17254 else
17255 msize = GET_MODE_SIZE (GET_MODE (x));
17256
17257 regno = REGNO (x);
17258
17259 if (regno == ARG_POINTER_REGNUM
17260 || regno == FRAME_POINTER_REGNUM
17261 || regno == FPSR_REG
17262 || regno == FPCR_REG)
17263 {
17264 output_operand_lossage
17265 ("invalid use of register '%s'", reg_names[regno]);
17266 return;
17267 }
17268 else if (regno == FLAGS_REG)
17269 {
17270 output_operand_lossage ("invalid use of asm flag output");
17271 return;
17272 }
17273
17274 duplicated = code == 'd' && TARGET_AVX;
17275
17276 switch (msize)
17277 {
17278 case 16:
17279 case 12:
17280 case 8:
17281 if (GENERAL_REGNO_P (regno) && msize > GET_MODE_SIZE (word_mode))
17282 warning (0, "unsupported size for integer register");
17283 /* FALLTHRU */
17284 case 4:
17285 if (LEGACY_INT_REGNO_P (regno))
17286 putc (msize > 4 && TARGET_64BIT ? 'r' : 'e', file);
17287 /* FALLTHRU */
17288 case 2:
17289 normal:
17290 reg = hi_reg_name[regno];
17291 break;
17292 case 1:
17293 if (regno >= ARRAY_SIZE (qi_reg_name))
17294 goto normal;
17295 if (!ANY_QI_REGNO_P (regno))
17296 error ("unsupported size for integer register");
17297 reg = qi_reg_name[regno];
17298 break;
17299 case 0:
17300 if (regno >= ARRAY_SIZE (qi_high_reg_name))
17301 goto normal;
17302 reg = qi_high_reg_name[regno];
17303 break;
17304 case 32:
17305 case 64:
17306 if (SSE_REGNO_P (regno))
17307 {
17308 gcc_assert (!duplicated);
17309 putc (msize == 32 ? 'y' : 'z', file);
17310 reg = hi_reg_name[regno] + 1;
17311 break;
17312 }
17313 goto normal;
17314 default:
17315 gcc_unreachable ();
17316 }
17317
17318 fputs (reg, file);
17319
17320 /* Irritatingly, AMD extended registers use
17321 different naming convention: "r%d[bwd]" */
17322 if (REX_INT_REGNO_P (regno))
17323 {
17324 gcc_assert (TARGET_64BIT);
17325 switch (msize)
17326 {
17327 case 0:
17328 error ("extended registers have no high halves");
17329 break;
17330 case 1:
17331 putc ('b', file);
17332 break;
17333 case 2:
17334 putc ('w', file);
17335 break;
17336 case 4:
17337 putc ('d', file);
17338 break;
17339 case 8:
17340 /* no suffix */
17341 break;
17342 default:
17343 error ("unsupported operand size for extended register");
17344 break;
17345 }
17346 return;
17347 }
17348
17349 if (duplicated)
17350 {
17351 if (ASSEMBLER_DIALECT == ASM_ATT)
17352 fprintf (file, ", %%%s", reg);
17353 else
17354 fprintf (file, ", %s", reg);
17355 }
17356 }
17357
17358 /* Meaning of CODE:
17359 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
17360 C -- print opcode suffix for set/cmov insn.
17361 c -- like C, but print reversed condition
17362 F,f -- likewise, but for floating-point.
17363 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
17364 otherwise nothing
17365 R -- print embeded rounding and sae.
17366 r -- print only sae.
17367 z -- print the opcode suffix for the size of the current operand.
17368 Z -- likewise, with special suffixes for x87 instructions.
17369 * -- print a star (in certain assembler syntax)
17370 A -- print an absolute memory reference.
17371 E -- print address with DImode register names if TARGET_64BIT.
17372 w -- print the operand as if it's a "word" (HImode) even if it isn't.
17373 s -- print a shift double count, followed by the assemblers argument
17374 delimiter.
17375 b -- print the QImode name of the register for the indicated operand.
17376 %b0 would print %al if operands[0] is reg 0.
17377 w -- likewise, print the HImode name of the register.
17378 k -- likewise, print the SImode name of the register.
17379 q -- likewise, print the DImode name of the register.
17380 x -- likewise, print the V4SFmode name of the register.
17381 t -- likewise, print the V8SFmode name of the register.
17382 g -- likewise, print the V16SFmode name of the register.
17383 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
17384 y -- print "st(0)" instead of "st" as a register.
17385 d -- print duplicated register operand for AVX instruction.
17386 D -- print condition for SSE cmp instruction.
17387 P -- if PIC, print an @PLT suffix.
17388 p -- print raw symbol name.
17389 X -- don't print any sort of PIC '@' suffix for a symbol.
17390 & -- print some in-use local-dynamic symbol name.
17391 H -- print a memory address offset by 8; used for sse high-parts
17392 Y -- print condition for XOP pcom* instruction.
17393 + -- print a branch hint as 'cs' or 'ds' prefix
17394 ; -- print a semicolon (after prefixes due to bug in older gas).
17395 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
17396 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
17397 ! -- print MPX prefix for jxx/call/ret instructions if required.
17398 */
17399
17400 void
17401 ix86_print_operand (FILE *file, rtx x, int code)
17402 {
17403 if (code)
17404 {
17405 switch (code)
17406 {
17407 case 'A':
17408 switch (ASSEMBLER_DIALECT)
17409 {
17410 case ASM_ATT:
17411 putc ('*', file);
17412 break;
17413
17414 case ASM_INTEL:
17415 /* Intel syntax. For absolute addresses, registers should not
17416 be surrounded by braces. */
17417 if (!REG_P (x))
17418 {
17419 putc ('[', file);
17420 ix86_print_operand (file, x, 0);
17421 putc (']', file);
17422 return;
17423 }
17424 break;
17425
17426 default:
17427 gcc_unreachable ();
17428 }
17429
17430 ix86_print_operand (file, x, 0);
17431 return;
17432
17433 case 'E':
17434 /* Wrap address in an UNSPEC to declare special handling. */
17435 if (TARGET_64BIT)
17436 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
17437
17438 output_address (VOIDmode, x);
17439 return;
17440
17441 case 'L':
17442 if (ASSEMBLER_DIALECT == ASM_ATT)
17443 putc ('l', file);
17444 return;
17445
17446 case 'W':
17447 if (ASSEMBLER_DIALECT == ASM_ATT)
17448 putc ('w', file);
17449 return;
17450
17451 case 'B':
17452 if (ASSEMBLER_DIALECT == ASM_ATT)
17453 putc ('b', file);
17454 return;
17455
17456 case 'Q':
17457 if (ASSEMBLER_DIALECT == ASM_ATT)
17458 putc ('l', file);
17459 return;
17460
17461 case 'S':
17462 if (ASSEMBLER_DIALECT == ASM_ATT)
17463 putc ('s', file);
17464 return;
17465
17466 case 'T':
17467 if (ASSEMBLER_DIALECT == ASM_ATT)
17468 putc ('t', file);
17469 return;
17470
17471 case 'O':
17472 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17473 if (ASSEMBLER_DIALECT != ASM_ATT)
17474 return;
17475
17476 switch (GET_MODE_SIZE (GET_MODE (x)))
17477 {
17478 case 2:
17479 putc ('w', file);
17480 break;
17481
17482 case 4:
17483 putc ('l', file);
17484 break;
17485
17486 case 8:
17487 putc ('q', file);
17488 break;
17489
17490 default:
17491 output_operand_lossage ("invalid operand size for operand "
17492 "code 'O'");
17493 return;
17494 }
17495
17496 putc ('.', file);
17497 #endif
17498 return;
17499
17500 case 'z':
17501 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17502 {
17503 /* Opcodes don't get size suffixes if using Intel opcodes. */
17504 if (ASSEMBLER_DIALECT == ASM_INTEL)
17505 return;
17506
17507 switch (GET_MODE_SIZE (GET_MODE (x)))
17508 {
17509 case 1:
17510 putc ('b', file);
17511 return;
17512
17513 case 2:
17514 putc ('w', file);
17515 return;
17516
17517 case 4:
17518 putc ('l', file);
17519 return;
17520
17521 case 8:
17522 putc ('q', file);
17523 return;
17524
17525 default:
17526 output_operand_lossage ("invalid operand size for operand "
17527 "code 'z'");
17528 return;
17529 }
17530 }
17531
17532 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17533 warning (0, "non-integer operand used with operand code 'z'");
17534 /* FALLTHRU */
17535
17536 case 'Z':
17537 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
17538 if (ASSEMBLER_DIALECT == ASM_INTEL)
17539 return;
17540
17541 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
17542 {
17543 switch (GET_MODE_SIZE (GET_MODE (x)))
17544 {
17545 case 2:
17546 #ifdef HAVE_AS_IX86_FILDS
17547 putc ('s', file);
17548 #endif
17549 return;
17550
17551 case 4:
17552 putc ('l', file);
17553 return;
17554
17555 case 8:
17556 #ifdef HAVE_AS_IX86_FILDQ
17557 putc ('q', file);
17558 #else
17559 fputs ("ll", file);
17560 #endif
17561 return;
17562
17563 default:
17564 break;
17565 }
17566 }
17567 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
17568 {
17569 /* 387 opcodes don't get size suffixes
17570 if the operands are registers. */
17571 if (STACK_REG_P (x))
17572 return;
17573
17574 switch (GET_MODE_SIZE (GET_MODE (x)))
17575 {
17576 case 4:
17577 putc ('s', file);
17578 return;
17579
17580 case 8:
17581 putc ('l', file);
17582 return;
17583
17584 case 12:
17585 case 16:
17586 putc ('t', file);
17587 return;
17588
17589 default:
17590 break;
17591 }
17592 }
17593 else
17594 {
17595 output_operand_lossage ("invalid operand type used with "
17596 "operand code 'Z'");
17597 return;
17598 }
17599
17600 output_operand_lossage ("invalid operand size for operand code 'Z'");
17601 return;
17602
17603 case 'd':
17604 case 'b':
17605 case 'w':
17606 case 'k':
17607 case 'q':
17608 case 'h':
17609 case 't':
17610 case 'g':
17611 case 'y':
17612 case 'x':
17613 case 'X':
17614 case 'P':
17615 case 'p':
17616 break;
17617
17618 case 's':
17619 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
17620 {
17621 ix86_print_operand (file, x, 0);
17622 fputs (", ", file);
17623 }
17624 return;
17625
17626 case 'Y':
17627 switch (GET_CODE (x))
17628 {
17629 case NE:
17630 fputs ("neq", file);
17631 break;
17632 case EQ:
17633 fputs ("eq", file);
17634 break;
17635 case GE:
17636 case GEU:
17637 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
17638 break;
17639 case GT:
17640 case GTU:
17641 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
17642 break;
17643 case LE:
17644 case LEU:
17645 fputs ("le", file);
17646 break;
17647 case LT:
17648 case LTU:
17649 fputs ("lt", file);
17650 break;
17651 case UNORDERED:
17652 fputs ("unord", file);
17653 break;
17654 case ORDERED:
17655 fputs ("ord", file);
17656 break;
17657 case UNEQ:
17658 fputs ("ueq", file);
17659 break;
17660 case UNGE:
17661 fputs ("nlt", file);
17662 break;
17663 case UNGT:
17664 fputs ("nle", file);
17665 break;
17666 case UNLE:
17667 fputs ("ule", file);
17668 break;
17669 case UNLT:
17670 fputs ("ult", file);
17671 break;
17672 case LTGT:
17673 fputs ("une", file);
17674 break;
17675 default:
17676 output_operand_lossage ("operand is not a condition code, "
17677 "invalid operand code 'Y'");
17678 return;
17679 }
17680 return;
17681
17682 case 'D':
17683 /* Little bit of braindamage here. The SSE compare instructions
17684 does use completely different names for the comparisons that the
17685 fp conditional moves. */
17686 switch (GET_CODE (x))
17687 {
17688 case UNEQ:
17689 if (TARGET_AVX)
17690 {
17691 fputs ("eq_us", file);
17692 break;
17693 }
17694 /* FALLTHRU */
17695 case EQ:
17696 fputs ("eq", file);
17697 break;
17698 case UNLT:
17699 if (TARGET_AVX)
17700 {
17701 fputs ("nge", file);
17702 break;
17703 }
17704 /* FALLTHRU */
17705 case LT:
17706 fputs ("lt", file);
17707 break;
17708 case UNLE:
17709 if (TARGET_AVX)
17710 {
17711 fputs ("ngt", file);
17712 break;
17713 }
17714 /* FALLTHRU */
17715 case LE:
17716 fputs ("le", file);
17717 break;
17718 case UNORDERED:
17719 fputs ("unord", file);
17720 break;
17721 case LTGT:
17722 if (TARGET_AVX)
17723 {
17724 fputs ("neq_oq", file);
17725 break;
17726 }
17727 /* FALLTHRU */
17728 case NE:
17729 fputs ("neq", file);
17730 break;
17731 case GE:
17732 if (TARGET_AVX)
17733 {
17734 fputs ("ge", file);
17735 break;
17736 }
17737 /* FALLTHRU */
17738 case UNGE:
17739 fputs ("nlt", file);
17740 break;
17741 case GT:
17742 if (TARGET_AVX)
17743 {
17744 fputs ("gt", file);
17745 break;
17746 }
17747 /* FALLTHRU */
17748 case UNGT:
17749 fputs ("nle", file);
17750 break;
17751 case ORDERED:
17752 fputs ("ord", file);
17753 break;
17754 default:
17755 output_operand_lossage ("operand is not a condition code, "
17756 "invalid operand code 'D'");
17757 return;
17758 }
17759 return;
17760
17761 case 'F':
17762 case 'f':
17763 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
17764 if (ASSEMBLER_DIALECT == ASM_ATT)
17765 putc ('.', file);
17766 gcc_fallthrough ();
17767 #endif
17768
17769 case 'C':
17770 case 'c':
17771 if (!COMPARISON_P (x))
17772 {
17773 output_operand_lossage ("operand is not a condition code, "
17774 "invalid operand code '%c'", code);
17775 return;
17776 }
17777 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
17778 code == 'c' || code == 'f',
17779 code == 'F' || code == 'f',
17780 file);
17781 return;
17782
17783 case 'H':
17784 if (!offsettable_memref_p (x))
17785 {
17786 output_operand_lossage ("operand is not an offsettable memory "
17787 "reference, invalid operand code 'H'");
17788 return;
17789 }
17790 /* It doesn't actually matter what mode we use here, as we're
17791 only going to use this for printing. */
17792 x = adjust_address_nv (x, DImode, 8);
17793 /* Output 'qword ptr' for intel assembler dialect. */
17794 if (ASSEMBLER_DIALECT == ASM_INTEL)
17795 code = 'q';
17796 break;
17797
17798 case 'K':
17799 if (!CONST_INT_P (x))
17800 {
17801 output_operand_lossage ("operand is not an integer, invalid "
17802 "operand code 'K'");
17803 return;
17804 }
17805
17806 if (INTVAL (x) & IX86_HLE_ACQUIRE)
17807 #ifdef HAVE_AS_IX86_HLE
17808 fputs ("xacquire ", file);
17809 #else
17810 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
17811 #endif
17812 else if (INTVAL (x) & IX86_HLE_RELEASE)
17813 #ifdef HAVE_AS_IX86_HLE
17814 fputs ("xrelease ", file);
17815 #else
17816 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
17817 #endif
17818 /* We do not want to print value of the operand. */
17819 return;
17820
17821 case 'N':
17822 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
17823 fputs ("{z}", file);
17824 return;
17825
17826 case 'r':
17827 if (!CONST_INT_P (x) || INTVAL (x) != ROUND_SAE)
17828 {
17829 output_operand_lossage ("operand is not a specific integer, "
17830 "invalid operand code 'r'");
17831 return;
17832 }
17833
17834 if (ASSEMBLER_DIALECT == ASM_INTEL)
17835 fputs (", ", file);
17836
17837 fputs ("{sae}", file);
17838
17839 if (ASSEMBLER_DIALECT == ASM_ATT)
17840 fputs (", ", file);
17841
17842 return;
17843
17844 case 'R':
17845 if (!CONST_INT_P (x))
17846 {
17847 output_operand_lossage ("operand is not an integer, invalid "
17848 "operand code 'R'");
17849 return;
17850 }
17851
17852 if (ASSEMBLER_DIALECT == ASM_INTEL)
17853 fputs (", ", file);
17854
17855 switch (INTVAL (x))
17856 {
17857 case ROUND_NEAREST_INT | ROUND_SAE:
17858 fputs ("{rn-sae}", file);
17859 break;
17860 case ROUND_NEG_INF | ROUND_SAE:
17861 fputs ("{rd-sae}", file);
17862 break;
17863 case ROUND_POS_INF | ROUND_SAE:
17864 fputs ("{ru-sae}", file);
17865 break;
17866 case ROUND_ZERO | ROUND_SAE:
17867 fputs ("{rz-sae}", file);
17868 break;
17869 default:
17870 output_operand_lossage ("operand is not a specific integer, "
17871 "invalid operand code 'R'");
17872 }
17873
17874 if (ASSEMBLER_DIALECT == ASM_ATT)
17875 fputs (", ", file);
17876
17877 return;
17878
17879 case '*':
17880 if (ASSEMBLER_DIALECT == ASM_ATT)
17881 putc ('*', file);
17882 return;
17883
17884 case '&':
17885 {
17886 const char *name = get_some_local_dynamic_name ();
17887 if (name == NULL)
17888 output_operand_lossage ("'%%&' used without any "
17889 "local dynamic TLS references");
17890 else
17891 assemble_name (file, name);
17892 return;
17893 }
17894
17895 case '+':
17896 {
17897 rtx x;
17898
17899 if (!optimize
17900 || optimize_function_for_size_p (cfun)
17901 || !TARGET_BRANCH_PREDICTION_HINTS)
17902 return;
17903
17904 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
17905 if (x)
17906 {
17907 int pred_val = profile_probability::from_reg_br_prob_note
17908 (XINT (x, 0)).to_reg_br_prob_base ();
17909
17910 if (pred_val < REG_BR_PROB_BASE * 45 / 100
17911 || pred_val > REG_BR_PROB_BASE * 55 / 100)
17912 {
17913 bool taken = pred_val > REG_BR_PROB_BASE / 2;
17914 bool cputaken
17915 = final_forward_branch_p (current_output_insn) == 0;
17916
17917 /* Emit hints only in the case default branch prediction
17918 heuristics would fail. */
17919 if (taken != cputaken)
17920 {
17921 /* We use 3e (DS) prefix for taken branches and
17922 2e (CS) prefix for not taken branches. */
17923 if (taken)
17924 fputs ("ds ; ", file);
17925 else
17926 fputs ("cs ; ", file);
17927 }
17928 }
17929 }
17930 return;
17931 }
17932
17933 case ';':
17934 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
17935 putc (';', file);
17936 #endif
17937 return;
17938
17939 case '~':
17940 putc (TARGET_AVX2 ? 'i' : 'f', file);
17941 return;
17942
17943 case '^':
17944 if (TARGET_64BIT && Pmode != word_mode)
17945 fputs ("addr32 ", file);
17946 return;
17947
17948 case '!':
17949 if (ix86_bnd_prefixed_insn_p (current_output_insn))
17950 fputs ("bnd ", file);
17951 if (ix86_notrack_prefixed_insn_p (current_output_insn))
17952 fputs ("notrack ", file);
17953 return;
17954
17955 default:
17956 output_operand_lossage ("invalid operand code '%c'", code);
17957 }
17958 }
17959
17960 if (REG_P (x))
17961 print_reg (x, code, file);
17962
17963 else if (MEM_P (x))
17964 {
17965 rtx addr = XEXP (x, 0);
17966
17967 /* No `byte ptr' prefix for call instructions ... */
17968 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
17969 {
17970 machine_mode mode = GET_MODE (x);
17971 const char *size;
17972
17973 /* Check for explicit size override codes. */
17974 if (code == 'b')
17975 size = "BYTE";
17976 else if (code == 'w')
17977 size = "WORD";
17978 else if (code == 'k')
17979 size = "DWORD";
17980 else if (code == 'q')
17981 size = "QWORD";
17982 else if (code == 'x')
17983 size = "XMMWORD";
17984 else if (code == 't')
17985 size = "YMMWORD";
17986 else if (code == 'g')
17987 size = "ZMMWORD";
17988 else if (mode == BLKmode)
17989 /* ... or BLKmode operands, when not overridden. */
17990 size = NULL;
17991 else
17992 switch (GET_MODE_SIZE (mode))
17993 {
17994 case 1: size = "BYTE"; break;
17995 case 2: size = "WORD"; break;
17996 case 4: size = "DWORD"; break;
17997 case 8: size = "QWORD"; break;
17998 case 12: size = "TBYTE"; break;
17999 case 16:
18000 if (mode == XFmode)
18001 size = "TBYTE";
18002 else
18003 size = "XMMWORD";
18004 break;
18005 case 32: size = "YMMWORD"; break;
18006 case 64: size = "ZMMWORD"; break;
18007 default:
18008 gcc_unreachable ();
18009 }
18010 if (size)
18011 {
18012 fputs (size, file);
18013 fputs (" PTR ", file);
18014 }
18015 }
18016
18017 if (this_is_asm_operands && ! address_operand (addr, VOIDmode))
18018 output_operand_lossage ("invalid constraints for operand");
18019 else
18020 ix86_print_operand_address_as
18021 (file, addr, MEM_ADDR_SPACE (x), code == 'p' || code == 'P');
18022 }
18023
18024 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == SFmode)
18025 {
18026 long l;
18027
18028 REAL_VALUE_TO_TARGET_SINGLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18029
18030 if (ASSEMBLER_DIALECT == ASM_ATT)
18031 putc ('$', file);
18032 /* Sign extend 32bit SFmode immediate to 8 bytes. */
18033 if (code == 'q')
18034 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
18035 (unsigned long long) (int) l);
18036 else
18037 fprintf (file, "0x%08x", (unsigned int) l);
18038 }
18039
18040 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == DFmode)
18041 {
18042 long l[2];
18043
18044 REAL_VALUE_TO_TARGET_DOUBLE (*CONST_DOUBLE_REAL_VALUE (x), l);
18045
18046 if (ASSEMBLER_DIALECT == ASM_ATT)
18047 putc ('$', file);
18048 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
18049 }
18050
18051 /* These float cases don't actually occur as immediate operands. */
18052 else if (CONST_DOUBLE_P (x) && GET_MODE (x) == XFmode)
18053 {
18054 char dstr[30];
18055
18056 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
18057 fputs (dstr, file);
18058 }
18059
18060 else
18061 {
18062 /* We have patterns that allow zero sets of memory, for instance.
18063 In 64-bit mode, we should probably support all 8-byte vectors,
18064 since we can in fact encode that into an immediate. */
18065 if (GET_CODE (x) == CONST_VECTOR)
18066 {
18067 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
18068 x = const0_rtx;
18069 }
18070
18071 if (code != 'P' && code != 'p')
18072 {
18073 if (CONST_INT_P (x))
18074 {
18075 if (ASSEMBLER_DIALECT == ASM_ATT)
18076 putc ('$', file);
18077 }
18078 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
18079 || GET_CODE (x) == LABEL_REF)
18080 {
18081 if (ASSEMBLER_DIALECT == ASM_ATT)
18082 putc ('$', file);
18083 else
18084 fputs ("OFFSET FLAT:", file);
18085 }
18086 }
18087 if (CONST_INT_P (x))
18088 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
18089 else if (flag_pic || MACHOPIC_INDIRECT)
18090 output_pic_addr_const (file, x, code);
18091 else
18092 output_addr_const (file, x);
18093 }
18094 }
18095
18096 static bool
18097 ix86_print_operand_punct_valid_p (unsigned char code)
18098 {
18099 return (code == '*' || code == '+' || code == '&' || code == ';'
18100 || code == '~' || code == '^' || code == '!');
18101 }
18102 \f
18103 /* Print a memory operand whose address is ADDR. */
18104
18105 static void
18106 ix86_print_operand_address_as (FILE *file, rtx addr,
18107 addr_space_t as, bool no_rip)
18108 {
18109 struct ix86_address parts;
18110 rtx base, index, disp;
18111 int scale;
18112 int ok;
18113 bool vsib = false;
18114 int code = 0;
18115
18116 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
18117 {
18118 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18119 gcc_assert (parts.index == NULL_RTX);
18120 parts.index = XVECEXP (addr, 0, 1);
18121 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
18122 addr = XVECEXP (addr, 0, 0);
18123 vsib = true;
18124 }
18125 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
18126 {
18127 gcc_assert (TARGET_64BIT);
18128 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18129 code = 'q';
18130 }
18131 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDMK_ADDR)
18132 {
18133 ok = ix86_decompose_address (XVECEXP (addr, 0, 1), &parts);
18134 gcc_assert (parts.base == NULL_RTX || parts.index == NULL_RTX);
18135 if (parts.base != NULL_RTX)
18136 {
18137 parts.index = parts.base;
18138 parts.scale = 1;
18139 }
18140 parts.base = XVECEXP (addr, 0, 0);
18141 addr = XVECEXP (addr, 0, 0);
18142 }
18143 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_BNDLDX_ADDR)
18144 {
18145 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
18146 gcc_assert (parts.index == NULL_RTX);
18147 parts.index = XVECEXP (addr, 0, 1);
18148 addr = XVECEXP (addr, 0, 0);
18149 }
18150 else
18151 ok = ix86_decompose_address (addr, &parts);
18152
18153 gcc_assert (ok);
18154
18155 base = parts.base;
18156 index = parts.index;
18157 disp = parts.disp;
18158 scale = parts.scale;
18159
18160 if (ADDR_SPACE_GENERIC_P (as))
18161 as = parts.seg;
18162 else
18163 gcc_assert (ADDR_SPACE_GENERIC_P (parts.seg));
18164
18165 if (!ADDR_SPACE_GENERIC_P (as))
18166 {
18167 const char *string;
18168
18169 if (as == ADDR_SPACE_SEG_FS)
18170 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%fs:" : "fs:");
18171 else if (as == ADDR_SPACE_SEG_GS)
18172 string = (ASSEMBLER_DIALECT == ASM_ATT ? "%gs:" : "gs:");
18173 else
18174 gcc_unreachable ();
18175 fputs (string, file);
18176 }
18177
18178 /* Use one byte shorter RIP relative addressing for 64bit mode. */
18179 if (TARGET_64BIT && !base && !index && !no_rip)
18180 {
18181 rtx symbol = disp;
18182
18183 if (GET_CODE (disp) == CONST
18184 && GET_CODE (XEXP (disp, 0)) == PLUS
18185 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18186 symbol = XEXP (XEXP (disp, 0), 0);
18187
18188 if (GET_CODE (symbol) == LABEL_REF
18189 || (GET_CODE (symbol) == SYMBOL_REF
18190 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
18191 base = pc_rtx;
18192 }
18193
18194 if (!base && !index)
18195 {
18196 /* Displacement only requires special attention. */
18197 if (CONST_INT_P (disp))
18198 {
18199 if (ASSEMBLER_DIALECT == ASM_INTEL && ADDR_SPACE_GENERIC_P (as))
18200 fputs ("ds:", file);
18201 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
18202 }
18203 /* Load the external function address via the GOT slot to avoid PLT. */
18204 else if (GET_CODE (disp) == CONST
18205 && GET_CODE (XEXP (disp, 0)) == UNSPEC
18206 && (XINT (XEXP (disp, 0), 1) == UNSPEC_GOTPCREL
18207 || XINT (XEXP (disp, 0), 1) == UNSPEC_GOT)
18208 && ix86_force_load_from_GOT_p (XVECEXP (XEXP (disp, 0), 0, 0)))
18209 output_pic_addr_const (file, disp, 0);
18210 else if (flag_pic)
18211 output_pic_addr_const (file, disp, 0);
18212 else
18213 output_addr_const (file, disp);
18214 }
18215 else
18216 {
18217 /* Print SImode register names to force addr32 prefix. */
18218 if (SImode_address_operand (addr, VOIDmode))
18219 {
18220 if (flag_checking)
18221 {
18222 gcc_assert (TARGET_64BIT);
18223 switch (GET_CODE (addr))
18224 {
18225 case SUBREG:
18226 gcc_assert (GET_MODE (addr) == SImode);
18227 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
18228 break;
18229 case ZERO_EXTEND:
18230 case AND:
18231 gcc_assert (GET_MODE (addr) == DImode);
18232 break;
18233 default:
18234 gcc_unreachable ();
18235 }
18236 }
18237 gcc_assert (!code);
18238 code = 'k';
18239 }
18240 else if (code == 0
18241 && TARGET_X32
18242 && disp
18243 && CONST_INT_P (disp)
18244 && INTVAL (disp) < -16*1024*1024)
18245 {
18246 /* X32 runs in 64-bit mode, where displacement, DISP, in
18247 address DISP(%r64), is encoded as 32-bit immediate sign-
18248 extended from 32-bit to 64-bit. For -0x40000300(%r64),
18249 address is %r64 + 0xffffffffbffffd00. When %r64 <
18250 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
18251 which is invalid for x32. The correct address is %r64
18252 - 0x40000300 == 0xf7ffdd64. To properly encode
18253 -0x40000300(%r64) for x32, we zero-extend negative
18254 displacement by forcing addr32 prefix which truncates
18255 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
18256 zero-extend all negative displacements, including -1(%rsp).
18257 However, for small negative displacements, sign-extension
18258 won't cause overflow. We only zero-extend negative
18259 displacements if they < -16*1024*1024, which is also used
18260 to check legitimate address displacements for PIC. */
18261 code = 'k';
18262 }
18263
18264 /* Since the upper 32 bits of RSP are always zero for x32,
18265 we can encode %esp as %rsp to avoid 0x67 prefix if
18266 there is no index register. */
18267 if (TARGET_X32 && Pmode == SImode
18268 && !index && base && REG_P (base) && REGNO (base) == SP_REG)
18269 code = 'q';
18270
18271 if (ASSEMBLER_DIALECT == ASM_ATT)
18272 {
18273 if (disp)
18274 {
18275 if (flag_pic)
18276 output_pic_addr_const (file, disp, 0);
18277 else if (GET_CODE (disp) == LABEL_REF)
18278 output_asm_label (disp);
18279 else
18280 output_addr_const (file, disp);
18281 }
18282
18283 putc ('(', file);
18284 if (base)
18285 print_reg (base, code, file);
18286 if (index)
18287 {
18288 putc (',', file);
18289 print_reg (index, vsib ? 0 : code, file);
18290 if (scale != 1 || vsib)
18291 fprintf (file, ",%d", scale);
18292 }
18293 putc (')', file);
18294 }
18295 else
18296 {
18297 rtx offset = NULL_RTX;
18298
18299 if (disp)
18300 {
18301 /* Pull out the offset of a symbol; print any symbol itself. */
18302 if (GET_CODE (disp) == CONST
18303 && GET_CODE (XEXP (disp, 0)) == PLUS
18304 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
18305 {
18306 offset = XEXP (XEXP (disp, 0), 1);
18307 disp = gen_rtx_CONST (VOIDmode,
18308 XEXP (XEXP (disp, 0), 0));
18309 }
18310
18311 if (flag_pic)
18312 output_pic_addr_const (file, disp, 0);
18313 else if (GET_CODE (disp) == LABEL_REF)
18314 output_asm_label (disp);
18315 else if (CONST_INT_P (disp))
18316 offset = disp;
18317 else
18318 output_addr_const (file, disp);
18319 }
18320
18321 putc ('[', file);
18322 if (base)
18323 {
18324 print_reg (base, code, file);
18325 if (offset)
18326 {
18327 if (INTVAL (offset) >= 0)
18328 putc ('+', file);
18329 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18330 }
18331 }
18332 else if (offset)
18333 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
18334 else
18335 putc ('0', file);
18336
18337 if (index)
18338 {
18339 putc ('+', file);
18340 print_reg (index, vsib ? 0 : code, file);
18341 if (scale != 1 || vsib)
18342 fprintf (file, "*%d", scale);
18343 }
18344 putc (']', file);
18345 }
18346 }
18347 }
18348
18349 static void
18350 ix86_print_operand_address (FILE *file, machine_mode /*mode*/, rtx addr)
18351 {
18352 ix86_print_operand_address_as (file, addr, ADDR_SPACE_GENERIC, false);
18353 }
18354
18355 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
18356
18357 static bool
18358 i386_asm_output_addr_const_extra (FILE *file, rtx x)
18359 {
18360 rtx op;
18361
18362 if (GET_CODE (x) != UNSPEC)
18363 return false;
18364
18365 op = XVECEXP (x, 0, 0);
18366 switch (XINT (x, 1))
18367 {
18368 case UNSPEC_GOTOFF:
18369 output_addr_const (file, op);
18370 fputs ("@gotoff", file);
18371 break;
18372 case UNSPEC_GOTTPOFF:
18373 output_addr_const (file, op);
18374 /* FIXME: This might be @TPOFF in Sun ld. */
18375 fputs ("@gottpoff", file);
18376 break;
18377 case UNSPEC_TPOFF:
18378 output_addr_const (file, op);
18379 fputs ("@tpoff", file);
18380 break;
18381 case UNSPEC_NTPOFF:
18382 output_addr_const (file, op);
18383 if (TARGET_64BIT)
18384 fputs ("@tpoff", file);
18385 else
18386 fputs ("@ntpoff", file);
18387 break;
18388 case UNSPEC_DTPOFF:
18389 output_addr_const (file, op);
18390 fputs ("@dtpoff", file);
18391 break;
18392 case UNSPEC_GOTNTPOFF:
18393 output_addr_const (file, op);
18394 if (TARGET_64BIT)
18395 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
18396 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
18397 else
18398 fputs ("@gotntpoff", file);
18399 break;
18400 case UNSPEC_INDNTPOFF:
18401 output_addr_const (file, op);
18402 fputs ("@indntpoff", file);
18403 break;
18404 #if TARGET_MACHO
18405 case UNSPEC_MACHOPIC_OFFSET:
18406 output_addr_const (file, op);
18407 putc ('-', file);
18408 machopic_output_function_base_name (file);
18409 break;
18410 #endif
18411
18412 default:
18413 return false;
18414 }
18415
18416 return true;
18417 }
18418 \f
18419 /* Split one or more double-mode RTL references into pairs of half-mode
18420 references. The RTL can be REG, offsettable MEM, integer constant, or
18421 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
18422 split and "num" is its length. lo_half and hi_half are output arrays
18423 that parallel "operands". */
18424
18425 void
18426 split_double_mode (machine_mode mode, rtx operands[],
18427 int num, rtx lo_half[], rtx hi_half[])
18428 {
18429 machine_mode half_mode;
18430 unsigned int byte;
18431
18432 switch (mode)
18433 {
18434 case E_TImode:
18435 half_mode = DImode;
18436 break;
18437 case E_DImode:
18438 half_mode = SImode;
18439 break;
18440 default:
18441 gcc_unreachable ();
18442 }
18443
18444 byte = GET_MODE_SIZE (half_mode);
18445
18446 while (num--)
18447 {
18448 rtx op = operands[num];
18449
18450 /* simplify_subreg refuse to split volatile memory addresses,
18451 but we still have to handle it. */
18452 if (MEM_P (op))
18453 {
18454 lo_half[num] = adjust_address (op, half_mode, 0);
18455 hi_half[num] = adjust_address (op, half_mode, byte);
18456 }
18457 else
18458 {
18459 lo_half[num] = simplify_gen_subreg (half_mode, op,
18460 GET_MODE (op) == VOIDmode
18461 ? mode : GET_MODE (op), 0);
18462 hi_half[num] = simplify_gen_subreg (half_mode, op,
18463 GET_MODE (op) == VOIDmode
18464 ? mode : GET_MODE (op), byte);
18465 }
18466 }
18467 }
18468 \f
18469 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
18470 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
18471 is the expression of the binary operation. The output may either be
18472 emitted here, or returned to the caller, like all output_* functions.
18473
18474 There is no guarantee that the operands are the same mode, as they
18475 might be within FLOAT or FLOAT_EXTEND expressions. */
18476
18477 #ifndef SYSV386_COMPAT
18478 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
18479 wants to fix the assemblers because that causes incompatibility
18480 with gcc. No-one wants to fix gcc because that causes
18481 incompatibility with assemblers... You can use the option of
18482 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
18483 #define SYSV386_COMPAT 1
18484 #endif
18485
18486 const char *
18487 output_387_binary_op (rtx_insn *insn, rtx *operands)
18488 {
18489 static char buf[40];
18490 const char *p;
18491 bool is_sse
18492 = (SSE_REG_P (operands[0])
18493 || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]));
18494
18495 if (is_sse)
18496 p = "%v";
18497 else if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
18498 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
18499 p = "fi";
18500 else
18501 p = "f";
18502
18503 strcpy (buf, p);
18504
18505 switch (GET_CODE (operands[3]))
18506 {
18507 case PLUS:
18508 p = "add"; break;
18509 case MINUS:
18510 p = "sub"; break;
18511 case MULT:
18512 p = "mul"; break;
18513 case DIV:
18514 p = "div"; break;
18515 default:
18516 gcc_unreachable ();
18517 }
18518
18519 strcat (buf, p);
18520
18521 if (is_sse)
18522 {
18523 p = (GET_MODE (operands[0]) == SFmode) ? "ss" : "sd";
18524 strcat (buf, p);
18525
18526 if (TARGET_AVX)
18527 p = "\t{%2, %1, %0|%0, %1, %2}";
18528 else
18529 p = "\t{%2, %0|%0, %2}";
18530
18531 strcat (buf, p);
18532 return buf;
18533 }
18534
18535 /* Even if we do not want to check the inputs, this documents input
18536 constraints. Which helps in understanding the following code. */
18537 if (flag_checking)
18538 {
18539 if (STACK_REG_P (operands[0])
18540 && ((REG_P (operands[1])
18541 && REGNO (operands[0]) == REGNO (operands[1])
18542 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
18543 || (REG_P (operands[2])
18544 && REGNO (operands[0]) == REGNO (operands[2])
18545 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
18546 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
18547 ; /* ok */
18548 else
18549 gcc_unreachable ();
18550 }
18551
18552 switch (GET_CODE (operands[3]))
18553 {
18554 case MULT:
18555 case PLUS:
18556 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
18557 std::swap (operands[1], operands[2]);
18558
18559 /* know operands[0] == operands[1]. */
18560
18561 if (MEM_P (operands[2]))
18562 {
18563 p = "%Z2\t%2";
18564 break;
18565 }
18566
18567 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18568 {
18569 if (STACK_TOP_P (operands[0]))
18570 /* How is it that we are storing to a dead operand[2]?
18571 Well, presumably operands[1] is dead too. We can't
18572 store the result to st(0) as st(0) gets popped on this
18573 instruction. Instead store to operands[2] (which I
18574 think has to be st(1)). st(1) will be popped later.
18575 gcc <= 2.8.1 didn't have this check and generated
18576 assembly code that the Unixware assembler rejected. */
18577 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18578 else
18579 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18580 break;
18581 }
18582
18583 if (STACK_TOP_P (operands[0]))
18584 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18585 else
18586 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18587 break;
18588
18589 case MINUS:
18590 case DIV:
18591 if (MEM_P (operands[1]))
18592 {
18593 p = "r%Z1\t%1";
18594 break;
18595 }
18596
18597 if (MEM_P (operands[2]))
18598 {
18599 p = "%Z2\t%2";
18600 break;
18601 }
18602
18603 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
18604 {
18605 #if SYSV386_COMPAT
18606 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
18607 derived assemblers, confusingly reverse the direction of
18608 the operation for fsub{r} and fdiv{r} when the
18609 destination register is not st(0). The Intel assembler
18610 doesn't have this brain damage. Read !SYSV386_COMPAT to
18611 figure out what the hardware really does. */
18612 if (STACK_TOP_P (operands[0]))
18613 p = "{p\t%0, %2|rp\t%2, %0}";
18614 else
18615 p = "{rp\t%2, %0|p\t%0, %2}";
18616 #else
18617 if (STACK_TOP_P (operands[0]))
18618 /* As above for fmul/fadd, we can't store to st(0). */
18619 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
18620 else
18621 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
18622 #endif
18623 break;
18624 }
18625
18626 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
18627 {
18628 #if SYSV386_COMPAT
18629 if (STACK_TOP_P (operands[0]))
18630 p = "{rp\t%0, %1|p\t%1, %0}";
18631 else
18632 p = "{p\t%1, %0|rp\t%0, %1}";
18633 #else
18634 if (STACK_TOP_P (operands[0]))
18635 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
18636 else
18637 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
18638 #endif
18639 break;
18640 }
18641
18642 if (STACK_TOP_P (operands[0]))
18643 {
18644 if (STACK_TOP_P (operands[1]))
18645 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
18646 else
18647 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
18648 break;
18649 }
18650 else if (STACK_TOP_P (operands[1]))
18651 {
18652 #if SYSV386_COMPAT
18653 p = "{\t%1, %0|r\t%0, %1}";
18654 #else
18655 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
18656 #endif
18657 }
18658 else
18659 {
18660 #if SYSV386_COMPAT
18661 p = "{r\t%2, %0|\t%0, %2}";
18662 #else
18663 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
18664 #endif
18665 }
18666 break;
18667
18668 default:
18669 gcc_unreachable ();
18670 }
18671
18672 strcat (buf, p);
18673 return buf;
18674 }
18675
18676 /* Return needed mode for entity in optimize_mode_switching pass. */
18677
18678 static int
18679 ix86_dirflag_mode_needed (rtx_insn *insn)
18680 {
18681 if (CALL_P (insn))
18682 {
18683 if (cfun->machine->func_type == TYPE_NORMAL)
18684 return X86_DIRFLAG_ANY;
18685 else
18686 /* No need to emit CLD in interrupt handler for TARGET_CLD. */
18687 return TARGET_CLD ? X86_DIRFLAG_ANY : X86_DIRFLAG_RESET;
18688 }
18689
18690 if (recog_memoized (insn) < 0)
18691 return X86_DIRFLAG_ANY;
18692
18693 if (get_attr_type (insn) == TYPE_STR)
18694 {
18695 /* Emit cld instruction if stringops are used in the function. */
18696 if (cfun->machine->func_type == TYPE_NORMAL)
18697 return TARGET_CLD ? X86_DIRFLAG_RESET : X86_DIRFLAG_ANY;
18698 else
18699 return X86_DIRFLAG_RESET;
18700 }
18701
18702 return X86_DIRFLAG_ANY;
18703 }
18704
18705 /* Check if a 256bit or 512 bit AVX register is referenced inside of EXP. */
18706
18707 static bool
18708 ix86_check_avx_upper_register (const_rtx exp)
18709 {
18710 if (SUBREG_P (exp))
18711 exp = SUBREG_REG (exp);
18712
18713 return (REG_P (exp)
18714 && (VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp))
18715 || VALID_AVX512F_REG_OR_XI_MODE (GET_MODE (exp))));
18716 }
18717
18718 /* Return needed mode for entity in optimize_mode_switching pass. */
18719
18720 static int
18721 ix86_avx_u128_mode_needed (rtx_insn *insn)
18722 {
18723 if (CALL_P (insn))
18724 {
18725 rtx link;
18726
18727 /* Needed mode is set to AVX_U128_CLEAN if there are
18728 no 256bit or 512bit modes used in function arguments. */
18729 for (link = CALL_INSN_FUNCTION_USAGE (insn);
18730 link;
18731 link = XEXP (link, 1))
18732 {
18733 if (GET_CODE (XEXP (link, 0)) == USE)
18734 {
18735 rtx arg = XEXP (XEXP (link, 0), 0);
18736
18737 if (ix86_check_avx_upper_register (arg))
18738 return AVX_U128_DIRTY;
18739 }
18740 }
18741
18742 return AVX_U128_CLEAN;
18743 }
18744
18745 /* Require DIRTY mode if a 256bit or 512bit AVX register is referenced.
18746 Hardware changes state only when a 256bit register is written to,
18747 but we need to prevent the compiler from moving optimal insertion
18748 point above eventual read from 256bit or 512 bit register. */
18749 subrtx_iterator::array_type array;
18750 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
18751 if (ix86_check_avx_upper_register (*iter))
18752 return AVX_U128_DIRTY;
18753
18754 return AVX_U128_ANY;
18755 }
18756
18757 /* Return mode that i387 must be switched into
18758 prior to the execution of insn. */
18759
18760 static int
18761 ix86_i387_mode_needed (int entity, rtx_insn *insn)
18762 {
18763 enum attr_i387_cw mode;
18764
18765 /* The mode UNINITIALIZED is used to store control word after a
18766 function call or ASM pattern. The mode ANY specify that function
18767 has no requirements on the control word and make no changes in the
18768 bits we are interested in. */
18769
18770 if (CALL_P (insn)
18771 || (NONJUMP_INSN_P (insn)
18772 && (asm_noperands (PATTERN (insn)) >= 0
18773 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
18774 return I387_CW_UNINITIALIZED;
18775
18776 if (recog_memoized (insn) < 0)
18777 return I387_CW_ANY;
18778
18779 mode = get_attr_i387_cw (insn);
18780
18781 switch (entity)
18782 {
18783 case I387_TRUNC:
18784 if (mode == I387_CW_TRUNC)
18785 return mode;
18786 break;
18787
18788 case I387_FLOOR:
18789 if (mode == I387_CW_FLOOR)
18790 return mode;
18791 break;
18792
18793 case I387_CEIL:
18794 if (mode == I387_CW_CEIL)
18795 return mode;
18796 break;
18797
18798 case I387_MASK_PM:
18799 if (mode == I387_CW_MASK_PM)
18800 return mode;
18801 break;
18802
18803 default:
18804 gcc_unreachable ();
18805 }
18806
18807 return I387_CW_ANY;
18808 }
18809
18810 /* Return mode that entity must be switched into
18811 prior to the execution of insn. */
18812
18813 static int
18814 ix86_mode_needed (int entity, rtx_insn *insn)
18815 {
18816 switch (entity)
18817 {
18818 case X86_DIRFLAG:
18819 return ix86_dirflag_mode_needed (insn);
18820 case AVX_U128:
18821 return ix86_avx_u128_mode_needed (insn);
18822 case I387_TRUNC:
18823 case I387_FLOOR:
18824 case I387_CEIL:
18825 case I387_MASK_PM:
18826 return ix86_i387_mode_needed (entity, insn);
18827 default:
18828 gcc_unreachable ();
18829 }
18830 return 0;
18831 }
18832
18833 /* Check if a 256bit or 512bit AVX register is referenced in stores. */
18834
18835 static void
18836 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
18837 {
18838 if (ix86_check_avx_upper_register (dest))
18839 {
18840 bool *used = (bool *) data;
18841 *used = true;
18842 }
18843 }
18844
18845 /* Calculate mode of upper 128bit AVX registers after the insn. */
18846
18847 static int
18848 ix86_avx_u128_mode_after (int mode, rtx_insn *insn)
18849 {
18850 rtx pat = PATTERN (insn);
18851
18852 if (vzeroupper_operation (pat, VOIDmode)
18853 || vzeroall_operation (pat, VOIDmode))
18854 return AVX_U128_CLEAN;
18855
18856 /* We know that state is clean after CALL insn if there are no
18857 256bit or 512bit registers used in the function return register. */
18858 if (CALL_P (insn))
18859 {
18860 bool avx_upper_reg_found = false;
18861 note_stores (pat, ix86_check_avx_upper_stores, &avx_upper_reg_found);
18862
18863 return avx_upper_reg_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
18864 }
18865
18866 /* Otherwise, return current mode. Remember that if insn
18867 references AVX 256bit or 512bit registers, the mode was already
18868 changed to DIRTY from MODE_NEEDED. */
18869 return mode;
18870 }
18871
18872 /* Return the mode that an insn results in. */
18873
18874 static int
18875 ix86_mode_after (int entity, int mode, rtx_insn *insn)
18876 {
18877 switch (entity)
18878 {
18879 case X86_DIRFLAG:
18880 return mode;
18881 case AVX_U128:
18882 return ix86_avx_u128_mode_after (mode, insn);
18883 case I387_TRUNC:
18884 case I387_FLOOR:
18885 case I387_CEIL:
18886 case I387_MASK_PM:
18887 return mode;
18888 default:
18889 gcc_unreachable ();
18890 }
18891 }
18892
18893 static int
18894 ix86_dirflag_mode_entry (void)
18895 {
18896 /* For TARGET_CLD or in the interrupt handler we can't assume
18897 direction flag state at function entry. */
18898 if (TARGET_CLD
18899 || cfun->machine->func_type != TYPE_NORMAL)
18900 return X86_DIRFLAG_ANY;
18901
18902 return X86_DIRFLAG_RESET;
18903 }
18904
18905 static int
18906 ix86_avx_u128_mode_entry (void)
18907 {
18908 tree arg;
18909
18910 /* Entry mode is set to AVX_U128_DIRTY if there are
18911 256bit or 512bit modes used in function arguments. */
18912 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
18913 arg = TREE_CHAIN (arg))
18914 {
18915 rtx incoming = DECL_INCOMING_RTL (arg);
18916
18917 if (incoming && ix86_check_avx_upper_register (incoming))
18918 return AVX_U128_DIRTY;
18919 }
18920
18921 return AVX_U128_CLEAN;
18922 }
18923
18924 /* Return a mode that ENTITY is assumed to be
18925 switched to at function entry. */
18926
18927 static int
18928 ix86_mode_entry (int entity)
18929 {
18930 switch (entity)
18931 {
18932 case X86_DIRFLAG:
18933 return ix86_dirflag_mode_entry ();
18934 case AVX_U128:
18935 return ix86_avx_u128_mode_entry ();
18936 case I387_TRUNC:
18937 case I387_FLOOR:
18938 case I387_CEIL:
18939 case I387_MASK_PM:
18940 return I387_CW_ANY;
18941 default:
18942 gcc_unreachable ();
18943 }
18944 }
18945
18946 static int
18947 ix86_avx_u128_mode_exit (void)
18948 {
18949 rtx reg = crtl->return_rtx;
18950
18951 /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
18952 or 512 bit modes used in the function return register. */
18953 if (reg && ix86_check_avx_upper_register (reg))
18954 return AVX_U128_DIRTY;
18955
18956 return AVX_U128_CLEAN;
18957 }
18958
18959 /* Return a mode that ENTITY is assumed to be
18960 switched to at function exit. */
18961
18962 static int
18963 ix86_mode_exit (int entity)
18964 {
18965 switch (entity)
18966 {
18967 case X86_DIRFLAG:
18968 return X86_DIRFLAG_ANY;
18969 case AVX_U128:
18970 return ix86_avx_u128_mode_exit ();
18971 case I387_TRUNC:
18972 case I387_FLOOR:
18973 case I387_CEIL:
18974 case I387_MASK_PM:
18975 return I387_CW_ANY;
18976 default:
18977 gcc_unreachable ();
18978 }
18979 }
18980
18981 static int
18982 ix86_mode_priority (int, int n)
18983 {
18984 return n;
18985 }
18986
18987 /* Output code to initialize control word copies used by trunc?f?i and
18988 rounding patterns. CURRENT_MODE is set to current control word,
18989 while NEW_MODE is set to new control word. */
18990
18991 static void
18992 emit_i387_cw_initialization (int mode)
18993 {
18994 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
18995 rtx new_mode;
18996
18997 enum ix86_stack_slot slot;
18998
18999 rtx reg = gen_reg_rtx (HImode);
19000
19001 emit_insn (gen_x86_fnstcw_1 (stored_mode));
19002 emit_move_insn (reg, copy_rtx (stored_mode));
19003
19004 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
19005 || optimize_insn_for_size_p ())
19006 {
19007 switch (mode)
19008 {
19009 case I387_CW_TRUNC:
19010 /* round toward zero (truncate) */
19011 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
19012 slot = SLOT_CW_TRUNC;
19013 break;
19014
19015 case I387_CW_FLOOR:
19016 /* round down toward -oo */
19017 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19018 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
19019 slot = SLOT_CW_FLOOR;
19020 break;
19021
19022 case I387_CW_CEIL:
19023 /* round up toward +oo */
19024 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
19025 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
19026 slot = SLOT_CW_CEIL;
19027 break;
19028
19029 case I387_CW_MASK_PM:
19030 /* mask precision exception for nearbyint() */
19031 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19032 slot = SLOT_CW_MASK_PM;
19033 break;
19034
19035 default:
19036 gcc_unreachable ();
19037 }
19038 }
19039 else
19040 {
19041 switch (mode)
19042 {
19043 case I387_CW_TRUNC:
19044 /* round toward zero (truncate) */
19045 emit_insn (gen_insvsi_1 (reg, GEN_INT (0xc)));
19046 slot = SLOT_CW_TRUNC;
19047 break;
19048
19049 case I387_CW_FLOOR:
19050 /* round down toward -oo */
19051 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x4)));
19052 slot = SLOT_CW_FLOOR;
19053 break;
19054
19055 case I387_CW_CEIL:
19056 /* round up toward +oo */
19057 emit_insn (gen_insvsi_1 (reg, GEN_INT (0x8)));
19058 slot = SLOT_CW_CEIL;
19059 break;
19060
19061 case I387_CW_MASK_PM:
19062 /* mask precision exception for nearbyint() */
19063 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
19064 slot = SLOT_CW_MASK_PM;
19065 break;
19066
19067 default:
19068 gcc_unreachable ();
19069 }
19070 }
19071
19072 gcc_assert (slot < MAX_386_STACK_LOCALS);
19073
19074 new_mode = assign_386_stack_local (HImode, slot);
19075 emit_move_insn (new_mode, reg);
19076 }
19077
19078 /* Emit vzeroupper. */
19079
19080 void
19081 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
19082 {
19083 int i;
19084
19085 /* Cancel automatic vzeroupper insertion if there are
19086 live call-saved SSE registers at the insertion point. */
19087
19088 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19089 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19090 return;
19091
19092 if (TARGET_64BIT)
19093 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19094 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
19095 return;
19096
19097 emit_insn (gen_avx_vzeroupper ());
19098 }
19099
19100 /* Generate one or more insns to set ENTITY to MODE. */
19101
19102 /* Generate one or more insns to set ENTITY to MODE. HARD_REG_LIVE
19103 is the set of hard registers live at the point where the insn(s)
19104 are to be inserted. */
19105
19106 static void
19107 ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED,
19108 HARD_REG_SET regs_live)
19109 {
19110 switch (entity)
19111 {
19112 case X86_DIRFLAG:
19113 if (mode == X86_DIRFLAG_RESET)
19114 emit_insn (gen_cld ());
19115 break;
19116 case AVX_U128:
19117 if (mode == AVX_U128_CLEAN)
19118 ix86_avx_emit_vzeroupper (regs_live);
19119 break;
19120 case I387_TRUNC:
19121 case I387_FLOOR:
19122 case I387_CEIL:
19123 case I387_MASK_PM:
19124 if (mode != I387_CW_ANY
19125 && mode != I387_CW_UNINITIALIZED)
19126 emit_i387_cw_initialization (mode);
19127 break;
19128 default:
19129 gcc_unreachable ();
19130 }
19131 }
19132
19133 /* Output code for INSN to convert a float to a signed int. OPERANDS
19134 are the insn operands. The output may be [HSD]Imode and the input
19135 operand may be [SDX]Fmode. */
19136
19137 const char *
19138 output_fix_trunc (rtx_insn *insn, rtx *operands, bool fisttp)
19139 {
19140 bool stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19141 bool dimode_p = GET_MODE (operands[0]) == DImode;
19142 int round_mode = get_attr_i387_cw (insn);
19143
19144 static char buf[40];
19145 const char *p;
19146
19147 /* Jump through a hoop or two for DImode, since the hardware has no
19148 non-popping instruction. We used to do this a different way, but
19149 that was somewhat fragile and broke with post-reload splitters. */
19150 if ((dimode_p || fisttp) && !stack_top_dies)
19151 output_asm_insn ("fld\t%y1", operands);
19152
19153 gcc_assert (STACK_TOP_P (operands[1]));
19154 gcc_assert (MEM_P (operands[0]));
19155 gcc_assert (GET_MODE (operands[1]) != TFmode);
19156
19157 if (fisttp)
19158 return "fisttp%Z0\t%0";
19159
19160 strcpy (buf, "fist");
19161
19162 if (round_mode != I387_CW_ANY)
19163 output_asm_insn ("fldcw\t%3", operands);
19164
19165 p = "p%Z0\t%0";
19166 strcat (buf, p + !(stack_top_dies || dimode_p));
19167
19168 output_asm_insn (buf, operands);
19169
19170 if (round_mode != I387_CW_ANY)
19171 output_asm_insn ("fldcw\t%2", operands);
19172
19173 return "";
19174 }
19175
19176 /* Output code for x87 ffreep insn. The OPNO argument, which may only
19177 have the values zero or one, indicates the ffreep insn's operand
19178 from the OPERANDS array. */
19179
19180 static const char *
19181 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
19182 {
19183 if (TARGET_USE_FFREEP)
19184 #ifdef HAVE_AS_IX86_FFREEP
19185 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
19186 #else
19187 {
19188 static char retval[32];
19189 int regno = REGNO (operands[opno]);
19190
19191 gcc_assert (STACK_REGNO_P (regno));
19192
19193 regno -= FIRST_STACK_REG;
19194
19195 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
19196 return retval;
19197 }
19198 #endif
19199
19200 return opno ? "fstp\t%y1" : "fstp\t%y0";
19201 }
19202
19203
19204 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
19205 should be used. UNORDERED_P is true when fucom should be used. */
19206
19207 const char *
19208 output_fp_compare (rtx_insn *insn, rtx *operands,
19209 bool eflags_p, bool unordered_p)
19210 {
19211 rtx *xops = eflags_p ? &operands[0] : &operands[1];
19212 bool stack_top_dies;
19213
19214 static char buf[40];
19215 const char *p;
19216
19217 gcc_assert (STACK_TOP_P (xops[0]));
19218
19219 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG);
19220
19221 if (eflags_p)
19222 {
19223 p = unordered_p ? "fucomi" : "fcomi";
19224 strcpy (buf, p);
19225
19226 p = "p\t{%y1, %0|%0, %y1}";
19227 strcat (buf, p + !stack_top_dies);
19228
19229 return buf;
19230 }
19231
19232 if (STACK_REG_P (xops[1])
19233 && stack_top_dies
19234 && find_regno_note (insn, REG_DEAD, FIRST_STACK_REG + 1))
19235 {
19236 gcc_assert (REGNO (xops[1]) == FIRST_STACK_REG + 1);
19237
19238 /* If both the top of the 387 stack die, and the other operand
19239 is also a stack register that dies, then this must be a
19240 `fcompp' float compare. */
19241 p = unordered_p ? "fucompp" : "fcompp";
19242 strcpy (buf, p);
19243 }
19244 else if (const0_operand (xops[1], VOIDmode))
19245 {
19246 gcc_assert (!unordered_p);
19247 strcpy (buf, "ftst");
19248 }
19249 else
19250 {
19251 if (GET_MODE_CLASS (GET_MODE (xops[1])) == MODE_INT)
19252 {
19253 gcc_assert (!unordered_p);
19254 p = "ficom";
19255 }
19256 else
19257 p = unordered_p ? "fucom" : "fcom";
19258
19259 strcpy (buf, p);
19260
19261 p = "p%Z2\t%y2";
19262 strcat (buf, p + !stack_top_dies);
19263 }
19264
19265 output_asm_insn (buf, operands);
19266 return "fnstsw\t%0";
19267 }
19268
19269 void
19270 ix86_output_addr_vec_elt (FILE *file, int value)
19271 {
19272 const char *directive = ASM_LONG;
19273
19274 #ifdef ASM_QUAD
19275 if (TARGET_LP64)
19276 directive = ASM_QUAD;
19277 #else
19278 gcc_assert (!TARGET_64BIT);
19279 #endif
19280
19281 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
19282 }
19283
19284 void
19285 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
19286 {
19287 const char *directive = ASM_LONG;
19288
19289 #ifdef ASM_QUAD
19290 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
19291 directive = ASM_QUAD;
19292 #else
19293 gcc_assert (!TARGET_64BIT);
19294 #endif
19295 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
19296 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
19297 fprintf (file, "%s%s%d-%s%d\n",
19298 directive, LPREFIX, value, LPREFIX, rel);
19299 else if (HAVE_AS_GOTOFF_IN_DATA)
19300 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
19301 #if TARGET_MACHO
19302 else if (TARGET_MACHO)
19303 {
19304 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
19305 machopic_output_function_base_name (file);
19306 putc ('\n', file);
19307 }
19308 #endif
19309 else
19310 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
19311 GOT_SYMBOL_NAME, LPREFIX, value);
19312 }
19313 \f
19314 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
19315 for the target. */
19316
19317 void
19318 ix86_expand_clear (rtx dest)
19319 {
19320 rtx tmp;
19321
19322 /* We play register width games, which are only valid after reload. */
19323 gcc_assert (reload_completed);
19324
19325 /* Avoid HImode and its attendant prefix byte. */
19326 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
19327 dest = gen_rtx_REG (SImode, REGNO (dest));
19328 tmp = gen_rtx_SET (dest, const0_rtx);
19329
19330 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
19331 {
19332 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19333 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
19334 }
19335
19336 emit_insn (tmp);
19337 }
19338
19339 void
19340 ix86_expand_move (machine_mode mode, rtx operands[])
19341 {
19342 rtx op0, op1;
19343 rtx tmp, addend = NULL_RTX;
19344 enum tls_model model;
19345
19346 op0 = operands[0];
19347 op1 = operands[1];
19348
19349 switch (GET_CODE (op1))
19350 {
19351 case CONST:
19352 tmp = XEXP (op1, 0);
19353
19354 if (GET_CODE (tmp) != PLUS
19355 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
19356 break;
19357
19358 op1 = XEXP (tmp, 0);
19359 addend = XEXP (tmp, 1);
19360 /* FALLTHRU */
19361
19362 case SYMBOL_REF:
19363 model = SYMBOL_REF_TLS_MODEL (op1);
19364
19365 if (model)
19366 op1 = legitimize_tls_address (op1, model, true);
19367 else if (ix86_force_load_from_GOT_p (op1))
19368 {
19369 /* Load the external function address via GOT slot to avoid PLT. */
19370 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
19371 (TARGET_64BIT
19372 ? UNSPEC_GOTPCREL
19373 : UNSPEC_GOT));
19374 op1 = gen_rtx_CONST (Pmode, op1);
19375 op1 = gen_const_mem (Pmode, op1);
19376 set_mem_alias_set (op1, ix86_GOT_alias_set ());
19377 }
19378 else
19379 {
19380 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
19381 if (tmp)
19382 {
19383 op1 = tmp;
19384 if (!addend)
19385 break;
19386 }
19387 else
19388 {
19389 op1 = operands[1];
19390 break;
19391 }
19392 }
19393
19394 if (addend)
19395 {
19396 op1 = force_operand (op1, NULL_RTX);
19397 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
19398 op0, 1, OPTAB_DIRECT);
19399 }
19400 else
19401 op1 = force_operand (op1, op0);
19402
19403 if (op1 == op0)
19404 return;
19405
19406 op1 = convert_to_mode (mode, op1, 1);
19407
19408 default:
19409 break;
19410 }
19411
19412 if ((flag_pic || MACHOPIC_INDIRECT)
19413 && symbolic_operand (op1, mode))
19414 {
19415 if (TARGET_MACHO && !TARGET_64BIT)
19416 {
19417 #if TARGET_MACHO
19418 /* dynamic-no-pic */
19419 if (MACHOPIC_INDIRECT)
19420 {
19421 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
19422 ? op0 : gen_reg_rtx (Pmode);
19423 op1 = machopic_indirect_data_reference (op1, temp);
19424 if (MACHOPIC_PURE)
19425 op1 = machopic_legitimize_pic_address (op1, mode,
19426 temp == op1 ? 0 : temp);
19427 }
19428 if (op0 != op1 && GET_CODE (op0) != MEM)
19429 {
19430 rtx insn = gen_rtx_SET (op0, op1);
19431 emit_insn (insn);
19432 return;
19433 }
19434 if (GET_CODE (op0) == MEM)
19435 op1 = force_reg (Pmode, op1);
19436 else
19437 {
19438 rtx temp = op0;
19439 if (GET_CODE (temp) != REG)
19440 temp = gen_reg_rtx (Pmode);
19441 temp = legitimize_pic_address (op1, temp);
19442 if (temp == op0)
19443 return;
19444 op1 = temp;
19445 }
19446 /* dynamic-no-pic */
19447 #endif
19448 }
19449 else
19450 {
19451 if (MEM_P (op0))
19452 op1 = force_reg (mode, op1);
19453 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
19454 {
19455 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
19456 op1 = legitimize_pic_address (op1, reg);
19457 if (op0 == op1)
19458 return;
19459 op1 = convert_to_mode (mode, op1, 1);
19460 }
19461 }
19462 }
19463 else
19464 {
19465 if (MEM_P (op0)
19466 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
19467 || !push_operand (op0, mode))
19468 && MEM_P (op1))
19469 op1 = force_reg (mode, op1);
19470
19471 if (push_operand (op0, mode)
19472 && ! general_no_elim_operand (op1, mode))
19473 op1 = copy_to_mode_reg (mode, op1);
19474
19475 /* Force large constants in 64bit compilation into register
19476 to get them CSEed. */
19477 if (can_create_pseudo_p ()
19478 && (mode == DImode) && TARGET_64BIT
19479 && immediate_operand (op1, mode)
19480 && !x86_64_zext_immediate_operand (op1, VOIDmode)
19481 && !register_operand (op0, mode)
19482 && optimize)
19483 op1 = copy_to_mode_reg (mode, op1);
19484
19485 if (can_create_pseudo_p ()
19486 && CONST_DOUBLE_P (op1))
19487 {
19488 /* If we are loading a floating point constant to a register,
19489 force the value to memory now, since we'll get better code
19490 out the back end. */
19491
19492 op1 = validize_mem (force_const_mem (mode, op1));
19493 if (!register_operand (op0, mode))
19494 {
19495 rtx temp = gen_reg_rtx (mode);
19496 emit_insn (gen_rtx_SET (temp, op1));
19497 emit_move_insn (op0, temp);
19498 return;
19499 }
19500 }
19501 }
19502
19503 emit_insn (gen_rtx_SET (op0, op1));
19504 }
19505
19506 void
19507 ix86_expand_vector_move (machine_mode mode, rtx operands[])
19508 {
19509 rtx op0 = operands[0], op1 = operands[1];
19510 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
19511 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
19512 unsigned int align = (TARGET_IAMCU
19513 ? GET_MODE_BITSIZE (mode)
19514 : GET_MODE_ALIGNMENT (mode));
19515
19516 if (push_operand (op0, VOIDmode))
19517 op0 = emit_move_resolve_push (mode, op0);
19518
19519 /* Force constants other than zero into memory. We do not know how
19520 the instructions used to build constants modify the upper 64 bits
19521 of the register, once we have that information we may be able
19522 to handle some of them more efficiently. */
19523 if (can_create_pseudo_p ()
19524 && (CONSTANT_P (op1)
19525 || (SUBREG_P (op1)
19526 && CONSTANT_P (SUBREG_REG (op1))))
19527 && ((register_operand (op0, mode)
19528 && !standard_sse_constant_p (op1, mode))
19529 /* ix86_expand_vector_move_misalign() does not like constants. */
19530 || (SSE_REG_MODE_P (mode)
19531 && MEM_P (op0)
19532 && MEM_ALIGN (op0) < align)))
19533 {
19534 if (SUBREG_P (op1))
19535 {
19536 machine_mode imode = GET_MODE (SUBREG_REG (op1));
19537 rtx r = force_const_mem (imode, SUBREG_REG (op1));
19538 if (r)
19539 r = validize_mem (r);
19540 else
19541 r = force_reg (imode, SUBREG_REG (op1));
19542 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
19543 }
19544 else
19545 op1 = validize_mem (force_const_mem (mode, op1));
19546 }
19547
19548 /* We need to check memory alignment for SSE mode since attribute
19549 can make operands unaligned. */
19550 if (can_create_pseudo_p ()
19551 && SSE_REG_MODE_P (mode)
19552 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
19553 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
19554 {
19555 rtx tmp[2];
19556
19557 /* ix86_expand_vector_move_misalign() does not like both
19558 arguments in memory. */
19559 if (!register_operand (op0, mode)
19560 && !register_operand (op1, mode))
19561 op1 = force_reg (mode, op1);
19562
19563 tmp[0] = op0; tmp[1] = op1;
19564 ix86_expand_vector_move_misalign (mode, tmp);
19565 return;
19566 }
19567
19568 /* Make operand1 a register if it isn't already. */
19569 if (can_create_pseudo_p ()
19570 && !register_operand (op0, mode)
19571 && !register_operand (op1, mode))
19572 {
19573 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
19574 return;
19575 }
19576
19577 emit_insn (gen_rtx_SET (op0, op1));
19578 }
19579
19580 /* Split 32-byte AVX unaligned load and store if needed. */
19581
19582 static void
19583 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
19584 {
19585 rtx m;
19586 rtx (*extract) (rtx, rtx, rtx);
19587 machine_mode mode;
19588
19589 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
19590 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
19591 {
19592 emit_insn (gen_rtx_SET (op0, op1));
19593 return;
19594 }
19595
19596 rtx orig_op0 = NULL_RTX;
19597 mode = GET_MODE (op0);
19598 switch (GET_MODE_CLASS (mode))
19599 {
19600 case MODE_VECTOR_INT:
19601 case MODE_INT:
19602 if (mode != V32QImode)
19603 {
19604 if (!MEM_P (op0))
19605 {
19606 orig_op0 = op0;
19607 op0 = gen_reg_rtx (V32QImode);
19608 }
19609 else
19610 op0 = gen_lowpart (V32QImode, op0);
19611 op1 = gen_lowpart (V32QImode, op1);
19612 mode = V32QImode;
19613 }
19614 break;
19615 case MODE_VECTOR_FLOAT:
19616 break;
19617 default:
19618 gcc_unreachable ();
19619 }
19620
19621 switch (mode)
19622 {
19623 default:
19624 gcc_unreachable ();
19625 case E_V32QImode:
19626 extract = gen_avx_vextractf128v32qi;
19627 mode = V16QImode;
19628 break;
19629 case E_V8SFmode:
19630 extract = gen_avx_vextractf128v8sf;
19631 mode = V4SFmode;
19632 break;
19633 case E_V4DFmode:
19634 extract = gen_avx_vextractf128v4df;
19635 mode = V2DFmode;
19636 break;
19637 }
19638
19639 if (MEM_P (op1))
19640 {
19641 rtx r = gen_reg_rtx (mode);
19642 m = adjust_address (op1, mode, 0);
19643 emit_move_insn (r, m);
19644 m = adjust_address (op1, mode, 16);
19645 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
19646 emit_move_insn (op0, r);
19647 }
19648 else if (MEM_P (op0))
19649 {
19650 m = adjust_address (op0, mode, 0);
19651 emit_insn (extract (m, op1, const0_rtx));
19652 m = adjust_address (op0, mode, 16);
19653 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
19654 }
19655 else
19656 gcc_unreachable ();
19657
19658 if (orig_op0)
19659 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
19660 }
19661
19662 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
19663 straight to ix86_expand_vector_move. */
19664 /* Code generation for scalar reg-reg moves of single and double precision data:
19665 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
19666 movaps reg, reg
19667 else
19668 movss reg, reg
19669 if (x86_sse_partial_reg_dependency == true)
19670 movapd reg, reg
19671 else
19672 movsd reg, reg
19673
19674 Code generation for scalar loads of double precision data:
19675 if (x86_sse_split_regs == true)
19676 movlpd mem, reg (gas syntax)
19677 else
19678 movsd mem, reg
19679
19680 Code generation for unaligned packed loads of single precision data
19681 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
19682 if (x86_sse_unaligned_move_optimal)
19683 movups mem, reg
19684
19685 if (x86_sse_partial_reg_dependency == true)
19686 {
19687 xorps reg, reg
19688 movlps mem, reg
19689 movhps mem+8, reg
19690 }
19691 else
19692 {
19693 movlps mem, reg
19694 movhps mem+8, reg
19695 }
19696
19697 Code generation for unaligned packed loads of double precision data
19698 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
19699 if (x86_sse_unaligned_move_optimal)
19700 movupd mem, reg
19701
19702 if (x86_sse_split_regs == true)
19703 {
19704 movlpd mem, reg
19705 movhpd mem+8, reg
19706 }
19707 else
19708 {
19709 movsd mem, reg
19710 movhpd mem+8, reg
19711 }
19712 */
19713
19714 void
19715 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
19716 {
19717 rtx op0, op1, m;
19718
19719 op0 = operands[0];
19720 op1 = operands[1];
19721
19722 /* Use unaligned load/store for AVX512 or when optimizing for size. */
19723 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
19724 {
19725 emit_insn (gen_rtx_SET (op0, op1));
19726 return;
19727 }
19728
19729 if (TARGET_AVX)
19730 {
19731 if (GET_MODE_SIZE (mode) == 32)
19732 ix86_avx256_split_vector_move_misalign (op0, op1);
19733 else
19734 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
19735 emit_insn (gen_rtx_SET (op0, op1));
19736 return;
19737 }
19738
19739 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
19740 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
19741 {
19742 emit_insn (gen_rtx_SET (op0, op1));
19743 return;
19744 }
19745
19746 /* ??? If we have typed data, then it would appear that using
19747 movdqu is the only way to get unaligned data loaded with
19748 integer type. */
19749 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
19750 {
19751 emit_insn (gen_rtx_SET (op0, op1));
19752 return;
19753 }
19754
19755 if (MEM_P (op1))
19756 {
19757 if (TARGET_SSE2 && mode == V2DFmode)
19758 {
19759 rtx zero;
19760
19761 /* When SSE registers are split into halves, we can avoid
19762 writing to the top half twice. */
19763 if (TARGET_SSE_SPLIT_REGS)
19764 {
19765 emit_clobber (op0);
19766 zero = op0;
19767 }
19768 else
19769 {
19770 /* ??? Not sure about the best option for the Intel chips.
19771 The following would seem to satisfy; the register is
19772 entirely cleared, breaking the dependency chain. We
19773 then store to the upper half, with a dependency depth
19774 of one. A rumor has it that Intel recommends two movsd
19775 followed by an unpacklpd, but this is unconfirmed. And
19776 given that the dependency depth of the unpacklpd would
19777 still be one, I'm not sure why this would be better. */
19778 zero = CONST0_RTX (V2DFmode);
19779 }
19780
19781 m = adjust_address (op1, DFmode, 0);
19782 emit_insn (gen_sse2_loadlpd (op0, zero, m));
19783 m = adjust_address (op1, DFmode, 8);
19784 emit_insn (gen_sse2_loadhpd (op0, op0, m));
19785 }
19786 else
19787 {
19788 rtx t;
19789
19790 if (mode != V4SFmode)
19791 t = gen_reg_rtx (V4SFmode);
19792 else
19793 t = op0;
19794
19795 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
19796 emit_move_insn (t, CONST0_RTX (V4SFmode));
19797 else
19798 emit_clobber (t);
19799
19800 m = adjust_address (op1, V2SFmode, 0);
19801 emit_insn (gen_sse_loadlps (t, t, m));
19802 m = adjust_address (op1, V2SFmode, 8);
19803 emit_insn (gen_sse_loadhps (t, t, m));
19804 if (mode != V4SFmode)
19805 emit_move_insn (op0, gen_lowpart (mode, t));
19806 }
19807 }
19808 else if (MEM_P (op0))
19809 {
19810 if (TARGET_SSE2 && mode == V2DFmode)
19811 {
19812 m = adjust_address (op0, DFmode, 0);
19813 emit_insn (gen_sse2_storelpd (m, op1));
19814 m = adjust_address (op0, DFmode, 8);
19815 emit_insn (gen_sse2_storehpd (m, op1));
19816 }
19817 else
19818 {
19819 if (mode != V4SFmode)
19820 op1 = gen_lowpart (V4SFmode, op1);
19821
19822 m = adjust_address (op0, V2SFmode, 0);
19823 emit_insn (gen_sse_storelps (m, op1));
19824 m = adjust_address (op0, V2SFmode, 8);
19825 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
19826 }
19827 }
19828 else
19829 gcc_unreachable ();
19830 }
19831
19832 /* Helper function of ix86_fixup_binary_operands to canonicalize
19833 operand order. Returns true if the operands should be swapped. */
19834
19835 static bool
19836 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
19837 rtx operands[])
19838 {
19839 rtx dst = operands[0];
19840 rtx src1 = operands[1];
19841 rtx src2 = operands[2];
19842
19843 /* If the operation is not commutative, we can't do anything. */
19844 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
19845 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
19846 return false;
19847
19848 /* Highest priority is that src1 should match dst. */
19849 if (rtx_equal_p (dst, src1))
19850 return false;
19851 if (rtx_equal_p (dst, src2))
19852 return true;
19853
19854 /* Next highest priority is that immediate constants come second. */
19855 if (immediate_operand (src2, mode))
19856 return false;
19857 if (immediate_operand (src1, mode))
19858 return true;
19859
19860 /* Lowest priority is that memory references should come second. */
19861 if (MEM_P (src2))
19862 return false;
19863 if (MEM_P (src1))
19864 return true;
19865
19866 return false;
19867 }
19868
19869
19870 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
19871 destination to use for the operation. If different from the true
19872 destination in operands[0], a copy operation will be required. */
19873
19874 rtx
19875 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
19876 rtx operands[])
19877 {
19878 rtx dst = operands[0];
19879 rtx src1 = operands[1];
19880 rtx src2 = operands[2];
19881
19882 /* Canonicalize operand order. */
19883 if (ix86_swap_binary_operands_p (code, mode, operands))
19884 {
19885 /* It is invalid to swap operands of different modes. */
19886 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
19887
19888 std::swap (src1, src2);
19889 }
19890
19891 /* Both source operands cannot be in memory. */
19892 if (MEM_P (src1) && MEM_P (src2))
19893 {
19894 /* Optimization: Only read from memory once. */
19895 if (rtx_equal_p (src1, src2))
19896 {
19897 src2 = force_reg (mode, src2);
19898 src1 = src2;
19899 }
19900 else if (rtx_equal_p (dst, src1))
19901 src2 = force_reg (mode, src2);
19902 else
19903 src1 = force_reg (mode, src1);
19904 }
19905
19906 /* If the destination is memory, and we do not have matching source
19907 operands, do things in registers. */
19908 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
19909 dst = gen_reg_rtx (mode);
19910
19911 /* Source 1 cannot be a constant. */
19912 if (CONSTANT_P (src1))
19913 src1 = force_reg (mode, src1);
19914
19915 /* Source 1 cannot be a non-matching memory. */
19916 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
19917 src1 = force_reg (mode, src1);
19918
19919 /* Improve address combine. */
19920 if (code == PLUS
19921 && GET_MODE_CLASS (mode) == MODE_INT
19922 && MEM_P (src2))
19923 src2 = force_reg (mode, src2);
19924
19925 operands[1] = src1;
19926 operands[2] = src2;
19927 return dst;
19928 }
19929
19930 /* Similarly, but assume that the destination has already been
19931 set up properly. */
19932
19933 void
19934 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
19935 machine_mode mode, rtx operands[])
19936 {
19937 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
19938 gcc_assert (dst == operands[0]);
19939 }
19940
19941 /* Attempt to expand a binary operator. Make the expansion closer to the
19942 actual machine, then just general_operand, which will allow 3 separate
19943 memory references (one output, two input) in a single insn. */
19944
19945 void
19946 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
19947 rtx operands[])
19948 {
19949 rtx src1, src2, dst, op, clob;
19950
19951 dst = ix86_fixup_binary_operands (code, mode, operands);
19952 src1 = operands[1];
19953 src2 = operands[2];
19954
19955 /* Emit the instruction. */
19956
19957 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
19958
19959 if (reload_completed
19960 && code == PLUS
19961 && !rtx_equal_p (dst, src1))
19962 {
19963 /* This is going to be an LEA; avoid splitting it later. */
19964 emit_insn (op);
19965 }
19966 else
19967 {
19968 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
19969 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
19970 }
19971
19972 /* Fix up the destination if needed. */
19973 if (dst != operands[0])
19974 emit_move_insn (operands[0], dst);
19975 }
19976
19977 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
19978 the given OPERANDS. */
19979
19980 void
19981 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
19982 rtx operands[])
19983 {
19984 rtx op1 = NULL_RTX, op2 = NULL_RTX;
19985 if (SUBREG_P (operands[1]))
19986 {
19987 op1 = operands[1];
19988 op2 = operands[2];
19989 }
19990 else if (SUBREG_P (operands[2]))
19991 {
19992 op1 = operands[2];
19993 op2 = operands[1];
19994 }
19995 /* Optimize (__m128i) d | (__m128i) e and similar code
19996 when d and e are float vectors into float vector logical
19997 insn. In C/C++ without using intrinsics there is no other way
19998 to express vector logical operation on float vectors than
19999 to cast them temporarily to integer vectors. */
20000 if (op1
20001 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
20002 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
20003 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
20004 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
20005 && SUBREG_BYTE (op1) == 0
20006 && (GET_CODE (op2) == CONST_VECTOR
20007 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
20008 && SUBREG_BYTE (op2) == 0))
20009 && can_create_pseudo_p ())
20010 {
20011 rtx dst;
20012 switch (GET_MODE (SUBREG_REG (op1)))
20013 {
20014 case E_V4SFmode:
20015 case E_V8SFmode:
20016 case E_V16SFmode:
20017 case E_V2DFmode:
20018 case E_V4DFmode:
20019 case E_V8DFmode:
20020 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
20021 if (GET_CODE (op2) == CONST_VECTOR)
20022 {
20023 op2 = gen_lowpart (GET_MODE (dst), op2);
20024 op2 = force_reg (GET_MODE (dst), op2);
20025 }
20026 else
20027 {
20028 op1 = operands[1];
20029 op2 = SUBREG_REG (operands[2]);
20030 if (!vector_operand (op2, GET_MODE (dst)))
20031 op2 = force_reg (GET_MODE (dst), op2);
20032 }
20033 op1 = SUBREG_REG (op1);
20034 if (!vector_operand (op1, GET_MODE (dst)))
20035 op1 = force_reg (GET_MODE (dst), op1);
20036 emit_insn (gen_rtx_SET (dst,
20037 gen_rtx_fmt_ee (code, GET_MODE (dst),
20038 op1, op2)));
20039 emit_move_insn (operands[0], gen_lowpart (mode, dst));
20040 return;
20041 default:
20042 break;
20043 }
20044 }
20045 if (!vector_operand (operands[1], mode))
20046 operands[1] = force_reg (mode, operands[1]);
20047 if (!vector_operand (operands[2], mode))
20048 operands[2] = force_reg (mode, operands[2]);
20049 ix86_fixup_binary_operands_no_copy (code, mode, operands);
20050 emit_insn (gen_rtx_SET (operands[0],
20051 gen_rtx_fmt_ee (code, mode, operands[1],
20052 operands[2])));
20053 }
20054
20055 /* Return TRUE or FALSE depending on whether the binary operator meets the
20056 appropriate constraints. */
20057
20058 bool
20059 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
20060 rtx operands[3])
20061 {
20062 rtx dst = operands[0];
20063 rtx src1 = operands[1];
20064 rtx src2 = operands[2];
20065
20066 /* Both source operands cannot be in memory. */
20067 if (MEM_P (src1) && MEM_P (src2))
20068 return false;
20069
20070 /* Canonicalize operand order for commutative operators. */
20071 if (ix86_swap_binary_operands_p (code, mode, operands))
20072 std::swap (src1, src2);
20073
20074 /* If the destination is memory, we must have a matching source operand. */
20075 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
20076 return false;
20077
20078 /* Source 1 cannot be a constant. */
20079 if (CONSTANT_P (src1))
20080 return false;
20081
20082 /* Source 1 cannot be a non-matching memory. */
20083 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
20084 /* Support "andhi/andsi/anddi" as a zero-extending move. */
20085 return (code == AND
20086 && (mode == HImode
20087 || mode == SImode
20088 || (TARGET_64BIT && mode == DImode))
20089 && satisfies_constraint_L (src2));
20090
20091 return true;
20092 }
20093
20094 /* Attempt to expand a unary operator. Make the expansion closer to the
20095 actual machine, then just general_operand, which will allow 2 separate
20096 memory references (one output, one input) in a single insn. */
20097
20098 void
20099 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
20100 rtx operands[])
20101 {
20102 bool matching_memory = false;
20103 rtx src, dst, op, clob;
20104
20105 dst = operands[0];
20106 src = operands[1];
20107
20108 /* If the destination is memory, and we do not have matching source
20109 operands, do things in registers. */
20110 if (MEM_P (dst))
20111 {
20112 if (rtx_equal_p (dst, src))
20113 matching_memory = true;
20114 else
20115 dst = gen_reg_rtx (mode);
20116 }
20117
20118 /* When source operand is memory, destination must match. */
20119 if (MEM_P (src) && !matching_memory)
20120 src = force_reg (mode, src);
20121
20122 /* Emit the instruction. */
20123
20124 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
20125
20126 if (code == NOT)
20127 emit_insn (op);
20128 else
20129 {
20130 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20131 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20132 }
20133
20134 /* Fix up the destination if needed. */
20135 if (dst != operands[0])
20136 emit_move_insn (operands[0], dst);
20137 }
20138
20139 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
20140 divisor are within the range [0-255]. */
20141
20142 void
20143 ix86_split_idivmod (machine_mode mode, rtx operands[],
20144 bool signed_p)
20145 {
20146 rtx_code_label *end_label, *qimode_label;
20147 rtx div, mod;
20148 rtx_insn *insn;
20149 rtx scratch, tmp0, tmp1, tmp2;
20150 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
20151 rtx (*gen_zero_extend) (rtx, rtx);
20152 rtx (*gen_test_ccno_1) (rtx, rtx);
20153
20154 switch (mode)
20155 {
20156 case E_SImode:
20157 if (GET_MODE (operands[0]) == SImode)
20158 {
20159 if (GET_MODE (operands[1]) == SImode)
20160 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
20161 else
20162 gen_divmod4_1
20163 = signed_p ? gen_divmodsi4_zext_2 : gen_udivmodsi4_zext_2;
20164 gen_zero_extend = gen_zero_extendqisi2;
20165 }
20166 else
20167 {
20168 gen_divmod4_1
20169 = signed_p ? gen_divmodsi4_zext_1 : gen_udivmodsi4_zext_1;
20170 gen_zero_extend = gen_zero_extendqidi2;
20171 }
20172 gen_test_ccno_1 = gen_testsi_ccno_1;
20173 break;
20174 case E_DImode:
20175 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
20176 gen_test_ccno_1 = gen_testdi_ccno_1;
20177 gen_zero_extend = gen_zero_extendqidi2;
20178 break;
20179 default:
20180 gcc_unreachable ();
20181 }
20182
20183 end_label = gen_label_rtx ();
20184 qimode_label = gen_label_rtx ();
20185
20186 scratch = gen_reg_rtx (mode);
20187
20188 /* Use 8bit unsigned divimod if dividend and divisor are within
20189 the range [0-255]. */
20190 emit_move_insn (scratch, operands[2]);
20191 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
20192 scratch, 1, OPTAB_DIRECT);
20193 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
20194 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
20195 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
20196 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
20197 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
20198 pc_rtx);
20199 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
20200 predict_jump (REG_BR_PROB_BASE * 50 / 100);
20201 JUMP_LABEL (insn) = qimode_label;
20202
20203 /* Generate original signed/unsigned divimod. */
20204 div = gen_divmod4_1 (operands[0], operands[1],
20205 operands[2], operands[3]);
20206 emit_insn (div);
20207
20208 /* Branch to the end. */
20209 emit_jump_insn (gen_jump (end_label));
20210 emit_barrier ();
20211
20212 /* Generate 8bit unsigned divide. */
20213 emit_label (qimode_label);
20214 /* Don't use operands[0] for result of 8bit divide since not all
20215 registers support QImode ZERO_EXTRACT. */
20216 tmp0 = lowpart_subreg (HImode, scratch, mode);
20217 tmp1 = lowpart_subreg (HImode, operands[2], mode);
20218 tmp2 = lowpart_subreg (QImode, operands[3], mode);
20219 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
20220
20221 if (signed_p)
20222 {
20223 div = gen_rtx_DIV (mode, operands[2], operands[3]);
20224 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
20225 }
20226 else
20227 {
20228 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
20229 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
20230 }
20231 if (mode == SImode)
20232 {
20233 if (GET_MODE (operands[0]) != SImode)
20234 div = gen_rtx_ZERO_EXTEND (DImode, div);
20235 if (GET_MODE (operands[1]) != SImode)
20236 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
20237 }
20238
20239 /* Extract remainder from AH. */
20240 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]),
20241 tmp0, GEN_INT (8), GEN_INT (8));
20242 if (REG_P (operands[1]))
20243 insn = emit_move_insn (operands[1], tmp1);
20244 else
20245 {
20246 /* Need a new scratch register since the old one has result
20247 of 8bit divide. */
20248 scratch = gen_reg_rtx (GET_MODE (operands[1]));
20249 emit_move_insn (scratch, tmp1);
20250 insn = emit_move_insn (operands[1], scratch);
20251 }
20252 set_unique_reg_note (insn, REG_EQUAL, mod);
20253
20254 /* Zero extend quotient from AL. */
20255 tmp1 = gen_lowpart (QImode, tmp0);
20256 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
20257 set_unique_reg_note (insn, REG_EQUAL, div);
20258
20259 emit_label (end_label);
20260 }
20261
20262 #define LEA_MAX_STALL (3)
20263 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
20264
20265 /* Increase given DISTANCE in half-cycles according to
20266 dependencies between PREV and NEXT instructions.
20267 Add 1 half-cycle if there is no dependency and
20268 go to next cycle if there is some dependecy. */
20269
20270 static unsigned int
20271 increase_distance (rtx_insn *prev, rtx_insn *next, unsigned int distance)
20272 {
20273 df_ref def, use;
20274
20275 if (!prev || !next)
20276 return distance + (distance & 1) + 2;
20277
20278 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
20279 return distance + 1;
20280
20281 FOR_EACH_INSN_USE (use, next)
20282 FOR_EACH_INSN_DEF (def, prev)
20283 if (!DF_REF_IS_ARTIFICIAL (def)
20284 && DF_REF_REGNO (use) == DF_REF_REGNO (def))
20285 return distance + (distance & 1) + 2;
20286
20287 return distance + 1;
20288 }
20289
20290 /* Function checks if instruction INSN defines register number
20291 REGNO1 or REGNO2. */
20292
20293 static bool
20294 insn_defines_reg (unsigned int regno1, unsigned int regno2,
20295 rtx_insn *insn)
20296 {
20297 df_ref def;
20298
20299 FOR_EACH_INSN_DEF (def, insn)
20300 if (DF_REF_REG_DEF_P (def)
20301 && !DF_REF_IS_ARTIFICIAL (def)
20302 && (regno1 == DF_REF_REGNO (def)
20303 || regno2 == DF_REF_REGNO (def)))
20304 return true;
20305
20306 return false;
20307 }
20308
20309 /* Function checks if instruction INSN uses register number
20310 REGNO as a part of address expression. */
20311
20312 static bool
20313 insn_uses_reg_mem (unsigned int regno, rtx insn)
20314 {
20315 df_ref use;
20316
20317 FOR_EACH_INSN_USE (use, insn)
20318 if (DF_REF_REG_MEM_P (use) && regno == DF_REF_REGNO (use))
20319 return true;
20320
20321 return false;
20322 }
20323
20324 /* Search backward for non-agu definition of register number REGNO1
20325 or register number REGNO2 in basic block starting from instruction
20326 START up to head of basic block or instruction INSN.
20327
20328 Function puts true value into *FOUND var if definition was found
20329 and false otherwise.
20330
20331 Distance in half-cycles between START and found instruction or head
20332 of BB is added to DISTANCE and returned. */
20333
20334 static int
20335 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
20336 rtx_insn *insn, int distance,
20337 rtx_insn *start, bool *found)
20338 {
20339 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
20340 rtx_insn *prev = start;
20341 rtx_insn *next = NULL;
20342
20343 *found = false;
20344
20345 while (prev
20346 && prev != insn
20347 && distance < LEA_SEARCH_THRESHOLD)
20348 {
20349 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
20350 {
20351 distance = increase_distance (prev, next, distance);
20352 if (insn_defines_reg (regno1, regno2, prev))
20353 {
20354 if (recog_memoized (prev) < 0
20355 || get_attr_type (prev) != TYPE_LEA)
20356 {
20357 *found = true;
20358 return distance;
20359 }
20360 }
20361
20362 next = prev;
20363 }
20364 if (prev == BB_HEAD (bb))
20365 break;
20366
20367 prev = PREV_INSN (prev);
20368 }
20369
20370 return distance;
20371 }
20372
20373 /* Search backward for non-agu definition of register number REGNO1
20374 or register number REGNO2 in INSN's basic block until
20375 1. Pass LEA_SEARCH_THRESHOLD instructions, or
20376 2. Reach neighbor BBs boundary, or
20377 3. Reach agu definition.
20378 Returns the distance between the non-agu definition point and INSN.
20379 If no definition point, returns -1. */
20380
20381 static int
20382 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
20383 rtx_insn *insn)
20384 {
20385 basic_block bb = BLOCK_FOR_INSN (insn);
20386 int distance = 0;
20387 bool found = false;
20388
20389 if (insn != BB_HEAD (bb))
20390 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
20391 distance, PREV_INSN (insn),
20392 &found);
20393
20394 if (!found && distance < LEA_SEARCH_THRESHOLD)
20395 {
20396 edge e;
20397 edge_iterator ei;
20398 bool simple_loop = false;
20399
20400 FOR_EACH_EDGE (e, ei, bb->preds)
20401 if (e->src == bb)
20402 {
20403 simple_loop = true;
20404 break;
20405 }
20406
20407 if (simple_loop)
20408 distance = distance_non_agu_define_in_bb (regno1, regno2,
20409 insn, distance,
20410 BB_END (bb), &found);
20411 else
20412 {
20413 int shortest_dist = -1;
20414 bool found_in_bb = false;
20415
20416 FOR_EACH_EDGE (e, ei, bb->preds)
20417 {
20418 int bb_dist
20419 = distance_non_agu_define_in_bb (regno1, regno2,
20420 insn, distance,
20421 BB_END (e->src),
20422 &found_in_bb);
20423 if (found_in_bb)
20424 {
20425 if (shortest_dist < 0)
20426 shortest_dist = bb_dist;
20427 else if (bb_dist > 0)
20428 shortest_dist = MIN (bb_dist, shortest_dist);
20429
20430 found = true;
20431 }
20432 }
20433
20434 distance = shortest_dist;
20435 }
20436 }
20437
20438 /* get_attr_type may modify recog data. We want to make sure
20439 that recog data is valid for instruction INSN, on which
20440 distance_non_agu_define is called. INSN is unchanged here. */
20441 extract_insn_cached (insn);
20442
20443 if (!found)
20444 return -1;
20445
20446 return distance >> 1;
20447 }
20448
20449 /* Return the distance in half-cycles between INSN and the next
20450 insn that uses register number REGNO in memory address added
20451 to DISTANCE. Return -1 if REGNO0 is set.
20452
20453 Put true value into *FOUND if register usage was found and
20454 false otherwise.
20455 Put true value into *REDEFINED if register redefinition was
20456 found and false otherwise. */
20457
20458 static int
20459 distance_agu_use_in_bb (unsigned int regno,
20460 rtx_insn *insn, int distance, rtx_insn *start,
20461 bool *found, bool *redefined)
20462 {
20463 basic_block bb = NULL;
20464 rtx_insn *next = start;
20465 rtx_insn *prev = NULL;
20466
20467 *found = false;
20468 *redefined = false;
20469
20470 if (start != NULL_RTX)
20471 {
20472 bb = BLOCK_FOR_INSN (start);
20473 if (start != BB_HEAD (bb))
20474 /* If insn and start belong to the same bb, set prev to insn,
20475 so the call to increase_distance will increase the distance
20476 between insns by 1. */
20477 prev = insn;
20478 }
20479
20480 while (next
20481 && next != insn
20482 && distance < LEA_SEARCH_THRESHOLD)
20483 {
20484 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
20485 {
20486 distance = increase_distance(prev, next, distance);
20487 if (insn_uses_reg_mem (regno, next))
20488 {
20489 /* Return DISTANCE if OP0 is used in memory
20490 address in NEXT. */
20491 *found = true;
20492 return distance;
20493 }
20494
20495 if (insn_defines_reg (regno, INVALID_REGNUM, next))
20496 {
20497 /* Return -1 if OP0 is set in NEXT. */
20498 *redefined = true;
20499 return -1;
20500 }
20501
20502 prev = next;
20503 }
20504
20505 if (next == BB_END (bb))
20506 break;
20507
20508 next = NEXT_INSN (next);
20509 }
20510
20511 return distance;
20512 }
20513
20514 /* Return the distance between INSN and the next insn that uses
20515 register number REGNO0 in memory address. Return -1 if no such
20516 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
20517
20518 static int
20519 distance_agu_use (unsigned int regno0, rtx_insn *insn)
20520 {
20521 basic_block bb = BLOCK_FOR_INSN (insn);
20522 int distance = 0;
20523 bool found = false;
20524 bool redefined = false;
20525
20526 if (insn != BB_END (bb))
20527 distance = distance_agu_use_in_bb (regno0, insn, distance,
20528 NEXT_INSN (insn),
20529 &found, &redefined);
20530
20531 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
20532 {
20533 edge e;
20534 edge_iterator ei;
20535 bool simple_loop = false;
20536
20537 FOR_EACH_EDGE (e, ei, bb->succs)
20538 if (e->dest == bb)
20539 {
20540 simple_loop = true;
20541 break;
20542 }
20543
20544 if (simple_loop)
20545 distance = distance_agu_use_in_bb (regno0, insn,
20546 distance, BB_HEAD (bb),
20547 &found, &redefined);
20548 else
20549 {
20550 int shortest_dist = -1;
20551 bool found_in_bb = false;
20552 bool redefined_in_bb = false;
20553
20554 FOR_EACH_EDGE (e, ei, bb->succs)
20555 {
20556 int bb_dist
20557 = distance_agu_use_in_bb (regno0, insn,
20558 distance, BB_HEAD (e->dest),
20559 &found_in_bb, &redefined_in_bb);
20560 if (found_in_bb)
20561 {
20562 if (shortest_dist < 0)
20563 shortest_dist = bb_dist;
20564 else if (bb_dist > 0)
20565 shortest_dist = MIN (bb_dist, shortest_dist);
20566
20567 found = true;
20568 }
20569 }
20570
20571 distance = shortest_dist;
20572 }
20573 }
20574
20575 if (!found || redefined)
20576 return -1;
20577
20578 return distance >> 1;
20579 }
20580
20581 /* Define this macro to tune LEA priority vs ADD, it take effect when
20582 there is a dilemma of choicing LEA or ADD
20583 Negative value: ADD is more preferred than LEA
20584 Zero: Netrual
20585 Positive value: LEA is more preferred than ADD*/
20586 #define IX86_LEA_PRIORITY 0
20587
20588 /* Return true if usage of lea INSN has performance advantage
20589 over a sequence of instructions. Instructions sequence has
20590 SPLIT_COST cycles higher latency than lea latency. */
20591
20592 static bool
20593 ix86_lea_outperforms (rtx_insn *insn, unsigned int regno0, unsigned int regno1,
20594 unsigned int regno2, int split_cost, bool has_scale)
20595 {
20596 int dist_define, dist_use;
20597
20598 /* For Silvermont if using a 2-source or 3-source LEA for
20599 non-destructive destination purposes, or due to wanting
20600 ability to use SCALE, the use of LEA is justified. */
20601 if (TARGET_SILVERMONT || TARGET_INTEL)
20602 {
20603 if (has_scale)
20604 return true;
20605 if (split_cost < 1)
20606 return false;
20607 if (regno0 == regno1 || regno0 == regno2)
20608 return false;
20609 return true;
20610 }
20611
20612 dist_define = distance_non_agu_define (regno1, regno2, insn);
20613 dist_use = distance_agu_use (regno0, insn);
20614
20615 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
20616 {
20617 /* If there is no non AGU operand definition, no AGU
20618 operand usage and split cost is 0 then both lea
20619 and non lea variants have same priority. Currently
20620 we prefer lea for 64 bit code and non lea on 32 bit
20621 code. */
20622 if (dist_use < 0 && split_cost == 0)
20623 return TARGET_64BIT || IX86_LEA_PRIORITY;
20624 else
20625 return true;
20626 }
20627
20628 /* With longer definitions distance lea is more preferable.
20629 Here we change it to take into account splitting cost and
20630 lea priority. */
20631 dist_define += split_cost + IX86_LEA_PRIORITY;
20632
20633 /* If there is no use in memory addess then we just check
20634 that split cost exceeds AGU stall. */
20635 if (dist_use < 0)
20636 return dist_define > LEA_MAX_STALL;
20637
20638 /* If this insn has both backward non-agu dependence and forward
20639 agu dependence, the one with short distance takes effect. */
20640 return dist_define >= dist_use;
20641 }
20642
20643 /* Return true if it is legal to clobber flags by INSN and
20644 false otherwise. */
20645
20646 static bool
20647 ix86_ok_to_clobber_flags (rtx_insn *insn)
20648 {
20649 basic_block bb = BLOCK_FOR_INSN (insn);
20650 df_ref use;
20651 bitmap live;
20652
20653 while (insn)
20654 {
20655 if (NONDEBUG_INSN_P (insn))
20656 {
20657 FOR_EACH_INSN_USE (use, insn)
20658 if (DF_REF_REG_USE_P (use) && DF_REF_REGNO (use) == FLAGS_REG)
20659 return false;
20660
20661 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
20662 return true;
20663 }
20664
20665 if (insn == BB_END (bb))
20666 break;
20667
20668 insn = NEXT_INSN (insn);
20669 }
20670
20671 live = df_get_live_out(bb);
20672 return !REGNO_REG_SET_P (live, FLAGS_REG);
20673 }
20674
20675 /* Return true if we need to split op0 = op1 + op2 into a sequence of
20676 move and add to avoid AGU stalls. */
20677
20678 bool
20679 ix86_avoid_lea_for_add (rtx_insn *insn, rtx operands[])
20680 {
20681 unsigned int regno0, regno1, regno2;
20682
20683 /* Check if we need to optimize. */
20684 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20685 return false;
20686
20687 /* Check it is correct to split here. */
20688 if (!ix86_ok_to_clobber_flags(insn))
20689 return false;
20690
20691 regno0 = true_regnum (operands[0]);
20692 regno1 = true_regnum (operands[1]);
20693 regno2 = true_regnum (operands[2]);
20694
20695 /* We need to split only adds with non destructive
20696 destination operand. */
20697 if (regno0 == regno1 || regno0 == regno2)
20698 return false;
20699 else
20700 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1, false);
20701 }
20702
20703 /* Return true if we should emit lea instruction instead of mov
20704 instruction. */
20705
20706 bool
20707 ix86_use_lea_for_mov (rtx_insn *insn, rtx operands[])
20708 {
20709 unsigned int regno0, regno1;
20710
20711 /* Check if we need to optimize. */
20712 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20713 return false;
20714
20715 /* Use lea for reg to reg moves only. */
20716 if (!REG_P (operands[0]) || !REG_P (operands[1]))
20717 return false;
20718
20719 regno0 = true_regnum (operands[0]);
20720 regno1 = true_regnum (operands[1]);
20721
20722 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0, false);
20723 }
20724
20725 /* Return true if we need to split lea into a sequence of
20726 instructions to avoid AGU stalls. */
20727
20728 bool
20729 ix86_avoid_lea_for_addr (rtx_insn *insn, rtx operands[])
20730 {
20731 unsigned int regno0, regno1, regno2;
20732 int split_cost;
20733 struct ix86_address parts;
20734 int ok;
20735
20736 /* Check we need to optimize. */
20737 if (!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun))
20738 return false;
20739
20740 /* The "at least two components" test below might not catch simple
20741 move or zero extension insns if parts.base is non-NULL and parts.disp
20742 is const0_rtx as the only components in the address, e.g. if the
20743 register is %rbp or %r13. As this test is much cheaper and moves or
20744 zero extensions are the common case, do this check first. */
20745 if (REG_P (operands[1])
20746 || (SImode_address_operand (operands[1], VOIDmode)
20747 && REG_P (XEXP (operands[1], 0))))
20748 return false;
20749
20750 /* Check if it is OK to split here. */
20751 if (!ix86_ok_to_clobber_flags (insn))
20752 return false;
20753
20754 ok = ix86_decompose_address (operands[1], &parts);
20755 gcc_assert (ok);
20756
20757 /* There should be at least two components in the address. */
20758 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
20759 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
20760 return false;
20761
20762 /* We should not split into add if non legitimate pic
20763 operand is used as displacement. */
20764 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
20765 return false;
20766
20767 regno0 = true_regnum (operands[0]) ;
20768 regno1 = INVALID_REGNUM;
20769 regno2 = INVALID_REGNUM;
20770
20771 if (parts.base)
20772 regno1 = true_regnum (parts.base);
20773 if (parts.index)
20774 regno2 = true_regnum (parts.index);
20775
20776 split_cost = 0;
20777
20778 /* Compute how many cycles we will add to execution time
20779 if split lea into a sequence of instructions. */
20780 if (parts.base || parts.index)
20781 {
20782 /* Have to use mov instruction if non desctructive
20783 destination form is used. */
20784 if (regno1 != regno0 && regno2 != regno0)
20785 split_cost += 1;
20786
20787 /* Have to add index to base if both exist. */
20788 if (parts.base && parts.index)
20789 split_cost += 1;
20790
20791 /* Have to use shift and adds if scale is 2 or greater. */
20792 if (parts.scale > 1)
20793 {
20794 if (regno0 != regno1)
20795 split_cost += 1;
20796 else if (regno2 == regno0)
20797 split_cost += 4;
20798 else
20799 split_cost += parts.scale;
20800 }
20801
20802 /* Have to use add instruction with immediate if
20803 disp is non zero. */
20804 if (parts.disp && parts.disp != const0_rtx)
20805 split_cost += 1;
20806
20807 /* Subtract the price of lea. */
20808 split_cost -= 1;
20809 }
20810
20811 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost,
20812 parts.scale > 1);
20813 }
20814
20815 /* Emit x86 binary operand CODE in mode MODE, where the first operand
20816 matches destination. RTX includes clobber of FLAGS_REG. */
20817
20818 static void
20819 ix86_emit_binop (enum rtx_code code, machine_mode mode,
20820 rtx dst, rtx src)
20821 {
20822 rtx op, clob;
20823
20824 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
20825 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
20826
20827 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
20828 }
20829
20830 /* Return true if regno1 def is nearest to the insn. */
20831
20832 static bool
20833 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
20834 {
20835 rtx_insn *prev = insn;
20836 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
20837
20838 if (insn == start)
20839 return false;
20840 while (prev && prev != start)
20841 {
20842 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
20843 {
20844 prev = PREV_INSN (prev);
20845 continue;
20846 }
20847 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
20848 return true;
20849 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
20850 return false;
20851 prev = PREV_INSN (prev);
20852 }
20853
20854 /* None of the regs is defined in the bb. */
20855 return false;
20856 }
20857
20858 /* Split lea instructions into a sequence of instructions
20859 which are executed on ALU to avoid AGU stalls.
20860 It is assumed that it is allowed to clobber flags register
20861 at lea position. */
20862
20863 void
20864 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
20865 {
20866 unsigned int regno0, regno1, regno2;
20867 struct ix86_address parts;
20868 rtx target, tmp;
20869 int ok, adds;
20870
20871 ok = ix86_decompose_address (operands[1], &parts);
20872 gcc_assert (ok);
20873
20874 target = gen_lowpart (mode, operands[0]);
20875
20876 regno0 = true_regnum (target);
20877 regno1 = INVALID_REGNUM;
20878 regno2 = INVALID_REGNUM;
20879
20880 if (parts.base)
20881 {
20882 parts.base = gen_lowpart (mode, parts.base);
20883 regno1 = true_regnum (parts.base);
20884 }
20885
20886 if (parts.index)
20887 {
20888 parts.index = gen_lowpart (mode, parts.index);
20889 regno2 = true_regnum (parts.index);
20890 }
20891
20892 if (parts.disp)
20893 parts.disp = gen_lowpart (mode, parts.disp);
20894
20895 if (parts.scale > 1)
20896 {
20897 /* Case r1 = r1 + ... */
20898 if (regno1 == regno0)
20899 {
20900 /* If we have a case r1 = r1 + C * r2 then we
20901 should use multiplication which is very
20902 expensive. Assume cost model is wrong if we
20903 have such case here. */
20904 gcc_assert (regno2 != regno0);
20905
20906 for (adds = parts.scale; adds > 0; adds--)
20907 ix86_emit_binop (PLUS, mode, target, parts.index);
20908 }
20909 else
20910 {
20911 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
20912 if (regno0 != regno2)
20913 emit_insn (gen_rtx_SET (target, parts.index));
20914
20915 /* Use shift for scaling. */
20916 ix86_emit_binop (ASHIFT, mode, target,
20917 GEN_INT (exact_log2 (parts.scale)));
20918
20919 if (parts.base)
20920 ix86_emit_binop (PLUS, mode, target, parts.base);
20921
20922 if (parts.disp && parts.disp != const0_rtx)
20923 ix86_emit_binop (PLUS, mode, target, parts.disp);
20924 }
20925 }
20926 else if (!parts.base && !parts.index)
20927 {
20928 gcc_assert(parts.disp);
20929 emit_insn (gen_rtx_SET (target, parts.disp));
20930 }
20931 else
20932 {
20933 if (!parts.base)
20934 {
20935 if (regno0 != regno2)
20936 emit_insn (gen_rtx_SET (target, parts.index));
20937 }
20938 else if (!parts.index)
20939 {
20940 if (regno0 != regno1)
20941 emit_insn (gen_rtx_SET (target, parts.base));
20942 }
20943 else
20944 {
20945 if (regno0 == regno1)
20946 tmp = parts.index;
20947 else if (regno0 == regno2)
20948 tmp = parts.base;
20949 else
20950 {
20951 rtx tmp1;
20952
20953 /* Find better operand for SET instruction, depending
20954 on which definition is farther from the insn. */
20955 if (find_nearest_reg_def (insn, regno1, regno2))
20956 tmp = parts.index, tmp1 = parts.base;
20957 else
20958 tmp = parts.base, tmp1 = parts.index;
20959
20960 emit_insn (gen_rtx_SET (target, tmp));
20961
20962 if (parts.disp && parts.disp != const0_rtx)
20963 ix86_emit_binop (PLUS, mode, target, parts.disp);
20964
20965 ix86_emit_binop (PLUS, mode, target, tmp1);
20966 return;
20967 }
20968
20969 ix86_emit_binop (PLUS, mode, target, tmp);
20970 }
20971
20972 if (parts.disp && parts.disp != const0_rtx)
20973 ix86_emit_binop (PLUS, mode, target, parts.disp);
20974 }
20975 }
20976
20977 /* Return true if it is ok to optimize an ADD operation to LEA
20978 operation to avoid flag register consumation. For most processors,
20979 ADD is faster than LEA. For the processors like BONNELL, if the
20980 destination register of LEA holds an actual address which will be
20981 used soon, LEA is better and otherwise ADD is better. */
20982
20983 bool
20984 ix86_lea_for_add_ok (rtx_insn *insn, rtx operands[])
20985 {
20986 unsigned int regno0 = true_regnum (operands[0]);
20987 unsigned int regno1 = true_regnum (operands[1]);
20988 unsigned int regno2 = true_regnum (operands[2]);
20989
20990 /* If a = b + c, (a!=b && a!=c), must use lea form. */
20991 if (regno0 != regno1 && regno0 != regno2)
20992 return true;
20993
20994 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
20995 return false;
20996
20997 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0, false);
20998 }
20999
21000 /* Return true if destination reg of SET_BODY is shift count of
21001 USE_BODY. */
21002
21003 static bool
21004 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
21005 {
21006 rtx set_dest;
21007 rtx shift_rtx;
21008 int i;
21009
21010 /* Retrieve destination of SET_BODY. */
21011 switch (GET_CODE (set_body))
21012 {
21013 case SET:
21014 set_dest = SET_DEST (set_body);
21015 if (!set_dest || !REG_P (set_dest))
21016 return false;
21017 break;
21018 case PARALLEL:
21019 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
21020 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
21021 use_body))
21022 return true;
21023 /* FALLTHROUGH */
21024 default:
21025 return false;
21026 }
21027
21028 /* Retrieve shift count of USE_BODY. */
21029 switch (GET_CODE (use_body))
21030 {
21031 case SET:
21032 shift_rtx = XEXP (use_body, 1);
21033 break;
21034 case PARALLEL:
21035 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
21036 if (ix86_dep_by_shift_count_body (set_body,
21037 XVECEXP (use_body, 0, i)))
21038 return true;
21039 /* FALLTHROUGH */
21040 default:
21041 return false;
21042 }
21043
21044 if (shift_rtx
21045 && (GET_CODE (shift_rtx) == ASHIFT
21046 || GET_CODE (shift_rtx) == LSHIFTRT
21047 || GET_CODE (shift_rtx) == ASHIFTRT
21048 || GET_CODE (shift_rtx) == ROTATE
21049 || GET_CODE (shift_rtx) == ROTATERT))
21050 {
21051 rtx shift_count = XEXP (shift_rtx, 1);
21052
21053 /* Return true if shift count is dest of SET_BODY. */
21054 if (REG_P (shift_count))
21055 {
21056 /* Add check since it can be invoked before register
21057 allocation in pre-reload schedule. */
21058 if (reload_completed
21059 && true_regnum (set_dest) == true_regnum (shift_count))
21060 return true;
21061 else if (REGNO(set_dest) == REGNO(shift_count))
21062 return true;
21063 }
21064 }
21065
21066 return false;
21067 }
21068
21069 /* Return true if destination reg of SET_INSN is shift count of
21070 USE_INSN. */
21071
21072 bool
21073 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
21074 {
21075 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
21076 PATTERN (use_insn));
21077 }
21078
21079 /* Return TRUE or FALSE depending on whether the unary operator meets the
21080 appropriate constraints. */
21081
21082 bool
21083 ix86_unary_operator_ok (enum rtx_code,
21084 machine_mode,
21085 rtx operands[2])
21086 {
21087 /* If one of operands is memory, source and destination must match. */
21088 if ((MEM_P (operands[0])
21089 || MEM_P (operands[1]))
21090 && ! rtx_equal_p (operands[0], operands[1]))
21091 return false;
21092 return true;
21093 }
21094
21095 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
21096 are ok, keeping in mind the possible movddup alternative. */
21097
21098 bool
21099 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
21100 {
21101 if (MEM_P (operands[0]))
21102 return rtx_equal_p (operands[0], operands[1 + high]);
21103 if (MEM_P (operands[1]) && MEM_P (operands[2]))
21104 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
21105 return true;
21106 }
21107
21108 /* Post-reload splitter for converting an SF or DFmode value in an
21109 SSE register into an unsigned SImode. */
21110
21111 void
21112 ix86_split_convert_uns_si_sse (rtx operands[])
21113 {
21114 machine_mode vecmode;
21115 rtx value, large, zero_or_two31, input, two31, x;
21116
21117 large = operands[1];
21118 zero_or_two31 = operands[2];
21119 input = operands[3];
21120 two31 = operands[4];
21121 vecmode = GET_MODE (large);
21122 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
21123
21124 /* Load up the value into the low element. We must ensure that the other
21125 elements are valid floats -- zero is the easiest such value. */
21126 if (MEM_P (input))
21127 {
21128 if (vecmode == V4SFmode)
21129 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
21130 else
21131 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
21132 }
21133 else
21134 {
21135 input = gen_rtx_REG (vecmode, REGNO (input));
21136 emit_move_insn (value, CONST0_RTX (vecmode));
21137 if (vecmode == V4SFmode)
21138 emit_insn (gen_sse_movss (value, value, input));
21139 else
21140 emit_insn (gen_sse2_movsd (value, value, input));
21141 }
21142
21143 emit_move_insn (large, two31);
21144 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
21145
21146 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
21147 emit_insn (gen_rtx_SET (large, x));
21148
21149 x = gen_rtx_AND (vecmode, zero_or_two31, large);
21150 emit_insn (gen_rtx_SET (zero_or_two31, x));
21151
21152 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
21153 emit_insn (gen_rtx_SET (value, x));
21154
21155 large = gen_rtx_REG (V4SImode, REGNO (large));
21156 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
21157
21158 x = gen_rtx_REG (V4SImode, REGNO (value));
21159 if (vecmode == V4SFmode)
21160 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
21161 else
21162 emit_insn (gen_sse2_cvttpd2dq (x, value));
21163 value = x;
21164
21165 emit_insn (gen_xorv4si3 (value, value, large));
21166 }
21167
21168 /* Convert an unsigned DImode value into a DFmode, using only SSE.
21169 Expects the 64-bit DImode to be supplied in a pair of integral
21170 registers. Requires SSE2; will use SSE3 if available. For x86_32,
21171 -mfpmath=sse, !optimize_size only. */
21172
21173 void
21174 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
21175 {
21176 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
21177 rtx int_xmm, fp_xmm;
21178 rtx biases, exponents;
21179 rtx x;
21180
21181 int_xmm = gen_reg_rtx (V4SImode);
21182 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
21183 emit_insn (gen_movdi_to_sse (int_xmm, input));
21184 else if (TARGET_SSE_SPLIT_REGS)
21185 {
21186 emit_clobber (int_xmm);
21187 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
21188 }
21189 else
21190 {
21191 x = gen_reg_rtx (V2DImode);
21192 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
21193 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
21194 }
21195
21196 x = gen_rtx_CONST_VECTOR (V4SImode,
21197 gen_rtvec (4, GEN_INT (0x43300000UL),
21198 GEN_INT (0x45300000UL),
21199 const0_rtx, const0_rtx));
21200 exponents = validize_mem (force_const_mem (V4SImode, x));
21201
21202 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
21203 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
21204
21205 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
21206 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
21207 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
21208 (0x1.0p84 + double(fp_value_hi_xmm)).
21209 Note these exponents differ by 32. */
21210
21211 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
21212
21213 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
21214 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
21215 real_ldexp (&bias_lo_rvt, &dconst1, 52);
21216 real_ldexp (&bias_hi_rvt, &dconst1, 84);
21217 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
21218 x = const_double_from_real_value (bias_hi_rvt, DFmode);
21219 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
21220 biases = validize_mem (force_const_mem (V2DFmode, biases));
21221 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
21222
21223 /* Add the upper and lower DFmode values together. */
21224 if (TARGET_SSE3)
21225 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
21226 else
21227 {
21228 x = copy_to_mode_reg (V2DFmode, fp_xmm);
21229 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
21230 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
21231 }
21232
21233 ix86_expand_vector_extract (false, target, fp_xmm, 0);
21234 }
21235
21236 /* Not used, but eases macroization of patterns. */
21237 void
21238 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
21239 {
21240 gcc_unreachable ();
21241 }
21242
21243 /* Convert an unsigned SImode value into a DFmode. Only currently used
21244 for SSE, but applicable anywhere. */
21245
21246 void
21247 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
21248 {
21249 REAL_VALUE_TYPE TWO31r;
21250 rtx x, fp;
21251
21252 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
21253 NULL, 1, OPTAB_DIRECT);
21254
21255 fp = gen_reg_rtx (DFmode);
21256 emit_insn (gen_floatsidf2 (fp, x));
21257
21258 real_ldexp (&TWO31r, &dconst1, 31);
21259 x = const_double_from_real_value (TWO31r, DFmode);
21260
21261 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
21262 if (x != target)
21263 emit_move_insn (target, x);
21264 }
21265
21266 /* Convert a signed DImode value into a DFmode. Only used for SSE in
21267 32-bit mode; otherwise we have a direct convert instruction. */
21268
21269 void
21270 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
21271 {
21272 REAL_VALUE_TYPE TWO32r;
21273 rtx fp_lo, fp_hi, x;
21274
21275 fp_lo = gen_reg_rtx (DFmode);
21276 fp_hi = gen_reg_rtx (DFmode);
21277
21278 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
21279
21280 real_ldexp (&TWO32r, &dconst1, 32);
21281 x = const_double_from_real_value (TWO32r, DFmode);
21282 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
21283
21284 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
21285
21286 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
21287 0, OPTAB_DIRECT);
21288 if (x != target)
21289 emit_move_insn (target, x);
21290 }
21291
21292 /* Convert an unsigned SImode value into a SFmode, using only SSE.
21293 For x86_32, -mfpmath=sse, !optimize_size only. */
21294 void
21295 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
21296 {
21297 REAL_VALUE_TYPE ONE16r;
21298 rtx fp_hi, fp_lo, int_hi, int_lo, x;
21299
21300 real_ldexp (&ONE16r, &dconst1, 16);
21301 x = const_double_from_real_value (ONE16r, SFmode);
21302 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
21303 NULL, 0, OPTAB_DIRECT);
21304 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
21305 NULL, 0, OPTAB_DIRECT);
21306 fp_hi = gen_reg_rtx (SFmode);
21307 fp_lo = gen_reg_rtx (SFmode);
21308 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
21309 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
21310 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
21311 0, OPTAB_DIRECT);
21312 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
21313 0, OPTAB_DIRECT);
21314 if (!rtx_equal_p (target, fp_hi))
21315 emit_move_insn (target, fp_hi);
21316 }
21317
21318 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
21319 a vector of unsigned ints VAL to vector of floats TARGET. */
21320
21321 void
21322 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
21323 {
21324 rtx tmp[8];
21325 REAL_VALUE_TYPE TWO16r;
21326 machine_mode intmode = GET_MODE (val);
21327 machine_mode fltmode = GET_MODE (target);
21328 rtx (*cvt) (rtx, rtx);
21329
21330 if (intmode == V4SImode)
21331 cvt = gen_floatv4siv4sf2;
21332 else
21333 cvt = gen_floatv8siv8sf2;
21334 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
21335 tmp[0] = force_reg (intmode, tmp[0]);
21336 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
21337 OPTAB_DIRECT);
21338 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
21339 NULL_RTX, 1, OPTAB_DIRECT);
21340 tmp[3] = gen_reg_rtx (fltmode);
21341 emit_insn (cvt (tmp[3], tmp[1]));
21342 tmp[4] = gen_reg_rtx (fltmode);
21343 emit_insn (cvt (tmp[4], tmp[2]));
21344 real_ldexp (&TWO16r, &dconst1, 16);
21345 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
21346 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
21347 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
21348 OPTAB_DIRECT);
21349 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
21350 OPTAB_DIRECT);
21351 if (tmp[7] != target)
21352 emit_move_insn (target, tmp[7]);
21353 }
21354
21355 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
21356 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
21357 This is done by doing just signed conversion if < 0x1p31, and otherwise by
21358 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
21359
21360 rtx
21361 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
21362 {
21363 REAL_VALUE_TYPE TWO31r;
21364 rtx two31r, tmp[4];
21365 machine_mode mode = GET_MODE (val);
21366 machine_mode scalarmode = GET_MODE_INNER (mode);
21367 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
21368 rtx (*cmp) (rtx, rtx, rtx, rtx);
21369 int i;
21370
21371 for (i = 0; i < 3; i++)
21372 tmp[i] = gen_reg_rtx (mode);
21373 real_ldexp (&TWO31r, &dconst1, 31);
21374 two31r = const_double_from_real_value (TWO31r, scalarmode);
21375 two31r = ix86_build_const_vector (mode, 1, two31r);
21376 two31r = force_reg (mode, two31r);
21377 switch (mode)
21378 {
21379 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
21380 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
21381 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
21382 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
21383 default: gcc_unreachable ();
21384 }
21385 tmp[3] = gen_rtx_LE (mode, two31r, val);
21386 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
21387 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
21388 0, OPTAB_DIRECT);
21389 if (intmode == V4SImode || TARGET_AVX2)
21390 *xorp = expand_simple_binop (intmode, ASHIFT,
21391 gen_lowpart (intmode, tmp[0]),
21392 GEN_INT (31), NULL_RTX, 0,
21393 OPTAB_DIRECT);
21394 else
21395 {
21396 rtx two31 = GEN_INT (HOST_WIDE_INT_1U << 31);
21397 two31 = ix86_build_const_vector (intmode, 1, two31);
21398 *xorp = expand_simple_binop (intmode, AND,
21399 gen_lowpart (intmode, tmp[0]),
21400 two31, NULL_RTX, 0,
21401 OPTAB_DIRECT);
21402 }
21403 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
21404 0, OPTAB_DIRECT);
21405 }
21406
21407 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
21408 then replicate the value for all elements of the vector
21409 register. */
21410
21411 rtx
21412 ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
21413 {
21414 int i, n_elt;
21415 rtvec v;
21416 machine_mode scalar_mode;
21417
21418 switch (mode)
21419 {
21420 case E_V64QImode:
21421 case E_V32QImode:
21422 case E_V16QImode:
21423 case E_V32HImode:
21424 case E_V16HImode:
21425 case E_V8HImode:
21426 case E_V16SImode:
21427 case E_V8SImode:
21428 case E_V4SImode:
21429 case E_V8DImode:
21430 case E_V4DImode:
21431 case E_V2DImode:
21432 gcc_assert (vect);
21433 /* FALLTHRU */
21434 case E_V16SFmode:
21435 case E_V8SFmode:
21436 case E_V4SFmode:
21437 case E_V8DFmode:
21438 case E_V4DFmode:
21439 case E_V2DFmode:
21440 n_elt = GET_MODE_NUNITS (mode);
21441 v = rtvec_alloc (n_elt);
21442 scalar_mode = GET_MODE_INNER (mode);
21443
21444 RTVEC_ELT (v, 0) = value;
21445
21446 for (i = 1; i < n_elt; ++i)
21447 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
21448
21449 return gen_rtx_CONST_VECTOR (mode, v);
21450
21451 default:
21452 gcc_unreachable ();
21453 }
21454 }
21455
21456 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
21457 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
21458 for an SSE register. If VECT is true, then replicate the mask for
21459 all elements of the vector register. If INVERT is true, then create
21460 a mask excluding the sign bit. */
21461
21462 rtx
21463 ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
21464 {
21465 machine_mode vec_mode, imode;
21466 wide_int w;
21467 rtx mask, v;
21468
21469 switch (mode)
21470 {
21471 case E_V16SImode:
21472 case E_V16SFmode:
21473 case E_V8SImode:
21474 case E_V4SImode:
21475 case E_V8SFmode:
21476 case E_V4SFmode:
21477 vec_mode = mode;
21478 imode = SImode;
21479 break;
21480
21481 case E_V8DImode:
21482 case E_V4DImode:
21483 case E_V2DImode:
21484 case E_V8DFmode:
21485 case E_V4DFmode:
21486 case E_V2DFmode:
21487 vec_mode = mode;
21488 imode = DImode;
21489 break;
21490
21491 case E_TImode:
21492 case E_TFmode:
21493 vec_mode = VOIDmode;
21494 imode = TImode;
21495 break;
21496
21497 default:
21498 gcc_unreachable ();
21499 }
21500
21501 machine_mode inner_mode = GET_MODE_INNER (mode);
21502 w = wi::set_bit_in_zero (GET_MODE_BITSIZE (inner_mode) - 1,
21503 GET_MODE_BITSIZE (inner_mode));
21504 if (invert)
21505 w = wi::bit_not (w);
21506
21507 /* Force this value into the low part of a fp vector constant. */
21508 mask = immed_wide_int_const (w, imode);
21509 mask = gen_lowpart (inner_mode, mask);
21510
21511 if (vec_mode == VOIDmode)
21512 return force_reg (inner_mode, mask);
21513
21514 v = ix86_build_const_vector (vec_mode, vect, mask);
21515 return force_reg (vec_mode, v);
21516 }
21517
21518 /* Generate code for floating point ABS or NEG. */
21519
21520 void
21521 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
21522 rtx operands[])
21523 {
21524 rtx mask, set, dst, src;
21525 bool use_sse = false;
21526 bool vector_mode = VECTOR_MODE_P (mode);
21527 machine_mode vmode = mode;
21528
21529 if (vector_mode)
21530 use_sse = true;
21531 else if (mode == TFmode)
21532 use_sse = true;
21533 else if (TARGET_SSE_MATH)
21534 {
21535 use_sse = SSE_FLOAT_MODE_P (mode);
21536 if (mode == SFmode)
21537 vmode = V4SFmode;
21538 else if (mode == DFmode)
21539 vmode = V2DFmode;
21540 }
21541
21542 /* NEG and ABS performed with SSE use bitwise mask operations.
21543 Create the appropriate mask now. */
21544 if (use_sse)
21545 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
21546 else
21547 mask = NULL_RTX;
21548
21549 dst = operands[0];
21550 src = operands[1];
21551
21552 set = gen_rtx_fmt_e (code, mode, src);
21553 set = gen_rtx_SET (dst, set);
21554
21555 if (mask)
21556 {
21557 rtx use, clob;
21558 rtvec par;
21559
21560 use = gen_rtx_USE (VOIDmode, mask);
21561 if (vector_mode)
21562 par = gen_rtvec (2, set, use);
21563 else
21564 {
21565 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
21566 par = gen_rtvec (3, set, use, clob);
21567 }
21568 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
21569 }
21570 else
21571 emit_insn (set);
21572 }
21573
21574 /* Expand a copysign operation. Special case operand 0 being a constant. */
21575
21576 void
21577 ix86_expand_copysign (rtx operands[])
21578 {
21579 machine_mode mode, vmode;
21580 rtx dest, op0, op1, mask, nmask;
21581
21582 dest = operands[0];
21583 op0 = operands[1];
21584 op1 = operands[2];
21585
21586 mode = GET_MODE (dest);
21587
21588 if (mode == SFmode)
21589 vmode = V4SFmode;
21590 else if (mode == DFmode)
21591 vmode = V2DFmode;
21592 else
21593 vmode = mode;
21594
21595 if (CONST_DOUBLE_P (op0))
21596 {
21597 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
21598
21599 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
21600 op0 = simplify_unary_operation (ABS, mode, op0, mode);
21601
21602 if (mode == SFmode || mode == DFmode)
21603 {
21604 if (op0 == CONST0_RTX (mode))
21605 op0 = CONST0_RTX (vmode);
21606 else
21607 {
21608 rtx v = ix86_build_const_vector (vmode, false, op0);
21609
21610 op0 = force_reg (vmode, v);
21611 }
21612 }
21613 else if (op0 != CONST0_RTX (mode))
21614 op0 = force_reg (mode, op0);
21615
21616 mask = ix86_build_signbit_mask (vmode, 0, 0);
21617
21618 if (mode == SFmode)
21619 copysign_insn = gen_copysignsf3_const;
21620 else if (mode == DFmode)
21621 copysign_insn = gen_copysigndf3_const;
21622 else
21623 copysign_insn = gen_copysigntf3_const;
21624
21625 emit_insn (copysign_insn (dest, op0, op1, mask));
21626 }
21627 else
21628 {
21629 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
21630
21631 nmask = ix86_build_signbit_mask (vmode, 0, 1);
21632 mask = ix86_build_signbit_mask (vmode, 0, 0);
21633
21634 if (mode == SFmode)
21635 copysign_insn = gen_copysignsf3_var;
21636 else if (mode == DFmode)
21637 copysign_insn = gen_copysigndf3_var;
21638 else
21639 copysign_insn = gen_copysigntf3_var;
21640
21641 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
21642 }
21643 }
21644
21645 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
21646 be a constant, and so has already been expanded into a vector constant. */
21647
21648 void
21649 ix86_split_copysign_const (rtx operands[])
21650 {
21651 machine_mode mode, vmode;
21652 rtx dest, op0, mask, x;
21653
21654 dest = operands[0];
21655 op0 = operands[1];
21656 mask = operands[3];
21657
21658 mode = GET_MODE (dest);
21659 vmode = GET_MODE (mask);
21660
21661 dest = lowpart_subreg (vmode, dest, mode);
21662 x = gen_rtx_AND (vmode, dest, mask);
21663 emit_insn (gen_rtx_SET (dest, x));
21664
21665 if (op0 != CONST0_RTX (vmode))
21666 {
21667 x = gen_rtx_IOR (vmode, dest, op0);
21668 emit_insn (gen_rtx_SET (dest, x));
21669 }
21670 }
21671
21672 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
21673 so we have to do two masks. */
21674
21675 void
21676 ix86_split_copysign_var (rtx operands[])
21677 {
21678 machine_mode mode, vmode;
21679 rtx dest, scratch, op0, op1, mask, nmask, x;
21680
21681 dest = operands[0];
21682 scratch = operands[1];
21683 op0 = operands[2];
21684 op1 = operands[3];
21685 nmask = operands[4];
21686 mask = operands[5];
21687
21688 mode = GET_MODE (dest);
21689 vmode = GET_MODE (mask);
21690
21691 if (rtx_equal_p (op0, op1))
21692 {
21693 /* Shouldn't happen often (it's useless, obviously), but when it does
21694 we'd generate incorrect code if we continue below. */
21695 emit_move_insn (dest, op0);
21696 return;
21697 }
21698
21699 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
21700 {
21701 gcc_assert (REGNO (op1) == REGNO (scratch));
21702
21703 x = gen_rtx_AND (vmode, scratch, mask);
21704 emit_insn (gen_rtx_SET (scratch, x));
21705
21706 dest = mask;
21707 op0 = lowpart_subreg (vmode, op0, mode);
21708 x = gen_rtx_NOT (vmode, dest);
21709 x = gen_rtx_AND (vmode, x, op0);
21710 emit_insn (gen_rtx_SET (dest, x));
21711 }
21712 else
21713 {
21714 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
21715 {
21716 x = gen_rtx_AND (vmode, scratch, mask);
21717 }
21718 else /* alternative 2,4 */
21719 {
21720 gcc_assert (REGNO (mask) == REGNO (scratch));
21721 op1 = lowpart_subreg (vmode, op1, mode);
21722 x = gen_rtx_AND (vmode, scratch, op1);
21723 }
21724 emit_insn (gen_rtx_SET (scratch, x));
21725
21726 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
21727 {
21728 dest = lowpart_subreg (vmode, op0, mode);
21729 x = gen_rtx_AND (vmode, dest, nmask);
21730 }
21731 else /* alternative 3,4 */
21732 {
21733 gcc_assert (REGNO (nmask) == REGNO (dest));
21734 dest = nmask;
21735 op0 = lowpart_subreg (vmode, op0, mode);
21736 x = gen_rtx_AND (vmode, dest, op0);
21737 }
21738 emit_insn (gen_rtx_SET (dest, x));
21739 }
21740
21741 x = gen_rtx_IOR (vmode, dest, scratch);
21742 emit_insn (gen_rtx_SET (dest, x));
21743 }
21744
21745 /* Return TRUE or FALSE depending on whether the first SET in INSN
21746 has source and destination with matching CC modes, and that the
21747 CC mode is at least as constrained as REQ_MODE. */
21748
21749 bool
21750 ix86_match_ccmode (rtx insn, machine_mode req_mode)
21751 {
21752 rtx set;
21753 machine_mode set_mode;
21754
21755 set = PATTERN (insn);
21756 if (GET_CODE (set) == PARALLEL)
21757 set = XVECEXP (set, 0, 0);
21758 gcc_assert (GET_CODE (set) == SET);
21759 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
21760
21761 set_mode = GET_MODE (SET_DEST (set));
21762 switch (set_mode)
21763 {
21764 case E_CCNOmode:
21765 if (req_mode != CCNOmode
21766 && (req_mode != CCmode
21767 || XEXP (SET_SRC (set), 1) != const0_rtx))
21768 return false;
21769 break;
21770 case E_CCmode:
21771 if (req_mode == CCGCmode)
21772 return false;
21773 /* FALLTHRU */
21774 case E_CCGCmode:
21775 if (req_mode == CCGOCmode || req_mode == CCNOmode)
21776 return false;
21777 /* FALLTHRU */
21778 case E_CCGOCmode:
21779 if (req_mode == CCZmode)
21780 return false;
21781 /* FALLTHRU */
21782 case E_CCZmode:
21783 break;
21784
21785 case E_CCGZmode:
21786
21787 case E_CCAmode:
21788 case E_CCCmode:
21789 case E_CCOmode:
21790 case E_CCPmode:
21791 case E_CCSmode:
21792 if (set_mode != req_mode)
21793 return false;
21794 break;
21795
21796 default:
21797 gcc_unreachable ();
21798 }
21799
21800 return GET_MODE (SET_SRC (set)) == set_mode;
21801 }
21802
21803 /* Generate insn patterns to do an integer compare of OPERANDS. */
21804
21805 static rtx
21806 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
21807 {
21808 machine_mode cmpmode;
21809 rtx tmp, flags;
21810
21811 cmpmode = SELECT_CC_MODE (code, op0, op1);
21812 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
21813
21814 /* This is very simple, but making the interface the same as in the
21815 FP case makes the rest of the code easier. */
21816 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
21817 emit_insn (gen_rtx_SET (flags, tmp));
21818
21819 /* Return the test that should be put into the flags user, i.e.
21820 the bcc, scc, or cmov instruction. */
21821 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
21822 }
21823
21824 /* Figure out whether to use unordered fp comparisons. */
21825
21826 static bool
21827 ix86_unordered_fp_compare (enum rtx_code code)
21828 {
21829 if (!TARGET_IEEE_FP)
21830 return false;
21831
21832 switch (code)
21833 {
21834 case GT:
21835 case GE:
21836 case LT:
21837 case LE:
21838 return false;
21839
21840 case EQ:
21841 case NE:
21842
21843 case LTGT:
21844 case UNORDERED:
21845 case ORDERED:
21846 case UNLT:
21847 case UNLE:
21848 case UNGT:
21849 case UNGE:
21850 case UNEQ:
21851 return true;
21852
21853 default:
21854 gcc_unreachable ();
21855 }
21856 }
21857
21858 machine_mode
21859 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
21860 {
21861 machine_mode mode = GET_MODE (op0);
21862
21863 if (SCALAR_FLOAT_MODE_P (mode))
21864 {
21865 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
21866 return CCFPmode;
21867 }
21868
21869 switch (code)
21870 {
21871 /* Only zero flag is needed. */
21872 case EQ: /* ZF=0 */
21873 case NE: /* ZF!=0 */
21874 return CCZmode;
21875 /* Codes needing carry flag. */
21876 case GEU: /* CF=0 */
21877 case LTU: /* CF=1 */
21878 /* Detect overflow checks. They need just the carry flag. */
21879 if (GET_CODE (op0) == PLUS
21880 && (rtx_equal_p (op1, XEXP (op0, 0))
21881 || rtx_equal_p (op1, XEXP (op0, 1))))
21882 return CCCmode;
21883 else
21884 return CCmode;
21885 case GTU: /* CF=0 & ZF=0 */
21886 case LEU: /* CF=1 | ZF=1 */
21887 return CCmode;
21888 /* Codes possibly doable only with sign flag when
21889 comparing against zero. */
21890 case GE: /* SF=OF or SF=0 */
21891 case LT: /* SF<>OF or SF=1 */
21892 if (op1 == const0_rtx)
21893 return CCGOCmode;
21894 else
21895 /* For other cases Carry flag is not required. */
21896 return CCGCmode;
21897 /* Codes doable only with sign flag when comparing
21898 against zero, but we miss jump instruction for it
21899 so we need to use relational tests against overflow
21900 that thus needs to be zero. */
21901 case GT: /* ZF=0 & SF=OF */
21902 case LE: /* ZF=1 | SF<>OF */
21903 if (op1 == const0_rtx)
21904 return CCNOmode;
21905 else
21906 return CCGCmode;
21907 /* strcmp pattern do (use flags) and combine may ask us for proper
21908 mode. */
21909 case USE:
21910 return CCmode;
21911 default:
21912 gcc_unreachable ();
21913 }
21914 }
21915
21916 /* Return the fixed registers used for condition codes. */
21917
21918 static bool
21919 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
21920 {
21921 *p1 = FLAGS_REG;
21922 *p2 = FPSR_REG;
21923 return true;
21924 }
21925
21926 /* If two condition code modes are compatible, return a condition code
21927 mode which is compatible with both. Otherwise, return
21928 VOIDmode. */
21929
21930 static machine_mode
21931 ix86_cc_modes_compatible (machine_mode m1, machine_mode m2)
21932 {
21933 if (m1 == m2)
21934 return m1;
21935
21936 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
21937 return VOIDmode;
21938
21939 if ((m1 == CCGCmode && m2 == CCGOCmode)
21940 || (m1 == CCGOCmode && m2 == CCGCmode))
21941 return CCGCmode;
21942
21943 if ((m1 == CCNOmode && m2 == CCGOCmode)
21944 || (m1 == CCGOCmode && m2 == CCNOmode))
21945 return CCNOmode;
21946
21947 if (m1 == CCZmode
21948 && (m2 == CCGCmode || m2 == CCGOCmode || m2 == CCNOmode))
21949 return m2;
21950 else if (m2 == CCZmode
21951 && (m1 == CCGCmode || m1 == CCGOCmode || m1 == CCNOmode))
21952 return m1;
21953
21954 switch (m1)
21955 {
21956 default:
21957 gcc_unreachable ();
21958
21959 case E_CCmode:
21960 case E_CCGCmode:
21961 case E_CCGOCmode:
21962 case E_CCNOmode:
21963 case E_CCAmode:
21964 case E_CCCmode:
21965 case E_CCOmode:
21966 case E_CCPmode:
21967 case E_CCSmode:
21968 case E_CCZmode:
21969 switch (m2)
21970 {
21971 default:
21972 return VOIDmode;
21973
21974 case E_CCmode:
21975 case E_CCGCmode:
21976 case E_CCGOCmode:
21977 case E_CCNOmode:
21978 case E_CCAmode:
21979 case E_CCCmode:
21980 case E_CCOmode:
21981 case E_CCPmode:
21982 case E_CCSmode:
21983 case E_CCZmode:
21984 return CCmode;
21985 }
21986
21987 case E_CCFPmode:
21988 /* These are only compatible with themselves, which we already
21989 checked above. */
21990 return VOIDmode;
21991 }
21992 }
21993
21994
21995 /* Return a comparison we can do and that it is equivalent to
21996 swap_condition (code) apart possibly from orderedness.
21997 But, never change orderedness if TARGET_IEEE_FP, returning
21998 UNKNOWN in that case if necessary. */
21999
22000 static enum rtx_code
22001 ix86_fp_swap_condition (enum rtx_code code)
22002 {
22003 switch (code)
22004 {
22005 case GT: /* GTU - CF=0 & ZF=0 */
22006 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
22007 case GE: /* GEU - CF=0 */
22008 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
22009 case UNLT: /* LTU - CF=1 */
22010 return TARGET_IEEE_FP ? UNKNOWN : GT;
22011 case UNLE: /* LEU - CF=1 | ZF=1 */
22012 return TARGET_IEEE_FP ? UNKNOWN : GE;
22013 default:
22014 return swap_condition (code);
22015 }
22016 }
22017
22018 /* Return cost of comparison CODE using the best strategy for performance.
22019 All following functions do use number of instructions as a cost metrics.
22020 In future this should be tweaked to compute bytes for optimize_size and
22021 take into account performance of various instructions on various CPUs. */
22022
22023 static int
22024 ix86_fp_comparison_cost (enum rtx_code code)
22025 {
22026 int arith_cost;
22027
22028 /* The cost of code using bit-twiddling on %ah. */
22029 switch (code)
22030 {
22031 case UNLE:
22032 case UNLT:
22033 case LTGT:
22034 case GT:
22035 case GE:
22036 case UNORDERED:
22037 case ORDERED:
22038 case UNEQ:
22039 arith_cost = 4;
22040 break;
22041 case LT:
22042 case NE:
22043 case EQ:
22044 case UNGE:
22045 arith_cost = TARGET_IEEE_FP ? 5 : 4;
22046 break;
22047 case LE:
22048 case UNGT:
22049 arith_cost = TARGET_IEEE_FP ? 6 : 4;
22050 break;
22051 default:
22052 gcc_unreachable ();
22053 }
22054
22055 switch (ix86_fp_comparison_strategy (code))
22056 {
22057 case IX86_FPCMP_COMI:
22058 return arith_cost > 4 ? 3 : 2;
22059 case IX86_FPCMP_SAHF:
22060 return arith_cost > 4 ? 4 : 3;
22061 default:
22062 return arith_cost;
22063 }
22064 }
22065
22066 /* Return strategy to use for floating-point. We assume that fcomi is always
22067 preferrable where available, since that is also true when looking at size
22068 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
22069
22070 enum ix86_fpcmp_strategy
22071 ix86_fp_comparison_strategy (enum rtx_code)
22072 {
22073 /* Do fcomi/sahf based test when profitable. */
22074
22075 if (TARGET_CMOVE)
22076 return IX86_FPCMP_COMI;
22077
22078 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
22079 return IX86_FPCMP_SAHF;
22080
22081 return IX86_FPCMP_ARITH;
22082 }
22083
22084 /* Swap, force into registers, or otherwise massage the two operands
22085 to a fp comparison. The operands are updated in place; the new
22086 comparison code is returned. */
22087
22088 static enum rtx_code
22089 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
22090 {
22091 bool unordered_compare = ix86_unordered_fp_compare (code);
22092 rtx op0 = *pop0, op1 = *pop1;
22093 machine_mode op_mode = GET_MODE (op0);
22094 bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
22095
22096 /* All of the unordered compare instructions only work on registers.
22097 The same is true of the fcomi compare instructions. The XFmode
22098 compare instructions require registers except when comparing
22099 against zero or when converting operand 1 from fixed point to
22100 floating point. */
22101
22102 if (!is_sse
22103 && (unordered_compare
22104 || (op_mode == XFmode
22105 && ! (standard_80387_constant_p (op0) == 1
22106 || standard_80387_constant_p (op1) == 1)
22107 && GET_CODE (op1) != FLOAT)
22108 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
22109 {
22110 op0 = force_reg (op_mode, op0);
22111 op1 = force_reg (op_mode, op1);
22112 }
22113 else
22114 {
22115 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
22116 things around if they appear profitable, otherwise force op0
22117 into a register. */
22118
22119 if (standard_80387_constant_p (op0) == 0
22120 || (MEM_P (op0)
22121 && ! (standard_80387_constant_p (op1) == 0
22122 || MEM_P (op1))))
22123 {
22124 enum rtx_code new_code = ix86_fp_swap_condition (code);
22125 if (new_code != UNKNOWN)
22126 {
22127 std::swap (op0, op1);
22128 code = new_code;
22129 }
22130 }
22131
22132 if (!REG_P (op0))
22133 op0 = force_reg (op_mode, op0);
22134
22135 if (CONSTANT_P (op1))
22136 {
22137 int tmp = standard_80387_constant_p (op1);
22138 if (tmp == 0)
22139 op1 = validize_mem (force_const_mem (op_mode, op1));
22140 else if (tmp == 1)
22141 {
22142 if (TARGET_CMOVE)
22143 op1 = force_reg (op_mode, op1);
22144 }
22145 else
22146 op1 = force_reg (op_mode, op1);
22147 }
22148 }
22149
22150 /* Try to rearrange the comparison to make it cheaper. */
22151 if (ix86_fp_comparison_cost (code)
22152 > ix86_fp_comparison_cost (swap_condition (code))
22153 && (REG_P (op1) || can_create_pseudo_p ()))
22154 {
22155 std::swap (op0, op1);
22156 code = swap_condition (code);
22157 if (!REG_P (op0))
22158 op0 = force_reg (op_mode, op0);
22159 }
22160
22161 *pop0 = op0;
22162 *pop1 = op1;
22163 return code;
22164 }
22165
22166 /* Convert comparison codes we use to represent FP comparison to integer
22167 code that will result in proper branch. Return UNKNOWN if no such code
22168 is available. */
22169
22170 enum rtx_code
22171 ix86_fp_compare_code_to_integer (enum rtx_code code)
22172 {
22173 switch (code)
22174 {
22175 case GT:
22176 return GTU;
22177 case GE:
22178 return GEU;
22179 case ORDERED:
22180 case UNORDERED:
22181 return code;
22182 case UNEQ:
22183 return EQ;
22184 case UNLT:
22185 return LTU;
22186 case UNLE:
22187 return LEU;
22188 case LTGT:
22189 return NE;
22190 default:
22191 return UNKNOWN;
22192 }
22193 }
22194
22195 /* Generate insn patterns to do a floating point compare of OPERANDS. */
22196
22197 static rtx
22198 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
22199 {
22200 bool unordered_compare = ix86_unordered_fp_compare (code);
22201 machine_mode intcmp_mode;
22202 rtx tmp, tmp2;
22203
22204 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
22205
22206 /* Do fcomi/sahf based test when profitable. */
22207 switch (ix86_fp_comparison_strategy (code))
22208 {
22209 case IX86_FPCMP_COMI:
22210 intcmp_mode = CCFPmode;
22211 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22212 if (unordered_compare)
22213 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22214 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
22215 break;
22216
22217 case IX86_FPCMP_SAHF:
22218 intcmp_mode = CCFPmode;
22219 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22220 if (unordered_compare)
22221 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22222 tmp = gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp);
22223 if (!scratch)
22224 scratch = gen_reg_rtx (HImode);
22225 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
22226 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
22227 break;
22228
22229 case IX86_FPCMP_ARITH:
22230 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
22231 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
22232 if (unordered_compare)
22233 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
22234 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
22235 if (!scratch)
22236 scratch = gen_reg_rtx (HImode);
22237 emit_insn (gen_rtx_SET (scratch, tmp));
22238
22239 /* In the unordered case, we have to check C2 for NaN's, which
22240 doesn't happen to work out to anything nice combination-wise.
22241 So do some bit twiddling on the value we've got in AH to come
22242 up with an appropriate set of condition codes. */
22243
22244 intcmp_mode = CCNOmode;
22245 switch (code)
22246 {
22247 case GT:
22248 case UNGT:
22249 if (code == GT || !TARGET_IEEE_FP)
22250 {
22251 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22252 code = EQ;
22253 }
22254 else
22255 {
22256 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22257 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22258 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
22259 intcmp_mode = CCmode;
22260 code = GEU;
22261 }
22262 break;
22263 case LT:
22264 case UNLT:
22265 if (code == LT && TARGET_IEEE_FP)
22266 {
22267 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22268 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
22269 intcmp_mode = CCmode;
22270 code = EQ;
22271 }
22272 else
22273 {
22274 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
22275 code = NE;
22276 }
22277 break;
22278 case GE:
22279 case UNGE:
22280 if (code == GE || !TARGET_IEEE_FP)
22281 {
22282 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
22283 code = EQ;
22284 }
22285 else
22286 {
22287 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22288 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
22289 code = NE;
22290 }
22291 break;
22292 case LE:
22293 case UNLE:
22294 if (code == LE && TARGET_IEEE_FP)
22295 {
22296 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22297 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
22298 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22299 intcmp_mode = CCmode;
22300 code = LTU;
22301 }
22302 else
22303 {
22304 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
22305 code = NE;
22306 }
22307 break;
22308 case EQ:
22309 case UNEQ:
22310 if (code == EQ && TARGET_IEEE_FP)
22311 {
22312 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22313 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
22314 intcmp_mode = CCmode;
22315 code = EQ;
22316 }
22317 else
22318 {
22319 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22320 code = NE;
22321 }
22322 break;
22323 case NE:
22324 case LTGT:
22325 if (code == NE && TARGET_IEEE_FP)
22326 {
22327 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
22328 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
22329 GEN_INT (0x40)));
22330 code = NE;
22331 }
22332 else
22333 {
22334 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
22335 code = EQ;
22336 }
22337 break;
22338
22339 case UNORDERED:
22340 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22341 code = NE;
22342 break;
22343 case ORDERED:
22344 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
22345 code = EQ;
22346 break;
22347
22348 default:
22349 gcc_unreachable ();
22350 }
22351 break;
22352
22353 default:
22354 gcc_unreachable();
22355 }
22356
22357 /* Return the test that should be put into the flags user, i.e.
22358 the bcc, scc, or cmov instruction. */
22359 return gen_rtx_fmt_ee (code, VOIDmode,
22360 gen_rtx_REG (intcmp_mode, FLAGS_REG),
22361 const0_rtx);
22362 }
22363
22364 static rtx
22365 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
22366 {
22367 rtx ret;
22368
22369 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
22370 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
22371
22372 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
22373 {
22374 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
22375 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22376 }
22377 else
22378 ret = ix86_expand_int_compare (code, op0, op1);
22379
22380 return ret;
22381 }
22382
22383 void
22384 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
22385 {
22386 machine_mode mode = GET_MODE (op0);
22387 rtx tmp;
22388
22389 /* Handle special case - vector comparsion with boolean result, transform
22390 it using ptest instruction. */
22391 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
22392 {
22393 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
22394 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
22395
22396 gcc_assert (code == EQ || code == NE);
22397 /* Generate XOR since we can't check that one operand is zero vector. */
22398 tmp = gen_reg_rtx (mode);
22399 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
22400 tmp = gen_lowpart (p_mode, tmp);
22401 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
22402 gen_rtx_UNSPEC (CCmode,
22403 gen_rtvec (2, tmp, tmp),
22404 UNSPEC_PTEST)));
22405 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
22406 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22407 gen_rtx_LABEL_REF (VOIDmode, label),
22408 pc_rtx);
22409 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22410 return;
22411 }
22412
22413 switch (mode)
22414 {
22415 case E_SFmode:
22416 case E_DFmode:
22417 case E_XFmode:
22418 case E_QImode:
22419 case E_HImode:
22420 case E_SImode:
22421 simple:
22422 tmp = ix86_expand_compare (code, op0, op1);
22423 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
22424 gen_rtx_LABEL_REF (VOIDmode, label),
22425 pc_rtx);
22426 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
22427 return;
22428
22429 case E_DImode:
22430 if (TARGET_64BIT)
22431 goto simple;
22432 /* For 32-bit target DI comparison may be performed on
22433 SSE registers. To allow this we should avoid split
22434 to SI mode which is achieved by doing xor in DI mode
22435 and then comparing with zero (which is recognized by
22436 STV pass). We don't compare using xor when optimizing
22437 for size. */
22438 if (!optimize_insn_for_size_p ()
22439 && TARGET_STV
22440 && (code == EQ || code == NE))
22441 {
22442 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
22443 op1 = const0_rtx;
22444 }
22445 /* FALLTHRU */
22446 case E_TImode:
22447 /* Expand DImode branch into multiple compare+branch. */
22448 {
22449 rtx lo[2], hi[2];
22450 rtx_code_label *label2;
22451 enum rtx_code code1, code2, code3;
22452 machine_mode submode;
22453
22454 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
22455 {
22456 std::swap (op0, op1);
22457 code = swap_condition (code);
22458 }
22459
22460 split_double_mode (mode, &op0, 1, lo+0, hi+0);
22461 split_double_mode (mode, &op1, 1, lo+1, hi+1);
22462
22463 submode = mode == DImode ? SImode : DImode;
22464
22465 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
22466 avoid two branches. This costs one extra insn, so disable when
22467 optimizing for size. */
22468
22469 if ((code == EQ || code == NE)
22470 && (!optimize_insn_for_size_p ()
22471 || hi[1] == const0_rtx || lo[1] == const0_rtx))
22472 {
22473 rtx xor0, xor1;
22474
22475 xor1 = hi[0];
22476 if (hi[1] != const0_rtx)
22477 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
22478 NULL_RTX, 0, OPTAB_WIDEN);
22479
22480 xor0 = lo[0];
22481 if (lo[1] != const0_rtx)
22482 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
22483 NULL_RTX, 0, OPTAB_WIDEN);
22484
22485 tmp = expand_binop (submode, ior_optab, xor1, xor0,
22486 NULL_RTX, 0, OPTAB_WIDEN);
22487
22488 ix86_expand_branch (code, tmp, const0_rtx, label);
22489 return;
22490 }
22491
22492 /* Otherwise, if we are doing less-than or greater-or-equal-than,
22493 op1 is a constant and the low word is zero, then we can just
22494 examine the high word. Similarly for low word -1 and
22495 less-or-equal-than or greater-than. */
22496
22497 if (CONST_INT_P (hi[1]))
22498 switch (code)
22499 {
22500 case LT: case LTU: case GE: case GEU:
22501 if (lo[1] == const0_rtx)
22502 {
22503 ix86_expand_branch (code, hi[0], hi[1], label);
22504 return;
22505 }
22506 break;
22507 case LE: case LEU: case GT: case GTU:
22508 if (lo[1] == constm1_rtx)
22509 {
22510 ix86_expand_branch (code, hi[0], hi[1], label);
22511 return;
22512 }
22513 break;
22514 default:
22515 break;
22516 }
22517
22518 /* Emulate comparisons that do not depend on Zero flag with
22519 double-word subtraction. Note that only Overflow, Sign
22520 and Carry flags are valid, so swap arguments and condition
22521 of comparisons that would otherwise test Zero flag. */
22522
22523 switch (code)
22524 {
22525 case LE: case LEU: case GT: case GTU:
22526 std::swap (lo[0], lo[1]);
22527 std::swap (hi[0], hi[1]);
22528 code = swap_condition (code);
22529 /* FALLTHRU */
22530
22531 case LT: case LTU: case GE: case GEU:
22532 {
22533 rtx (*cmp_insn) (rtx, rtx);
22534 rtx (*sbb_insn) (rtx, rtx, rtx);
22535 bool uns = (code == LTU || code == GEU);
22536
22537 if (TARGET_64BIT)
22538 {
22539 cmp_insn = gen_cmpdi_1;
22540 sbb_insn
22541 = uns ? gen_subdi3_carry_ccc : gen_subdi3_carry_ccgz;
22542 }
22543 else
22544 {
22545 cmp_insn = gen_cmpsi_1;
22546 sbb_insn
22547 = uns ? gen_subsi3_carry_ccc : gen_subsi3_carry_ccgz;
22548 }
22549
22550 if (!nonimmediate_operand (lo[0], submode))
22551 lo[0] = force_reg (submode, lo[0]);
22552 if (!x86_64_general_operand (lo[1], submode))
22553 lo[1] = force_reg (submode, lo[1]);
22554
22555 if (!register_operand (hi[0], submode))
22556 hi[0] = force_reg (submode, hi[0]);
22557 if ((uns && !nonimmediate_operand (hi[1], submode))
22558 || (!uns && !x86_64_general_operand (hi[1], submode)))
22559 hi[1] = force_reg (submode, hi[1]);
22560
22561 emit_insn (cmp_insn (lo[0], lo[1]));
22562 emit_insn (sbb_insn (gen_rtx_SCRATCH (submode), hi[0], hi[1]));
22563
22564 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
22565
22566 ix86_expand_branch (code, tmp, const0_rtx, label);
22567 return;
22568 }
22569
22570 default:
22571 break;
22572 }
22573
22574 /* Otherwise, we need two or three jumps. */
22575
22576 label2 = gen_label_rtx ();
22577
22578 code1 = code;
22579 code2 = swap_condition (code);
22580 code3 = unsigned_condition (code);
22581
22582 switch (code)
22583 {
22584 case LT: case GT: case LTU: case GTU:
22585 break;
22586
22587 case LE: code1 = LT; code2 = GT; break;
22588 case GE: code1 = GT; code2 = LT; break;
22589 case LEU: code1 = LTU; code2 = GTU; break;
22590 case GEU: code1 = GTU; code2 = LTU; break;
22591
22592 case EQ: code1 = UNKNOWN; code2 = NE; break;
22593 case NE: code2 = UNKNOWN; break;
22594
22595 default:
22596 gcc_unreachable ();
22597 }
22598
22599 /*
22600 * a < b =>
22601 * if (hi(a) < hi(b)) goto true;
22602 * if (hi(a) > hi(b)) goto false;
22603 * if (lo(a) < lo(b)) goto true;
22604 * false:
22605 */
22606
22607 if (code1 != UNKNOWN)
22608 ix86_expand_branch (code1, hi[0], hi[1], label);
22609 if (code2 != UNKNOWN)
22610 ix86_expand_branch (code2, hi[0], hi[1], label2);
22611
22612 ix86_expand_branch (code3, lo[0], lo[1], label);
22613
22614 if (code2 != UNKNOWN)
22615 emit_label (label2);
22616 return;
22617 }
22618
22619 default:
22620 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
22621 goto simple;
22622 }
22623 }
22624
22625 void
22626 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
22627 {
22628 rtx ret;
22629
22630 gcc_assert (GET_MODE (dest) == QImode);
22631
22632 ret = ix86_expand_compare (code, op0, op1);
22633 PUT_MODE (ret, QImode);
22634 emit_insn (gen_rtx_SET (dest, ret));
22635 }
22636
22637 /* Expand comparison setting or clearing carry flag. Return true when
22638 successful and set pop for the operation. */
22639 static bool
22640 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
22641 {
22642 machine_mode mode =
22643 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
22644
22645 /* Do not handle double-mode compares that go through special path. */
22646 if (mode == (TARGET_64BIT ? TImode : DImode))
22647 return false;
22648
22649 if (SCALAR_FLOAT_MODE_P (mode))
22650 {
22651 rtx compare_op;
22652 rtx_insn *compare_seq;
22653
22654 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
22655
22656 /* Shortcut: following common codes never translate
22657 into carry flag compares. */
22658 if (code == EQ || code == NE || code == UNEQ || code == LTGT
22659 || code == ORDERED || code == UNORDERED)
22660 return false;
22661
22662 /* These comparisons require zero flag; swap operands so they won't. */
22663 if ((code == GT || code == UNLE || code == LE || code == UNGT)
22664 && !TARGET_IEEE_FP)
22665 {
22666 std::swap (op0, op1);
22667 code = swap_condition (code);
22668 }
22669
22670 /* Try to expand the comparison and verify that we end up with
22671 carry flag based comparison. This fails to be true only when
22672 we decide to expand comparison using arithmetic that is not
22673 too common scenario. */
22674 start_sequence ();
22675 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
22676 compare_seq = get_insns ();
22677 end_sequence ();
22678
22679 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
22680 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
22681 else
22682 code = GET_CODE (compare_op);
22683
22684 if (code != LTU && code != GEU)
22685 return false;
22686
22687 emit_insn (compare_seq);
22688 *pop = compare_op;
22689 return true;
22690 }
22691
22692 if (!INTEGRAL_MODE_P (mode))
22693 return false;
22694
22695 switch (code)
22696 {
22697 case LTU:
22698 case GEU:
22699 break;
22700
22701 /* Convert a==0 into (unsigned)a<1. */
22702 case EQ:
22703 case NE:
22704 if (op1 != const0_rtx)
22705 return false;
22706 op1 = const1_rtx;
22707 code = (code == EQ ? LTU : GEU);
22708 break;
22709
22710 /* Convert a>b into b<a or a>=b-1. */
22711 case GTU:
22712 case LEU:
22713 if (CONST_INT_P (op1))
22714 {
22715 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
22716 /* Bail out on overflow. We still can swap operands but that
22717 would force loading of the constant into register. */
22718 if (op1 == const0_rtx
22719 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
22720 return false;
22721 code = (code == GTU ? GEU : LTU);
22722 }
22723 else
22724 {
22725 std::swap (op0, op1);
22726 code = (code == GTU ? LTU : GEU);
22727 }
22728 break;
22729
22730 /* Convert a>=0 into (unsigned)a<0x80000000. */
22731 case LT:
22732 case GE:
22733 if (mode == DImode || op1 != const0_rtx)
22734 return false;
22735 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22736 code = (code == LT ? GEU : LTU);
22737 break;
22738 case LE:
22739 case GT:
22740 if (mode == DImode || op1 != constm1_rtx)
22741 return false;
22742 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
22743 code = (code == LE ? GEU : LTU);
22744 break;
22745
22746 default:
22747 return false;
22748 }
22749 /* Swapping operands may cause constant to appear as first operand. */
22750 if (!nonimmediate_operand (op0, VOIDmode))
22751 {
22752 if (!can_create_pseudo_p ())
22753 return false;
22754 op0 = force_reg (mode, op0);
22755 }
22756 *pop = ix86_expand_compare (code, op0, op1);
22757 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
22758 return true;
22759 }
22760
22761 bool
22762 ix86_expand_int_movcc (rtx operands[])
22763 {
22764 enum rtx_code code = GET_CODE (operands[1]), compare_code;
22765 rtx_insn *compare_seq;
22766 rtx compare_op;
22767 machine_mode mode = GET_MODE (operands[0]);
22768 bool sign_bit_compare_p = false;
22769 rtx op0 = XEXP (operands[1], 0);
22770 rtx op1 = XEXP (operands[1], 1);
22771
22772 if (GET_MODE (op0) == TImode
22773 || (GET_MODE (op0) == DImode
22774 && !TARGET_64BIT))
22775 return false;
22776
22777 start_sequence ();
22778 compare_op = ix86_expand_compare (code, op0, op1);
22779 compare_seq = get_insns ();
22780 end_sequence ();
22781
22782 compare_code = GET_CODE (compare_op);
22783
22784 if ((op1 == const0_rtx && (code == GE || code == LT))
22785 || (op1 == constm1_rtx && (code == GT || code == LE)))
22786 sign_bit_compare_p = true;
22787
22788 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
22789 HImode insns, we'd be swallowed in word prefix ops. */
22790
22791 if ((mode != HImode || TARGET_FAST_PREFIX)
22792 && (mode != (TARGET_64BIT ? TImode : DImode))
22793 && CONST_INT_P (operands[2])
22794 && CONST_INT_P (operands[3]))
22795 {
22796 rtx out = operands[0];
22797 HOST_WIDE_INT ct = INTVAL (operands[2]);
22798 HOST_WIDE_INT cf = INTVAL (operands[3]);
22799 HOST_WIDE_INT diff;
22800
22801 diff = ct - cf;
22802 /* Sign bit compares are better done using shifts than we do by using
22803 sbb. */
22804 if (sign_bit_compare_p
22805 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
22806 {
22807 /* Detect overlap between destination and compare sources. */
22808 rtx tmp = out;
22809
22810 if (!sign_bit_compare_p)
22811 {
22812 rtx flags;
22813 bool fpcmp = false;
22814
22815 compare_code = GET_CODE (compare_op);
22816
22817 flags = XEXP (compare_op, 0);
22818
22819 if (GET_MODE (flags) == CCFPmode)
22820 {
22821 fpcmp = true;
22822 compare_code
22823 = ix86_fp_compare_code_to_integer (compare_code);
22824 }
22825
22826 /* To simplify rest of code, restrict to the GEU case. */
22827 if (compare_code == LTU)
22828 {
22829 std::swap (ct, cf);
22830 compare_code = reverse_condition (compare_code);
22831 code = reverse_condition (code);
22832 }
22833 else
22834 {
22835 if (fpcmp)
22836 PUT_CODE (compare_op,
22837 reverse_condition_maybe_unordered
22838 (GET_CODE (compare_op)));
22839 else
22840 PUT_CODE (compare_op,
22841 reverse_condition (GET_CODE (compare_op)));
22842 }
22843 diff = ct - cf;
22844
22845 if (reg_overlap_mentioned_p (out, op0)
22846 || reg_overlap_mentioned_p (out, op1))
22847 tmp = gen_reg_rtx (mode);
22848
22849 if (mode == DImode)
22850 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
22851 else
22852 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
22853 flags, compare_op));
22854 }
22855 else
22856 {
22857 if (code == GT || code == GE)
22858 code = reverse_condition (code);
22859 else
22860 {
22861 std::swap (ct, cf);
22862 diff = ct - cf;
22863 }
22864 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
22865 }
22866
22867 if (diff == 1)
22868 {
22869 /*
22870 * cmpl op0,op1
22871 * sbbl dest,dest
22872 * [addl dest, ct]
22873 *
22874 * Size 5 - 8.
22875 */
22876 if (ct)
22877 tmp = expand_simple_binop (mode, PLUS,
22878 tmp, GEN_INT (ct),
22879 copy_rtx (tmp), 1, OPTAB_DIRECT);
22880 }
22881 else if (cf == -1)
22882 {
22883 /*
22884 * cmpl op0,op1
22885 * sbbl dest,dest
22886 * orl $ct, dest
22887 *
22888 * Size 8.
22889 */
22890 tmp = expand_simple_binop (mode, IOR,
22891 tmp, GEN_INT (ct),
22892 copy_rtx (tmp), 1, OPTAB_DIRECT);
22893 }
22894 else if (diff == -1 && ct)
22895 {
22896 /*
22897 * cmpl op0,op1
22898 * sbbl dest,dest
22899 * notl dest
22900 * [addl dest, cf]
22901 *
22902 * Size 8 - 11.
22903 */
22904 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22905 if (cf)
22906 tmp = expand_simple_binop (mode, PLUS,
22907 copy_rtx (tmp), GEN_INT (cf),
22908 copy_rtx (tmp), 1, OPTAB_DIRECT);
22909 }
22910 else
22911 {
22912 /*
22913 * cmpl op0,op1
22914 * sbbl dest,dest
22915 * [notl dest]
22916 * andl cf - ct, dest
22917 * [addl dest, ct]
22918 *
22919 * Size 8 - 11.
22920 */
22921
22922 if (cf == 0)
22923 {
22924 cf = ct;
22925 ct = 0;
22926 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
22927 }
22928
22929 tmp = expand_simple_binop (mode, AND,
22930 copy_rtx (tmp),
22931 gen_int_mode (cf - ct, mode),
22932 copy_rtx (tmp), 1, OPTAB_DIRECT);
22933 if (ct)
22934 tmp = expand_simple_binop (mode, PLUS,
22935 copy_rtx (tmp), GEN_INT (ct),
22936 copy_rtx (tmp), 1, OPTAB_DIRECT);
22937 }
22938
22939 if (!rtx_equal_p (tmp, out))
22940 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
22941
22942 return true;
22943 }
22944
22945 if (diff < 0)
22946 {
22947 machine_mode cmp_mode = GET_MODE (op0);
22948 enum rtx_code new_code;
22949
22950 if (SCALAR_FLOAT_MODE_P (cmp_mode))
22951 {
22952 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
22953
22954 /* We may be reversing unordered compare to normal compare, that
22955 is not valid in general (we may convert non-trapping condition
22956 to trapping one), however on i386 we currently emit all
22957 comparisons unordered. */
22958 new_code = reverse_condition_maybe_unordered (code);
22959 }
22960 else
22961 new_code = ix86_reverse_condition (code, cmp_mode);
22962 if (new_code != UNKNOWN)
22963 {
22964 std::swap (ct, cf);
22965 diff = -diff;
22966 code = new_code;
22967 }
22968 }
22969
22970 compare_code = UNKNOWN;
22971 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
22972 && CONST_INT_P (op1))
22973 {
22974 if (op1 == const0_rtx
22975 && (code == LT || code == GE))
22976 compare_code = code;
22977 else if (op1 == constm1_rtx)
22978 {
22979 if (code == LE)
22980 compare_code = LT;
22981 else if (code == GT)
22982 compare_code = GE;
22983 }
22984 }
22985
22986 /* Optimize dest = (op0 < 0) ? -1 : cf. */
22987 if (compare_code != UNKNOWN
22988 && GET_MODE (op0) == GET_MODE (out)
22989 && (cf == -1 || ct == -1))
22990 {
22991 /* If lea code below could be used, only optimize
22992 if it results in a 2 insn sequence. */
22993
22994 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
22995 || diff == 3 || diff == 5 || diff == 9)
22996 || (compare_code == LT && ct == -1)
22997 || (compare_code == GE && cf == -1))
22998 {
22999 /*
23000 * notl op1 (if necessary)
23001 * sarl $31, op1
23002 * orl cf, op1
23003 */
23004 if (ct != -1)
23005 {
23006 cf = ct;
23007 ct = -1;
23008 code = reverse_condition (code);
23009 }
23010
23011 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23012
23013 out = expand_simple_binop (mode, IOR,
23014 out, GEN_INT (cf),
23015 out, 1, OPTAB_DIRECT);
23016 if (out != operands[0])
23017 emit_move_insn (operands[0], out);
23018
23019 return true;
23020 }
23021 }
23022
23023
23024 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
23025 || diff == 3 || diff == 5 || diff == 9)
23026 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
23027 && (mode != DImode
23028 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
23029 {
23030 /*
23031 * xorl dest,dest
23032 * cmpl op1,op2
23033 * setcc dest
23034 * lea cf(dest*(ct-cf)),dest
23035 *
23036 * Size 14.
23037 *
23038 * This also catches the degenerate setcc-only case.
23039 */
23040
23041 rtx tmp;
23042 int nops;
23043
23044 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23045
23046 nops = 0;
23047 /* On x86_64 the lea instruction operates on Pmode, so we need
23048 to get arithmetics done in proper mode to match. */
23049 if (diff == 1)
23050 tmp = copy_rtx (out);
23051 else
23052 {
23053 rtx out1;
23054 out1 = copy_rtx (out);
23055 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
23056 nops++;
23057 if (diff & 1)
23058 {
23059 tmp = gen_rtx_PLUS (mode, tmp, out1);
23060 nops++;
23061 }
23062 }
23063 if (cf != 0)
23064 {
23065 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
23066 nops++;
23067 }
23068 if (!rtx_equal_p (tmp, out))
23069 {
23070 if (nops == 1)
23071 out = force_operand (tmp, copy_rtx (out));
23072 else
23073 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
23074 }
23075 if (!rtx_equal_p (out, operands[0]))
23076 emit_move_insn (operands[0], copy_rtx (out));
23077
23078 return true;
23079 }
23080
23081 /*
23082 * General case: Jumpful:
23083 * xorl dest,dest cmpl op1, op2
23084 * cmpl op1, op2 movl ct, dest
23085 * setcc dest jcc 1f
23086 * decl dest movl cf, dest
23087 * andl (cf-ct),dest 1:
23088 * addl ct,dest
23089 *
23090 * Size 20. Size 14.
23091 *
23092 * This is reasonably steep, but branch mispredict costs are
23093 * high on modern cpus, so consider failing only if optimizing
23094 * for space.
23095 */
23096
23097 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23098 && BRANCH_COST (optimize_insn_for_speed_p (),
23099 false) >= 2)
23100 {
23101 if (cf == 0)
23102 {
23103 machine_mode cmp_mode = GET_MODE (op0);
23104 enum rtx_code new_code;
23105
23106 if (SCALAR_FLOAT_MODE_P (cmp_mode))
23107 {
23108 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
23109
23110 /* We may be reversing unordered compare to normal compare,
23111 that is not valid in general (we may convert non-trapping
23112 condition to trapping one), however on i386 we currently
23113 emit all comparisons unordered. */
23114 new_code = reverse_condition_maybe_unordered (code);
23115 }
23116 else
23117 {
23118 new_code = ix86_reverse_condition (code, cmp_mode);
23119 if (compare_code != UNKNOWN && new_code != UNKNOWN)
23120 compare_code = reverse_condition (compare_code);
23121 }
23122
23123 if (new_code != UNKNOWN)
23124 {
23125 cf = ct;
23126 ct = 0;
23127 code = new_code;
23128 }
23129 }
23130
23131 if (compare_code != UNKNOWN)
23132 {
23133 /* notl op1 (if needed)
23134 sarl $31, op1
23135 andl (cf-ct), op1
23136 addl ct, op1
23137
23138 For x < 0 (resp. x <= -1) there will be no notl,
23139 so if possible swap the constants to get rid of the
23140 complement.
23141 True/false will be -1/0 while code below (store flag
23142 followed by decrement) is 0/-1, so the constants need
23143 to be exchanged once more. */
23144
23145 if (compare_code == GE || !cf)
23146 {
23147 code = reverse_condition (code);
23148 compare_code = LT;
23149 }
23150 else
23151 std::swap (ct, cf);
23152
23153 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
23154 }
23155 else
23156 {
23157 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
23158
23159 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
23160 constm1_rtx,
23161 copy_rtx (out), 1, OPTAB_DIRECT);
23162 }
23163
23164 out = expand_simple_binop (mode, AND, copy_rtx (out),
23165 gen_int_mode (cf - ct, mode),
23166 copy_rtx (out), 1, OPTAB_DIRECT);
23167 if (ct)
23168 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
23169 copy_rtx (out), 1, OPTAB_DIRECT);
23170 if (!rtx_equal_p (out, operands[0]))
23171 emit_move_insn (operands[0], copy_rtx (out));
23172
23173 return true;
23174 }
23175 }
23176
23177 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
23178 {
23179 /* Try a few things more with specific constants and a variable. */
23180
23181 optab op;
23182 rtx var, orig_out, out, tmp;
23183
23184 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
23185 return false;
23186
23187 /* If one of the two operands is an interesting constant, load a
23188 constant with the above and mask it in with a logical operation. */
23189
23190 if (CONST_INT_P (operands[2]))
23191 {
23192 var = operands[3];
23193 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
23194 operands[3] = constm1_rtx, op = and_optab;
23195 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
23196 operands[3] = const0_rtx, op = ior_optab;
23197 else
23198 return false;
23199 }
23200 else if (CONST_INT_P (operands[3]))
23201 {
23202 var = operands[2];
23203 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
23204 operands[2] = constm1_rtx, op = and_optab;
23205 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
23206 operands[2] = const0_rtx, op = ior_optab;
23207 else
23208 return false;
23209 }
23210 else
23211 return false;
23212
23213 orig_out = operands[0];
23214 tmp = gen_reg_rtx (mode);
23215 operands[0] = tmp;
23216
23217 /* Recurse to get the constant loaded. */
23218 if (!ix86_expand_int_movcc (operands))
23219 return false;
23220
23221 /* Mask in the interesting variable. */
23222 out = expand_binop (mode, op, var, tmp, orig_out, 0,
23223 OPTAB_WIDEN);
23224 if (!rtx_equal_p (out, orig_out))
23225 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
23226
23227 return true;
23228 }
23229
23230 /*
23231 * For comparison with above,
23232 *
23233 * movl cf,dest
23234 * movl ct,tmp
23235 * cmpl op1,op2
23236 * cmovcc tmp,dest
23237 *
23238 * Size 15.
23239 */
23240
23241 if (! nonimmediate_operand (operands[2], mode))
23242 operands[2] = force_reg (mode, operands[2]);
23243 if (! nonimmediate_operand (operands[3], mode))
23244 operands[3] = force_reg (mode, operands[3]);
23245
23246 if (! register_operand (operands[2], VOIDmode)
23247 && (mode == QImode
23248 || ! register_operand (operands[3], VOIDmode)))
23249 operands[2] = force_reg (mode, operands[2]);
23250
23251 if (mode == QImode
23252 && ! register_operand (operands[3], VOIDmode))
23253 operands[3] = force_reg (mode, operands[3]);
23254
23255 emit_insn (compare_seq);
23256 emit_insn (gen_rtx_SET (operands[0],
23257 gen_rtx_IF_THEN_ELSE (mode,
23258 compare_op, operands[2],
23259 operands[3])));
23260 return true;
23261 }
23262
23263 /* Swap, force into registers, or otherwise massage the two operands
23264 to an sse comparison with a mask result. Thus we differ a bit from
23265 ix86_prepare_fp_compare_args which expects to produce a flags result.
23266
23267 The DEST operand exists to help determine whether to commute commutative
23268 operators. The POP0/POP1 operands are updated in place. The new
23269 comparison code is returned, or UNKNOWN if not implementable. */
23270
23271 static enum rtx_code
23272 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
23273 rtx *pop0, rtx *pop1)
23274 {
23275 switch (code)
23276 {
23277 case LTGT:
23278 case UNEQ:
23279 /* AVX supports all the needed comparisons. */
23280 if (TARGET_AVX)
23281 break;
23282 /* We have no LTGT as an operator. We could implement it with
23283 NE & ORDERED, but this requires an extra temporary. It's
23284 not clear that it's worth it. */
23285 return UNKNOWN;
23286
23287 case LT:
23288 case LE:
23289 case UNGT:
23290 case UNGE:
23291 /* These are supported directly. */
23292 break;
23293
23294 case EQ:
23295 case NE:
23296 case UNORDERED:
23297 case ORDERED:
23298 /* AVX has 3 operand comparisons, no need to swap anything. */
23299 if (TARGET_AVX)
23300 break;
23301 /* For commutative operators, try to canonicalize the destination
23302 operand to be first in the comparison - this helps reload to
23303 avoid extra moves. */
23304 if (!dest || !rtx_equal_p (dest, *pop1))
23305 break;
23306 /* FALLTHRU */
23307
23308 case GE:
23309 case GT:
23310 case UNLE:
23311 case UNLT:
23312 /* These are not supported directly before AVX, and furthermore
23313 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
23314 comparison operands to transform into something that is
23315 supported. */
23316 std::swap (*pop0, *pop1);
23317 code = swap_condition (code);
23318 break;
23319
23320 default:
23321 gcc_unreachable ();
23322 }
23323
23324 return code;
23325 }
23326
23327 /* Detect conditional moves that exactly match min/max operational
23328 semantics. Note that this is IEEE safe, as long as we don't
23329 interchange the operands.
23330
23331 Returns FALSE if this conditional move doesn't match a MIN/MAX,
23332 and TRUE if the operation is successful and instructions are emitted. */
23333
23334 static bool
23335 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
23336 rtx cmp_op1, rtx if_true, rtx if_false)
23337 {
23338 machine_mode mode;
23339 bool is_min;
23340 rtx tmp;
23341
23342 if (code == LT)
23343 ;
23344 else if (code == UNGE)
23345 std::swap (if_true, if_false);
23346 else
23347 return false;
23348
23349 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
23350 is_min = true;
23351 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
23352 is_min = false;
23353 else
23354 return false;
23355
23356 mode = GET_MODE (dest);
23357
23358 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
23359 but MODE may be a vector mode and thus not appropriate. */
23360 if (!flag_finite_math_only || flag_signed_zeros)
23361 {
23362 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
23363 rtvec v;
23364
23365 if_true = force_reg (mode, if_true);
23366 v = gen_rtvec (2, if_true, if_false);
23367 tmp = gen_rtx_UNSPEC (mode, v, u);
23368 }
23369 else
23370 {
23371 code = is_min ? SMIN : SMAX;
23372 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
23373 }
23374
23375 emit_insn (gen_rtx_SET (dest, tmp));
23376 return true;
23377 }
23378
23379 /* Expand an sse vector comparison. Return the register with the result. */
23380
23381 static rtx
23382 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
23383 rtx op_true, rtx op_false)
23384 {
23385 machine_mode mode = GET_MODE (dest);
23386 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
23387
23388 /* In general case result of comparison can differ from operands' type. */
23389 machine_mode cmp_mode;
23390
23391 /* In AVX512F the result of comparison is an integer mask. */
23392 bool maskcmp = false;
23393 rtx x;
23394
23395 if (GET_MODE_SIZE (cmp_ops_mode) == 64)
23396 {
23397 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
23398 cmp_mode = int_mode_for_size (nbits, 0).require ();
23399 maskcmp = true;
23400 }
23401 else
23402 cmp_mode = cmp_ops_mode;
23403
23404
23405 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
23406 if (!nonimmediate_operand (cmp_op1, cmp_ops_mode))
23407 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
23408
23409 if (optimize
23410 || (maskcmp && cmp_mode != mode)
23411 || (op_true && reg_overlap_mentioned_p (dest, op_true))
23412 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
23413 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
23414
23415 /* Compare patterns for int modes are unspec in AVX512F only. */
23416 if (maskcmp && (code == GT || code == EQ))
23417 {
23418 rtx (*gen)(rtx, rtx, rtx);
23419
23420 switch (cmp_ops_mode)
23421 {
23422 case E_V64QImode:
23423 gcc_assert (TARGET_AVX512BW);
23424 gen = code == GT ? gen_avx512bw_gtv64qi3 : gen_avx512bw_eqv64qi3_1;
23425 break;
23426 case E_V32HImode:
23427 gcc_assert (TARGET_AVX512BW);
23428 gen = code == GT ? gen_avx512bw_gtv32hi3 : gen_avx512bw_eqv32hi3_1;
23429 break;
23430 case E_V16SImode:
23431 gen = code == GT ? gen_avx512f_gtv16si3 : gen_avx512f_eqv16si3_1;
23432 break;
23433 case E_V8DImode:
23434 gen = code == GT ? gen_avx512f_gtv8di3 : gen_avx512f_eqv8di3_1;
23435 break;
23436 default:
23437 gen = NULL;
23438 }
23439
23440 if (gen)
23441 {
23442 emit_insn (gen (dest, cmp_op0, cmp_op1));
23443 return dest;
23444 }
23445 }
23446 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
23447
23448 if (cmp_mode != mode && !maskcmp)
23449 {
23450 x = force_reg (cmp_ops_mode, x);
23451 convert_move (dest, x, false);
23452 }
23453 else
23454 emit_insn (gen_rtx_SET (dest, x));
23455
23456 return dest;
23457 }
23458
23459 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
23460 operations. This is used for both scalar and vector conditional moves. */
23461
23462 void
23463 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
23464 {
23465 machine_mode mode = GET_MODE (dest);
23466 machine_mode cmpmode = GET_MODE (cmp);
23467
23468 /* In AVX512F the result of comparison is an integer mask. */
23469 bool maskcmp = (mode != cmpmode && TARGET_AVX512F);
23470
23471 rtx t2, t3, x;
23472
23473 /* If we have an integer mask and FP value then we need
23474 to cast mask to FP mode. */
23475 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
23476 {
23477 cmp = force_reg (cmpmode, cmp);
23478 cmp = gen_rtx_SUBREG (mode, cmp, 0);
23479 }
23480
23481 if (vector_all_ones_operand (op_true, mode)
23482 && rtx_equal_p (op_false, CONST0_RTX (mode))
23483 && !maskcmp)
23484 {
23485 emit_insn (gen_rtx_SET (dest, cmp));
23486 }
23487 else if (op_false == CONST0_RTX (mode)
23488 && !maskcmp)
23489 {
23490 op_true = force_reg (mode, op_true);
23491 x = gen_rtx_AND (mode, cmp, op_true);
23492 emit_insn (gen_rtx_SET (dest, x));
23493 }
23494 else if (op_true == CONST0_RTX (mode)
23495 && !maskcmp)
23496 {
23497 op_false = force_reg (mode, op_false);
23498 x = gen_rtx_NOT (mode, cmp);
23499 x = gen_rtx_AND (mode, x, op_false);
23500 emit_insn (gen_rtx_SET (dest, x));
23501 }
23502 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode)
23503 && !maskcmp)
23504 {
23505 op_false = force_reg (mode, op_false);
23506 x = gen_rtx_IOR (mode, cmp, op_false);
23507 emit_insn (gen_rtx_SET (dest, x));
23508 }
23509 else if (TARGET_XOP
23510 && !maskcmp)
23511 {
23512 op_true = force_reg (mode, op_true);
23513
23514 if (!nonimmediate_operand (op_false, mode))
23515 op_false = force_reg (mode, op_false);
23516
23517 emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
23518 op_true,
23519 op_false)));
23520 }
23521 else
23522 {
23523 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
23524 rtx d = dest;
23525
23526 if (!nonimmediate_operand (op_true, mode))
23527 op_true = force_reg (mode, op_true);
23528
23529 op_false = force_reg (mode, op_false);
23530
23531 switch (mode)
23532 {
23533 case E_V4SFmode:
23534 if (TARGET_SSE4_1)
23535 gen = gen_sse4_1_blendvps;
23536 break;
23537 case E_V2DFmode:
23538 if (TARGET_SSE4_1)
23539 gen = gen_sse4_1_blendvpd;
23540 break;
23541 case E_V16QImode:
23542 case E_V8HImode:
23543 case E_V4SImode:
23544 case E_V2DImode:
23545 if (TARGET_SSE4_1)
23546 {
23547 gen = gen_sse4_1_pblendvb;
23548 if (mode != V16QImode)
23549 d = gen_reg_rtx (V16QImode);
23550 op_false = gen_lowpart (V16QImode, op_false);
23551 op_true = gen_lowpart (V16QImode, op_true);
23552 cmp = gen_lowpart (V16QImode, cmp);
23553 }
23554 break;
23555 case E_V8SFmode:
23556 if (TARGET_AVX)
23557 gen = gen_avx_blendvps256;
23558 break;
23559 case E_V4DFmode:
23560 if (TARGET_AVX)
23561 gen = gen_avx_blendvpd256;
23562 break;
23563 case E_V32QImode:
23564 case E_V16HImode:
23565 case E_V8SImode:
23566 case E_V4DImode:
23567 if (TARGET_AVX2)
23568 {
23569 gen = gen_avx2_pblendvb;
23570 if (mode != V32QImode)
23571 d = gen_reg_rtx (V32QImode);
23572 op_false = gen_lowpart (V32QImode, op_false);
23573 op_true = gen_lowpart (V32QImode, op_true);
23574 cmp = gen_lowpart (V32QImode, cmp);
23575 }
23576 break;
23577
23578 case E_V64QImode:
23579 gen = gen_avx512bw_blendmv64qi;
23580 break;
23581 case E_V32HImode:
23582 gen = gen_avx512bw_blendmv32hi;
23583 break;
23584 case E_V16SImode:
23585 gen = gen_avx512f_blendmv16si;
23586 break;
23587 case E_V8DImode:
23588 gen = gen_avx512f_blendmv8di;
23589 break;
23590 case E_V8DFmode:
23591 gen = gen_avx512f_blendmv8df;
23592 break;
23593 case E_V16SFmode:
23594 gen = gen_avx512f_blendmv16sf;
23595 break;
23596
23597 default:
23598 break;
23599 }
23600
23601 if (gen != NULL)
23602 {
23603 emit_insn (gen (d, op_false, op_true, cmp));
23604 if (d != dest)
23605 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
23606 }
23607 else
23608 {
23609 op_true = force_reg (mode, op_true);
23610
23611 t2 = gen_reg_rtx (mode);
23612 if (optimize)
23613 t3 = gen_reg_rtx (mode);
23614 else
23615 t3 = dest;
23616
23617 x = gen_rtx_AND (mode, op_true, cmp);
23618 emit_insn (gen_rtx_SET (t2, x));
23619
23620 x = gen_rtx_NOT (mode, cmp);
23621 x = gen_rtx_AND (mode, x, op_false);
23622 emit_insn (gen_rtx_SET (t3, x));
23623
23624 x = gen_rtx_IOR (mode, t3, t2);
23625 emit_insn (gen_rtx_SET (dest, x));
23626 }
23627 }
23628 }
23629
23630 /* Expand a floating-point conditional move. Return true if successful. */
23631
23632 bool
23633 ix86_expand_fp_movcc (rtx operands[])
23634 {
23635 machine_mode mode = GET_MODE (operands[0]);
23636 enum rtx_code code = GET_CODE (operands[1]);
23637 rtx tmp, compare_op;
23638 rtx op0 = XEXP (operands[1], 0);
23639 rtx op1 = XEXP (operands[1], 1);
23640
23641 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
23642 {
23643 machine_mode cmode;
23644
23645 /* Since we've no cmove for sse registers, don't force bad register
23646 allocation just to gain access to it. Deny movcc when the
23647 comparison mode doesn't match the move mode. */
23648 cmode = GET_MODE (op0);
23649 if (cmode == VOIDmode)
23650 cmode = GET_MODE (op1);
23651 if (cmode != mode)
23652 return false;
23653
23654 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
23655 if (code == UNKNOWN)
23656 return false;
23657
23658 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
23659 operands[2], operands[3]))
23660 return true;
23661
23662 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
23663 operands[2], operands[3]);
23664 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
23665 return true;
23666 }
23667
23668 if (GET_MODE (op0) == TImode
23669 || (GET_MODE (op0) == DImode
23670 && !TARGET_64BIT))
23671 return false;
23672
23673 /* The floating point conditional move instructions don't directly
23674 support conditions resulting from a signed integer comparison. */
23675
23676 compare_op = ix86_expand_compare (code, op0, op1);
23677 if (!fcmov_comparison_operator (compare_op, VOIDmode))
23678 {
23679 tmp = gen_reg_rtx (QImode);
23680 ix86_expand_setcc (tmp, code, op0, op1);
23681
23682 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
23683 }
23684
23685 emit_insn (gen_rtx_SET (operands[0],
23686 gen_rtx_IF_THEN_ELSE (mode, compare_op,
23687 operands[2], operands[3])));
23688
23689 return true;
23690 }
23691
23692 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
23693
23694 static int
23695 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
23696 {
23697 switch (code)
23698 {
23699 case EQ:
23700 return 0;
23701 case LT:
23702 case LTU:
23703 return 1;
23704 case LE:
23705 case LEU:
23706 return 2;
23707 case NE:
23708 return 4;
23709 case GE:
23710 case GEU:
23711 return 5;
23712 case GT:
23713 case GTU:
23714 return 6;
23715 default:
23716 gcc_unreachable ();
23717 }
23718 }
23719
23720 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
23721
23722 static int
23723 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
23724 {
23725 switch (code)
23726 {
23727 case EQ:
23728 return 0x00;
23729 case NE:
23730 return 0x04;
23731 case GT:
23732 return 0x0e;
23733 case LE:
23734 return 0x02;
23735 case GE:
23736 return 0x0d;
23737 case LT:
23738 return 0x01;
23739 case UNLE:
23740 return 0x0a;
23741 case UNLT:
23742 return 0x09;
23743 case UNGE:
23744 return 0x05;
23745 case UNGT:
23746 return 0x06;
23747 case UNEQ:
23748 return 0x18;
23749 case LTGT:
23750 return 0x0c;
23751 case ORDERED:
23752 return 0x07;
23753 case UNORDERED:
23754 return 0x03;
23755 default:
23756 gcc_unreachable ();
23757 }
23758 }
23759
23760 /* Return immediate value to be used in UNSPEC_PCMP
23761 for comparison CODE in MODE. */
23762
23763 static int
23764 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
23765 {
23766 if (FLOAT_MODE_P (mode))
23767 return ix86_fp_cmp_code_to_pcmp_immediate (code);
23768 return ix86_int_cmp_code_to_pcmp_immediate (code);
23769 }
23770
23771 /* Expand AVX-512 vector comparison. */
23772
23773 bool
23774 ix86_expand_mask_vec_cmp (rtx operands[])
23775 {
23776 machine_mode mask_mode = GET_MODE (operands[0]);
23777 machine_mode cmp_mode = GET_MODE (operands[2]);
23778 enum rtx_code code = GET_CODE (operands[1]);
23779 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
23780 int unspec_code;
23781 rtx unspec;
23782
23783 switch (code)
23784 {
23785 case LEU:
23786 case GTU:
23787 case GEU:
23788 case LTU:
23789 unspec_code = UNSPEC_UNSIGNED_PCMP;
23790 break;
23791
23792 default:
23793 unspec_code = UNSPEC_PCMP;
23794 }
23795
23796 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, operands[2],
23797 operands[3], imm),
23798 unspec_code);
23799 emit_insn (gen_rtx_SET (operands[0], unspec));
23800
23801 return true;
23802 }
23803
23804 /* Expand fp vector comparison. */
23805
23806 bool
23807 ix86_expand_fp_vec_cmp (rtx operands[])
23808 {
23809 enum rtx_code code = GET_CODE (operands[1]);
23810 rtx cmp;
23811
23812 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
23813 &operands[2], &operands[3]);
23814 if (code == UNKNOWN)
23815 {
23816 rtx temp;
23817 switch (GET_CODE (operands[1]))
23818 {
23819 case LTGT:
23820 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
23821 operands[3], NULL, NULL);
23822 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
23823 operands[3], NULL, NULL);
23824 code = AND;
23825 break;
23826 case UNEQ:
23827 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
23828 operands[3], NULL, NULL);
23829 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
23830 operands[3], NULL, NULL);
23831 code = IOR;
23832 break;
23833 default:
23834 gcc_unreachable ();
23835 }
23836 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
23837 OPTAB_DIRECT);
23838 }
23839 else
23840 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
23841 operands[1], operands[2]);
23842
23843 if (operands[0] != cmp)
23844 emit_move_insn (operands[0], cmp);
23845
23846 return true;
23847 }
23848
23849 static rtx
23850 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
23851 rtx op_true, rtx op_false, bool *negate)
23852 {
23853 machine_mode data_mode = GET_MODE (dest);
23854 machine_mode mode = GET_MODE (cop0);
23855 rtx x;
23856
23857 *negate = false;
23858
23859 /* XOP supports all of the comparisons on all 128-bit vector int types. */
23860 if (TARGET_XOP
23861 && (mode == V16QImode || mode == V8HImode
23862 || mode == V4SImode || mode == V2DImode))
23863 ;
23864 else
23865 {
23866 /* Canonicalize the comparison to EQ, GT, GTU. */
23867 switch (code)
23868 {
23869 case EQ:
23870 case GT:
23871 case GTU:
23872 break;
23873
23874 case NE:
23875 case LE:
23876 case LEU:
23877 code = reverse_condition (code);
23878 *negate = true;
23879 break;
23880
23881 case GE:
23882 case GEU:
23883 code = reverse_condition (code);
23884 *negate = true;
23885 /* FALLTHRU */
23886
23887 case LT:
23888 case LTU:
23889 std::swap (cop0, cop1);
23890 code = swap_condition (code);
23891 break;
23892
23893 default:
23894 gcc_unreachable ();
23895 }
23896
23897 /* Only SSE4.1/SSE4.2 supports V2DImode. */
23898 if (mode == V2DImode)
23899 {
23900 switch (code)
23901 {
23902 case EQ:
23903 /* SSE4.1 supports EQ. */
23904 if (!TARGET_SSE4_1)
23905 return NULL;
23906 break;
23907
23908 case GT:
23909 case GTU:
23910 /* SSE4.2 supports GT/GTU. */
23911 if (!TARGET_SSE4_2)
23912 return NULL;
23913 break;
23914
23915 default:
23916 gcc_unreachable ();
23917 }
23918 }
23919
23920 /* Unsigned parallel compare is not supported by the hardware.
23921 Play some tricks to turn this into a signed comparison
23922 against 0. */
23923 if (code == GTU)
23924 {
23925 cop0 = force_reg (mode, cop0);
23926
23927 switch (mode)
23928 {
23929 case E_V16SImode:
23930 case E_V8DImode:
23931 case E_V8SImode:
23932 case E_V4DImode:
23933 case E_V4SImode:
23934 case E_V2DImode:
23935 {
23936 rtx t1, t2, mask;
23937 rtx (*gen_sub3) (rtx, rtx, rtx);
23938
23939 switch (mode)
23940 {
23941 case E_V16SImode: gen_sub3 = gen_subv16si3; break;
23942 case E_V8DImode: gen_sub3 = gen_subv8di3; break;
23943 case E_V8SImode: gen_sub3 = gen_subv8si3; break;
23944 case E_V4DImode: gen_sub3 = gen_subv4di3; break;
23945 case E_V4SImode: gen_sub3 = gen_subv4si3; break;
23946 case E_V2DImode: gen_sub3 = gen_subv2di3; break;
23947 default:
23948 gcc_unreachable ();
23949 }
23950 /* Subtract (-(INT MAX) - 1) from both operands to make
23951 them signed. */
23952 mask = ix86_build_signbit_mask (mode, true, false);
23953 t1 = gen_reg_rtx (mode);
23954 emit_insn (gen_sub3 (t1, cop0, mask));
23955
23956 t2 = gen_reg_rtx (mode);
23957 emit_insn (gen_sub3 (t2, cop1, mask));
23958
23959 cop0 = t1;
23960 cop1 = t2;
23961 code = GT;
23962 }
23963 break;
23964
23965 case E_V64QImode:
23966 case E_V32HImode:
23967 case E_V32QImode:
23968 case E_V16HImode:
23969 case E_V16QImode:
23970 case E_V8HImode:
23971 /* Perform a parallel unsigned saturating subtraction. */
23972 x = gen_reg_rtx (mode);
23973 emit_insn (gen_rtx_SET (x, gen_rtx_US_MINUS (mode, cop0,
23974 cop1)));
23975
23976 cop0 = x;
23977 cop1 = CONST0_RTX (mode);
23978 code = EQ;
23979 *negate = !*negate;
23980 break;
23981
23982 default:
23983 gcc_unreachable ();
23984 }
23985 }
23986 }
23987
23988 if (*negate)
23989 std::swap (op_true, op_false);
23990
23991 /* Allow the comparison to be done in one mode, but the movcc to
23992 happen in another mode. */
23993 if (data_mode == mode)
23994 {
23995 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
23996 op_true, op_false);
23997 }
23998 else
23999 {
24000 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
24001 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
24002 op_true, op_false);
24003 if (GET_MODE (x) == mode)
24004 x = gen_lowpart (data_mode, x);
24005 }
24006
24007 return x;
24008 }
24009
24010 /* Expand integer vector comparison. */
24011
24012 bool
24013 ix86_expand_int_vec_cmp (rtx operands[])
24014 {
24015 rtx_code code = GET_CODE (operands[1]);
24016 bool negate = false;
24017 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
24018 operands[3], NULL, NULL, &negate);
24019
24020 if (!cmp)
24021 return false;
24022
24023 if (negate)
24024 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
24025 CONST0_RTX (GET_MODE (cmp)),
24026 NULL, NULL, &negate);
24027
24028 gcc_assert (!negate);
24029
24030 if (operands[0] != cmp)
24031 emit_move_insn (operands[0], cmp);
24032
24033 return true;
24034 }
24035
24036 /* Expand a floating-point vector conditional move; a vcond operation
24037 rather than a movcc operation. */
24038
24039 bool
24040 ix86_expand_fp_vcond (rtx operands[])
24041 {
24042 enum rtx_code code = GET_CODE (operands[3]);
24043 rtx cmp;
24044
24045 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
24046 &operands[4], &operands[5]);
24047 if (code == UNKNOWN)
24048 {
24049 rtx temp;
24050 switch (GET_CODE (operands[3]))
24051 {
24052 case LTGT:
24053 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
24054 operands[5], operands[0], operands[0]);
24055 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
24056 operands[5], operands[1], operands[2]);
24057 code = AND;
24058 break;
24059 case UNEQ:
24060 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
24061 operands[5], operands[0], operands[0]);
24062 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
24063 operands[5], operands[1], operands[2]);
24064 code = IOR;
24065 break;
24066 default:
24067 gcc_unreachable ();
24068 }
24069 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
24070 OPTAB_DIRECT);
24071 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24072 return true;
24073 }
24074
24075 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
24076 operands[5], operands[1], operands[2]))
24077 return true;
24078
24079 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
24080 operands[1], operands[2]);
24081 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
24082 return true;
24083 }
24084
24085 /* Expand a signed/unsigned integral vector conditional move. */
24086
24087 bool
24088 ix86_expand_int_vcond (rtx operands[])
24089 {
24090 machine_mode data_mode = GET_MODE (operands[0]);
24091 machine_mode mode = GET_MODE (operands[4]);
24092 enum rtx_code code = GET_CODE (operands[3]);
24093 bool negate = false;
24094 rtx x, cop0, cop1;
24095
24096 cop0 = operands[4];
24097 cop1 = operands[5];
24098
24099 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
24100 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
24101 if ((code == LT || code == GE)
24102 && data_mode == mode
24103 && cop1 == CONST0_RTX (mode)
24104 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
24105 && GET_MODE_UNIT_SIZE (data_mode) > 1
24106 && GET_MODE_UNIT_SIZE (data_mode) <= 8
24107 && (GET_MODE_SIZE (data_mode) == 16
24108 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
24109 {
24110 rtx negop = operands[2 - (code == LT)];
24111 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
24112 if (negop == CONST1_RTX (data_mode))
24113 {
24114 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
24115 operands[0], 1, OPTAB_DIRECT);
24116 if (res != operands[0])
24117 emit_move_insn (operands[0], res);
24118 return true;
24119 }
24120 else if (GET_MODE_INNER (data_mode) != DImode
24121 && vector_all_ones_operand (negop, data_mode))
24122 {
24123 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
24124 operands[0], 0, OPTAB_DIRECT);
24125 if (res != operands[0])
24126 emit_move_insn (operands[0], res);
24127 return true;
24128 }
24129 }
24130
24131 if (!nonimmediate_operand (cop1, mode))
24132 cop1 = force_reg (mode, cop1);
24133 if (!general_operand (operands[1], data_mode))
24134 operands[1] = force_reg (data_mode, operands[1]);
24135 if (!general_operand (operands[2], data_mode))
24136 operands[2] = force_reg (data_mode, operands[2]);
24137
24138 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
24139 operands[1], operands[2], &negate);
24140
24141 if (!x)
24142 return false;
24143
24144 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
24145 operands[2-negate]);
24146 return true;
24147 }
24148
24149 /* AVX512F does support 64-byte integer vector operations,
24150 thus the longest vector we are faced with is V64QImode. */
24151 #define MAX_VECT_LEN 64
24152
24153 struct expand_vec_perm_d
24154 {
24155 rtx target, op0, op1;
24156 unsigned char perm[MAX_VECT_LEN];
24157 machine_mode vmode;
24158 unsigned char nelt;
24159 bool one_operand_p;
24160 bool testing_p;
24161 };
24162
24163 static bool
24164 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
24165 struct expand_vec_perm_d *d)
24166 {
24167 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24168 expander, so args are either in d, or in op0, op1 etc. */
24169 machine_mode mode = GET_MODE (d ? d->op0 : op0);
24170 machine_mode maskmode = mode;
24171 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
24172
24173 switch (mode)
24174 {
24175 case E_V8HImode:
24176 if (TARGET_AVX512VL && TARGET_AVX512BW)
24177 gen = gen_avx512vl_vpermt2varv8hi3;
24178 break;
24179 case E_V16HImode:
24180 if (TARGET_AVX512VL && TARGET_AVX512BW)
24181 gen = gen_avx512vl_vpermt2varv16hi3;
24182 break;
24183 case E_V64QImode:
24184 if (TARGET_AVX512VBMI)
24185 gen = gen_avx512bw_vpermt2varv64qi3;
24186 break;
24187 case E_V32HImode:
24188 if (TARGET_AVX512BW)
24189 gen = gen_avx512bw_vpermt2varv32hi3;
24190 break;
24191 case E_V4SImode:
24192 if (TARGET_AVX512VL)
24193 gen = gen_avx512vl_vpermt2varv4si3;
24194 break;
24195 case E_V8SImode:
24196 if (TARGET_AVX512VL)
24197 gen = gen_avx512vl_vpermt2varv8si3;
24198 break;
24199 case E_V16SImode:
24200 if (TARGET_AVX512F)
24201 gen = gen_avx512f_vpermt2varv16si3;
24202 break;
24203 case E_V4SFmode:
24204 if (TARGET_AVX512VL)
24205 {
24206 gen = gen_avx512vl_vpermt2varv4sf3;
24207 maskmode = V4SImode;
24208 }
24209 break;
24210 case E_V8SFmode:
24211 if (TARGET_AVX512VL)
24212 {
24213 gen = gen_avx512vl_vpermt2varv8sf3;
24214 maskmode = V8SImode;
24215 }
24216 break;
24217 case E_V16SFmode:
24218 if (TARGET_AVX512F)
24219 {
24220 gen = gen_avx512f_vpermt2varv16sf3;
24221 maskmode = V16SImode;
24222 }
24223 break;
24224 case E_V2DImode:
24225 if (TARGET_AVX512VL)
24226 gen = gen_avx512vl_vpermt2varv2di3;
24227 break;
24228 case E_V4DImode:
24229 if (TARGET_AVX512VL)
24230 gen = gen_avx512vl_vpermt2varv4di3;
24231 break;
24232 case E_V8DImode:
24233 if (TARGET_AVX512F)
24234 gen = gen_avx512f_vpermt2varv8di3;
24235 break;
24236 case E_V2DFmode:
24237 if (TARGET_AVX512VL)
24238 {
24239 gen = gen_avx512vl_vpermt2varv2df3;
24240 maskmode = V2DImode;
24241 }
24242 break;
24243 case E_V4DFmode:
24244 if (TARGET_AVX512VL)
24245 {
24246 gen = gen_avx512vl_vpermt2varv4df3;
24247 maskmode = V4DImode;
24248 }
24249 break;
24250 case E_V8DFmode:
24251 if (TARGET_AVX512F)
24252 {
24253 gen = gen_avx512f_vpermt2varv8df3;
24254 maskmode = V8DImode;
24255 }
24256 break;
24257 default:
24258 break;
24259 }
24260
24261 if (gen == NULL)
24262 return false;
24263
24264 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
24265 expander, so args are either in d, or in op0, op1 etc. */
24266 if (d)
24267 {
24268 rtx vec[64];
24269 target = d->target;
24270 op0 = d->op0;
24271 op1 = d->op1;
24272 for (int i = 0; i < d->nelt; ++i)
24273 vec[i] = GEN_INT (d->perm[i]);
24274 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
24275 }
24276
24277 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
24278 return true;
24279 }
24280
24281 /* Expand a variable vector permutation. */
24282
24283 void
24284 ix86_expand_vec_perm (rtx operands[])
24285 {
24286 rtx target = operands[0];
24287 rtx op0 = operands[1];
24288 rtx op1 = operands[2];
24289 rtx mask = operands[3];
24290 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
24291 machine_mode mode = GET_MODE (op0);
24292 machine_mode maskmode = GET_MODE (mask);
24293 int w, e, i;
24294 bool one_operand_shuffle = rtx_equal_p (op0, op1);
24295
24296 /* Number of elements in the vector. */
24297 w = GET_MODE_NUNITS (mode);
24298 e = GET_MODE_UNIT_SIZE (mode);
24299 gcc_assert (w <= 64);
24300
24301 if (TARGET_AVX512F && one_operand_shuffle)
24302 {
24303 rtx (*gen) (rtx, rtx, rtx) = NULL;
24304 switch (mode)
24305 {
24306 case E_V16SImode:
24307 gen =gen_avx512f_permvarv16si;
24308 break;
24309 case E_V16SFmode:
24310 gen = gen_avx512f_permvarv16sf;
24311 break;
24312 case E_V8DImode:
24313 gen = gen_avx512f_permvarv8di;
24314 break;
24315 case E_V8DFmode:
24316 gen = gen_avx512f_permvarv8df;
24317 break;
24318 default:
24319 break;
24320 }
24321 if (gen != NULL)
24322 {
24323 emit_insn (gen (target, op0, mask));
24324 return;
24325 }
24326 }
24327
24328 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
24329 return;
24330
24331 if (TARGET_AVX2)
24332 {
24333 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
24334 {
24335 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
24336 an constant shuffle operand. With a tiny bit of effort we can
24337 use VPERMD instead. A re-interpretation stall for V4DFmode is
24338 unfortunate but there's no avoiding it.
24339 Similarly for V16HImode we don't have instructions for variable
24340 shuffling, while for V32QImode we can use after preparing suitable
24341 masks vpshufb; vpshufb; vpermq; vpor. */
24342
24343 if (mode == V16HImode)
24344 {
24345 maskmode = mode = V32QImode;
24346 w = 32;
24347 e = 1;
24348 }
24349 else
24350 {
24351 maskmode = mode = V8SImode;
24352 w = 8;
24353 e = 4;
24354 }
24355 t1 = gen_reg_rtx (maskmode);
24356
24357 /* Replicate the low bits of the V4DImode mask into V8SImode:
24358 mask = { A B C D }
24359 t1 = { A A B B C C D D }. */
24360 for (i = 0; i < w / 2; ++i)
24361 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
24362 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24363 vt = force_reg (maskmode, vt);
24364 mask = gen_lowpart (maskmode, mask);
24365 if (maskmode == V8SImode)
24366 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
24367 else
24368 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
24369
24370 /* Multiply the shuffle indicies by two. */
24371 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
24372 OPTAB_DIRECT);
24373
24374 /* Add one to the odd shuffle indicies:
24375 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
24376 for (i = 0; i < w / 2; ++i)
24377 {
24378 vec[i * 2] = const0_rtx;
24379 vec[i * 2 + 1] = const1_rtx;
24380 }
24381 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
24382 vt = validize_mem (force_const_mem (maskmode, vt));
24383 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
24384 OPTAB_DIRECT);
24385
24386 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
24387 operands[3] = mask = t1;
24388 target = gen_reg_rtx (mode);
24389 op0 = gen_lowpart (mode, op0);
24390 op1 = gen_lowpart (mode, op1);
24391 }
24392
24393 switch (mode)
24394 {
24395 case E_V8SImode:
24396 /* The VPERMD and VPERMPS instructions already properly ignore
24397 the high bits of the shuffle elements. No need for us to
24398 perform an AND ourselves. */
24399 if (one_operand_shuffle)
24400 {
24401 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
24402 if (target != operands[0])
24403 emit_move_insn (operands[0],
24404 gen_lowpart (GET_MODE (operands[0]), target));
24405 }
24406 else
24407 {
24408 t1 = gen_reg_rtx (V8SImode);
24409 t2 = gen_reg_rtx (V8SImode);
24410 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
24411 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
24412 goto merge_two;
24413 }
24414 return;
24415
24416 case E_V8SFmode:
24417 mask = gen_lowpart (V8SImode, mask);
24418 if (one_operand_shuffle)
24419 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
24420 else
24421 {
24422 t1 = gen_reg_rtx (V8SFmode);
24423 t2 = gen_reg_rtx (V8SFmode);
24424 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
24425 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
24426 goto merge_two;
24427 }
24428 return;
24429
24430 case E_V4SImode:
24431 /* By combining the two 128-bit input vectors into one 256-bit
24432 input vector, we can use VPERMD and VPERMPS for the full
24433 two-operand shuffle. */
24434 t1 = gen_reg_rtx (V8SImode);
24435 t2 = gen_reg_rtx (V8SImode);
24436 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
24437 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24438 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
24439 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
24440 return;
24441
24442 case E_V4SFmode:
24443 t1 = gen_reg_rtx (V8SFmode);
24444 t2 = gen_reg_rtx (V8SImode);
24445 mask = gen_lowpart (V4SImode, mask);
24446 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
24447 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
24448 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
24449 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
24450 return;
24451
24452 case E_V32QImode:
24453 t1 = gen_reg_rtx (V32QImode);
24454 t2 = gen_reg_rtx (V32QImode);
24455 t3 = gen_reg_rtx (V32QImode);
24456 vt2 = GEN_INT (-128);
24457 vt = gen_const_vec_duplicate (V32QImode, vt2);
24458 vt = force_reg (V32QImode, vt);
24459 for (i = 0; i < 32; i++)
24460 vec[i] = i < 16 ? vt2 : const0_rtx;
24461 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
24462 vt2 = force_reg (V32QImode, vt2);
24463 /* From mask create two adjusted masks, which contain the same
24464 bits as mask in the low 7 bits of each vector element.
24465 The first mask will have the most significant bit clear
24466 if it requests element from the same 128-bit lane
24467 and MSB set if it requests element from the other 128-bit lane.
24468 The second mask will have the opposite values of the MSB,
24469 and additionally will have its 128-bit lanes swapped.
24470 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
24471 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
24472 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
24473 stands for other 12 bytes. */
24474 /* The bit whether element is from the same lane or the other
24475 lane is bit 4, so shift it up by 3 to the MSB position. */
24476 t5 = gen_reg_rtx (V4DImode);
24477 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
24478 GEN_INT (3)));
24479 /* Clear MSB bits from the mask just in case it had them set. */
24480 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
24481 /* After this t1 will have MSB set for elements from other lane. */
24482 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
24483 /* Clear bits other than MSB. */
24484 emit_insn (gen_andv32qi3 (t1, t1, vt));
24485 /* Or in the lower bits from mask into t3. */
24486 emit_insn (gen_iorv32qi3 (t3, t1, t2));
24487 /* And invert MSB bits in t1, so MSB is set for elements from the same
24488 lane. */
24489 emit_insn (gen_xorv32qi3 (t1, t1, vt));
24490 /* Swap 128-bit lanes in t3. */
24491 t6 = gen_reg_rtx (V4DImode);
24492 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
24493 const2_rtx, GEN_INT (3),
24494 const0_rtx, const1_rtx));
24495 /* And or in the lower bits from mask into t1. */
24496 emit_insn (gen_iorv32qi3 (t1, t1, t2));
24497 if (one_operand_shuffle)
24498 {
24499 /* Each of these shuffles will put 0s in places where
24500 element from the other 128-bit lane is needed, otherwise
24501 will shuffle in the requested value. */
24502 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
24503 gen_lowpart (V32QImode, t6)));
24504 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
24505 /* For t3 the 128-bit lanes are swapped again. */
24506 t7 = gen_reg_rtx (V4DImode);
24507 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
24508 const2_rtx, GEN_INT (3),
24509 const0_rtx, const1_rtx));
24510 /* And oring both together leads to the result. */
24511 emit_insn (gen_iorv32qi3 (target, t1,
24512 gen_lowpart (V32QImode, t7)));
24513 if (target != operands[0])
24514 emit_move_insn (operands[0],
24515 gen_lowpart (GET_MODE (operands[0]), target));
24516 return;
24517 }
24518
24519 t4 = gen_reg_rtx (V32QImode);
24520 /* Similarly to the above one_operand_shuffle code,
24521 just for repeated twice for each operand. merge_two:
24522 code will merge the two results together. */
24523 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
24524 gen_lowpart (V32QImode, t6)));
24525 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
24526 gen_lowpart (V32QImode, t6)));
24527 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
24528 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
24529 t7 = gen_reg_rtx (V4DImode);
24530 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
24531 const2_rtx, GEN_INT (3),
24532 const0_rtx, const1_rtx));
24533 t8 = gen_reg_rtx (V4DImode);
24534 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
24535 const2_rtx, GEN_INT (3),
24536 const0_rtx, const1_rtx));
24537 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
24538 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
24539 t1 = t4;
24540 t2 = t3;
24541 goto merge_two;
24542
24543 default:
24544 gcc_assert (GET_MODE_SIZE (mode) <= 16);
24545 break;
24546 }
24547 }
24548
24549 if (TARGET_XOP)
24550 {
24551 /* The XOP VPPERM insn supports three inputs. By ignoring the
24552 one_operand_shuffle special case, we avoid creating another
24553 set of constant vectors in memory. */
24554 one_operand_shuffle = false;
24555
24556 /* mask = mask & {2*w-1, ...} */
24557 vt = GEN_INT (2*w - 1);
24558 }
24559 else
24560 {
24561 /* mask = mask & {w-1, ...} */
24562 vt = GEN_INT (w - 1);
24563 }
24564
24565 vt = gen_const_vec_duplicate (maskmode, vt);
24566 mask = expand_simple_binop (maskmode, AND, mask, vt,
24567 NULL_RTX, 0, OPTAB_DIRECT);
24568
24569 /* For non-QImode operations, convert the word permutation control
24570 into a byte permutation control. */
24571 if (mode != V16QImode)
24572 {
24573 mask = expand_simple_binop (maskmode, ASHIFT, mask,
24574 GEN_INT (exact_log2 (e)),
24575 NULL_RTX, 0, OPTAB_DIRECT);
24576
24577 /* Convert mask to vector of chars. */
24578 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
24579
24580 /* Replicate each of the input bytes into byte positions:
24581 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
24582 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
24583 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
24584 for (i = 0; i < 16; ++i)
24585 vec[i] = GEN_INT (i/e * e);
24586 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24587 vt = validize_mem (force_const_mem (V16QImode, vt));
24588 if (TARGET_XOP)
24589 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
24590 else
24591 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
24592
24593 /* Convert it into the byte positions by doing
24594 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
24595 for (i = 0; i < 16; ++i)
24596 vec[i] = GEN_INT (i % e);
24597 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
24598 vt = validize_mem (force_const_mem (V16QImode, vt));
24599 emit_insn (gen_addv16qi3 (mask, mask, vt));
24600 }
24601
24602 /* The actual shuffle operations all operate on V16QImode. */
24603 op0 = gen_lowpart (V16QImode, op0);
24604 op1 = gen_lowpart (V16QImode, op1);
24605
24606 if (TARGET_XOP)
24607 {
24608 if (GET_MODE (target) != V16QImode)
24609 target = gen_reg_rtx (V16QImode);
24610 emit_insn (gen_xop_pperm (target, op0, op1, mask));
24611 if (target != operands[0])
24612 emit_move_insn (operands[0],
24613 gen_lowpart (GET_MODE (operands[0]), target));
24614 }
24615 else if (one_operand_shuffle)
24616 {
24617 if (GET_MODE (target) != V16QImode)
24618 target = gen_reg_rtx (V16QImode);
24619 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
24620 if (target != operands[0])
24621 emit_move_insn (operands[0],
24622 gen_lowpart (GET_MODE (operands[0]), target));
24623 }
24624 else
24625 {
24626 rtx xops[6];
24627 bool ok;
24628
24629 /* Shuffle the two input vectors independently. */
24630 t1 = gen_reg_rtx (V16QImode);
24631 t2 = gen_reg_rtx (V16QImode);
24632 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
24633 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
24634
24635 merge_two:
24636 /* Then merge them together. The key is whether any given control
24637 element contained a bit set that indicates the second word. */
24638 mask = operands[3];
24639 vt = GEN_INT (w);
24640 if (maskmode == V2DImode && !TARGET_SSE4_1)
24641 {
24642 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
24643 more shuffle to convert the V2DI input mask into a V4SI
24644 input mask. At which point the masking that expand_int_vcond
24645 will work as desired. */
24646 rtx t3 = gen_reg_rtx (V4SImode);
24647 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
24648 const0_rtx, const0_rtx,
24649 const2_rtx, const2_rtx));
24650 mask = t3;
24651 maskmode = V4SImode;
24652 e = w = 4;
24653 }
24654
24655 vt = gen_const_vec_duplicate (maskmode, vt);
24656 vt = force_reg (maskmode, vt);
24657 mask = expand_simple_binop (maskmode, AND, mask, vt,
24658 NULL_RTX, 0, OPTAB_DIRECT);
24659
24660 if (GET_MODE (target) != mode)
24661 target = gen_reg_rtx (mode);
24662 xops[0] = target;
24663 xops[1] = gen_lowpart (mode, t2);
24664 xops[2] = gen_lowpart (mode, t1);
24665 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
24666 xops[4] = mask;
24667 xops[5] = vt;
24668 ok = ix86_expand_int_vcond (xops);
24669 gcc_assert (ok);
24670 if (target != operands[0])
24671 emit_move_insn (operands[0],
24672 gen_lowpart (GET_MODE (operands[0]), target));
24673 }
24674 }
24675
24676 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
24677 true if we should do zero extension, else sign extension. HIGH_P is
24678 true if we want the N/2 high elements, else the low elements. */
24679
24680 void
24681 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
24682 {
24683 machine_mode imode = GET_MODE (src);
24684 rtx tmp;
24685
24686 if (TARGET_SSE4_1)
24687 {
24688 rtx (*unpack)(rtx, rtx);
24689 rtx (*extract)(rtx, rtx) = NULL;
24690 machine_mode halfmode = BLKmode;
24691
24692 switch (imode)
24693 {
24694 case E_V64QImode:
24695 if (unsigned_p)
24696 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
24697 else
24698 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
24699 halfmode = V32QImode;
24700 extract
24701 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
24702 break;
24703 case E_V32QImode:
24704 if (unsigned_p)
24705 unpack = gen_avx2_zero_extendv16qiv16hi2;
24706 else
24707 unpack = gen_avx2_sign_extendv16qiv16hi2;
24708 halfmode = V16QImode;
24709 extract
24710 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
24711 break;
24712 case E_V32HImode:
24713 if (unsigned_p)
24714 unpack = gen_avx512f_zero_extendv16hiv16si2;
24715 else
24716 unpack = gen_avx512f_sign_extendv16hiv16si2;
24717 halfmode = V16HImode;
24718 extract
24719 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
24720 break;
24721 case E_V16HImode:
24722 if (unsigned_p)
24723 unpack = gen_avx2_zero_extendv8hiv8si2;
24724 else
24725 unpack = gen_avx2_sign_extendv8hiv8si2;
24726 halfmode = V8HImode;
24727 extract
24728 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
24729 break;
24730 case E_V16SImode:
24731 if (unsigned_p)
24732 unpack = gen_avx512f_zero_extendv8siv8di2;
24733 else
24734 unpack = gen_avx512f_sign_extendv8siv8di2;
24735 halfmode = V8SImode;
24736 extract
24737 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
24738 break;
24739 case E_V8SImode:
24740 if (unsigned_p)
24741 unpack = gen_avx2_zero_extendv4siv4di2;
24742 else
24743 unpack = gen_avx2_sign_extendv4siv4di2;
24744 halfmode = V4SImode;
24745 extract
24746 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
24747 break;
24748 case E_V16QImode:
24749 if (unsigned_p)
24750 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
24751 else
24752 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
24753 break;
24754 case E_V8HImode:
24755 if (unsigned_p)
24756 unpack = gen_sse4_1_zero_extendv4hiv4si2;
24757 else
24758 unpack = gen_sse4_1_sign_extendv4hiv4si2;
24759 break;
24760 case E_V4SImode:
24761 if (unsigned_p)
24762 unpack = gen_sse4_1_zero_extendv2siv2di2;
24763 else
24764 unpack = gen_sse4_1_sign_extendv2siv2di2;
24765 break;
24766 default:
24767 gcc_unreachable ();
24768 }
24769
24770 if (GET_MODE_SIZE (imode) >= 32)
24771 {
24772 tmp = gen_reg_rtx (halfmode);
24773 emit_insn (extract (tmp, src));
24774 }
24775 else if (high_p)
24776 {
24777 /* Shift higher 8 bytes to lower 8 bytes. */
24778 tmp = gen_reg_rtx (V1TImode);
24779 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
24780 GEN_INT (64)));
24781 tmp = gen_lowpart (imode, tmp);
24782 }
24783 else
24784 tmp = src;
24785
24786 emit_insn (unpack (dest, tmp));
24787 }
24788 else
24789 {
24790 rtx (*unpack)(rtx, rtx, rtx);
24791
24792 switch (imode)
24793 {
24794 case E_V16QImode:
24795 if (high_p)
24796 unpack = gen_vec_interleave_highv16qi;
24797 else
24798 unpack = gen_vec_interleave_lowv16qi;
24799 break;
24800 case E_V8HImode:
24801 if (high_p)
24802 unpack = gen_vec_interleave_highv8hi;
24803 else
24804 unpack = gen_vec_interleave_lowv8hi;
24805 break;
24806 case E_V4SImode:
24807 if (high_p)
24808 unpack = gen_vec_interleave_highv4si;
24809 else
24810 unpack = gen_vec_interleave_lowv4si;
24811 break;
24812 default:
24813 gcc_unreachable ();
24814 }
24815
24816 if (unsigned_p)
24817 tmp = force_reg (imode, CONST0_RTX (imode));
24818 else
24819 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
24820 src, pc_rtx, pc_rtx);
24821
24822 rtx tmp2 = gen_reg_rtx (imode);
24823 emit_insn (unpack (tmp2, src, tmp));
24824 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
24825 }
24826 }
24827
24828 /* Expand conditional increment or decrement using adb/sbb instructions.
24829 The default case using setcc followed by the conditional move can be
24830 done by generic code. */
24831 bool
24832 ix86_expand_int_addcc (rtx operands[])
24833 {
24834 enum rtx_code code = GET_CODE (operands[1]);
24835 rtx flags;
24836 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
24837 rtx compare_op;
24838 rtx val = const0_rtx;
24839 bool fpcmp = false;
24840 machine_mode mode;
24841 rtx op0 = XEXP (operands[1], 0);
24842 rtx op1 = XEXP (operands[1], 1);
24843
24844 if (operands[3] != const1_rtx
24845 && operands[3] != constm1_rtx)
24846 return false;
24847 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
24848 return false;
24849 code = GET_CODE (compare_op);
24850
24851 flags = XEXP (compare_op, 0);
24852
24853 if (GET_MODE (flags) == CCFPmode)
24854 {
24855 fpcmp = true;
24856 code = ix86_fp_compare_code_to_integer (code);
24857 }
24858
24859 if (code != LTU)
24860 {
24861 val = constm1_rtx;
24862 if (fpcmp)
24863 PUT_CODE (compare_op,
24864 reverse_condition_maybe_unordered
24865 (GET_CODE (compare_op)));
24866 else
24867 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
24868 }
24869
24870 mode = GET_MODE (operands[0]);
24871
24872 /* Construct either adc or sbb insn. */
24873 if ((code == LTU) == (operands[3] == constm1_rtx))
24874 {
24875 switch (mode)
24876 {
24877 case E_QImode:
24878 insn = gen_subqi3_carry;
24879 break;
24880 case E_HImode:
24881 insn = gen_subhi3_carry;
24882 break;
24883 case E_SImode:
24884 insn = gen_subsi3_carry;
24885 break;
24886 case E_DImode:
24887 insn = gen_subdi3_carry;
24888 break;
24889 default:
24890 gcc_unreachable ();
24891 }
24892 }
24893 else
24894 {
24895 switch (mode)
24896 {
24897 case E_QImode:
24898 insn = gen_addqi3_carry;
24899 break;
24900 case E_HImode:
24901 insn = gen_addhi3_carry;
24902 break;
24903 case E_SImode:
24904 insn = gen_addsi3_carry;
24905 break;
24906 case E_DImode:
24907 insn = gen_adddi3_carry;
24908 break;
24909 default:
24910 gcc_unreachable ();
24911 }
24912 }
24913 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
24914
24915 return true;
24916 }
24917
24918
24919 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
24920 but works for floating pointer parameters and nonoffsetable memories.
24921 For pushes, it returns just stack offsets; the values will be saved
24922 in the right order. Maximally three parts are generated. */
24923
24924 static int
24925 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
24926 {
24927 int size;
24928
24929 if (!TARGET_64BIT)
24930 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
24931 else
24932 size = (GET_MODE_SIZE (mode) + 4) / 8;
24933
24934 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
24935 gcc_assert (size >= 2 && size <= 4);
24936
24937 /* Optimize constant pool reference to immediates. This is used by fp
24938 moves, that force all constants to memory to allow combining. */
24939 if (MEM_P (operand) && MEM_READONLY_P (operand))
24940 operand = avoid_constant_pool_reference (operand);
24941
24942 if (MEM_P (operand) && !offsettable_memref_p (operand))
24943 {
24944 /* The only non-offsetable memories we handle are pushes. */
24945 int ok = push_operand (operand, VOIDmode);
24946
24947 gcc_assert (ok);
24948
24949 operand = copy_rtx (operand);
24950 PUT_MODE (operand, word_mode);
24951 parts[0] = parts[1] = parts[2] = parts[3] = operand;
24952 return size;
24953 }
24954
24955 if (GET_CODE (operand) == CONST_VECTOR)
24956 {
24957 scalar_int_mode imode = int_mode_for_mode (mode).require ();
24958 /* Caution: if we looked through a constant pool memory above,
24959 the operand may actually have a different mode now. That's
24960 ok, since we want to pun this all the way back to an integer. */
24961 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
24962 gcc_assert (operand != NULL);
24963 mode = imode;
24964 }
24965
24966 if (!TARGET_64BIT)
24967 {
24968 if (mode == DImode)
24969 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
24970 else
24971 {
24972 int i;
24973
24974 if (REG_P (operand))
24975 {
24976 gcc_assert (reload_completed);
24977 for (i = 0; i < size; i++)
24978 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
24979 }
24980 else if (offsettable_memref_p (operand))
24981 {
24982 operand = adjust_address (operand, SImode, 0);
24983 parts[0] = operand;
24984 for (i = 1; i < size; i++)
24985 parts[i] = adjust_address (operand, SImode, 4 * i);
24986 }
24987 else if (CONST_DOUBLE_P (operand))
24988 {
24989 const REAL_VALUE_TYPE *r;
24990 long l[4];
24991
24992 r = CONST_DOUBLE_REAL_VALUE (operand);
24993 switch (mode)
24994 {
24995 case E_TFmode:
24996 real_to_target (l, r, mode);
24997 parts[3] = gen_int_mode (l[3], SImode);
24998 parts[2] = gen_int_mode (l[2], SImode);
24999 break;
25000 case E_XFmode:
25001 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
25002 long double may not be 80-bit. */
25003 real_to_target (l, r, mode);
25004 parts[2] = gen_int_mode (l[2], SImode);
25005 break;
25006 case E_DFmode:
25007 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
25008 break;
25009 default:
25010 gcc_unreachable ();
25011 }
25012 parts[1] = gen_int_mode (l[1], SImode);
25013 parts[0] = gen_int_mode (l[0], SImode);
25014 }
25015 else
25016 gcc_unreachable ();
25017 }
25018 }
25019 else
25020 {
25021 if (mode == TImode)
25022 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
25023 if (mode == XFmode || mode == TFmode)
25024 {
25025 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
25026 if (REG_P (operand))
25027 {
25028 gcc_assert (reload_completed);
25029 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
25030 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
25031 }
25032 else if (offsettable_memref_p (operand))
25033 {
25034 operand = adjust_address (operand, DImode, 0);
25035 parts[0] = operand;
25036 parts[1] = adjust_address (operand, upper_mode, 8);
25037 }
25038 else if (CONST_DOUBLE_P (operand))
25039 {
25040 long l[4];
25041
25042 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
25043
25044 /* real_to_target puts 32-bit pieces in each long. */
25045 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
25046 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
25047 << 32), DImode);
25048
25049 if (upper_mode == SImode)
25050 parts[1] = gen_int_mode (l[2], SImode);
25051 else
25052 parts[1]
25053 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
25054 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
25055 << 32), DImode);
25056 }
25057 else
25058 gcc_unreachable ();
25059 }
25060 }
25061
25062 return size;
25063 }
25064
25065 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
25066 Return false when normal moves are needed; true when all required
25067 insns have been emitted. Operands 2-4 contain the input values
25068 int the correct order; operands 5-7 contain the output values. */
25069
25070 void
25071 ix86_split_long_move (rtx operands[])
25072 {
25073 rtx part[2][4];
25074 int nparts, i, j;
25075 int push = 0;
25076 int collisions = 0;
25077 machine_mode mode = GET_MODE (operands[0]);
25078 bool collisionparts[4];
25079
25080 /* The DFmode expanders may ask us to move double.
25081 For 64bit target this is single move. By hiding the fact
25082 here we simplify i386.md splitters. */
25083 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
25084 {
25085 /* Optimize constant pool reference to immediates. This is used by
25086 fp moves, that force all constants to memory to allow combining. */
25087
25088 if (MEM_P (operands[1])
25089 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
25090 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
25091 operands[1] = get_pool_constant (XEXP (operands[1], 0));
25092 if (push_operand (operands[0], VOIDmode))
25093 {
25094 operands[0] = copy_rtx (operands[0]);
25095 PUT_MODE (operands[0], word_mode);
25096 }
25097 else
25098 operands[0] = gen_lowpart (DImode, operands[0]);
25099 operands[1] = gen_lowpart (DImode, operands[1]);
25100 emit_move_insn (operands[0], operands[1]);
25101 return;
25102 }
25103
25104 /* The only non-offsettable memory we handle is push. */
25105 if (push_operand (operands[0], VOIDmode))
25106 push = 1;
25107 else
25108 gcc_assert (!MEM_P (operands[0])
25109 || offsettable_memref_p (operands[0]));
25110
25111 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
25112 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
25113
25114 /* When emitting push, take care for source operands on the stack. */
25115 if (push && MEM_P (operands[1])
25116 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
25117 {
25118 rtx src_base = XEXP (part[1][nparts - 1], 0);
25119
25120 /* Compensate for the stack decrement by 4. */
25121 if (!TARGET_64BIT && nparts == 3
25122 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
25123 src_base = plus_constant (Pmode, src_base, 4);
25124
25125 /* src_base refers to the stack pointer and is
25126 automatically decreased by emitted push. */
25127 for (i = 0; i < nparts; i++)
25128 part[1][i] = change_address (part[1][i],
25129 GET_MODE (part[1][i]), src_base);
25130 }
25131
25132 /* We need to do copy in the right order in case an address register
25133 of the source overlaps the destination. */
25134 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
25135 {
25136 rtx tmp;
25137
25138 for (i = 0; i < nparts; i++)
25139 {
25140 collisionparts[i]
25141 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
25142 if (collisionparts[i])
25143 collisions++;
25144 }
25145
25146 /* Collision in the middle part can be handled by reordering. */
25147 if (collisions == 1 && nparts == 3 && collisionparts [1])
25148 {
25149 std::swap (part[0][1], part[0][2]);
25150 std::swap (part[1][1], part[1][2]);
25151 }
25152 else if (collisions == 1
25153 && nparts == 4
25154 && (collisionparts [1] || collisionparts [2]))
25155 {
25156 if (collisionparts [1])
25157 {
25158 std::swap (part[0][1], part[0][2]);
25159 std::swap (part[1][1], part[1][2]);
25160 }
25161 else
25162 {
25163 std::swap (part[0][2], part[0][3]);
25164 std::swap (part[1][2], part[1][3]);
25165 }
25166 }
25167
25168 /* If there are more collisions, we can't handle it by reordering.
25169 Do an lea to the last part and use only one colliding move. */
25170 else if (collisions > 1)
25171 {
25172 rtx base, addr;
25173
25174 collisions = 1;
25175
25176 base = part[0][nparts - 1];
25177
25178 /* Handle the case when the last part isn't valid for lea.
25179 Happens in 64-bit mode storing the 12-byte XFmode. */
25180 if (GET_MODE (base) != Pmode)
25181 base = gen_rtx_REG (Pmode, REGNO (base));
25182
25183 addr = XEXP (part[1][0], 0);
25184 if (TARGET_TLS_DIRECT_SEG_REFS)
25185 {
25186 struct ix86_address parts;
25187 int ok = ix86_decompose_address (addr, &parts);
25188 gcc_assert (ok);
25189 /* It is not valid to use %gs: or %fs: in lea. */
25190 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
25191 }
25192 emit_insn (gen_rtx_SET (base, addr));
25193 part[1][0] = replace_equiv_address (part[1][0], base);
25194 for (i = 1; i < nparts; i++)
25195 {
25196 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
25197 part[1][i] = replace_equiv_address (part[1][i], tmp);
25198 }
25199 }
25200 }
25201
25202 if (push)
25203 {
25204 if (!TARGET_64BIT)
25205 {
25206 if (nparts == 3)
25207 {
25208 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
25209 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
25210 stack_pointer_rtx, GEN_INT (-4)));
25211 emit_move_insn (part[0][2], part[1][2]);
25212 }
25213 else if (nparts == 4)
25214 {
25215 emit_move_insn (part[0][3], part[1][3]);
25216 emit_move_insn (part[0][2], part[1][2]);
25217 }
25218 }
25219 else
25220 {
25221 /* In 64bit mode we don't have 32bit push available. In case this is
25222 register, it is OK - we will just use larger counterpart. We also
25223 retype memory - these comes from attempt to avoid REX prefix on
25224 moving of second half of TFmode value. */
25225 if (GET_MODE (part[1][1]) == SImode)
25226 {
25227 switch (GET_CODE (part[1][1]))
25228 {
25229 case MEM:
25230 part[1][1] = adjust_address (part[1][1], DImode, 0);
25231 break;
25232
25233 case REG:
25234 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
25235 break;
25236
25237 default:
25238 gcc_unreachable ();
25239 }
25240
25241 if (GET_MODE (part[1][0]) == SImode)
25242 part[1][0] = part[1][1];
25243 }
25244 }
25245 emit_move_insn (part[0][1], part[1][1]);
25246 emit_move_insn (part[0][0], part[1][0]);
25247 return;
25248 }
25249
25250 /* Choose correct order to not overwrite the source before it is copied. */
25251 if ((REG_P (part[0][0])
25252 && REG_P (part[1][1])
25253 && (REGNO (part[0][0]) == REGNO (part[1][1])
25254 || (nparts == 3
25255 && REGNO (part[0][0]) == REGNO (part[1][2]))
25256 || (nparts == 4
25257 && REGNO (part[0][0]) == REGNO (part[1][3]))))
25258 || (collisions > 0
25259 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
25260 {
25261 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
25262 {
25263 operands[2 + i] = part[0][j];
25264 operands[6 + i] = part[1][j];
25265 }
25266 }
25267 else
25268 {
25269 for (i = 0; i < nparts; i++)
25270 {
25271 operands[2 + i] = part[0][i];
25272 operands[6 + i] = part[1][i];
25273 }
25274 }
25275
25276 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
25277 if (optimize_insn_for_size_p ())
25278 {
25279 for (j = 0; j < nparts - 1; j++)
25280 if (CONST_INT_P (operands[6 + j])
25281 && operands[6 + j] != const0_rtx
25282 && REG_P (operands[2 + j]))
25283 for (i = j; i < nparts - 1; i++)
25284 if (CONST_INT_P (operands[7 + i])
25285 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
25286 operands[7 + i] = operands[2 + j];
25287 }
25288
25289 for (i = 0; i < nparts; i++)
25290 emit_move_insn (operands[2 + i], operands[6 + i]);
25291
25292 return;
25293 }
25294
25295 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
25296 left shift by a constant, either using a single shift or
25297 a sequence of add instructions. */
25298
25299 static void
25300 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
25301 {
25302 rtx (*insn)(rtx, rtx, rtx);
25303
25304 if (count == 1
25305 || (count * ix86_cost->add <= ix86_cost->shift_const
25306 && !optimize_insn_for_size_p ()))
25307 {
25308 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
25309 while (count-- > 0)
25310 emit_insn (insn (operand, operand, operand));
25311 }
25312 else
25313 {
25314 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25315 emit_insn (insn (operand, operand, GEN_INT (count)));
25316 }
25317 }
25318
25319 void
25320 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
25321 {
25322 rtx (*gen_ashl3)(rtx, rtx, rtx);
25323 rtx (*gen_shld)(rtx, rtx, rtx);
25324 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25325
25326 rtx low[2], high[2];
25327 int count;
25328
25329 if (CONST_INT_P (operands[2]))
25330 {
25331 split_double_mode (mode, operands, 2, low, high);
25332 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25333
25334 if (count >= half_width)
25335 {
25336 emit_move_insn (high[0], low[1]);
25337 emit_move_insn (low[0], const0_rtx);
25338
25339 if (count > half_width)
25340 ix86_expand_ashl_const (high[0], count - half_width, mode);
25341 }
25342 else
25343 {
25344 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25345
25346 if (!rtx_equal_p (operands[0], operands[1]))
25347 emit_move_insn (operands[0], operands[1]);
25348
25349 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
25350 ix86_expand_ashl_const (low[0], count, mode);
25351 }
25352 return;
25353 }
25354
25355 split_double_mode (mode, operands, 1, low, high);
25356
25357 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
25358
25359 if (operands[1] == const1_rtx)
25360 {
25361 /* Assuming we've chosen a QImode capable registers, then 1 << N
25362 can be done with two 32/64-bit shifts, no branches, no cmoves. */
25363 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
25364 {
25365 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
25366
25367 ix86_expand_clear (low[0]);
25368 ix86_expand_clear (high[0]);
25369 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
25370
25371 d = gen_lowpart (QImode, low[0]);
25372 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25373 s = gen_rtx_EQ (QImode, flags, const0_rtx);
25374 emit_insn (gen_rtx_SET (d, s));
25375
25376 d = gen_lowpart (QImode, high[0]);
25377 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
25378 s = gen_rtx_NE (QImode, flags, const0_rtx);
25379 emit_insn (gen_rtx_SET (d, s));
25380 }
25381
25382 /* Otherwise, we can get the same results by manually performing
25383 a bit extract operation on bit 5/6, and then performing the two
25384 shifts. The two methods of getting 0/1 into low/high are exactly
25385 the same size. Avoiding the shift in the bit extract case helps
25386 pentium4 a bit; no one else seems to care much either way. */
25387 else
25388 {
25389 machine_mode half_mode;
25390 rtx (*gen_lshr3)(rtx, rtx, rtx);
25391 rtx (*gen_and3)(rtx, rtx, rtx);
25392 rtx (*gen_xor3)(rtx, rtx, rtx);
25393 HOST_WIDE_INT bits;
25394 rtx x;
25395
25396 if (mode == DImode)
25397 {
25398 half_mode = SImode;
25399 gen_lshr3 = gen_lshrsi3;
25400 gen_and3 = gen_andsi3;
25401 gen_xor3 = gen_xorsi3;
25402 bits = 5;
25403 }
25404 else
25405 {
25406 half_mode = DImode;
25407 gen_lshr3 = gen_lshrdi3;
25408 gen_and3 = gen_anddi3;
25409 gen_xor3 = gen_xordi3;
25410 bits = 6;
25411 }
25412
25413 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
25414 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
25415 else
25416 x = gen_lowpart (half_mode, operands[2]);
25417 emit_insn (gen_rtx_SET (high[0], x));
25418
25419 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
25420 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
25421 emit_move_insn (low[0], high[0]);
25422 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
25423 }
25424
25425 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25426 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
25427 return;
25428 }
25429
25430 if (operands[1] == constm1_rtx)
25431 {
25432 /* For -1 << N, we can avoid the shld instruction, because we
25433 know that we're shifting 0...31/63 ones into a -1. */
25434 emit_move_insn (low[0], constm1_rtx);
25435 if (optimize_insn_for_size_p ())
25436 emit_move_insn (high[0], low[0]);
25437 else
25438 emit_move_insn (high[0], constm1_rtx);
25439 }
25440 else
25441 {
25442 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
25443
25444 if (!rtx_equal_p (operands[0], operands[1]))
25445 emit_move_insn (operands[0], operands[1]);
25446
25447 split_double_mode (mode, operands, 1, low, high);
25448 emit_insn (gen_shld (high[0], low[0], operands[2]));
25449 }
25450
25451 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
25452
25453 if (TARGET_CMOVE && scratch)
25454 {
25455 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25456 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25457
25458 ix86_expand_clear (scratch);
25459 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
25460 }
25461 else
25462 {
25463 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25464 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25465
25466 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
25467 }
25468 }
25469
25470 void
25471 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
25472 {
25473 rtx (*gen_ashr3)(rtx, rtx, rtx)
25474 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
25475 rtx (*gen_shrd)(rtx, rtx, rtx);
25476 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25477
25478 rtx low[2], high[2];
25479 int count;
25480
25481 if (CONST_INT_P (operands[2]))
25482 {
25483 split_double_mode (mode, operands, 2, low, high);
25484 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25485
25486 if (count == GET_MODE_BITSIZE (mode) - 1)
25487 {
25488 emit_move_insn (high[0], high[1]);
25489 emit_insn (gen_ashr3 (high[0], high[0],
25490 GEN_INT (half_width - 1)));
25491 emit_move_insn (low[0], high[0]);
25492
25493 }
25494 else if (count >= half_width)
25495 {
25496 emit_move_insn (low[0], high[1]);
25497 emit_move_insn (high[0], low[0]);
25498 emit_insn (gen_ashr3 (high[0], high[0],
25499 GEN_INT (half_width - 1)));
25500
25501 if (count > half_width)
25502 emit_insn (gen_ashr3 (low[0], low[0],
25503 GEN_INT (count - half_width)));
25504 }
25505 else
25506 {
25507 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25508
25509 if (!rtx_equal_p (operands[0], operands[1]))
25510 emit_move_insn (operands[0], operands[1]);
25511
25512 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25513 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
25514 }
25515 }
25516 else
25517 {
25518 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25519
25520 if (!rtx_equal_p (operands[0], operands[1]))
25521 emit_move_insn (operands[0], operands[1]);
25522
25523 split_double_mode (mode, operands, 1, low, high);
25524
25525 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25526 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
25527
25528 if (TARGET_CMOVE && scratch)
25529 {
25530 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25531 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25532
25533 emit_move_insn (scratch, high[0]);
25534 emit_insn (gen_ashr3 (scratch, scratch,
25535 GEN_INT (half_width - 1)));
25536 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25537 scratch));
25538 }
25539 else
25540 {
25541 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
25542 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
25543
25544 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
25545 }
25546 }
25547 }
25548
25549 void
25550 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
25551 {
25552 rtx (*gen_lshr3)(rtx, rtx, rtx)
25553 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
25554 rtx (*gen_shrd)(rtx, rtx, rtx);
25555 int half_width = GET_MODE_BITSIZE (mode) >> 1;
25556
25557 rtx low[2], high[2];
25558 int count;
25559
25560 if (CONST_INT_P (operands[2]))
25561 {
25562 split_double_mode (mode, operands, 2, low, high);
25563 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
25564
25565 if (count >= half_width)
25566 {
25567 emit_move_insn (low[0], high[1]);
25568 ix86_expand_clear (high[0]);
25569
25570 if (count > half_width)
25571 emit_insn (gen_lshr3 (low[0], low[0],
25572 GEN_INT (count - half_width)));
25573 }
25574 else
25575 {
25576 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25577
25578 if (!rtx_equal_p (operands[0], operands[1]))
25579 emit_move_insn (operands[0], operands[1]);
25580
25581 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
25582 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
25583 }
25584 }
25585 else
25586 {
25587 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
25588
25589 if (!rtx_equal_p (operands[0], operands[1]))
25590 emit_move_insn (operands[0], operands[1]);
25591
25592 split_double_mode (mode, operands, 1, low, high);
25593
25594 emit_insn (gen_shrd (low[0], high[0], operands[2]));
25595 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
25596
25597 if (TARGET_CMOVE && scratch)
25598 {
25599 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
25600 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
25601
25602 ix86_expand_clear (scratch);
25603 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
25604 scratch));
25605 }
25606 else
25607 {
25608 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
25609 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
25610
25611 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
25612 }
25613 }
25614 }
25615
25616 /* Predict just emitted jump instruction to be taken with probability PROB. */
25617 static void
25618 predict_jump (int prob)
25619 {
25620 rtx_insn *insn = get_last_insn ();
25621 gcc_assert (JUMP_P (insn));
25622 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
25623 }
25624
25625 /* Helper function for the string operations below. Dest VARIABLE whether
25626 it is aligned to VALUE bytes. If true, jump to the label. */
25627 static rtx_code_label *
25628 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
25629 {
25630 rtx_code_label *label = gen_label_rtx ();
25631 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
25632 if (GET_MODE (variable) == DImode)
25633 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
25634 else
25635 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
25636 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
25637 1, label);
25638 if (epilogue)
25639 predict_jump (REG_BR_PROB_BASE * 50 / 100);
25640 else
25641 predict_jump (REG_BR_PROB_BASE * 90 / 100);
25642 return label;
25643 }
25644
25645 /* Adjust COUNTER by the VALUE. */
25646 static void
25647 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
25648 {
25649 rtx (*gen_add)(rtx, rtx, rtx)
25650 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
25651
25652 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
25653 }
25654
25655 /* Zero extend possibly SImode EXP to Pmode register. */
25656 rtx
25657 ix86_zero_extend_to_Pmode (rtx exp)
25658 {
25659 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
25660 }
25661
25662 /* Divide COUNTREG by SCALE. */
25663 static rtx
25664 scale_counter (rtx countreg, int scale)
25665 {
25666 rtx sc;
25667
25668 if (scale == 1)
25669 return countreg;
25670 if (CONST_INT_P (countreg))
25671 return GEN_INT (INTVAL (countreg) / scale);
25672 gcc_assert (REG_P (countreg));
25673
25674 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
25675 GEN_INT (exact_log2 (scale)),
25676 NULL, 1, OPTAB_DIRECT);
25677 return sc;
25678 }
25679
25680 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
25681 DImode for constant loop counts. */
25682
25683 static machine_mode
25684 counter_mode (rtx count_exp)
25685 {
25686 if (GET_MODE (count_exp) != VOIDmode)
25687 return GET_MODE (count_exp);
25688 if (!CONST_INT_P (count_exp))
25689 return Pmode;
25690 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
25691 return DImode;
25692 return SImode;
25693 }
25694
25695 /* Copy the address to a Pmode register. This is used for x32 to
25696 truncate DImode TLS address to a SImode register. */
25697
25698 static rtx
25699 ix86_copy_addr_to_reg (rtx addr)
25700 {
25701 rtx reg;
25702 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
25703 {
25704 reg = copy_addr_to_reg (addr);
25705 REG_POINTER (reg) = 1;
25706 return reg;
25707 }
25708 else
25709 {
25710 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
25711 reg = copy_to_mode_reg (DImode, addr);
25712 REG_POINTER (reg) = 1;
25713 return gen_rtx_SUBREG (SImode, reg, 0);
25714 }
25715 }
25716
25717 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
25718 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
25719 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
25720 memory by VALUE (supposed to be in MODE).
25721
25722 The size is rounded down to whole number of chunk size moved at once.
25723 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
25724
25725
25726 static void
25727 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
25728 rtx destptr, rtx srcptr, rtx value,
25729 rtx count, machine_mode mode, int unroll,
25730 int expected_size, bool issetmem)
25731 {
25732 rtx_code_label *out_label, *top_label;
25733 rtx iter, tmp;
25734 machine_mode iter_mode = counter_mode (count);
25735 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
25736 rtx piece_size = GEN_INT (piece_size_n);
25737 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
25738 rtx size;
25739 int i;
25740
25741 top_label = gen_label_rtx ();
25742 out_label = gen_label_rtx ();
25743 iter = gen_reg_rtx (iter_mode);
25744
25745 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
25746 NULL, 1, OPTAB_DIRECT);
25747 /* Those two should combine. */
25748 if (piece_size == const1_rtx)
25749 {
25750 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
25751 true, out_label);
25752 predict_jump (REG_BR_PROB_BASE * 10 / 100);
25753 }
25754 emit_move_insn (iter, const0_rtx);
25755
25756 emit_label (top_label);
25757
25758 tmp = convert_modes (Pmode, iter_mode, iter, true);
25759
25760 /* This assert could be relaxed - in this case we'll need to compute
25761 smallest power of two, containing in PIECE_SIZE_N and pass it to
25762 offset_address. */
25763 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
25764 destmem = offset_address (destmem, tmp, piece_size_n);
25765 destmem = adjust_address (destmem, mode, 0);
25766
25767 if (!issetmem)
25768 {
25769 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
25770 srcmem = adjust_address (srcmem, mode, 0);
25771
25772 /* When unrolling for chips that reorder memory reads and writes,
25773 we can save registers by using single temporary.
25774 Also using 4 temporaries is overkill in 32bit mode. */
25775 if (!TARGET_64BIT && 0)
25776 {
25777 for (i = 0; i < unroll; i++)
25778 {
25779 if (i)
25780 {
25781 destmem =
25782 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25783 srcmem =
25784 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25785 }
25786 emit_move_insn (destmem, srcmem);
25787 }
25788 }
25789 else
25790 {
25791 rtx tmpreg[4];
25792 gcc_assert (unroll <= 4);
25793 for (i = 0; i < unroll; i++)
25794 {
25795 tmpreg[i] = gen_reg_rtx (mode);
25796 if (i)
25797 {
25798 srcmem =
25799 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
25800 }
25801 emit_move_insn (tmpreg[i], srcmem);
25802 }
25803 for (i = 0; i < unroll; i++)
25804 {
25805 if (i)
25806 {
25807 destmem =
25808 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25809 }
25810 emit_move_insn (destmem, tmpreg[i]);
25811 }
25812 }
25813 }
25814 else
25815 for (i = 0; i < unroll; i++)
25816 {
25817 if (i)
25818 destmem =
25819 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
25820 emit_move_insn (destmem, value);
25821 }
25822
25823 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
25824 true, OPTAB_LIB_WIDEN);
25825 if (tmp != iter)
25826 emit_move_insn (iter, tmp);
25827
25828 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
25829 true, top_label);
25830 if (expected_size != -1)
25831 {
25832 expected_size /= GET_MODE_SIZE (mode) * unroll;
25833 if (expected_size == 0)
25834 predict_jump (0);
25835 else if (expected_size > REG_BR_PROB_BASE)
25836 predict_jump (REG_BR_PROB_BASE - 1);
25837 else
25838 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
25839 }
25840 else
25841 predict_jump (REG_BR_PROB_BASE * 80 / 100);
25842 iter = ix86_zero_extend_to_Pmode (iter);
25843 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
25844 true, OPTAB_LIB_WIDEN);
25845 if (tmp != destptr)
25846 emit_move_insn (destptr, tmp);
25847 if (!issetmem)
25848 {
25849 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
25850 true, OPTAB_LIB_WIDEN);
25851 if (tmp != srcptr)
25852 emit_move_insn (srcptr, tmp);
25853 }
25854 emit_label (out_label);
25855 }
25856
25857 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
25858 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
25859 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
25860 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
25861 ORIG_VALUE is the original value passed to memset to fill the memory with.
25862 Other arguments have same meaning as for previous function. */
25863
25864 static void
25865 expand_set_or_movmem_via_rep (rtx destmem, rtx srcmem,
25866 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
25867 rtx count,
25868 machine_mode mode, bool issetmem)
25869 {
25870 rtx destexp;
25871 rtx srcexp;
25872 rtx countreg;
25873 HOST_WIDE_INT rounded_count;
25874
25875 /* If possible, it is shorter to use rep movs.
25876 TODO: Maybe it is better to move this logic to decide_alg. */
25877 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
25878 && (!issetmem || orig_value == const0_rtx))
25879 mode = SImode;
25880
25881 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
25882 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
25883
25884 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
25885 GET_MODE_SIZE (mode)));
25886 if (mode != QImode)
25887 {
25888 destexp = gen_rtx_ASHIFT (Pmode, countreg,
25889 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25890 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
25891 }
25892 else
25893 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
25894 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
25895 {
25896 rounded_count
25897 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25898 destmem = shallow_copy_rtx (destmem);
25899 set_mem_size (destmem, rounded_count);
25900 }
25901 else if (MEM_SIZE_KNOWN_P (destmem))
25902 clear_mem_size (destmem);
25903
25904 if (issetmem)
25905 {
25906 value = force_reg (mode, gen_lowpart (mode, value));
25907 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
25908 }
25909 else
25910 {
25911 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
25912 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
25913 if (mode != QImode)
25914 {
25915 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
25916 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
25917 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
25918 }
25919 else
25920 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
25921 if (CONST_INT_P (count))
25922 {
25923 rounded_count
25924 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
25925 srcmem = shallow_copy_rtx (srcmem);
25926 set_mem_size (srcmem, rounded_count);
25927 }
25928 else
25929 {
25930 if (MEM_SIZE_KNOWN_P (srcmem))
25931 clear_mem_size (srcmem);
25932 }
25933 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
25934 destexp, srcexp));
25935 }
25936 }
25937
25938 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
25939 DESTMEM.
25940 SRC is passed by pointer to be updated on return.
25941 Return value is updated DST. */
25942 static rtx
25943 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
25944 HOST_WIDE_INT size_to_move)
25945 {
25946 rtx dst = destmem, src = *srcmem, adjust, tempreg;
25947 enum insn_code code;
25948 machine_mode move_mode;
25949 int piece_size, i;
25950
25951 /* Find the widest mode in which we could perform moves.
25952 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
25953 it until move of such size is supported. */
25954 piece_size = 1 << floor_log2 (size_to_move);
25955 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
25956 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25957 {
25958 gcc_assert (piece_size > 1);
25959 piece_size >>= 1;
25960 }
25961
25962 /* Find the corresponding vector mode with the same size as MOVE_MODE.
25963 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
25964 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
25965 {
25966 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
25967 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
25968 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
25969 {
25970 move_mode = word_mode;
25971 piece_size = GET_MODE_SIZE (move_mode);
25972 code = optab_handler (mov_optab, move_mode);
25973 }
25974 }
25975 gcc_assert (code != CODE_FOR_nothing);
25976
25977 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
25978 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
25979
25980 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
25981 gcc_assert (size_to_move % piece_size == 0);
25982 adjust = GEN_INT (piece_size);
25983 for (i = 0; i < size_to_move; i += piece_size)
25984 {
25985 /* We move from memory to memory, so we'll need to do it via
25986 a temporary register. */
25987 tempreg = gen_reg_rtx (move_mode);
25988 emit_insn (GEN_FCN (code) (tempreg, src));
25989 emit_insn (GEN_FCN (code) (dst, tempreg));
25990
25991 emit_move_insn (destptr,
25992 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
25993 emit_move_insn (srcptr,
25994 gen_rtx_PLUS (Pmode, copy_rtx (srcptr), adjust));
25995
25996 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
25997 piece_size);
25998 src = adjust_automodify_address_nv (src, move_mode, srcptr,
25999 piece_size);
26000 }
26001
26002 /* Update DST and SRC rtx. */
26003 *srcmem = src;
26004 return dst;
26005 }
26006
26007 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
26008 static void
26009 expand_movmem_epilogue (rtx destmem, rtx srcmem,
26010 rtx destptr, rtx srcptr, rtx count, int max_size)
26011 {
26012 rtx src, dest;
26013 if (CONST_INT_P (count))
26014 {
26015 HOST_WIDE_INT countval = INTVAL (count);
26016 HOST_WIDE_INT epilogue_size = countval % max_size;
26017 int i;
26018
26019 /* For now MAX_SIZE should be a power of 2. This assert could be
26020 relaxed, but it'll require a bit more complicated epilogue
26021 expanding. */
26022 gcc_assert ((max_size & (max_size - 1)) == 0);
26023 for (i = max_size; i >= 1; i >>= 1)
26024 {
26025 if (epilogue_size & i)
26026 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26027 }
26028 return;
26029 }
26030 if (max_size > 8)
26031 {
26032 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
26033 count, 1, OPTAB_DIRECT);
26034 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
26035 count, QImode, 1, 4, false);
26036 return;
26037 }
26038
26039 /* When there are stringops, we can cheaply increase dest and src pointers.
26040 Otherwise we save code size by maintaining offset (zero is readily
26041 available from preceding rep operation) and using x86 addressing modes.
26042 */
26043 if (TARGET_SINGLE_STRINGOP)
26044 {
26045 if (max_size > 4)
26046 {
26047 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26048 src = change_address (srcmem, SImode, srcptr);
26049 dest = change_address (destmem, SImode, destptr);
26050 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26051 emit_label (label);
26052 LABEL_NUSES (label) = 1;
26053 }
26054 if (max_size > 2)
26055 {
26056 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26057 src = change_address (srcmem, HImode, srcptr);
26058 dest = change_address (destmem, HImode, destptr);
26059 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26060 emit_label (label);
26061 LABEL_NUSES (label) = 1;
26062 }
26063 if (max_size > 1)
26064 {
26065 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26066 src = change_address (srcmem, QImode, srcptr);
26067 dest = change_address (destmem, QImode, destptr);
26068 emit_insn (gen_strmov (destptr, dest, srcptr, src));
26069 emit_label (label);
26070 LABEL_NUSES (label) = 1;
26071 }
26072 }
26073 else
26074 {
26075 rtx offset = force_reg (Pmode, const0_rtx);
26076 rtx tmp;
26077
26078 if (max_size > 4)
26079 {
26080 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26081 src = change_address (srcmem, SImode, srcptr);
26082 dest = change_address (destmem, SImode, destptr);
26083 emit_move_insn (dest, src);
26084 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
26085 true, OPTAB_LIB_WIDEN);
26086 if (tmp != offset)
26087 emit_move_insn (offset, tmp);
26088 emit_label (label);
26089 LABEL_NUSES (label) = 1;
26090 }
26091 if (max_size > 2)
26092 {
26093 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26094 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26095 src = change_address (srcmem, HImode, tmp);
26096 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26097 dest = change_address (destmem, HImode, tmp);
26098 emit_move_insn (dest, src);
26099 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
26100 true, OPTAB_LIB_WIDEN);
26101 if (tmp != offset)
26102 emit_move_insn (offset, tmp);
26103 emit_label (label);
26104 LABEL_NUSES (label) = 1;
26105 }
26106 if (max_size > 1)
26107 {
26108 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26109 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
26110 src = change_address (srcmem, QImode, tmp);
26111 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
26112 dest = change_address (destmem, QImode, tmp);
26113 emit_move_insn (dest, src);
26114 emit_label (label);
26115 LABEL_NUSES (label) = 1;
26116 }
26117 }
26118 }
26119
26120 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
26121 with value PROMOTED_VAL.
26122 SRC is passed by pointer to be updated on return.
26123 Return value is updated DST. */
26124 static rtx
26125 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
26126 HOST_WIDE_INT size_to_move)
26127 {
26128 rtx dst = destmem, adjust;
26129 enum insn_code code;
26130 machine_mode move_mode;
26131 int piece_size, i;
26132
26133 /* Find the widest mode in which we could perform moves.
26134 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
26135 it until move of such size is supported. */
26136 move_mode = GET_MODE (promoted_val);
26137 if (move_mode == VOIDmode)
26138 move_mode = QImode;
26139 if (size_to_move < GET_MODE_SIZE (move_mode))
26140 {
26141 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
26142 move_mode = int_mode_for_size (move_bits, 0).require ();
26143 promoted_val = gen_lowpart (move_mode, promoted_val);
26144 }
26145 piece_size = GET_MODE_SIZE (move_mode);
26146 code = optab_handler (mov_optab, move_mode);
26147 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
26148
26149 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
26150
26151 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
26152 gcc_assert (size_to_move % piece_size == 0);
26153 adjust = GEN_INT (piece_size);
26154 for (i = 0; i < size_to_move; i += piece_size)
26155 {
26156 if (piece_size <= GET_MODE_SIZE (word_mode))
26157 {
26158 emit_insn (gen_strset (destptr, dst, promoted_val));
26159 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26160 piece_size);
26161 continue;
26162 }
26163
26164 emit_insn (GEN_FCN (code) (dst, promoted_val));
26165
26166 emit_move_insn (destptr,
26167 gen_rtx_PLUS (Pmode, copy_rtx (destptr), adjust));
26168
26169 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
26170 piece_size);
26171 }
26172
26173 /* Update DST rtx. */
26174 return dst;
26175 }
26176 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26177 static void
26178 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
26179 rtx count, int max_size)
26180 {
26181 count =
26182 expand_simple_binop (counter_mode (count), AND, count,
26183 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
26184 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
26185 gen_lowpart (QImode, value), count, QImode,
26186 1, max_size / 2, true);
26187 }
26188
26189 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
26190 static void
26191 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
26192 rtx count, int max_size)
26193 {
26194 rtx dest;
26195
26196 if (CONST_INT_P (count))
26197 {
26198 HOST_WIDE_INT countval = INTVAL (count);
26199 HOST_WIDE_INT epilogue_size = countval % max_size;
26200 int i;
26201
26202 /* For now MAX_SIZE should be a power of 2. This assert could be
26203 relaxed, but it'll require a bit more complicated epilogue
26204 expanding. */
26205 gcc_assert ((max_size & (max_size - 1)) == 0);
26206 for (i = max_size; i >= 1; i >>= 1)
26207 {
26208 if (epilogue_size & i)
26209 {
26210 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26211 destmem = emit_memset (destmem, destptr, vec_value, i);
26212 else
26213 destmem = emit_memset (destmem, destptr, value, i);
26214 }
26215 }
26216 return;
26217 }
26218 if (max_size > 32)
26219 {
26220 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
26221 return;
26222 }
26223 if (max_size > 16)
26224 {
26225 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
26226 if (TARGET_64BIT)
26227 {
26228 dest = change_address (destmem, DImode, destptr);
26229 emit_insn (gen_strset (destptr, dest, value));
26230 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
26231 emit_insn (gen_strset (destptr, dest, value));
26232 }
26233 else
26234 {
26235 dest = change_address (destmem, SImode, destptr);
26236 emit_insn (gen_strset (destptr, dest, value));
26237 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26238 emit_insn (gen_strset (destptr, dest, value));
26239 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
26240 emit_insn (gen_strset (destptr, dest, value));
26241 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
26242 emit_insn (gen_strset (destptr, dest, value));
26243 }
26244 emit_label (label);
26245 LABEL_NUSES (label) = 1;
26246 }
26247 if (max_size > 8)
26248 {
26249 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
26250 if (TARGET_64BIT)
26251 {
26252 dest = change_address (destmem, DImode, destptr);
26253 emit_insn (gen_strset (destptr, dest, value));
26254 }
26255 else
26256 {
26257 dest = change_address (destmem, SImode, destptr);
26258 emit_insn (gen_strset (destptr, dest, value));
26259 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
26260 emit_insn (gen_strset (destptr, dest, value));
26261 }
26262 emit_label (label);
26263 LABEL_NUSES (label) = 1;
26264 }
26265 if (max_size > 4)
26266 {
26267 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
26268 dest = change_address (destmem, SImode, destptr);
26269 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
26270 emit_label (label);
26271 LABEL_NUSES (label) = 1;
26272 }
26273 if (max_size > 2)
26274 {
26275 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
26276 dest = change_address (destmem, HImode, destptr);
26277 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
26278 emit_label (label);
26279 LABEL_NUSES (label) = 1;
26280 }
26281 if (max_size > 1)
26282 {
26283 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
26284 dest = change_address (destmem, QImode, destptr);
26285 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
26286 emit_label (label);
26287 LABEL_NUSES (label) = 1;
26288 }
26289 }
26290
26291 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
26292 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
26293 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
26294 ignored.
26295 Return value is updated DESTMEM. */
26296 static rtx
26297 expand_set_or_movmem_prologue (rtx destmem, rtx srcmem,
26298 rtx destptr, rtx srcptr, rtx value,
26299 rtx vec_value, rtx count, int align,
26300 int desired_alignment, bool issetmem)
26301 {
26302 int i;
26303 for (i = 1; i < desired_alignment; i <<= 1)
26304 {
26305 if (align <= i)
26306 {
26307 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
26308 if (issetmem)
26309 {
26310 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
26311 destmem = emit_memset (destmem, destptr, vec_value, i);
26312 else
26313 destmem = emit_memset (destmem, destptr, value, i);
26314 }
26315 else
26316 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
26317 ix86_adjust_counter (count, i);
26318 emit_label (label);
26319 LABEL_NUSES (label) = 1;
26320 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
26321 }
26322 }
26323 return destmem;
26324 }
26325
26326 /* Test if COUNT&SIZE is nonzero and if so, expand movme
26327 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
26328 and jump to DONE_LABEL. */
26329 static void
26330 expand_small_movmem_or_setmem (rtx destmem, rtx srcmem,
26331 rtx destptr, rtx srcptr,
26332 rtx value, rtx vec_value,
26333 rtx count, int size,
26334 rtx done_label, bool issetmem)
26335 {
26336 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
26337 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
26338 rtx modesize;
26339 int n;
26340
26341 /* If we do not have vector value to copy, we must reduce size. */
26342 if (issetmem)
26343 {
26344 if (!vec_value)
26345 {
26346 if (GET_MODE (value) == VOIDmode && size > 8)
26347 mode = Pmode;
26348 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
26349 mode = GET_MODE (value);
26350 }
26351 else
26352 mode = GET_MODE (vec_value), value = vec_value;
26353 }
26354 else
26355 {
26356 /* Choose appropriate vector mode. */
26357 if (size >= 32)
26358 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
26359 else if (size >= 16)
26360 mode = TARGET_SSE ? V16QImode : DImode;
26361 srcmem = change_address (srcmem, mode, srcptr);
26362 }
26363 destmem = change_address (destmem, mode, destptr);
26364 modesize = GEN_INT (GET_MODE_SIZE (mode));
26365 gcc_assert (GET_MODE_SIZE (mode) <= size);
26366 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26367 {
26368 if (issetmem)
26369 emit_move_insn (destmem, gen_lowpart (mode, value));
26370 else
26371 {
26372 emit_move_insn (destmem, srcmem);
26373 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26374 }
26375 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26376 }
26377
26378 destmem = offset_address (destmem, count, 1);
26379 destmem = offset_address (destmem, GEN_INT (-2 * size),
26380 GET_MODE_SIZE (mode));
26381 if (!issetmem)
26382 {
26383 srcmem = offset_address (srcmem, count, 1);
26384 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
26385 GET_MODE_SIZE (mode));
26386 }
26387 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
26388 {
26389 if (issetmem)
26390 emit_move_insn (destmem, gen_lowpart (mode, value));
26391 else
26392 {
26393 emit_move_insn (destmem, srcmem);
26394 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26395 }
26396 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26397 }
26398 emit_jump_insn (gen_jump (done_label));
26399 emit_barrier ();
26400
26401 emit_label (label);
26402 LABEL_NUSES (label) = 1;
26403 }
26404
26405 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
26406 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
26407 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
26408 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
26409 DONE_LABEL is a label after the whole copying sequence. The label is created
26410 on demand if *DONE_LABEL is NULL.
26411 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
26412 bounds after the initial copies.
26413
26414 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
26415 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
26416 we will dispatch to a library call for large blocks.
26417
26418 In pseudocode we do:
26419
26420 if (COUNT < SIZE)
26421 {
26422 Assume that SIZE is 4. Bigger sizes are handled analogously
26423 if (COUNT & 4)
26424 {
26425 copy 4 bytes from SRCPTR to DESTPTR
26426 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
26427 goto done_label
26428 }
26429 if (!COUNT)
26430 goto done_label;
26431 copy 1 byte from SRCPTR to DESTPTR
26432 if (COUNT & 2)
26433 {
26434 copy 2 bytes from SRCPTR to DESTPTR
26435 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
26436 }
26437 }
26438 else
26439 {
26440 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
26441 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
26442
26443 OLD_DESPTR = DESTPTR;
26444 Align DESTPTR up to DESIRED_ALIGN
26445 SRCPTR += DESTPTR - OLD_DESTPTR
26446 COUNT -= DEST_PTR - OLD_DESTPTR
26447 if (DYNAMIC_CHECK)
26448 Round COUNT down to multiple of SIZE
26449 << optional caller supplied zero size guard is here >>
26450 << optional caller supplied dynamic check is here >>
26451 << caller supplied main copy loop is here >>
26452 }
26453 done_label:
26454 */
26455 static void
26456 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
26457 rtx *destptr, rtx *srcptr,
26458 machine_mode mode,
26459 rtx value, rtx vec_value,
26460 rtx *count,
26461 rtx_code_label **done_label,
26462 int size,
26463 int desired_align,
26464 int align,
26465 unsigned HOST_WIDE_INT *min_size,
26466 bool dynamic_check,
26467 bool issetmem)
26468 {
26469 rtx_code_label *loop_label = NULL, *label;
26470 int n;
26471 rtx modesize;
26472 int prolog_size = 0;
26473 rtx mode_value;
26474
26475 /* Chose proper value to copy. */
26476 if (issetmem && VECTOR_MODE_P (mode))
26477 mode_value = vec_value;
26478 else
26479 mode_value = value;
26480 gcc_assert (GET_MODE_SIZE (mode) <= size);
26481
26482 /* See if block is big or small, handle small blocks. */
26483 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
26484 {
26485 int size2 = size;
26486 loop_label = gen_label_rtx ();
26487
26488 if (!*done_label)
26489 *done_label = gen_label_rtx ();
26490
26491 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
26492 1, loop_label);
26493 size2 >>= 1;
26494
26495 /* Handle sizes > 3. */
26496 for (;size2 > 2; size2 >>= 1)
26497 expand_small_movmem_or_setmem (destmem, srcmem,
26498 *destptr, *srcptr,
26499 value, vec_value,
26500 *count,
26501 size2, *done_label, issetmem);
26502 /* Nothing to copy? Jump to DONE_LABEL if so */
26503 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
26504 1, *done_label);
26505
26506 /* Do a byte copy. */
26507 destmem = change_address (destmem, QImode, *destptr);
26508 if (issetmem)
26509 emit_move_insn (destmem, gen_lowpart (QImode, value));
26510 else
26511 {
26512 srcmem = change_address (srcmem, QImode, *srcptr);
26513 emit_move_insn (destmem, srcmem);
26514 }
26515
26516 /* Handle sizes 2 and 3. */
26517 label = ix86_expand_aligntest (*count, 2, false);
26518 destmem = change_address (destmem, HImode, *destptr);
26519 destmem = offset_address (destmem, *count, 1);
26520 destmem = offset_address (destmem, GEN_INT (-2), 2);
26521 if (issetmem)
26522 emit_move_insn (destmem, gen_lowpart (HImode, value));
26523 else
26524 {
26525 srcmem = change_address (srcmem, HImode, *srcptr);
26526 srcmem = offset_address (srcmem, *count, 1);
26527 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
26528 emit_move_insn (destmem, srcmem);
26529 }
26530
26531 emit_label (label);
26532 LABEL_NUSES (label) = 1;
26533 emit_jump_insn (gen_jump (*done_label));
26534 emit_barrier ();
26535 }
26536 else
26537 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
26538 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
26539
26540 /* Start memcpy for COUNT >= SIZE. */
26541 if (loop_label)
26542 {
26543 emit_label (loop_label);
26544 LABEL_NUSES (loop_label) = 1;
26545 }
26546
26547 /* Copy first desired_align bytes. */
26548 if (!issetmem)
26549 srcmem = change_address (srcmem, mode, *srcptr);
26550 destmem = change_address (destmem, mode, *destptr);
26551 modesize = GEN_INT (GET_MODE_SIZE (mode));
26552 for (n = 0; prolog_size < desired_align - align; n++)
26553 {
26554 if (issetmem)
26555 emit_move_insn (destmem, mode_value);
26556 else
26557 {
26558 emit_move_insn (destmem, srcmem);
26559 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
26560 }
26561 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
26562 prolog_size += GET_MODE_SIZE (mode);
26563 }
26564
26565
26566 /* Copy last SIZE bytes. */
26567 destmem = offset_address (destmem, *count, 1);
26568 destmem = offset_address (destmem,
26569 GEN_INT (-size - prolog_size),
26570 1);
26571 if (issetmem)
26572 emit_move_insn (destmem, mode_value);
26573 else
26574 {
26575 srcmem = offset_address (srcmem, *count, 1);
26576 srcmem = offset_address (srcmem,
26577 GEN_INT (-size - prolog_size),
26578 1);
26579 emit_move_insn (destmem, srcmem);
26580 }
26581 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
26582 {
26583 destmem = offset_address (destmem, modesize, 1);
26584 if (issetmem)
26585 emit_move_insn (destmem, mode_value);
26586 else
26587 {
26588 srcmem = offset_address (srcmem, modesize, 1);
26589 emit_move_insn (destmem, srcmem);
26590 }
26591 }
26592
26593 /* Align destination. */
26594 if (desired_align > 1 && desired_align > align)
26595 {
26596 rtx saveddest = *destptr;
26597
26598 gcc_assert (desired_align <= size);
26599 /* Align destptr up, place it to new register. */
26600 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
26601 GEN_INT (prolog_size),
26602 NULL_RTX, 1, OPTAB_DIRECT);
26603 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
26604 REG_POINTER (*destptr) = 1;
26605 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
26606 GEN_INT (-desired_align),
26607 *destptr, 1, OPTAB_DIRECT);
26608 /* See how many bytes we skipped. */
26609 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
26610 *destptr,
26611 saveddest, 1, OPTAB_DIRECT);
26612 /* Adjust srcptr and count. */
26613 if (!issetmem)
26614 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
26615 saveddest, *srcptr, 1, OPTAB_DIRECT);
26616 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26617 saveddest, *count, 1, OPTAB_DIRECT);
26618 /* We copied at most size + prolog_size. */
26619 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
26620 *min_size
26621 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
26622 else
26623 *min_size = 0;
26624
26625 /* Our loops always round down the block size, but for dispatch to
26626 library we need precise value. */
26627 if (dynamic_check)
26628 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
26629 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
26630 }
26631 else
26632 {
26633 gcc_assert (prolog_size == 0);
26634 /* Decrease count, so we won't end up copying last word twice. */
26635 if (!CONST_INT_P (*count))
26636 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
26637 constm1_rtx, *count, 1, OPTAB_DIRECT);
26638 else
26639 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
26640 (unsigned HOST_WIDE_INT)size));
26641 if (*min_size)
26642 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
26643 }
26644 }
26645
26646
26647 /* This function is like the previous one, except here we know how many bytes
26648 need to be copied. That allows us to update alignment not only of DST, which
26649 is returned, but also of SRC, which is passed as a pointer for that
26650 reason. */
26651 static rtx
26652 expand_set_or_movmem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
26653 rtx srcreg, rtx value, rtx vec_value,
26654 int desired_align, int align_bytes,
26655 bool issetmem)
26656 {
26657 rtx src = NULL;
26658 rtx orig_dst = dst;
26659 rtx orig_src = NULL;
26660 int piece_size = 1;
26661 int copied_bytes = 0;
26662
26663 if (!issetmem)
26664 {
26665 gcc_assert (srcp != NULL);
26666 src = *srcp;
26667 orig_src = src;
26668 }
26669
26670 for (piece_size = 1;
26671 piece_size <= desired_align && copied_bytes < align_bytes;
26672 piece_size <<= 1)
26673 {
26674 if (align_bytes & piece_size)
26675 {
26676 if (issetmem)
26677 {
26678 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
26679 dst = emit_memset (dst, destreg, vec_value, piece_size);
26680 else
26681 dst = emit_memset (dst, destreg, value, piece_size);
26682 }
26683 else
26684 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
26685 copied_bytes += piece_size;
26686 }
26687 }
26688 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
26689 set_mem_align (dst, desired_align * BITS_PER_UNIT);
26690 if (MEM_SIZE_KNOWN_P (orig_dst))
26691 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
26692
26693 if (!issetmem)
26694 {
26695 int src_align_bytes = get_mem_align_offset (src, desired_align
26696 * BITS_PER_UNIT);
26697 if (src_align_bytes >= 0)
26698 src_align_bytes = desired_align - src_align_bytes;
26699 if (src_align_bytes >= 0)
26700 {
26701 unsigned int src_align;
26702 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
26703 {
26704 if ((src_align_bytes & (src_align - 1))
26705 == (align_bytes & (src_align - 1)))
26706 break;
26707 }
26708 if (src_align > (unsigned int) desired_align)
26709 src_align = desired_align;
26710 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
26711 set_mem_align (src, src_align * BITS_PER_UNIT);
26712 }
26713 if (MEM_SIZE_KNOWN_P (orig_src))
26714 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
26715 *srcp = src;
26716 }
26717
26718 return dst;
26719 }
26720
26721 /* Return true if ALG can be used in current context.
26722 Assume we expand memset if MEMSET is true. */
26723 static bool
26724 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
26725 {
26726 if (alg == no_stringop)
26727 return false;
26728 if (alg == vector_loop)
26729 return TARGET_SSE || TARGET_AVX;
26730 /* Algorithms using the rep prefix want at least edi and ecx;
26731 additionally, memset wants eax and memcpy wants esi. Don't
26732 consider such algorithms if the user has appropriated those
26733 registers for their own purposes, or if we have a non-default
26734 address space, since some string insns cannot override the segment. */
26735 if (alg == rep_prefix_1_byte
26736 || alg == rep_prefix_4_byte
26737 || alg == rep_prefix_8_byte)
26738 {
26739 if (have_as)
26740 return false;
26741 if (fixed_regs[CX_REG]
26742 || fixed_regs[DI_REG]
26743 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
26744 return false;
26745 }
26746 return true;
26747 }
26748
26749 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
26750 static enum stringop_alg
26751 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
26752 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
26753 bool memset, bool zero_memset, bool have_as,
26754 int *dynamic_check, bool *noalign, bool recur)
26755 {
26756 const struct stringop_algs *algs;
26757 bool optimize_for_speed;
26758 int max = 0;
26759 const struct processor_costs *cost;
26760 int i;
26761 bool any_alg_usable_p = false;
26762
26763 *noalign = false;
26764 *dynamic_check = -1;
26765
26766 /* Even if the string operation call is cold, we still might spend a lot
26767 of time processing large blocks. */
26768 if (optimize_function_for_size_p (cfun)
26769 || (optimize_insn_for_size_p ()
26770 && (max_size < 256
26771 || (expected_size != -1 && expected_size < 256))))
26772 optimize_for_speed = false;
26773 else
26774 optimize_for_speed = true;
26775
26776 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
26777 if (memset)
26778 algs = &cost->memset[TARGET_64BIT != 0];
26779 else
26780 algs = &cost->memcpy[TARGET_64BIT != 0];
26781
26782 /* See maximal size for user defined algorithm. */
26783 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26784 {
26785 enum stringop_alg candidate = algs->size[i].alg;
26786 bool usable = alg_usable_p (candidate, memset, have_as);
26787 any_alg_usable_p |= usable;
26788
26789 if (candidate != libcall && candidate && usable)
26790 max = algs->size[i].max;
26791 }
26792
26793 /* If expected size is not known but max size is small enough
26794 so inline version is a win, set expected size into
26795 the range. */
26796 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
26797 && expected_size == -1)
26798 expected_size = min_size / 2 + max_size / 2;
26799
26800 /* If user specified the algorithm, honor it if possible. */
26801 if (ix86_stringop_alg != no_stringop
26802 && alg_usable_p (ix86_stringop_alg, memset, have_as))
26803 return ix86_stringop_alg;
26804 /* rep; movq or rep; movl is the smallest variant. */
26805 else if (!optimize_for_speed)
26806 {
26807 *noalign = true;
26808 if (!count || (count & 3) || (memset && !zero_memset))
26809 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
26810 ? rep_prefix_1_byte : loop_1_byte;
26811 else
26812 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
26813 ? rep_prefix_4_byte : loop;
26814 }
26815 /* Very tiny blocks are best handled via the loop, REP is expensive to
26816 setup. */
26817 else if (expected_size != -1 && expected_size < 4)
26818 return loop_1_byte;
26819 else if (expected_size != -1)
26820 {
26821 enum stringop_alg alg = libcall;
26822 bool alg_noalign = false;
26823 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
26824 {
26825 /* We get here if the algorithms that were not libcall-based
26826 were rep-prefix based and we are unable to use rep prefixes
26827 based on global register usage. Break out of the loop and
26828 use the heuristic below. */
26829 if (algs->size[i].max == 0)
26830 break;
26831 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
26832 {
26833 enum stringop_alg candidate = algs->size[i].alg;
26834
26835 if (candidate != libcall
26836 && alg_usable_p (candidate, memset, have_as))
26837 {
26838 alg = candidate;
26839 alg_noalign = algs->size[i].noalign;
26840 }
26841 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
26842 last non-libcall inline algorithm. */
26843 if (TARGET_INLINE_ALL_STRINGOPS)
26844 {
26845 /* When the current size is best to be copied by a libcall,
26846 but we are still forced to inline, run the heuristic below
26847 that will pick code for medium sized blocks. */
26848 if (alg != libcall)
26849 {
26850 *noalign = alg_noalign;
26851 return alg;
26852 }
26853 else if (!any_alg_usable_p)
26854 break;
26855 }
26856 else if (alg_usable_p (candidate, memset, have_as))
26857 {
26858 *noalign = algs->size[i].noalign;
26859 return candidate;
26860 }
26861 }
26862 }
26863 }
26864 /* When asked to inline the call anyway, try to pick meaningful choice.
26865 We look for maximal size of block that is faster to copy by hand and
26866 take blocks of at most of that size guessing that average size will
26867 be roughly half of the block.
26868
26869 If this turns out to be bad, we might simply specify the preferred
26870 choice in ix86_costs. */
26871 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26872 && (algs->unknown_size == libcall
26873 || !alg_usable_p (algs->unknown_size, memset, have_as)))
26874 {
26875 enum stringop_alg alg;
26876 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
26877
26878 /* If there aren't any usable algorithms or if recursing already,
26879 then recursing on smaller sizes or same size isn't going to
26880 find anything. Just return the simple byte-at-a-time copy loop. */
26881 if (!any_alg_usable_p || recur)
26882 {
26883 /* Pick something reasonable. */
26884 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
26885 *dynamic_check = 128;
26886 return loop_1_byte;
26887 }
26888 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
26889 zero_memset, have_as, dynamic_check, noalign, true);
26890 gcc_assert (*dynamic_check == -1);
26891 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
26892 *dynamic_check = max;
26893 else
26894 gcc_assert (alg != libcall);
26895 return alg;
26896 }
26897 return (alg_usable_p (algs->unknown_size, memset, have_as)
26898 ? algs->unknown_size : libcall);
26899 }
26900
26901 /* Decide on alignment. We know that the operand is already aligned to ALIGN
26902 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
26903 static int
26904 decide_alignment (int align,
26905 enum stringop_alg alg,
26906 int expected_size,
26907 machine_mode move_mode)
26908 {
26909 int desired_align = 0;
26910
26911 gcc_assert (alg != no_stringop);
26912
26913 if (alg == libcall)
26914 return 0;
26915 if (move_mode == VOIDmode)
26916 return 0;
26917
26918 desired_align = GET_MODE_SIZE (move_mode);
26919 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
26920 copying whole cacheline at once. */
26921 if (TARGET_PENTIUMPRO
26922 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
26923 desired_align = 8;
26924
26925 if (optimize_size)
26926 desired_align = 1;
26927 if (desired_align < align)
26928 desired_align = align;
26929 if (expected_size != -1 && expected_size < 4)
26930 desired_align = align;
26931
26932 return desired_align;
26933 }
26934
26935
26936 /* Helper function for memcpy. For QImode value 0xXY produce
26937 0xXYXYXYXY of wide specified by MODE. This is essentially
26938 a * 0x10101010, but we can do slightly better than
26939 synth_mult by unwinding the sequence by hand on CPUs with
26940 slow multiply. */
26941 static rtx
26942 promote_duplicated_reg (machine_mode mode, rtx val)
26943 {
26944 machine_mode valmode = GET_MODE (val);
26945 rtx tmp;
26946 int nops = mode == DImode ? 3 : 2;
26947
26948 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
26949 if (val == const0_rtx)
26950 return copy_to_mode_reg (mode, CONST0_RTX (mode));
26951 if (CONST_INT_P (val))
26952 {
26953 HOST_WIDE_INT v = INTVAL (val) & 255;
26954
26955 v |= v << 8;
26956 v |= v << 16;
26957 if (mode == DImode)
26958 v |= (v << 16) << 16;
26959 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
26960 }
26961
26962 if (valmode == VOIDmode)
26963 valmode = QImode;
26964 if (valmode != QImode)
26965 val = gen_lowpart (QImode, val);
26966 if (mode == QImode)
26967 return val;
26968 if (!TARGET_PARTIAL_REG_STALL)
26969 nops--;
26970 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
26971 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
26972 <= (ix86_cost->shift_const + ix86_cost->add) * nops
26973 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
26974 {
26975 rtx reg = convert_modes (mode, QImode, val, true);
26976 tmp = promote_duplicated_reg (mode, const1_rtx);
26977 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
26978 OPTAB_DIRECT);
26979 }
26980 else
26981 {
26982 rtx reg = convert_modes (mode, QImode, val, true);
26983
26984 if (!TARGET_PARTIAL_REG_STALL)
26985 if (mode == SImode)
26986 emit_insn (gen_insvsi_1 (reg, reg));
26987 else
26988 emit_insn (gen_insvdi_1 (reg, reg));
26989 else
26990 {
26991 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
26992 NULL, 1, OPTAB_DIRECT);
26993 reg =
26994 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
26995 }
26996 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
26997 NULL, 1, OPTAB_DIRECT);
26998 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
26999 if (mode == SImode)
27000 return reg;
27001 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
27002 NULL, 1, OPTAB_DIRECT);
27003 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
27004 return reg;
27005 }
27006 }
27007
27008 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
27009 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
27010 alignment from ALIGN to DESIRED_ALIGN. */
27011 static rtx
27012 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
27013 int align)
27014 {
27015 rtx promoted_val;
27016
27017 if (TARGET_64BIT
27018 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
27019 promoted_val = promote_duplicated_reg (DImode, val);
27020 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
27021 promoted_val = promote_duplicated_reg (SImode, val);
27022 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
27023 promoted_val = promote_duplicated_reg (HImode, val);
27024 else
27025 promoted_val = val;
27026
27027 return promoted_val;
27028 }
27029
27030 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
27031 operations when profitable. The code depends upon architecture, block size
27032 and alignment, but always has one of the following overall structures:
27033
27034 Aligned move sequence:
27035
27036 1) Prologue guard: Conditional that jumps up to epilogues for small
27037 blocks that can be handled by epilogue alone. This is faster
27038 but also needed for correctness, since prologue assume the block
27039 is larger than the desired alignment.
27040
27041 Optional dynamic check for size and libcall for large
27042 blocks is emitted here too, with -minline-stringops-dynamically.
27043
27044 2) Prologue: copy first few bytes in order to get destination
27045 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
27046 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
27047 copied. We emit either a jump tree on power of two sized
27048 blocks, or a byte loop.
27049
27050 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27051 with specified algorithm.
27052
27053 4) Epilogue: code copying tail of the block that is too small to be
27054 handled by main body (or up to size guarded by prologue guard).
27055
27056 Misaligned move sequence
27057
27058 1) missaligned move prologue/epilogue containing:
27059 a) Prologue handling small memory blocks and jumping to done_label
27060 (skipped if blocks are known to be large enough)
27061 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
27062 needed by single possibly misaligned move
27063 (skipped if alignment is not needed)
27064 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
27065
27066 2) Zero size guard dispatching to done_label, if needed
27067
27068 3) dispatch to library call, if needed,
27069
27070 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
27071 with specified algorithm. */
27072 bool
27073 ix86_expand_set_or_movmem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
27074 rtx align_exp, rtx expected_align_exp,
27075 rtx expected_size_exp, rtx min_size_exp,
27076 rtx max_size_exp, rtx probable_max_size_exp,
27077 bool issetmem)
27078 {
27079 rtx destreg;
27080 rtx srcreg = NULL;
27081 rtx_code_label *label = NULL;
27082 rtx tmp;
27083 rtx_code_label *jump_around_label = NULL;
27084 HOST_WIDE_INT align = 1;
27085 unsigned HOST_WIDE_INT count = 0;
27086 HOST_WIDE_INT expected_size = -1;
27087 int size_needed = 0, epilogue_size_needed;
27088 int desired_align = 0, align_bytes = 0;
27089 enum stringop_alg alg;
27090 rtx promoted_val = NULL;
27091 rtx vec_promoted_val = NULL;
27092 bool force_loopy_epilogue = false;
27093 int dynamic_check;
27094 bool need_zero_guard = false;
27095 bool noalign;
27096 machine_mode move_mode = VOIDmode;
27097 machine_mode wider_mode;
27098 int unroll_factor = 1;
27099 /* TODO: Once value ranges are available, fill in proper data. */
27100 unsigned HOST_WIDE_INT min_size = 0;
27101 unsigned HOST_WIDE_INT max_size = -1;
27102 unsigned HOST_WIDE_INT probable_max_size = -1;
27103 bool misaligned_prologue_used = false;
27104 bool have_as;
27105
27106 if (CONST_INT_P (align_exp))
27107 align = INTVAL (align_exp);
27108 /* i386 can do misaligned access on reasonably increased cost. */
27109 if (CONST_INT_P (expected_align_exp)
27110 && INTVAL (expected_align_exp) > align)
27111 align = INTVAL (expected_align_exp);
27112 /* ALIGN is the minimum of destination and source alignment, but we care here
27113 just about destination alignment. */
27114 else if (!issetmem
27115 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
27116 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
27117
27118 if (CONST_INT_P (count_exp))
27119 {
27120 min_size = max_size = probable_max_size = count = expected_size
27121 = INTVAL (count_exp);
27122 /* When COUNT is 0, there is nothing to do. */
27123 if (!count)
27124 return true;
27125 }
27126 else
27127 {
27128 if (min_size_exp)
27129 min_size = INTVAL (min_size_exp);
27130 if (max_size_exp)
27131 max_size = INTVAL (max_size_exp);
27132 if (probable_max_size_exp)
27133 probable_max_size = INTVAL (probable_max_size_exp);
27134 if (CONST_INT_P (expected_size_exp))
27135 expected_size = INTVAL (expected_size_exp);
27136 }
27137
27138 /* Make sure we don't need to care about overflow later on. */
27139 if (count > (HOST_WIDE_INT_1U << 30))
27140 return false;
27141
27142 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
27143 if (!issetmem)
27144 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
27145
27146 /* Step 0: Decide on preferred algorithm, desired alignment and
27147 size of chunks to be copied by main loop. */
27148 alg = decide_alg (count, expected_size, min_size, probable_max_size,
27149 issetmem,
27150 issetmem && val_exp == const0_rtx, have_as,
27151 &dynamic_check, &noalign, false);
27152 if (alg == libcall)
27153 return false;
27154 gcc_assert (alg != no_stringop);
27155
27156 /* For now vector-version of memset is generated only for memory zeroing, as
27157 creating of promoted vector value is very cheap in this case. */
27158 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
27159 alg = unrolled_loop;
27160
27161 if (!count)
27162 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
27163 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
27164 if (!issetmem)
27165 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
27166
27167 unroll_factor = 1;
27168 move_mode = word_mode;
27169 switch (alg)
27170 {
27171 case libcall:
27172 case no_stringop:
27173 case last_alg:
27174 gcc_unreachable ();
27175 case loop_1_byte:
27176 need_zero_guard = true;
27177 move_mode = QImode;
27178 break;
27179 case loop:
27180 need_zero_guard = true;
27181 break;
27182 case unrolled_loop:
27183 need_zero_guard = true;
27184 unroll_factor = (TARGET_64BIT ? 4 : 2);
27185 break;
27186 case vector_loop:
27187 need_zero_guard = true;
27188 unroll_factor = 4;
27189 /* Find the widest supported mode. */
27190 move_mode = word_mode;
27191 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
27192 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
27193 move_mode = wider_mode;
27194
27195 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (move_mode) > 128)
27196 move_mode = TImode;
27197
27198 /* Find the corresponding vector mode with the same size as MOVE_MODE.
27199 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
27200 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
27201 {
27202 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
27203 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
27204 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
27205 move_mode = word_mode;
27206 }
27207 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
27208 break;
27209 case rep_prefix_8_byte:
27210 move_mode = DImode;
27211 break;
27212 case rep_prefix_4_byte:
27213 move_mode = SImode;
27214 break;
27215 case rep_prefix_1_byte:
27216 move_mode = QImode;
27217 break;
27218 }
27219 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
27220 epilogue_size_needed = size_needed;
27221
27222 /* If we are going to call any library calls conditionally, make sure any
27223 pending stack adjustment happen before the first conditional branch,
27224 otherwise they will be emitted before the library call only and won't
27225 happen from the other branches. */
27226 if (dynamic_check != -1)
27227 do_pending_stack_adjust ();
27228
27229 desired_align = decide_alignment (align, alg, expected_size, move_mode);
27230 if (!TARGET_ALIGN_STRINGOPS || noalign)
27231 align = desired_align;
27232
27233 /* Step 1: Prologue guard. */
27234
27235 /* Alignment code needs count to be in register. */
27236 if (CONST_INT_P (count_exp) && desired_align > align)
27237 {
27238 if (INTVAL (count_exp) > desired_align
27239 && INTVAL (count_exp) > size_needed)
27240 {
27241 align_bytes
27242 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
27243 if (align_bytes <= 0)
27244 align_bytes = 0;
27245 else
27246 align_bytes = desired_align - align_bytes;
27247 }
27248 if (align_bytes == 0)
27249 count_exp = force_reg (counter_mode (count_exp), count_exp);
27250 }
27251 gcc_assert (desired_align >= 1 && align >= 1);
27252
27253 /* Misaligned move sequences handle both prologue and epilogue at once.
27254 Default code generation results in a smaller code for large alignments
27255 and also avoids redundant job when sizes are known precisely. */
27256 misaligned_prologue_used
27257 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
27258 && MAX (desired_align, epilogue_size_needed) <= 32
27259 && desired_align <= epilogue_size_needed
27260 && ((desired_align > align && !align_bytes)
27261 || (!count && epilogue_size_needed > 1)));
27262
27263 /* Do the cheap promotion to allow better CSE across the
27264 main loop and epilogue (ie one load of the big constant in the
27265 front of all code.
27266 For now the misaligned move sequences do not have fast path
27267 without broadcasting. */
27268 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
27269 {
27270 if (alg == vector_loop)
27271 {
27272 gcc_assert (val_exp == const0_rtx);
27273 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
27274 promoted_val = promote_duplicated_reg_to_size (val_exp,
27275 GET_MODE_SIZE (word_mode),
27276 desired_align, align);
27277 }
27278 else
27279 {
27280 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27281 desired_align, align);
27282 }
27283 }
27284 /* Misaligned move sequences handles both prologues and epilogues at once.
27285 Default code generation results in smaller code for large alignments and
27286 also avoids redundant job when sizes are known precisely. */
27287 if (misaligned_prologue_used)
27288 {
27289 /* Misaligned move prologue handled small blocks by itself. */
27290 expand_set_or_movmem_prologue_epilogue_by_misaligned_moves
27291 (dst, src, &destreg, &srcreg,
27292 move_mode, promoted_val, vec_promoted_val,
27293 &count_exp,
27294 &jump_around_label,
27295 desired_align < align
27296 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
27297 desired_align, align, &min_size, dynamic_check, issetmem);
27298 if (!issetmem)
27299 src = change_address (src, BLKmode, srcreg);
27300 dst = change_address (dst, BLKmode, destreg);
27301 set_mem_align (dst, desired_align * BITS_PER_UNIT);
27302 epilogue_size_needed = 0;
27303 if (need_zero_guard
27304 && min_size < (unsigned HOST_WIDE_INT) size_needed)
27305 {
27306 /* It is possible that we copied enough so the main loop will not
27307 execute. */
27308 gcc_assert (size_needed > 1);
27309 if (jump_around_label == NULL_RTX)
27310 jump_around_label = gen_label_rtx ();
27311 emit_cmp_and_jump_insns (count_exp,
27312 GEN_INT (size_needed),
27313 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
27314 if (expected_size == -1
27315 || expected_size < (desired_align - align) / 2 + size_needed)
27316 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27317 else
27318 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27319 }
27320 }
27321 /* Ensure that alignment prologue won't copy past end of block. */
27322 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
27323 {
27324 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
27325 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
27326 Make sure it is power of 2. */
27327 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
27328
27329 /* To improve performance of small blocks, we jump around the VAL
27330 promoting mode. This mean that if the promoted VAL is not constant,
27331 we might not use it in the epilogue and have to use byte
27332 loop variant. */
27333 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
27334 force_loopy_epilogue = true;
27335 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27336 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27337 {
27338 /* If main algorithm works on QImode, no epilogue is needed.
27339 For small sizes just don't align anything. */
27340 if (size_needed == 1)
27341 desired_align = align;
27342 else
27343 goto epilogue;
27344 }
27345 else if (!count
27346 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
27347 {
27348 label = gen_label_rtx ();
27349 emit_cmp_and_jump_insns (count_exp,
27350 GEN_INT (epilogue_size_needed),
27351 LTU, 0, counter_mode (count_exp), 1, label);
27352 if (expected_size == -1 || expected_size < epilogue_size_needed)
27353 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27354 else
27355 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27356 }
27357 }
27358
27359 /* Emit code to decide on runtime whether library call or inline should be
27360 used. */
27361 if (dynamic_check != -1)
27362 {
27363 if (!issetmem && CONST_INT_P (count_exp))
27364 {
27365 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
27366 {
27367 emit_block_copy_via_libcall (dst, src, count_exp);
27368 count_exp = const0_rtx;
27369 goto epilogue;
27370 }
27371 }
27372 else
27373 {
27374 rtx_code_label *hot_label = gen_label_rtx ();
27375 if (jump_around_label == NULL_RTX)
27376 jump_around_label = gen_label_rtx ();
27377 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
27378 LEU, 0, counter_mode (count_exp),
27379 1, hot_label);
27380 predict_jump (REG_BR_PROB_BASE * 90 / 100);
27381 if (issetmem)
27382 set_storage_via_libcall (dst, count_exp, val_exp);
27383 else
27384 emit_block_copy_via_libcall (dst, src, count_exp);
27385 emit_jump (jump_around_label);
27386 emit_label (hot_label);
27387 }
27388 }
27389
27390 /* Step 2: Alignment prologue. */
27391 /* Do the expensive promotion once we branched off the small blocks. */
27392 if (issetmem && !promoted_val)
27393 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
27394 desired_align, align);
27395
27396 if (desired_align > align && !misaligned_prologue_used)
27397 {
27398 if (align_bytes == 0)
27399 {
27400 /* Except for the first move in prologue, we no longer know
27401 constant offset in aliasing info. It don't seems to worth
27402 the pain to maintain it for the first move, so throw away
27403 the info early. */
27404 dst = change_address (dst, BLKmode, destreg);
27405 if (!issetmem)
27406 src = change_address (src, BLKmode, srcreg);
27407 dst = expand_set_or_movmem_prologue (dst, src, destreg, srcreg,
27408 promoted_val, vec_promoted_val,
27409 count_exp, align, desired_align,
27410 issetmem);
27411 /* At most desired_align - align bytes are copied. */
27412 if (min_size < (unsigned)(desired_align - align))
27413 min_size = 0;
27414 else
27415 min_size -= desired_align - align;
27416 }
27417 else
27418 {
27419 /* If we know how many bytes need to be stored before dst is
27420 sufficiently aligned, maintain aliasing info accurately. */
27421 dst = expand_set_or_movmem_constant_prologue (dst, &src, destreg,
27422 srcreg,
27423 promoted_val,
27424 vec_promoted_val,
27425 desired_align,
27426 align_bytes,
27427 issetmem);
27428
27429 count_exp = plus_constant (counter_mode (count_exp),
27430 count_exp, -align_bytes);
27431 count -= align_bytes;
27432 min_size -= align_bytes;
27433 max_size -= align_bytes;
27434 }
27435 if (need_zero_guard
27436 && min_size < (unsigned HOST_WIDE_INT) size_needed
27437 && (count < (unsigned HOST_WIDE_INT) size_needed
27438 || (align_bytes == 0
27439 && count < ((unsigned HOST_WIDE_INT) size_needed
27440 + desired_align - align))))
27441 {
27442 /* It is possible that we copied enough so the main loop will not
27443 execute. */
27444 gcc_assert (size_needed > 1);
27445 if (label == NULL_RTX)
27446 label = gen_label_rtx ();
27447 emit_cmp_and_jump_insns (count_exp,
27448 GEN_INT (size_needed),
27449 LTU, 0, counter_mode (count_exp), 1, label);
27450 if (expected_size == -1
27451 || expected_size < (desired_align - align) / 2 + size_needed)
27452 predict_jump (REG_BR_PROB_BASE * 20 / 100);
27453 else
27454 predict_jump (REG_BR_PROB_BASE * 60 / 100);
27455 }
27456 }
27457 if (label && size_needed == 1)
27458 {
27459 emit_label (label);
27460 LABEL_NUSES (label) = 1;
27461 label = NULL;
27462 epilogue_size_needed = 1;
27463 if (issetmem)
27464 promoted_val = val_exp;
27465 }
27466 else if (label == NULL_RTX && !misaligned_prologue_used)
27467 epilogue_size_needed = size_needed;
27468
27469 /* Step 3: Main loop. */
27470
27471 switch (alg)
27472 {
27473 case libcall:
27474 case no_stringop:
27475 case last_alg:
27476 gcc_unreachable ();
27477 case loop_1_byte:
27478 case loop:
27479 case unrolled_loop:
27480 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, promoted_val,
27481 count_exp, move_mode, unroll_factor,
27482 expected_size, issetmem);
27483 break;
27484 case vector_loop:
27485 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg,
27486 vec_promoted_val, count_exp, move_mode,
27487 unroll_factor, expected_size, issetmem);
27488 break;
27489 case rep_prefix_8_byte:
27490 case rep_prefix_4_byte:
27491 case rep_prefix_1_byte:
27492 expand_set_or_movmem_via_rep (dst, src, destreg, srcreg, promoted_val,
27493 val_exp, count_exp, move_mode, issetmem);
27494 break;
27495 }
27496 /* Adjust properly the offset of src and dest memory for aliasing. */
27497 if (CONST_INT_P (count_exp))
27498 {
27499 if (!issetmem)
27500 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
27501 (count / size_needed) * size_needed);
27502 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
27503 (count / size_needed) * size_needed);
27504 }
27505 else
27506 {
27507 if (!issetmem)
27508 src = change_address (src, BLKmode, srcreg);
27509 dst = change_address (dst, BLKmode, destreg);
27510 }
27511
27512 /* Step 4: Epilogue to copy the remaining bytes. */
27513 epilogue:
27514 if (label)
27515 {
27516 /* When the main loop is done, COUNT_EXP might hold original count,
27517 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
27518 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
27519 bytes. Compensate if needed. */
27520
27521 if (size_needed < epilogue_size_needed)
27522 {
27523 tmp =
27524 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
27525 GEN_INT (size_needed - 1), count_exp, 1,
27526 OPTAB_DIRECT);
27527 if (tmp != count_exp)
27528 emit_move_insn (count_exp, tmp);
27529 }
27530 emit_label (label);
27531 LABEL_NUSES (label) = 1;
27532 }
27533
27534 if (count_exp != const0_rtx && epilogue_size_needed > 1)
27535 {
27536 if (force_loopy_epilogue)
27537 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
27538 epilogue_size_needed);
27539 else
27540 {
27541 if (issetmem)
27542 expand_setmem_epilogue (dst, destreg, promoted_val,
27543 vec_promoted_val, count_exp,
27544 epilogue_size_needed);
27545 else
27546 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
27547 epilogue_size_needed);
27548 }
27549 }
27550 if (jump_around_label)
27551 emit_label (jump_around_label);
27552 return true;
27553 }
27554
27555
27556 /* Expand the appropriate insns for doing strlen if not just doing
27557 repnz; scasb
27558
27559 out = result, initialized with the start address
27560 align_rtx = alignment of the address.
27561 scratch = scratch register, initialized with the startaddress when
27562 not aligned, otherwise undefined
27563
27564 This is just the body. It needs the initializations mentioned above and
27565 some address computing at the end. These things are done in i386.md. */
27566
27567 static void
27568 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
27569 {
27570 int align;
27571 rtx tmp;
27572 rtx_code_label *align_2_label = NULL;
27573 rtx_code_label *align_3_label = NULL;
27574 rtx_code_label *align_4_label = gen_label_rtx ();
27575 rtx_code_label *end_0_label = gen_label_rtx ();
27576 rtx mem;
27577 rtx tmpreg = gen_reg_rtx (SImode);
27578 rtx scratch = gen_reg_rtx (SImode);
27579 rtx cmp;
27580
27581 align = 0;
27582 if (CONST_INT_P (align_rtx))
27583 align = INTVAL (align_rtx);
27584
27585 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
27586
27587 /* Is there a known alignment and is it less than 4? */
27588 if (align < 4)
27589 {
27590 rtx scratch1 = gen_reg_rtx (Pmode);
27591 emit_move_insn (scratch1, out);
27592 /* Is there a known alignment and is it not 2? */
27593 if (align != 2)
27594 {
27595 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
27596 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
27597
27598 /* Leave just the 3 lower bits. */
27599 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
27600 NULL_RTX, 0, OPTAB_WIDEN);
27601
27602 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27603 Pmode, 1, align_4_label);
27604 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
27605 Pmode, 1, align_2_label);
27606 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
27607 Pmode, 1, align_3_label);
27608 }
27609 else
27610 {
27611 /* Since the alignment is 2, we have to check 2 or 0 bytes;
27612 check if is aligned to 4 - byte. */
27613
27614 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
27615 NULL_RTX, 0, OPTAB_WIDEN);
27616
27617 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
27618 Pmode, 1, align_4_label);
27619 }
27620
27621 mem = change_address (src, QImode, out);
27622
27623 /* Now compare the bytes. */
27624
27625 /* Compare the first n unaligned byte on a byte per byte basis. */
27626 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
27627 QImode, 1, end_0_label);
27628
27629 /* Increment the address. */
27630 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27631
27632 /* Not needed with an alignment of 2 */
27633 if (align != 2)
27634 {
27635 emit_label (align_2_label);
27636
27637 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27638 end_0_label);
27639
27640 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27641
27642 emit_label (align_3_label);
27643 }
27644
27645 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
27646 end_0_label);
27647
27648 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
27649 }
27650
27651 /* Generate loop to check 4 bytes at a time. It is not a good idea to
27652 align this loop. It gives only huge programs, but does not help to
27653 speed up. */
27654 emit_label (align_4_label);
27655
27656 mem = change_address (src, SImode, out);
27657 emit_move_insn (scratch, mem);
27658 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
27659
27660 /* This formula yields a nonzero result iff one of the bytes is zero.
27661 This saves three branches inside loop and many cycles. */
27662
27663 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
27664 emit_insn (gen_one_cmplsi2 (scratch, scratch));
27665 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
27666 emit_insn (gen_andsi3 (tmpreg, tmpreg,
27667 gen_int_mode (0x80808080, SImode)));
27668 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
27669 align_4_label);
27670
27671 if (TARGET_CMOVE)
27672 {
27673 rtx reg = gen_reg_rtx (SImode);
27674 rtx reg2 = gen_reg_rtx (Pmode);
27675 emit_move_insn (reg, tmpreg);
27676 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
27677
27678 /* If zero is not in the first two bytes, move two bytes forward. */
27679 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27680 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27681 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27682 emit_insn (gen_rtx_SET (tmpreg,
27683 gen_rtx_IF_THEN_ELSE (SImode, tmp,
27684 reg,
27685 tmpreg)));
27686 /* Emit lea manually to avoid clobbering of flags. */
27687 emit_insn (gen_rtx_SET (reg2, gen_rtx_PLUS (Pmode, out, const2_rtx)));
27688
27689 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27690 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
27691 emit_insn (gen_rtx_SET (out,
27692 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
27693 reg2,
27694 out)));
27695 }
27696 else
27697 {
27698 rtx_code_label *end_2_label = gen_label_rtx ();
27699 /* Is zero in the first two bytes? */
27700
27701 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
27702 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
27703 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
27704 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
27705 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
27706 pc_rtx);
27707 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
27708 JUMP_LABEL (tmp) = end_2_label;
27709
27710 /* Not in the first two. Move two bytes forward. */
27711 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
27712 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
27713
27714 emit_label (end_2_label);
27715
27716 }
27717
27718 /* Avoid branch in fixing the byte. */
27719 tmpreg = gen_lowpart (QImode, tmpreg);
27720 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
27721 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
27722 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
27723 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
27724
27725 emit_label (end_0_label);
27726 }
27727
27728 /* Expand strlen. */
27729
27730 bool
27731 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
27732 {
27733 rtx addr, scratch1, scratch2, scratch3, scratch4;
27734
27735 /* The generic case of strlen expander is long. Avoid it's
27736 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
27737
27738 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27739 && !TARGET_INLINE_ALL_STRINGOPS
27740 && !optimize_insn_for_size_p ()
27741 && (!CONST_INT_P (align) || INTVAL (align) < 4))
27742 return false;
27743
27744 addr = force_reg (Pmode, XEXP (src, 0));
27745 scratch1 = gen_reg_rtx (Pmode);
27746
27747 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
27748 && !optimize_insn_for_size_p ())
27749 {
27750 /* Well it seems that some optimizer does not combine a call like
27751 foo(strlen(bar), strlen(bar));
27752 when the move and the subtraction is done here. It does calculate
27753 the length just once when these instructions are done inside of
27754 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
27755 often used and I use one fewer register for the lifetime of
27756 output_strlen_unroll() this is better. */
27757
27758 emit_move_insn (out, addr);
27759
27760 ix86_expand_strlensi_unroll_1 (out, src, align);
27761
27762 /* strlensi_unroll_1 returns the address of the zero at the end of
27763 the string, like memchr(), so compute the length by subtracting
27764 the start address. */
27765 emit_insn (ix86_gen_sub3 (out, out, addr));
27766 }
27767 else
27768 {
27769 rtx unspec;
27770
27771 /* Can't use this if the user has appropriated eax, ecx, or edi. */
27772 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
27773 return false;
27774 /* Can't use this for non-default address spaces. */
27775 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src)))
27776 return false;
27777
27778 scratch2 = gen_reg_rtx (Pmode);
27779 scratch3 = gen_reg_rtx (Pmode);
27780 scratch4 = force_reg (Pmode, constm1_rtx);
27781
27782 emit_move_insn (scratch3, addr);
27783 eoschar = force_reg (QImode, eoschar);
27784
27785 src = replace_equiv_address_nv (src, scratch3);
27786
27787 /* If .md starts supporting :P, this can be done in .md. */
27788 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
27789 scratch4), UNSPEC_SCAS);
27790 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
27791 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
27792 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
27793 }
27794 return true;
27795 }
27796
27797 /* For given symbol (function) construct code to compute address of it's PLT
27798 entry in large x86-64 PIC model. */
27799 static rtx
27800 construct_plt_address (rtx symbol)
27801 {
27802 rtx tmp, unspec;
27803
27804 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
27805 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
27806 gcc_assert (Pmode == DImode);
27807
27808 tmp = gen_reg_rtx (Pmode);
27809 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
27810
27811 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
27812 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
27813 return tmp;
27814 }
27815
27816 rtx
27817 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
27818 rtx callarg2,
27819 rtx pop, bool sibcall)
27820 {
27821 rtx vec[3];
27822 rtx use = NULL, call;
27823 unsigned int vec_len = 0;
27824 tree fndecl;
27825
27826 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27827 {
27828 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
27829 if (fndecl
27830 && (lookup_attribute ("interrupt",
27831 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
27832 error ("interrupt service routine can't be called directly");
27833 }
27834 else
27835 fndecl = NULL_TREE;
27836
27837 if (pop == const0_rtx)
27838 pop = NULL;
27839 gcc_assert (!TARGET_64BIT || !pop);
27840
27841 if (TARGET_MACHO && !TARGET_64BIT)
27842 {
27843 #if TARGET_MACHO
27844 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
27845 fnaddr = machopic_indirect_call_target (fnaddr);
27846 #endif
27847 }
27848 else
27849 {
27850 /* Static functions and indirect calls don't need the pic register. Also,
27851 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
27852 it an indirect call. */
27853 rtx addr = XEXP (fnaddr, 0);
27854 if (flag_pic
27855 && GET_CODE (addr) == SYMBOL_REF
27856 && !SYMBOL_REF_LOCAL_P (addr))
27857 {
27858 if (flag_plt
27859 && (SYMBOL_REF_DECL (addr) == NULL_TREE
27860 || !lookup_attribute ("noplt",
27861 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
27862 {
27863 if (!TARGET_64BIT
27864 || (ix86_cmodel == CM_LARGE_PIC
27865 && DEFAULT_ABI != MS_ABI))
27866 {
27867 use_reg (&use, gen_rtx_REG (Pmode,
27868 REAL_PIC_OFFSET_TABLE_REGNUM));
27869 if (ix86_use_pseudo_pic_reg ())
27870 emit_move_insn (gen_rtx_REG (Pmode,
27871 REAL_PIC_OFFSET_TABLE_REGNUM),
27872 pic_offset_table_rtx);
27873 }
27874 }
27875 else if (!TARGET_PECOFF && !TARGET_MACHO)
27876 {
27877 if (TARGET_64BIT)
27878 {
27879 fnaddr = gen_rtx_UNSPEC (Pmode,
27880 gen_rtvec (1, addr),
27881 UNSPEC_GOTPCREL);
27882 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27883 }
27884 else
27885 {
27886 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
27887 UNSPEC_GOT);
27888 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
27889 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
27890 fnaddr);
27891 }
27892 fnaddr = gen_const_mem (Pmode, fnaddr);
27893 /* Pmode may not be the same as word_mode for x32, which
27894 doesn't support indirect branch via 32-bit memory slot.
27895 Since x32 GOT slot is 64 bit with zero upper 32 bits,
27896 indirect branch via x32 GOT slot is OK. */
27897 if (GET_MODE (fnaddr) != word_mode)
27898 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
27899 fnaddr = gen_rtx_MEM (QImode, fnaddr);
27900 }
27901 }
27902 }
27903
27904 /* Skip setting up RAX register for -mskip-rax-setup when there are no
27905 parameters passed in vector registers. */
27906 if (TARGET_64BIT
27907 && (INTVAL (callarg2) > 0
27908 || (INTVAL (callarg2) == 0
27909 && (TARGET_SSE || !flag_skip_rax_setup))))
27910 {
27911 rtx al = gen_rtx_REG (QImode, AX_REG);
27912 emit_move_insn (al, callarg2);
27913 use_reg (&use, al);
27914 }
27915
27916 if (ix86_cmodel == CM_LARGE_PIC
27917 && !TARGET_PECOFF
27918 && MEM_P (fnaddr)
27919 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
27920 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
27921 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
27922 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
27923 branch via x32 GOT slot is OK. */
27924 else if (!(TARGET_X32
27925 && MEM_P (fnaddr)
27926 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
27927 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
27928 && (sibcall
27929 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
27930 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
27931 {
27932 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
27933 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
27934 }
27935
27936 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
27937
27938 if (retval)
27939 {
27940 /* We should add bounds as destination register in case
27941 pointer with bounds may be returned. */
27942 if (TARGET_MPX && SCALAR_INT_MODE_P (GET_MODE (retval)))
27943 {
27944 rtx b0 = gen_rtx_REG (BND64mode, FIRST_BND_REG);
27945 rtx b1 = gen_rtx_REG (BND64mode, FIRST_BND_REG + 1);
27946 if (GET_CODE (retval) == PARALLEL)
27947 {
27948 b0 = gen_rtx_EXPR_LIST (VOIDmode, b0, const0_rtx);
27949 b1 = gen_rtx_EXPR_LIST (VOIDmode, b1, const0_rtx);
27950 rtx par = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, b0, b1));
27951 retval = chkp_join_splitted_slot (retval, par);
27952 }
27953 else
27954 {
27955 retval = gen_rtx_PARALLEL (VOIDmode,
27956 gen_rtvec (3, retval, b0, b1));
27957 chkp_put_regs_to_expr_list (retval);
27958 }
27959 }
27960
27961 call = gen_rtx_SET (retval, call);
27962 }
27963 vec[vec_len++] = call;
27964
27965 if (pop)
27966 {
27967 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
27968 pop = gen_rtx_SET (stack_pointer_rtx, pop);
27969 vec[vec_len++] = pop;
27970 }
27971
27972 if (cfun->machine->no_caller_saved_registers
27973 && (!fndecl
27974 || (!TREE_THIS_VOLATILE (fndecl)
27975 && !lookup_attribute ("no_caller_saved_registers",
27976 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
27977 {
27978 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
27979 bool is_64bit_ms_abi = (TARGET_64BIT
27980 && ix86_function_abi (fndecl) == MS_ABI);
27981 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
27982
27983 /* If there are no caller-saved registers, add all registers
27984 that are clobbered by the call which returns. */
27985 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
27986 if (!fixed_regs[i]
27987 && (ix86_call_used_regs[i] == 1
27988 || (ix86_call_used_regs[i] & c_mask))
27989 && !STACK_REGNO_P (i)
27990 && !MMX_REGNO_P (i))
27991 clobber_reg (&use,
27992 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
27993 }
27994 else if (TARGET_64BIT_MS_ABI
27995 && (!callarg2 || INTVAL (callarg2) != -2))
27996 {
27997 unsigned i;
27998
27999 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
28000 {
28001 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
28002 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
28003
28004 clobber_reg (&use, gen_rtx_REG (mode, regno));
28005 }
28006
28007 /* Set here, but it may get cleared later. */
28008 if (TARGET_CALL_MS2SYSV_XLOGUES)
28009 {
28010 if (!TARGET_SSE)
28011 ;
28012
28013 /* Don't break hot-patched functions. */
28014 else if (ix86_function_ms_hook_prologue (current_function_decl))
28015 ;
28016
28017 /* TODO: Cases not yet examined. */
28018 else if (flag_split_stack)
28019 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
28020
28021 else
28022 {
28023 gcc_assert (!reload_completed);
28024 cfun->machine->call_ms2sysv = true;
28025 }
28026 }
28027 }
28028
28029 if (vec_len > 1)
28030 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
28031 call = emit_call_insn (call);
28032 if (use)
28033 CALL_INSN_FUNCTION_USAGE (call) = use;
28034
28035 return call;
28036 }
28037
28038 /* Return true if the function being called was marked with attribute
28039 "noplt" or using -fno-plt and we are compiling for non-PIC. We need
28040 to handle the non-PIC case in the backend because there is no easy
28041 interface for the front-end to force non-PLT calls to use the GOT.
28042 This is currently used only with 64-bit or 32-bit GOT32X ELF targets
28043 to call the function marked "noplt" indirectly. */
28044
28045 static bool
28046 ix86_nopic_noplt_attribute_p (rtx call_op)
28047 {
28048 if (flag_pic || ix86_cmodel == CM_LARGE
28049 || !(TARGET_64BIT || HAVE_AS_IX86_GOT32X)
28050 || TARGET_MACHO || TARGET_SEH || TARGET_PECOFF
28051 || SYMBOL_REF_LOCAL_P (call_op))
28052 return false;
28053
28054 tree symbol_decl = SYMBOL_REF_DECL (call_op);
28055
28056 if (!flag_plt
28057 || (symbol_decl != NULL_TREE
28058 && lookup_attribute ("noplt", DECL_ATTRIBUTES (symbol_decl))))
28059 return true;
28060
28061 return false;
28062 }
28063
28064 /* Output the assembly for a call instruction. */
28065
28066 const char *
28067 ix86_output_call_insn (rtx_insn *insn, rtx call_op)
28068 {
28069 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
28070 bool seh_nop_p = false;
28071 const char *xasm;
28072
28073 if (SIBLING_CALL_P (insn))
28074 {
28075 if (direct_p)
28076 {
28077 if (ix86_nopic_noplt_attribute_p (call_op))
28078 {
28079 if (TARGET_64BIT)
28080 xasm = "%!jmp\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28081 else
28082 xasm = "%!jmp\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28083 }
28084 else
28085 xasm = "%!jmp\t%P0";
28086 }
28087 /* SEH epilogue detection requires the indirect branch case
28088 to include REX.W. */
28089 else if (TARGET_SEH)
28090 xasm = "%!rex.W jmp\t%A0";
28091 else
28092 xasm = "%!jmp\t%A0";
28093
28094 output_asm_insn (xasm, &call_op);
28095 return "";
28096 }
28097
28098 /* SEH unwinding can require an extra nop to be emitted in several
28099 circumstances. Determine if we have one of those. */
28100 if (TARGET_SEH)
28101 {
28102 rtx_insn *i;
28103
28104 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
28105 {
28106 /* If we get to another real insn, we don't need the nop. */
28107 if (INSN_P (i))
28108 break;
28109
28110 /* If we get to the epilogue note, prevent a catch region from
28111 being adjacent to the standard epilogue sequence. If non-
28112 call-exceptions, we'll have done this during epilogue emission. */
28113 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
28114 && !flag_non_call_exceptions
28115 && !can_throw_internal (insn))
28116 {
28117 seh_nop_p = true;
28118 break;
28119 }
28120 }
28121
28122 /* If we didn't find a real insn following the call, prevent the
28123 unwinder from looking into the next function. */
28124 if (i == NULL)
28125 seh_nop_p = true;
28126 }
28127
28128 if (direct_p)
28129 {
28130 if (ix86_nopic_noplt_attribute_p (call_op))
28131 {
28132 if (TARGET_64BIT)
28133 xasm = "%!call\t{*%p0@GOTPCREL(%%rip)|[QWORD PTR %p0@GOTPCREL[rip]]}";
28134 else
28135 xasm = "%!call\t{*%p0@GOT|[DWORD PTR %p0@GOT]}";
28136 }
28137 else
28138 xasm = "%!call\t%P0";
28139 }
28140 else
28141 xasm = "%!call\t%A0";
28142
28143 output_asm_insn (xasm, &call_op);
28144
28145 if (seh_nop_p)
28146 return "nop";
28147
28148 return "";
28149 }
28150 \f
28151 /* Clear stack slot assignments remembered from previous functions.
28152 This is called from INIT_EXPANDERS once before RTL is emitted for each
28153 function. */
28154
28155 static struct machine_function *
28156 ix86_init_machine_status (void)
28157 {
28158 struct machine_function *f;
28159
28160 f = ggc_cleared_alloc<machine_function> ();
28161 f->call_abi = ix86_abi;
28162
28163 return f;
28164 }
28165
28166 /* Return a MEM corresponding to a stack slot with mode MODE.
28167 Allocate a new slot if necessary.
28168
28169 The RTL for a function can have several slots available: N is
28170 which slot to use. */
28171
28172 rtx
28173 assign_386_stack_local (machine_mode mode, enum ix86_stack_slot n)
28174 {
28175 struct stack_local_entry *s;
28176
28177 gcc_assert (n < MAX_386_STACK_LOCALS);
28178
28179 for (s = ix86_stack_locals; s; s = s->next)
28180 if (s->mode == mode && s->n == n)
28181 return validize_mem (copy_rtx (s->rtl));
28182
28183 s = ggc_alloc<stack_local_entry> ();
28184 s->n = n;
28185 s->mode = mode;
28186 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
28187
28188 s->next = ix86_stack_locals;
28189 ix86_stack_locals = s;
28190 return validize_mem (copy_rtx (s->rtl));
28191 }
28192
28193 static void
28194 ix86_instantiate_decls (void)
28195 {
28196 struct stack_local_entry *s;
28197
28198 for (s = ix86_stack_locals; s; s = s->next)
28199 if (s->rtl != NULL_RTX)
28200 instantiate_decl_rtl (s->rtl);
28201 }
28202 \f
28203 /* Return the number used for encoding REG, in the range 0..7. */
28204
28205 static int
28206 reg_encoded_number (rtx reg)
28207 {
28208 unsigned regno = REGNO (reg);
28209 switch (regno)
28210 {
28211 case AX_REG:
28212 return 0;
28213 case CX_REG:
28214 return 1;
28215 case DX_REG:
28216 return 2;
28217 case BX_REG:
28218 return 3;
28219 case SP_REG:
28220 return 4;
28221 case BP_REG:
28222 return 5;
28223 case SI_REG:
28224 return 6;
28225 case DI_REG:
28226 return 7;
28227 default:
28228 break;
28229 }
28230 if (IN_RANGE (regno, FIRST_STACK_REG, LAST_STACK_REG))
28231 return regno - FIRST_STACK_REG;
28232 if (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG))
28233 return regno - FIRST_SSE_REG;
28234 if (IN_RANGE (regno, FIRST_MMX_REG, LAST_MMX_REG))
28235 return regno - FIRST_MMX_REG;
28236 if (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))
28237 return regno - FIRST_REX_SSE_REG;
28238 if (IN_RANGE (regno, FIRST_REX_INT_REG, LAST_REX_INT_REG))
28239 return regno - FIRST_REX_INT_REG;
28240 if (IN_RANGE (regno, FIRST_MASK_REG, LAST_MASK_REG))
28241 return regno - FIRST_MASK_REG;
28242 if (IN_RANGE (regno, FIRST_BND_REG, LAST_BND_REG))
28243 return regno - FIRST_BND_REG;
28244 return -1;
28245 }
28246
28247 /* Given an insn INSN with NOPERANDS OPERANDS, return the modr/m byte used
28248 in its encoding if it could be relevant for ROP mitigation, otherwise
28249 return -1. If POPNO0 and POPNO1 are nonnull, store the operand numbers
28250 used for calculating it into them. */
28251
28252 static int
28253 ix86_get_modrm_for_rop (rtx_insn *insn, rtx *operands, int noperands,
28254 int *popno0 = 0, int *popno1 = 0)
28255 {
28256 if (asm_noperands (PATTERN (insn)) >= 0)
28257 return -1;
28258 int has_modrm = get_attr_modrm (insn);
28259 if (!has_modrm)
28260 return -1;
28261 enum attr_modrm_class cls = get_attr_modrm_class (insn);
28262 rtx op0, op1;
28263 switch (cls)
28264 {
28265 case MODRM_CLASS_OP02:
28266 gcc_assert (noperands >= 3);
28267 if (popno0)
28268 {
28269 *popno0 = 0;
28270 *popno1 = 2;
28271 }
28272 op0 = operands[0];
28273 op1 = operands[2];
28274 break;
28275 case MODRM_CLASS_OP01:
28276 gcc_assert (noperands >= 2);
28277 if (popno0)
28278 {
28279 *popno0 = 0;
28280 *popno1 = 1;
28281 }
28282 op0 = operands[0];
28283 op1 = operands[1];
28284 break;
28285 default:
28286 return -1;
28287 }
28288 if (REG_P (op0) && REG_P (op1))
28289 {
28290 int enc0 = reg_encoded_number (op0);
28291 int enc1 = reg_encoded_number (op1);
28292 return 0xc0 + (enc1 << 3) + enc0;
28293 }
28294 return -1;
28295 }
28296
28297 /* Check whether x86 address PARTS is a pc-relative address. */
28298
28299 bool
28300 ix86_rip_relative_addr_p (struct ix86_address *parts)
28301 {
28302 rtx base, index, disp;
28303
28304 base = parts->base;
28305 index = parts->index;
28306 disp = parts->disp;
28307
28308 if (disp && !base && !index)
28309 {
28310 if (TARGET_64BIT)
28311 {
28312 rtx symbol = disp;
28313
28314 if (GET_CODE (disp) == CONST)
28315 symbol = XEXP (disp, 0);
28316 if (GET_CODE (symbol) == PLUS
28317 && CONST_INT_P (XEXP (symbol, 1)))
28318 symbol = XEXP (symbol, 0);
28319
28320 if (GET_CODE (symbol) == LABEL_REF
28321 || (GET_CODE (symbol) == SYMBOL_REF
28322 && SYMBOL_REF_TLS_MODEL (symbol) == 0)
28323 || (GET_CODE (symbol) == UNSPEC
28324 && (XINT (symbol, 1) == UNSPEC_GOTPCREL
28325 || XINT (symbol, 1) == UNSPEC_PCREL
28326 || XINT (symbol, 1) == UNSPEC_GOTNTPOFF)))
28327 return true;
28328 }
28329 }
28330 return false;
28331 }
28332
28333 /* Calculate the length of the memory address in the instruction encoding.
28334 Includes addr32 prefix, does not include the one-byte modrm, opcode,
28335 or other prefixes. We never generate addr32 prefix for LEA insn. */
28336
28337 int
28338 memory_address_length (rtx addr, bool lea)
28339 {
28340 struct ix86_address parts;
28341 rtx base, index, disp;
28342 int len;
28343 int ok;
28344
28345 if (GET_CODE (addr) == PRE_DEC
28346 || GET_CODE (addr) == POST_INC
28347 || GET_CODE (addr) == PRE_MODIFY
28348 || GET_CODE (addr) == POST_MODIFY)
28349 return 0;
28350
28351 ok = ix86_decompose_address (addr, &parts);
28352 gcc_assert (ok);
28353
28354 len = (parts.seg == ADDR_SPACE_GENERIC) ? 0 : 1;
28355
28356 /* If this is not LEA instruction, add the length of addr32 prefix. */
28357 if (TARGET_64BIT && !lea
28358 && (SImode_address_operand (addr, VOIDmode)
28359 || (parts.base && GET_MODE (parts.base) == SImode)
28360 || (parts.index && GET_MODE (parts.index) == SImode)))
28361 len++;
28362
28363 base = parts.base;
28364 index = parts.index;
28365 disp = parts.disp;
28366
28367 if (base && SUBREG_P (base))
28368 base = SUBREG_REG (base);
28369 if (index && SUBREG_P (index))
28370 index = SUBREG_REG (index);
28371
28372 gcc_assert (base == NULL_RTX || REG_P (base));
28373 gcc_assert (index == NULL_RTX || REG_P (index));
28374
28375 /* Rule of thumb:
28376 - esp as the base always wants an index,
28377 - ebp as the base always wants a displacement,
28378 - r12 as the base always wants an index,
28379 - r13 as the base always wants a displacement. */
28380
28381 /* Register Indirect. */
28382 if (base && !index && !disp)
28383 {
28384 /* esp (for its index) and ebp (for its displacement) need
28385 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
28386 code. */
28387 if (base == arg_pointer_rtx
28388 || base == frame_pointer_rtx
28389 || REGNO (base) == SP_REG
28390 || REGNO (base) == BP_REG
28391 || REGNO (base) == R12_REG
28392 || REGNO (base) == R13_REG)
28393 len++;
28394 }
28395
28396 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
28397 is not disp32, but disp32(%rip), so for disp32
28398 SIB byte is needed, unless print_operand_address
28399 optimizes it into disp32(%rip) or (%rip) is implied
28400 by UNSPEC. */
28401 else if (disp && !base && !index)
28402 {
28403 len += 4;
28404 if (!ix86_rip_relative_addr_p (&parts))
28405 len++;
28406 }
28407 else
28408 {
28409 /* Find the length of the displacement constant. */
28410 if (disp)
28411 {
28412 if (base && satisfies_constraint_K (disp))
28413 len += 1;
28414 else
28415 len += 4;
28416 }
28417 /* ebp always wants a displacement. Similarly r13. */
28418 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
28419 len++;
28420
28421 /* An index requires the two-byte modrm form.... */
28422 if (index
28423 /* ...like esp (or r12), which always wants an index. */
28424 || base == arg_pointer_rtx
28425 || base == frame_pointer_rtx
28426 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
28427 len++;
28428 }
28429
28430 return len;
28431 }
28432
28433 /* Compute default value for "length_immediate" attribute. When SHORTFORM
28434 is set, expect that insn have 8bit immediate alternative. */
28435 int
28436 ix86_attr_length_immediate_default (rtx_insn *insn, bool shortform)
28437 {
28438 int len = 0;
28439 int i;
28440 extract_insn_cached (insn);
28441 for (i = recog_data.n_operands - 1; i >= 0; --i)
28442 if (CONSTANT_P (recog_data.operand[i]))
28443 {
28444 enum attr_mode mode = get_attr_mode (insn);
28445
28446 gcc_assert (!len);
28447 if (shortform && CONST_INT_P (recog_data.operand[i]))
28448 {
28449 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
28450 switch (mode)
28451 {
28452 case MODE_QI:
28453 len = 1;
28454 continue;
28455 case MODE_HI:
28456 ival = trunc_int_for_mode (ival, HImode);
28457 break;
28458 case MODE_SI:
28459 ival = trunc_int_for_mode (ival, SImode);
28460 break;
28461 default:
28462 break;
28463 }
28464 if (IN_RANGE (ival, -128, 127))
28465 {
28466 len = 1;
28467 continue;
28468 }
28469 }
28470 switch (mode)
28471 {
28472 case MODE_QI:
28473 len = 1;
28474 break;
28475 case MODE_HI:
28476 len = 2;
28477 break;
28478 case MODE_SI:
28479 len = 4;
28480 break;
28481 /* Immediates for DImode instructions are encoded
28482 as 32bit sign extended values. */
28483 case MODE_DI:
28484 len = 4;
28485 break;
28486 default:
28487 fatal_insn ("unknown insn mode", insn);
28488 }
28489 }
28490 return len;
28491 }
28492
28493 /* Compute default value for "length_address" attribute. */
28494 int
28495 ix86_attr_length_address_default (rtx_insn *insn)
28496 {
28497 int i;
28498
28499 if (get_attr_type (insn) == TYPE_LEA)
28500 {
28501 rtx set = PATTERN (insn), addr;
28502
28503 if (GET_CODE (set) == PARALLEL)
28504 set = XVECEXP (set, 0, 0);
28505
28506 gcc_assert (GET_CODE (set) == SET);
28507
28508 addr = SET_SRC (set);
28509
28510 return memory_address_length (addr, true);
28511 }
28512
28513 extract_insn_cached (insn);
28514 for (i = recog_data.n_operands - 1; i >= 0; --i)
28515 {
28516 rtx op = recog_data.operand[i];
28517 if (MEM_P (op))
28518 {
28519 constrain_operands_cached (insn, reload_completed);
28520 if (which_alternative != -1)
28521 {
28522 const char *constraints = recog_data.constraints[i];
28523 int alt = which_alternative;
28524
28525 while (*constraints == '=' || *constraints == '+')
28526 constraints++;
28527 while (alt-- > 0)
28528 while (*constraints++ != ',')
28529 ;
28530 /* Skip ignored operands. */
28531 if (*constraints == 'X')
28532 continue;
28533 }
28534
28535 int len = memory_address_length (XEXP (op, 0), false);
28536
28537 /* Account for segment prefix for non-default addr spaces. */
28538 if (!ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (op)))
28539 len++;
28540
28541 return len;
28542 }
28543 }
28544 return 0;
28545 }
28546
28547 /* Compute default value for "length_vex" attribute. It includes
28548 2 or 3 byte VEX prefix and 1 opcode byte. */
28549
28550 int
28551 ix86_attr_length_vex_default (rtx_insn *insn, bool has_0f_opcode,
28552 bool has_vex_w)
28553 {
28554 int i;
28555
28556 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
28557 byte VEX prefix. */
28558 if (!has_0f_opcode || has_vex_w)
28559 return 3 + 1;
28560
28561 /* We can always use 2 byte VEX prefix in 32bit. */
28562 if (!TARGET_64BIT)
28563 return 2 + 1;
28564
28565 extract_insn_cached (insn);
28566
28567 for (i = recog_data.n_operands - 1; i >= 0; --i)
28568 if (REG_P (recog_data.operand[i]))
28569 {
28570 /* REX.W bit uses 3 byte VEX prefix. */
28571 if (GET_MODE (recog_data.operand[i]) == DImode
28572 && GENERAL_REG_P (recog_data.operand[i]))
28573 return 3 + 1;
28574 }
28575 else
28576 {
28577 /* REX.X or REX.B bits use 3 byte VEX prefix. */
28578 if (MEM_P (recog_data.operand[i])
28579 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
28580 return 3 + 1;
28581 }
28582
28583 return 2 + 1;
28584 }
28585 \f
28586
28587 static bool
28588 ix86_class_likely_spilled_p (reg_class_t);
28589
28590 /* Returns true if lhs of insn is HW function argument register and set up
28591 is_spilled to true if it is likely spilled HW register. */
28592 static bool
28593 insn_is_function_arg (rtx insn, bool* is_spilled)
28594 {
28595 rtx dst;
28596
28597 if (!NONDEBUG_INSN_P (insn))
28598 return false;
28599 /* Call instructions are not movable, ignore it. */
28600 if (CALL_P (insn))
28601 return false;
28602 insn = PATTERN (insn);
28603 if (GET_CODE (insn) == PARALLEL)
28604 insn = XVECEXP (insn, 0, 0);
28605 if (GET_CODE (insn) != SET)
28606 return false;
28607 dst = SET_DEST (insn);
28608 if (REG_P (dst) && HARD_REGISTER_P (dst)
28609 && ix86_function_arg_regno_p (REGNO (dst)))
28610 {
28611 /* Is it likely spilled HW register? */
28612 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
28613 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
28614 *is_spilled = true;
28615 return true;
28616 }
28617 return false;
28618 }
28619
28620 /* Add output dependencies for chain of function adjacent arguments if only
28621 there is a move to likely spilled HW register. Return first argument
28622 if at least one dependence was added or NULL otherwise. */
28623 static rtx_insn *
28624 add_parameter_dependencies (rtx_insn *call, rtx_insn *head)
28625 {
28626 rtx_insn *insn;
28627 rtx_insn *last = call;
28628 rtx_insn *first_arg = NULL;
28629 bool is_spilled = false;
28630
28631 head = PREV_INSN (head);
28632
28633 /* Find nearest to call argument passing instruction. */
28634 while (true)
28635 {
28636 last = PREV_INSN (last);
28637 if (last == head)
28638 return NULL;
28639 if (!NONDEBUG_INSN_P (last))
28640 continue;
28641 if (insn_is_function_arg (last, &is_spilled))
28642 break;
28643 return NULL;
28644 }
28645
28646 first_arg = last;
28647 while (true)
28648 {
28649 insn = PREV_INSN (last);
28650 if (!INSN_P (insn))
28651 break;
28652 if (insn == head)
28653 break;
28654 if (!NONDEBUG_INSN_P (insn))
28655 {
28656 last = insn;
28657 continue;
28658 }
28659 if (insn_is_function_arg (insn, &is_spilled))
28660 {
28661 /* Add output depdendence between two function arguments if chain
28662 of output arguments contains likely spilled HW registers. */
28663 if (is_spilled)
28664 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28665 first_arg = last = insn;
28666 }
28667 else
28668 break;
28669 }
28670 if (!is_spilled)
28671 return NULL;
28672 return first_arg;
28673 }
28674
28675 /* Add output or anti dependency from insn to first_arg to restrict its code
28676 motion. */
28677 static void
28678 avoid_func_arg_motion (rtx_insn *first_arg, rtx_insn *insn)
28679 {
28680 rtx set;
28681 rtx tmp;
28682
28683 /* Add anti dependencies for bounds stores. */
28684 if (INSN_P (insn)
28685 && GET_CODE (PATTERN (insn)) == PARALLEL
28686 && GET_CODE (XVECEXP (PATTERN (insn), 0, 0)) == UNSPEC
28687 && XINT (XVECEXP (PATTERN (insn), 0, 0), 1) == UNSPEC_BNDSTX)
28688 {
28689 add_dependence (first_arg, insn, REG_DEP_ANTI);
28690 return;
28691 }
28692
28693 set = single_set (insn);
28694 if (!set)
28695 return;
28696 tmp = SET_DEST (set);
28697 if (REG_P (tmp))
28698 {
28699 /* Add output dependency to the first function argument. */
28700 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
28701 return;
28702 }
28703 /* Add anti dependency. */
28704 add_dependence (first_arg, insn, REG_DEP_ANTI);
28705 }
28706
28707 /* Avoid cross block motion of function argument through adding dependency
28708 from the first non-jump instruction in bb. */
28709 static void
28710 add_dependee_for_func_arg (rtx_insn *arg, basic_block bb)
28711 {
28712 rtx_insn *insn = BB_END (bb);
28713
28714 while (insn)
28715 {
28716 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
28717 {
28718 rtx set = single_set (insn);
28719 if (set)
28720 {
28721 avoid_func_arg_motion (arg, insn);
28722 return;
28723 }
28724 }
28725 if (insn == BB_HEAD (bb))
28726 return;
28727 insn = PREV_INSN (insn);
28728 }
28729 }
28730
28731 /* Hook for pre-reload schedule - avoid motion of function arguments
28732 passed in likely spilled HW registers. */
28733 static void
28734 ix86_dependencies_evaluation_hook (rtx_insn *head, rtx_insn *tail)
28735 {
28736 rtx_insn *insn;
28737 rtx_insn *first_arg = NULL;
28738 if (reload_completed)
28739 return;
28740 while (head != tail && DEBUG_INSN_P (head))
28741 head = NEXT_INSN (head);
28742 for (insn = tail; insn != head; insn = PREV_INSN (insn))
28743 if (INSN_P (insn) && CALL_P (insn))
28744 {
28745 first_arg = add_parameter_dependencies (insn, head);
28746 if (first_arg)
28747 {
28748 /* Add dependee for first argument to predecessors if only
28749 region contains more than one block. */
28750 basic_block bb = BLOCK_FOR_INSN (insn);
28751 int rgn = CONTAINING_RGN (bb->index);
28752 int nr_blks = RGN_NR_BLOCKS (rgn);
28753 /* Skip trivial regions and region head blocks that can have
28754 predecessors outside of region. */
28755 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
28756 {
28757 edge e;
28758 edge_iterator ei;
28759
28760 /* Regions are SCCs with the exception of selective
28761 scheduling with pipelining of outer blocks enabled.
28762 So also check that immediate predecessors of a non-head
28763 block are in the same region. */
28764 FOR_EACH_EDGE (e, ei, bb->preds)
28765 {
28766 /* Avoid creating of loop-carried dependencies through
28767 using topological ordering in the region. */
28768 if (rgn == CONTAINING_RGN (e->src->index)
28769 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
28770 add_dependee_for_func_arg (first_arg, e->src);
28771 }
28772 }
28773 insn = first_arg;
28774 if (insn == head)
28775 break;
28776 }
28777 }
28778 else if (first_arg)
28779 avoid_func_arg_motion (first_arg, insn);
28780 }
28781
28782 /* Hook for pre-reload schedule - set priority of moves from likely spilled
28783 HW registers to maximum, to schedule them at soon as possible. These are
28784 moves from function argument registers at the top of the function entry
28785 and moves from function return value registers after call. */
28786 static int
28787 ix86_adjust_priority (rtx_insn *insn, int priority)
28788 {
28789 rtx set;
28790
28791 if (reload_completed)
28792 return priority;
28793
28794 if (!NONDEBUG_INSN_P (insn))
28795 return priority;
28796
28797 set = single_set (insn);
28798 if (set)
28799 {
28800 rtx tmp = SET_SRC (set);
28801 if (REG_P (tmp)
28802 && HARD_REGISTER_P (tmp)
28803 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
28804 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
28805 return current_sched_info->sched_max_insns_priority;
28806 }
28807
28808 return priority;
28809 }
28810
28811 /* Prepare for scheduling pass. */
28812 static void
28813 ix86_sched_init_global (FILE *, int, int)
28814 {
28815 /* Install scheduling hooks for current CPU. Some of these hooks are used
28816 in time-critical parts of the scheduler, so we only set them up when
28817 they are actually used. */
28818 switch (ix86_tune)
28819 {
28820 case PROCESSOR_CORE2:
28821 case PROCESSOR_NEHALEM:
28822 case PROCESSOR_SANDYBRIDGE:
28823 case PROCESSOR_HASWELL:
28824 /* Do not perform multipass scheduling for pre-reload schedule
28825 to save compile time. */
28826 if (reload_completed)
28827 {
28828 ix86_core2i7_init_hooks ();
28829 break;
28830 }
28831 /* Fall through. */
28832 default:
28833 targetm.sched.dfa_post_advance_cycle = NULL;
28834 targetm.sched.first_cycle_multipass_init = NULL;
28835 targetm.sched.first_cycle_multipass_begin = NULL;
28836 targetm.sched.first_cycle_multipass_issue = NULL;
28837 targetm.sched.first_cycle_multipass_backtrack = NULL;
28838 targetm.sched.first_cycle_multipass_end = NULL;
28839 targetm.sched.first_cycle_multipass_fini = NULL;
28840 break;
28841 }
28842 }
28843
28844 \f
28845 /* Implement TARGET_STATIC_RTX_ALIGNMENT. */
28846
28847 static HOST_WIDE_INT
28848 ix86_static_rtx_alignment (machine_mode mode)
28849 {
28850 if (mode == DFmode)
28851 return 64;
28852 if (ALIGN_MODE_128 (mode))
28853 return MAX (128, GET_MODE_ALIGNMENT (mode));
28854 return GET_MODE_ALIGNMENT (mode);
28855 }
28856
28857 /* Implement TARGET_CONSTANT_ALIGNMENT. */
28858
28859 static HOST_WIDE_INT
28860 ix86_constant_alignment (const_tree exp, HOST_WIDE_INT align)
28861 {
28862 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
28863 || TREE_CODE (exp) == INTEGER_CST)
28864 {
28865 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
28866 HOST_WIDE_INT mode_align = ix86_static_rtx_alignment (mode);
28867 return MAX (mode_align, align);
28868 }
28869 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
28870 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
28871 return BITS_PER_WORD;
28872
28873 return align;
28874 }
28875
28876 /* Implement TARGET_EMPTY_RECORD_P. */
28877
28878 static bool
28879 ix86_is_empty_record (const_tree type)
28880 {
28881 if (!TARGET_64BIT)
28882 return false;
28883 return default_is_empty_record (type);
28884 }
28885
28886 /* Implement TARGET_WARN_PARAMETER_PASSING_ABI. */
28887
28888 static void
28889 ix86_warn_parameter_passing_abi (cumulative_args_t cum_v, tree type)
28890 {
28891 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
28892
28893 if (!cum->warn_empty)
28894 return;
28895
28896 if (!TYPE_EMPTY_P (type))
28897 return;
28898
28899 const_tree ctx = get_ultimate_context (cum->decl);
28900 if (ctx != NULL_TREE
28901 && !TRANSLATION_UNIT_WARN_EMPTY_P (ctx))
28902 return;
28903
28904 /* If the actual size of the type is zero, then there is no change
28905 in how objects of this size are passed. */
28906 if (int_size_in_bytes (type) == 0)
28907 return;
28908
28909 warning (OPT_Wabi, "empty class %qT parameter passing ABI "
28910 "changes in -fabi-version=12 (GCC 8)", type);
28911
28912 /* Only warn once. */
28913 cum->warn_empty = false;
28914 }
28915
28916 /* Compute the alignment for a variable for Intel MCU psABI. TYPE is
28917 the data type, and ALIGN is the alignment that the object would
28918 ordinarily have. */
28919
28920 static int
28921 iamcu_alignment (tree type, int align)
28922 {
28923 machine_mode mode;
28924
28925 if (align < 32 || TYPE_USER_ALIGN (type))
28926 return align;
28927
28928 /* Intel MCU psABI specifies scalar types > 4 bytes aligned to 4
28929 bytes. */
28930 mode = TYPE_MODE (strip_array_types (type));
28931 switch (GET_MODE_CLASS (mode))
28932 {
28933 case MODE_INT:
28934 case MODE_COMPLEX_INT:
28935 case MODE_COMPLEX_FLOAT:
28936 case MODE_FLOAT:
28937 case MODE_DECIMAL_FLOAT:
28938 return 32;
28939 default:
28940 return align;
28941 }
28942 }
28943
28944 /* Compute the alignment for a static variable.
28945 TYPE is the data type, and ALIGN is the alignment that
28946 the object would ordinarily have. The value of this function is used
28947 instead of that alignment to align the object. */
28948
28949 int
28950 ix86_data_alignment (tree type, int align, bool opt)
28951 {
28952 /* GCC 4.8 and earlier used to incorrectly assume this alignment even
28953 for symbols from other compilation units or symbols that don't need
28954 to bind locally. In order to preserve some ABI compatibility with
28955 those compilers, ensure we don't decrease alignment from what we
28956 used to assume. */
28957
28958 int max_align_compat = MIN (256, MAX_OFILE_ALIGNMENT);
28959
28960 /* A data structure, equal or greater than the size of a cache line
28961 (64 bytes in the Pentium 4 and other recent Intel processors, including
28962 processors based on Intel Core microarchitecture) should be aligned
28963 so that its base address is a multiple of a cache line size. */
28964
28965 int max_align
28966 = MIN ((unsigned) ix86_tune_cost->prefetch_block * 8, MAX_OFILE_ALIGNMENT);
28967
28968 if (max_align < BITS_PER_WORD)
28969 max_align = BITS_PER_WORD;
28970
28971 switch (ix86_align_data_type)
28972 {
28973 case ix86_align_data_type_abi: opt = false; break;
28974 case ix86_align_data_type_compat: max_align = BITS_PER_WORD; break;
28975 case ix86_align_data_type_cacheline: break;
28976 }
28977
28978 if (TARGET_IAMCU)
28979 align = iamcu_alignment (type, align);
28980
28981 if (opt
28982 && AGGREGATE_TYPE_P (type)
28983 && TYPE_SIZE (type)
28984 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
28985 {
28986 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align_compat)
28987 && align < max_align_compat)
28988 align = max_align_compat;
28989 if (wi::geu_p (wi::to_wide (TYPE_SIZE (type)), max_align)
28990 && align < max_align)
28991 align = max_align;
28992 }
28993
28994 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
28995 to 16byte boundary. */
28996 if (TARGET_64BIT)
28997 {
28998 if ((opt ? AGGREGATE_TYPE_P (type) : TREE_CODE (type) == ARRAY_TYPE)
28999 && TYPE_SIZE (type)
29000 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29001 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29002 && align < 128)
29003 return 128;
29004 }
29005
29006 if (!opt)
29007 return align;
29008
29009 if (TREE_CODE (type) == ARRAY_TYPE)
29010 {
29011 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29012 return 64;
29013 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29014 return 128;
29015 }
29016 else if (TREE_CODE (type) == COMPLEX_TYPE)
29017 {
29018
29019 if (TYPE_MODE (type) == DCmode && align < 64)
29020 return 64;
29021 if ((TYPE_MODE (type) == XCmode
29022 || TYPE_MODE (type) == TCmode) && align < 128)
29023 return 128;
29024 }
29025 else if ((TREE_CODE (type) == RECORD_TYPE
29026 || TREE_CODE (type) == UNION_TYPE
29027 || TREE_CODE (type) == QUAL_UNION_TYPE)
29028 && TYPE_FIELDS (type))
29029 {
29030 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29031 return 64;
29032 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29033 return 128;
29034 }
29035 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29036 || TREE_CODE (type) == INTEGER_TYPE)
29037 {
29038 if (TYPE_MODE (type) == DFmode && align < 64)
29039 return 64;
29040 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29041 return 128;
29042 }
29043
29044 return align;
29045 }
29046
29047 /* Compute the alignment for a local variable or a stack slot. EXP is
29048 the data type or decl itself, MODE is the widest mode available and
29049 ALIGN is the alignment that the object would ordinarily have. The
29050 value of this macro is used instead of that alignment to align the
29051 object. */
29052
29053 unsigned int
29054 ix86_local_alignment (tree exp, machine_mode mode,
29055 unsigned int align)
29056 {
29057 tree type, decl;
29058
29059 if (exp && DECL_P (exp))
29060 {
29061 type = TREE_TYPE (exp);
29062 decl = exp;
29063 }
29064 else
29065 {
29066 type = exp;
29067 decl = NULL;
29068 }
29069
29070 /* Don't do dynamic stack realignment for long long objects with
29071 -mpreferred-stack-boundary=2. */
29072 if (!TARGET_64BIT
29073 && align == 64
29074 && ix86_preferred_stack_boundary < 64
29075 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
29076 && (!type || !TYPE_USER_ALIGN (type))
29077 && (!decl || !DECL_USER_ALIGN (decl)))
29078 align = 32;
29079
29080 /* If TYPE is NULL, we are allocating a stack slot for caller-save
29081 register in MODE. We will return the largest alignment of XF
29082 and DF. */
29083 if (!type)
29084 {
29085 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
29086 align = GET_MODE_ALIGNMENT (DFmode);
29087 return align;
29088 }
29089
29090 /* Don't increase alignment for Intel MCU psABI. */
29091 if (TARGET_IAMCU)
29092 return align;
29093
29094 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
29095 to 16byte boundary. Exact wording is:
29096
29097 An array uses the same alignment as its elements, except that a local or
29098 global array variable of length at least 16 bytes or
29099 a C99 variable-length array variable always has alignment of at least 16 bytes.
29100
29101 This was added to allow use of aligned SSE instructions at arrays. This
29102 rule is meant for static storage (where compiler can not do the analysis
29103 by itself). We follow it for automatic variables only when convenient.
29104 We fully control everything in the function compiled and functions from
29105 other unit can not rely on the alignment.
29106
29107 Exclude va_list type. It is the common case of local array where
29108 we can not benefit from the alignment.
29109
29110 TODO: Probably one should optimize for size only when var is not escaping. */
29111 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
29112 && TARGET_SSE)
29113 {
29114 if (AGGREGATE_TYPE_P (type)
29115 && (va_list_type_node == NULL_TREE
29116 || (TYPE_MAIN_VARIANT (type)
29117 != TYPE_MAIN_VARIANT (va_list_type_node)))
29118 && TYPE_SIZE (type)
29119 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
29120 && wi::geu_p (wi::to_wide (TYPE_SIZE (type)), 128)
29121 && align < 128)
29122 return 128;
29123 }
29124 if (TREE_CODE (type) == ARRAY_TYPE)
29125 {
29126 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
29127 return 64;
29128 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
29129 return 128;
29130 }
29131 else if (TREE_CODE (type) == COMPLEX_TYPE)
29132 {
29133 if (TYPE_MODE (type) == DCmode && align < 64)
29134 return 64;
29135 if ((TYPE_MODE (type) == XCmode
29136 || TYPE_MODE (type) == TCmode) && align < 128)
29137 return 128;
29138 }
29139 else if ((TREE_CODE (type) == RECORD_TYPE
29140 || TREE_CODE (type) == UNION_TYPE
29141 || TREE_CODE (type) == QUAL_UNION_TYPE)
29142 && TYPE_FIELDS (type))
29143 {
29144 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
29145 return 64;
29146 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
29147 return 128;
29148 }
29149 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
29150 || TREE_CODE (type) == INTEGER_TYPE)
29151 {
29152
29153 if (TYPE_MODE (type) == DFmode && align < 64)
29154 return 64;
29155 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
29156 return 128;
29157 }
29158 return align;
29159 }
29160
29161 /* Compute the minimum required alignment for dynamic stack realignment
29162 purposes for a local variable, parameter or a stack slot. EXP is
29163 the data type or decl itself, MODE is its mode and ALIGN is the
29164 alignment that the object would ordinarily have. */
29165
29166 unsigned int
29167 ix86_minimum_alignment (tree exp, machine_mode mode,
29168 unsigned int align)
29169 {
29170 tree type, decl;
29171
29172 if (exp && DECL_P (exp))
29173 {
29174 type = TREE_TYPE (exp);
29175 decl = exp;
29176 }
29177 else
29178 {
29179 type = exp;
29180 decl = NULL;
29181 }
29182
29183 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
29184 return align;
29185
29186 /* Don't do dynamic stack realignment for long long objects with
29187 -mpreferred-stack-boundary=2. */
29188 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
29189 && (!type || !TYPE_USER_ALIGN (type))
29190 && (!decl || !DECL_USER_ALIGN (decl)))
29191 {
29192 gcc_checking_assert (!TARGET_STV);
29193 return 32;
29194 }
29195
29196 return align;
29197 }
29198 \f
29199 /* Find a location for the static chain incoming to a nested function.
29200 This is a register, unless all free registers are used by arguments. */
29201
29202 static rtx
29203 ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
29204 {
29205 unsigned regno;
29206
29207 /* While this function won't be called by the middle-end when a static
29208 chain isn't needed, it's also used throughout the backend so it's
29209 easiest to keep this check centralized. */
29210 if (DECL_P (fndecl_or_type) && !DECL_STATIC_CHAIN (fndecl_or_type))
29211 return NULL;
29212
29213 if (TARGET_64BIT)
29214 {
29215 /* We always use R10 in 64-bit mode. */
29216 regno = R10_REG;
29217 }
29218 else
29219 {
29220 const_tree fntype, fndecl;
29221 unsigned int ccvt;
29222
29223 /* By default in 32-bit mode we use ECX to pass the static chain. */
29224 regno = CX_REG;
29225
29226 if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
29227 {
29228 fntype = TREE_TYPE (fndecl_or_type);
29229 fndecl = fndecl_or_type;
29230 }
29231 else
29232 {
29233 fntype = fndecl_or_type;
29234 fndecl = NULL;
29235 }
29236
29237 ccvt = ix86_get_callcvt (fntype);
29238 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29239 {
29240 /* Fastcall functions use ecx/edx for arguments, which leaves
29241 us with EAX for the static chain.
29242 Thiscall functions use ecx for arguments, which also
29243 leaves us with EAX for the static chain. */
29244 regno = AX_REG;
29245 }
29246 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29247 {
29248 /* Thiscall functions use ecx for arguments, which leaves
29249 us with EAX and EDX for the static chain.
29250 We are using for abi-compatibility EAX. */
29251 regno = AX_REG;
29252 }
29253 else if (ix86_function_regparm (fntype, fndecl) == 3)
29254 {
29255 /* For regparm 3, we have no free call-clobbered registers in
29256 which to store the static chain. In order to implement this,
29257 we have the trampoline push the static chain to the stack.
29258 However, we can't push a value below the return address when
29259 we call the nested function directly, so we have to use an
29260 alternate entry point. For this we use ESI, and have the
29261 alternate entry point push ESI, so that things appear the
29262 same once we're executing the nested function. */
29263 if (incoming_p)
29264 {
29265 if (fndecl == current_function_decl
29266 && !ix86_static_chain_on_stack)
29267 {
29268 gcc_assert (!reload_completed);
29269 ix86_static_chain_on_stack = true;
29270 }
29271 return gen_frame_mem (SImode,
29272 plus_constant (Pmode,
29273 arg_pointer_rtx, -8));
29274 }
29275 regno = SI_REG;
29276 }
29277 }
29278
29279 return gen_rtx_REG (Pmode, regno);
29280 }
29281
29282 /* Emit RTL insns to initialize the variable parts of a trampoline.
29283 FNDECL is the decl of the target address; M_TRAMP is a MEM for
29284 the trampoline, and CHAIN_VALUE is an RTX for the static chain
29285 to be passed to the target function. */
29286
29287 static void
29288 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
29289 {
29290 rtx mem, fnaddr;
29291 int opcode;
29292 int offset = 0;
29293
29294 fnaddr = XEXP (DECL_RTL (fndecl), 0);
29295
29296 if (TARGET_64BIT)
29297 {
29298 int size;
29299
29300 /* Load the function address to r11. Try to load address using
29301 the shorter movl instead of movabs. We may want to support
29302 movq for kernel mode, but kernel does not use trampolines at
29303 the moment. FNADDR is a 32bit address and may not be in
29304 DImode when ptr_mode == SImode. Always use movl in this
29305 case. */
29306 if (ptr_mode == SImode
29307 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
29308 {
29309 fnaddr = copy_addr_to_reg (fnaddr);
29310
29311 mem = adjust_address (m_tramp, HImode, offset);
29312 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
29313
29314 mem = adjust_address (m_tramp, SImode, offset + 2);
29315 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
29316 offset += 6;
29317 }
29318 else
29319 {
29320 mem = adjust_address (m_tramp, HImode, offset);
29321 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
29322
29323 mem = adjust_address (m_tramp, DImode, offset + 2);
29324 emit_move_insn (mem, fnaddr);
29325 offset += 10;
29326 }
29327
29328 /* Load static chain using movabs to r10. Use the shorter movl
29329 instead of movabs when ptr_mode == SImode. */
29330 if (ptr_mode == SImode)
29331 {
29332 opcode = 0xba41;
29333 size = 6;
29334 }
29335 else
29336 {
29337 opcode = 0xba49;
29338 size = 10;
29339 }
29340
29341 mem = adjust_address (m_tramp, HImode, offset);
29342 emit_move_insn (mem, gen_int_mode (opcode, HImode));
29343
29344 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
29345 emit_move_insn (mem, chain_value);
29346 offset += size;
29347
29348 /* Jump to r11; the last (unused) byte is a nop, only there to
29349 pad the write out to a single 32-bit store. */
29350 mem = adjust_address (m_tramp, SImode, offset);
29351 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
29352 offset += 4;
29353 }
29354 else
29355 {
29356 rtx disp, chain;
29357
29358 /* Depending on the static chain location, either load a register
29359 with a constant, or push the constant to the stack. All of the
29360 instructions are the same size. */
29361 chain = ix86_static_chain (fndecl, true);
29362 if (REG_P (chain))
29363 {
29364 switch (REGNO (chain))
29365 {
29366 case AX_REG:
29367 opcode = 0xb8; break;
29368 case CX_REG:
29369 opcode = 0xb9; break;
29370 default:
29371 gcc_unreachable ();
29372 }
29373 }
29374 else
29375 opcode = 0x68;
29376
29377 mem = adjust_address (m_tramp, QImode, offset);
29378 emit_move_insn (mem, gen_int_mode (opcode, QImode));
29379
29380 mem = adjust_address (m_tramp, SImode, offset + 1);
29381 emit_move_insn (mem, chain_value);
29382 offset += 5;
29383
29384 mem = adjust_address (m_tramp, QImode, offset);
29385 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
29386
29387 mem = adjust_address (m_tramp, SImode, offset + 1);
29388
29389 /* Compute offset from the end of the jmp to the target function.
29390 In the case in which the trampoline stores the static chain on
29391 the stack, we need to skip the first insn which pushes the
29392 (call-saved) register static chain; this push is 1 byte. */
29393 offset += 5;
29394 disp = expand_binop (SImode, sub_optab, fnaddr,
29395 plus_constant (Pmode, XEXP (m_tramp, 0),
29396 offset - (MEM_P (chain) ? 1 : 0)),
29397 NULL_RTX, 1, OPTAB_DIRECT);
29398 emit_move_insn (mem, disp);
29399 }
29400
29401 gcc_assert (offset <= TRAMPOLINE_SIZE);
29402
29403 #ifdef HAVE_ENABLE_EXECUTE_STACK
29404 #ifdef CHECK_EXECUTE_STACK_ENABLED
29405 if (CHECK_EXECUTE_STACK_ENABLED)
29406 #endif
29407 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
29408 LCT_NORMAL, VOIDmode, XEXP (m_tramp, 0), Pmode);
29409 #endif
29410 }
29411
29412 static bool
29413 ix86_allocate_stack_slots_for_args (void)
29414 {
29415 /* Naked functions should not allocate stack slots for arguments. */
29416 return !ix86_function_naked (current_function_decl);
29417 }
29418
29419 static bool
29420 ix86_warn_func_return (tree decl)
29421 {
29422 /* Naked functions are implemented entirely in assembly, including the
29423 return sequence, so suppress warnings about this. */
29424 return !ix86_function_naked (decl);
29425 }
29426 \f
29427 /* The following file contains several enumerations and data structures
29428 built from the definitions in i386-builtin-types.def. */
29429
29430 #include "i386-builtin-types.inc"
29431
29432 /* Table for the ix86 builtin non-function types. */
29433 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
29434
29435 /* Retrieve an element from the above table, building some of
29436 the types lazily. */
29437
29438 static tree
29439 ix86_get_builtin_type (enum ix86_builtin_type tcode)
29440 {
29441 unsigned int index;
29442 tree type, itype;
29443
29444 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
29445
29446 type = ix86_builtin_type_tab[(int) tcode];
29447 if (type != NULL)
29448 return type;
29449
29450 gcc_assert (tcode > IX86_BT_LAST_PRIM);
29451 if (tcode <= IX86_BT_LAST_VECT)
29452 {
29453 machine_mode mode;
29454
29455 index = tcode - IX86_BT_LAST_PRIM - 1;
29456 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
29457 mode = ix86_builtin_type_vect_mode[index];
29458
29459 type = build_vector_type_for_mode (itype, mode);
29460 }
29461 else
29462 {
29463 int quals;
29464
29465 index = tcode - IX86_BT_LAST_VECT - 1;
29466 if (tcode <= IX86_BT_LAST_PTR)
29467 quals = TYPE_UNQUALIFIED;
29468 else
29469 quals = TYPE_QUAL_CONST;
29470
29471 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
29472 if (quals != TYPE_UNQUALIFIED)
29473 itype = build_qualified_type (itype, quals);
29474
29475 type = build_pointer_type (itype);
29476 }
29477
29478 ix86_builtin_type_tab[(int) tcode] = type;
29479 return type;
29480 }
29481
29482 /* Table for the ix86 builtin function types. */
29483 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
29484
29485 /* Retrieve an element from the above table, building some of
29486 the types lazily. */
29487
29488 static tree
29489 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
29490 {
29491 tree type;
29492
29493 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
29494
29495 type = ix86_builtin_func_type_tab[(int) tcode];
29496 if (type != NULL)
29497 return type;
29498
29499 if (tcode <= IX86_BT_LAST_FUNC)
29500 {
29501 unsigned start = ix86_builtin_func_start[(int) tcode];
29502 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
29503 tree rtype, atype, args = void_list_node;
29504 unsigned i;
29505
29506 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
29507 for (i = after - 1; i > start; --i)
29508 {
29509 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
29510 args = tree_cons (NULL, atype, args);
29511 }
29512
29513 type = build_function_type (rtype, args);
29514 }
29515 else
29516 {
29517 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
29518 enum ix86_builtin_func_type icode;
29519
29520 icode = ix86_builtin_func_alias_base[index];
29521 type = ix86_get_builtin_func_type (icode);
29522 }
29523
29524 ix86_builtin_func_type_tab[(int) tcode] = type;
29525 return type;
29526 }
29527
29528
29529 /* Codes for all the SSE/MMX builtins. Builtins not mentioned in any
29530 bdesc_* arrays below should come first, then builtins for each bdesc_*
29531 array in ascending order, so that we can use direct array accesses. */
29532 enum ix86_builtins
29533 {
29534 IX86_BUILTIN_MASKMOVQ,
29535 IX86_BUILTIN_LDMXCSR,
29536 IX86_BUILTIN_STMXCSR,
29537 IX86_BUILTIN_MASKMOVDQU,
29538 IX86_BUILTIN_PSLLDQ128,
29539 IX86_BUILTIN_CLFLUSH,
29540 IX86_BUILTIN_MONITOR,
29541 IX86_BUILTIN_MWAIT,
29542 IX86_BUILTIN_CLZERO,
29543 IX86_BUILTIN_VEC_INIT_V2SI,
29544 IX86_BUILTIN_VEC_INIT_V4HI,
29545 IX86_BUILTIN_VEC_INIT_V8QI,
29546 IX86_BUILTIN_VEC_EXT_V2DF,
29547 IX86_BUILTIN_VEC_EXT_V2DI,
29548 IX86_BUILTIN_VEC_EXT_V4SF,
29549 IX86_BUILTIN_VEC_EXT_V4SI,
29550 IX86_BUILTIN_VEC_EXT_V8HI,
29551 IX86_BUILTIN_VEC_EXT_V2SI,
29552 IX86_BUILTIN_VEC_EXT_V4HI,
29553 IX86_BUILTIN_VEC_EXT_V16QI,
29554 IX86_BUILTIN_VEC_SET_V2DI,
29555 IX86_BUILTIN_VEC_SET_V4SF,
29556 IX86_BUILTIN_VEC_SET_V4SI,
29557 IX86_BUILTIN_VEC_SET_V8HI,
29558 IX86_BUILTIN_VEC_SET_V4HI,
29559 IX86_BUILTIN_VEC_SET_V16QI,
29560 IX86_BUILTIN_GATHERSIV2DF,
29561 IX86_BUILTIN_GATHERSIV4DF,
29562 IX86_BUILTIN_GATHERDIV2DF,
29563 IX86_BUILTIN_GATHERDIV4DF,
29564 IX86_BUILTIN_GATHERSIV4SF,
29565 IX86_BUILTIN_GATHERSIV8SF,
29566 IX86_BUILTIN_GATHERDIV4SF,
29567 IX86_BUILTIN_GATHERDIV8SF,
29568 IX86_BUILTIN_GATHERSIV2DI,
29569 IX86_BUILTIN_GATHERSIV4DI,
29570 IX86_BUILTIN_GATHERDIV2DI,
29571 IX86_BUILTIN_GATHERDIV4DI,
29572 IX86_BUILTIN_GATHERSIV4SI,
29573 IX86_BUILTIN_GATHERSIV8SI,
29574 IX86_BUILTIN_GATHERDIV4SI,
29575 IX86_BUILTIN_GATHERDIV8SI,
29576 IX86_BUILTIN_VFMSUBSD3_MASK3,
29577 IX86_BUILTIN_VFMSUBSS3_MASK3,
29578 IX86_BUILTIN_GATHER3SIV8SF,
29579 IX86_BUILTIN_GATHER3SIV4SF,
29580 IX86_BUILTIN_GATHER3SIV4DF,
29581 IX86_BUILTIN_GATHER3SIV2DF,
29582 IX86_BUILTIN_GATHER3DIV8SF,
29583 IX86_BUILTIN_GATHER3DIV4SF,
29584 IX86_BUILTIN_GATHER3DIV4DF,
29585 IX86_BUILTIN_GATHER3DIV2DF,
29586 IX86_BUILTIN_GATHER3SIV8SI,
29587 IX86_BUILTIN_GATHER3SIV4SI,
29588 IX86_BUILTIN_GATHER3SIV4DI,
29589 IX86_BUILTIN_GATHER3SIV2DI,
29590 IX86_BUILTIN_GATHER3DIV8SI,
29591 IX86_BUILTIN_GATHER3DIV4SI,
29592 IX86_BUILTIN_GATHER3DIV4DI,
29593 IX86_BUILTIN_GATHER3DIV2DI,
29594 IX86_BUILTIN_SCATTERSIV8SF,
29595 IX86_BUILTIN_SCATTERSIV4SF,
29596 IX86_BUILTIN_SCATTERSIV4DF,
29597 IX86_BUILTIN_SCATTERSIV2DF,
29598 IX86_BUILTIN_SCATTERDIV8SF,
29599 IX86_BUILTIN_SCATTERDIV4SF,
29600 IX86_BUILTIN_SCATTERDIV4DF,
29601 IX86_BUILTIN_SCATTERDIV2DF,
29602 IX86_BUILTIN_SCATTERSIV8SI,
29603 IX86_BUILTIN_SCATTERSIV4SI,
29604 IX86_BUILTIN_SCATTERSIV4DI,
29605 IX86_BUILTIN_SCATTERSIV2DI,
29606 IX86_BUILTIN_SCATTERDIV8SI,
29607 IX86_BUILTIN_SCATTERDIV4SI,
29608 IX86_BUILTIN_SCATTERDIV4DI,
29609 IX86_BUILTIN_SCATTERDIV2DI,
29610 /* Alternate 4 and 8 element gather/scatter for the vectorizer
29611 where all operands are 32-byte or 64-byte wide respectively. */
29612 IX86_BUILTIN_GATHERALTSIV4DF,
29613 IX86_BUILTIN_GATHERALTDIV8SF,
29614 IX86_BUILTIN_GATHERALTSIV4DI,
29615 IX86_BUILTIN_GATHERALTDIV8SI,
29616 IX86_BUILTIN_GATHER3ALTDIV16SF,
29617 IX86_BUILTIN_GATHER3ALTDIV16SI,
29618 IX86_BUILTIN_GATHER3ALTSIV4DF,
29619 IX86_BUILTIN_GATHER3ALTDIV8SF,
29620 IX86_BUILTIN_GATHER3ALTSIV4DI,
29621 IX86_BUILTIN_GATHER3ALTDIV8SI,
29622 IX86_BUILTIN_GATHER3ALTSIV8DF,
29623 IX86_BUILTIN_GATHER3ALTSIV8DI,
29624 IX86_BUILTIN_GATHER3DIV16SF,
29625 IX86_BUILTIN_GATHER3DIV16SI,
29626 IX86_BUILTIN_GATHER3DIV8DF,
29627 IX86_BUILTIN_GATHER3DIV8DI,
29628 IX86_BUILTIN_GATHER3SIV16SF,
29629 IX86_BUILTIN_GATHER3SIV16SI,
29630 IX86_BUILTIN_GATHER3SIV8DF,
29631 IX86_BUILTIN_GATHER3SIV8DI,
29632 IX86_BUILTIN_SCATTERALTSIV8DF,
29633 IX86_BUILTIN_SCATTERALTDIV16SF,
29634 IX86_BUILTIN_SCATTERALTSIV8DI,
29635 IX86_BUILTIN_SCATTERALTDIV16SI,
29636 IX86_BUILTIN_SCATTERDIV16SF,
29637 IX86_BUILTIN_SCATTERDIV16SI,
29638 IX86_BUILTIN_SCATTERDIV8DF,
29639 IX86_BUILTIN_SCATTERDIV8DI,
29640 IX86_BUILTIN_SCATTERSIV16SF,
29641 IX86_BUILTIN_SCATTERSIV16SI,
29642 IX86_BUILTIN_SCATTERSIV8DF,
29643 IX86_BUILTIN_SCATTERSIV8DI,
29644 IX86_BUILTIN_GATHERPFQPD,
29645 IX86_BUILTIN_GATHERPFDPS,
29646 IX86_BUILTIN_GATHERPFDPD,
29647 IX86_BUILTIN_GATHERPFQPS,
29648 IX86_BUILTIN_SCATTERPFDPD,
29649 IX86_BUILTIN_SCATTERPFDPS,
29650 IX86_BUILTIN_SCATTERPFQPD,
29651 IX86_BUILTIN_SCATTERPFQPS,
29652 IX86_BUILTIN_CLWB,
29653 IX86_BUILTIN_CLFLUSHOPT,
29654 IX86_BUILTIN_INFQ,
29655 IX86_BUILTIN_HUGE_VALQ,
29656 IX86_BUILTIN_NANQ,
29657 IX86_BUILTIN_NANSQ,
29658 IX86_BUILTIN_XABORT,
29659 IX86_BUILTIN_ADDCARRYX32,
29660 IX86_BUILTIN_ADDCARRYX64,
29661 IX86_BUILTIN_SBB32,
29662 IX86_BUILTIN_SBB64,
29663 IX86_BUILTIN_RDRAND16_STEP,
29664 IX86_BUILTIN_RDRAND32_STEP,
29665 IX86_BUILTIN_RDRAND64_STEP,
29666 IX86_BUILTIN_RDSEED16_STEP,
29667 IX86_BUILTIN_RDSEED32_STEP,
29668 IX86_BUILTIN_RDSEED64_STEP,
29669 IX86_BUILTIN_MONITORX,
29670 IX86_BUILTIN_MWAITX,
29671 IX86_BUILTIN_CFSTRING,
29672 IX86_BUILTIN_CPU_INIT,
29673 IX86_BUILTIN_CPU_IS,
29674 IX86_BUILTIN_CPU_SUPPORTS,
29675 IX86_BUILTIN_READ_FLAGS,
29676 IX86_BUILTIN_WRITE_FLAGS,
29677
29678 /* All the remaining builtins are tracked in bdesc_* arrays in
29679 i386-builtin.def. Don't add any IX86_BUILTIN_* enumerators after
29680 this point. */
29681 #define BDESC(mask, icode, name, code, comparison, flag) \
29682 code,
29683 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
29684 code, \
29685 IX86_BUILTIN__BDESC_##kindu##_FIRST = code,
29686 #define BDESC_END(kind, next_kind)
29687
29688 #include "i386-builtin.def"
29689
29690 #undef BDESC
29691 #undef BDESC_FIRST
29692 #undef BDESC_END
29693
29694 IX86_BUILTIN_MAX,
29695
29696 IX86_BUILTIN__BDESC_MAX_FIRST = IX86_BUILTIN_MAX,
29697
29698 /* Now just the aliases for bdesc_* start/end. */
29699 #define BDESC(mask, icode, name, code, comparison, flag)
29700 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag)
29701 #define BDESC_END(kind, next_kind) \
29702 IX86_BUILTIN__BDESC_##kind##_LAST \
29703 = IX86_BUILTIN__BDESC_##next_kind##_FIRST - 1,
29704
29705 #include "i386-builtin.def"
29706
29707 #undef BDESC
29708 #undef BDESC_FIRST
29709 #undef BDESC_END
29710
29711 /* Just to make sure there is no comma after the last enumerator. */
29712 IX86_BUILTIN__BDESC_MAX_LAST = IX86_BUILTIN__BDESC_MAX_FIRST
29713 };
29714
29715 /* Table for the ix86 builtin decls. */
29716 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
29717
29718 /* Table of all of the builtin functions that are possible with different ISA's
29719 but are waiting to be built until a function is declared to use that
29720 ISA. */
29721 struct builtin_isa {
29722 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
29723 HOST_WIDE_INT isa2; /* additional isa_flags this builtin is defined for */
29724 const char *name; /* function name */
29725 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
29726 unsigned char const_p:1; /* true if the declaration is constant */
29727 unsigned char pure_p:1; /* true if the declaration has pure attribute */
29728 bool leaf_p; /* true if the declaration has leaf attribute */
29729 bool nothrow_p; /* true if the declaration has nothrow attribute */
29730 bool set_and_not_built_p;
29731 };
29732
29733 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
29734
29735 /* Bits that can still enable any inclusion of a builtin. */
29736 static HOST_WIDE_INT deferred_isa_values = 0;
29737 static HOST_WIDE_INT deferred_isa_values2 = 0;
29738
29739 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
29740 of which isa_flags to use in the ix86_builtins_isa array. Stores the
29741 function decl in the ix86_builtins array. Returns the function decl or
29742 NULL_TREE, if the builtin was not added.
29743
29744 If the front end has a special hook for builtin functions, delay adding
29745 builtin functions that aren't in the current ISA until the ISA is changed
29746 with function specific optimization. Doing so, can save about 300K for the
29747 default compiler. When the builtin is expanded, check at that time whether
29748 it is valid.
29749
29750 If the front end doesn't have a special hook, record all builtins, even if
29751 it isn't an instruction set in the current ISA in case the user uses
29752 function specific options for a different ISA, so that we don't get scope
29753 errors if a builtin is added in the middle of a function scope. */
29754
29755 static inline tree
29756 def_builtin (HOST_WIDE_INT mask, const char *name,
29757 enum ix86_builtin_func_type tcode,
29758 enum ix86_builtins code)
29759 {
29760 tree decl = NULL_TREE;
29761
29762 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
29763 {
29764 ix86_builtins_isa[(int) code].isa = mask;
29765
29766 /* OPTION_MASK_ISA_AVX512VL has special meaning. Despite of generic case,
29767 where any bit set means that built-in is enable, this bit must be *and-ed*
29768 with another one. E.g.: OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL
29769 means that *both* cpuid bits must be set for the built-in to be available.
29770 Handle this here. */
29771 if (mask & ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
29772 mask &= ~OPTION_MASK_ISA_AVX512VL;
29773
29774 mask &= ~OPTION_MASK_ISA_64BIT;
29775 if (mask == 0
29776 || (mask & ix86_isa_flags) != 0
29777 || (lang_hooks.builtin_function
29778 == lang_hooks.builtin_function_ext_scope))
29779
29780 {
29781 tree type = ix86_get_builtin_func_type (tcode);
29782 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29783 NULL, NULL_TREE);
29784 ix86_builtins[(int) code] = decl;
29785 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29786 }
29787 else
29788 {
29789 /* Just a MASK where set_and_not_built_p == true can potentially
29790 include a builtin. */
29791 deferred_isa_values |= mask;
29792 ix86_builtins[(int) code] = NULL_TREE;
29793 ix86_builtins_isa[(int) code].tcode = tcode;
29794 ix86_builtins_isa[(int) code].name = name;
29795 ix86_builtins_isa[(int) code].leaf_p = false;
29796 ix86_builtins_isa[(int) code].nothrow_p = false;
29797 ix86_builtins_isa[(int) code].const_p = false;
29798 ix86_builtins_isa[(int) code].pure_p = false;
29799 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29800 }
29801 }
29802
29803 return decl;
29804 }
29805
29806 /* Like def_builtin, but also marks the function decl "const". */
29807
29808 static inline tree
29809 def_builtin_const (HOST_WIDE_INT mask, const char *name,
29810 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29811 {
29812 tree decl = def_builtin (mask, name, tcode, code);
29813 if (decl)
29814 TREE_READONLY (decl) = 1;
29815 else
29816 ix86_builtins_isa[(int) code].const_p = true;
29817
29818 return decl;
29819 }
29820
29821 /* Like def_builtin, but also marks the function decl "pure". */
29822
29823 static inline tree
29824 def_builtin_pure (HOST_WIDE_INT mask, const char *name,
29825 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29826 {
29827 tree decl = def_builtin (mask, name, tcode, code);
29828 if (decl)
29829 DECL_PURE_P (decl) = 1;
29830 else
29831 ix86_builtins_isa[(int) code].pure_p = true;
29832
29833 return decl;
29834 }
29835
29836 /* Like def_builtin, but for additional isa2 flags. */
29837
29838 static inline tree
29839 def_builtin2 (HOST_WIDE_INT mask, const char *name,
29840 enum ix86_builtin_func_type tcode,
29841 enum ix86_builtins code)
29842 {
29843 tree decl = NULL_TREE;
29844
29845 ix86_builtins_isa[(int) code].isa2 = mask;
29846
29847 if (mask == 0
29848 || (mask & ix86_isa_flags2) != 0
29849 || (lang_hooks.builtin_function
29850 == lang_hooks.builtin_function_ext_scope))
29851
29852 {
29853 tree type = ix86_get_builtin_func_type (tcode);
29854 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29855 NULL, NULL_TREE);
29856 ix86_builtins[(int) code] = decl;
29857 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
29858 }
29859 else
29860 {
29861 /* Just a MASK where set_and_not_built_p == true can potentially
29862 include a builtin. */
29863 deferred_isa_values2 |= mask;
29864 ix86_builtins[(int) code] = NULL_TREE;
29865 ix86_builtins_isa[(int) code].tcode = tcode;
29866 ix86_builtins_isa[(int) code].name = name;
29867 ix86_builtins_isa[(int) code].leaf_p = false;
29868 ix86_builtins_isa[(int) code].nothrow_p = false;
29869 ix86_builtins_isa[(int) code].const_p = false;
29870 ix86_builtins_isa[(int) code].pure_p = false;
29871 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
29872 }
29873
29874 return decl;
29875 }
29876
29877 /* Like def_builtin, but also marks the function decl "const". */
29878
29879 static inline tree
29880 def_builtin_const2 (HOST_WIDE_INT mask, const char *name,
29881 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29882 {
29883 tree decl = def_builtin2 (mask, name, tcode, code);
29884 if (decl)
29885 TREE_READONLY (decl) = 1;
29886 else
29887 ix86_builtins_isa[(int) code].const_p = true;
29888
29889 return decl;
29890 }
29891
29892 /* Like def_builtin, but also marks the function decl "pure". */
29893
29894 static inline tree
29895 def_builtin_pure2 (HOST_WIDE_INT mask, const char *name,
29896 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
29897 {
29898 tree decl = def_builtin2 (mask, name, tcode, code);
29899 if (decl)
29900 DECL_PURE_P (decl) = 1;
29901 else
29902 ix86_builtins_isa[(int) code].pure_p = true;
29903
29904 return decl;
29905 }
29906
29907 /* Add any new builtin functions for a given ISA that may not have been
29908 declared. This saves a bit of space compared to adding all of the
29909 declarations to the tree, even if we didn't use them. */
29910
29911 static void
29912 ix86_add_new_builtins (HOST_WIDE_INT isa, HOST_WIDE_INT isa2)
29913 {
29914 if ((isa & deferred_isa_values) == 0
29915 && (isa2 & deferred_isa_values2) == 0)
29916 return;
29917
29918 /* Bits in ISA value can be removed from potential isa values. */
29919 deferred_isa_values &= ~isa;
29920 deferred_isa_values2 &= ~isa2;
29921
29922 int i;
29923 tree saved_current_target_pragma = current_target_pragma;
29924 current_target_pragma = NULL_TREE;
29925
29926 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
29927 {
29928 if (((ix86_builtins_isa[i].isa & isa) != 0
29929 || (ix86_builtins_isa[i].isa2 & isa2) != 0)
29930 && ix86_builtins_isa[i].set_and_not_built_p)
29931 {
29932 tree decl, type;
29933
29934 /* Don't define the builtin again. */
29935 ix86_builtins_isa[i].set_and_not_built_p = false;
29936
29937 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
29938 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
29939 type, i, BUILT_IN_MD, NULL,
29940 NULL_TREE);
29941
29942 ix86_builtins[i] = decl;
29943 if (ix86_builtins_isa[i].const_p)
29944 TREE_READONLY (decl) = 1;
29945 if (ix86_builtins_isa[i].pure_p)
29946 DECL_PURE_P (decl) = 1;
29947 if (ix86_builtins_isa[i].leaf_p)
29948 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
29949 NULL_TREE);
29950 if (ix86_builtins_isa[i].nothrow_p)
29951 TREE_NOTHROW (decl) = 1;
29952 }
29953 }
29954
29955 current_target_pragma = saved_current_target_pragma;
29956 }
29957
29958 /* Bits for builtin_description.flag. */
29959
29960 /* Set when we don't support the comparison natively, and should
29961 swap_comparison in order to support it. */
29962 #define BUILTIN_DESC_SWAP_OPERANDS 1
29963
29964 struct builtin_description
29965 {
29966 const HOST_WIDE_INT mask;
29967 const enum insn_code icode;
29968 const char *const name;
29969 const enum ix86_builtins code;
29970 const enum rtx_code comparison;
29971 const int flag;
29972 };
29973
29974 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
29975 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
29976 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
29977 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
29978 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
29979 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
29980 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
29981 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
29982 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
29983 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
29984 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
29985 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
29986 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
29987 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
29988 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
29989 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
29990 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
29991 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
29992 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
29993 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
29994 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
29995 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
29996 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
29997 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
29998 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
29999 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
30000 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
30001 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
30002 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
30003 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
30004 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
30005 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
30006 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
30007 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
30008 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
30009 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
30010 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
30011 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
30012 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
30013 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
30014 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
30015 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
30016 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
30017 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
30018 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
30019 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
30020 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
30021 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
30022 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
30023 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
30024 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
30025 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
30026
30027 #define BDESC(mask, icode, name, code, comparison, flag) \
30028 { mask, icode, name, code, comparison, flag },
30029 #define BDESC_FIRST(kind, kindu, mask, icode, name, code, comparison, flag) \
30030 static const struct builtin_description bdesc_##kind[] = \
30031 { \
30032 BDESC (mask, icode, name, code, comparison, flag)
30033 #define BDESC_END(kind, next_kind) \
30034 };
30035
30036 #include "i386-builtin.def"
30037
30038 #undef BDESC
30039 #undef BDESC_FIRST
30040 #undef BDESC_END
30041 \f
30042 /* TM vector builtins. */
30043
30044 /* Reuse the existing x86-specific `struct builtin_description' cause
30045 we're lazy. Add casts to make them fit. */
30046 static const struct builtin_description bdesc_tm[] =
30047 {
30048 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30049 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30050 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
30051 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30052 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30053 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30054 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
30055
30056 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30057 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30058 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
30059 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30060 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30061 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30062 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
30063
30064 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30065 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30066 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
30067 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30068 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30069 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30070 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
30071
30072 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
30073 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
30074 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
30075 };
30076
30077 /* Initialize the transactional memory vector load/store builtins. */
30078
30079 static void
30080 ix86_init_tm_builtins (void)
30081 {
30082 enum ix86_builtin_func_type ftype;
30083 const struct builtin_description *d;
30084 size_t i;
30085 tree decl;
30086 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
30087 tree attrs_log, attrs_type_log;
30088
30089 if (!flag_tm)
30090 return;
30091
30092 /* If there are no builtins defined, we must be compiling in a
30093 language without trans-mem support. */
30094 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
30095 return;
30096
30097 /* Use whatever attributes a normal TM load has. */
30098 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
30099 attrs_load = DECL_ATTRIBUTES (decl);
30100 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30101 /* Use whatever attributes a normal TM store has. */
30102 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
30103 attrs_store = DECL_ATTRIBUTES (decl);
30104 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30105 /* Use whatever attributes a normal TM log has. */
30106 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
30107 attrs_log = DECL_ATTRIBUTES (decl);
30108 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
30109
30110 for (i = 0, d = bdesc_tm;
30111 i < ARRAY_SIZE (bdesc_tm);
30112 i++, d++)
30113 {
30114 if ((d->mask & ix86_isa_flags) != 0
30115 || (lang_hooks.builtin_function
30116 == lang_hooks.builtin_function_ext_scope))
30117 {
30118 tree type, attrs, attrs_type;
30119 enum built_in_function code = (enum built_in_function) d->code;
30120
30121 ftype = (enum ix86_builtin_func_type) d->flag;
30122 type = ix86_get_builtin_func_type (ftype);
30123
30124 if (BUILTIN_TM_LOAD_P (code))
30125 {
30126 attrs = attrs_load;
30127 attrs_type = attrs_type_load;
30128 }
30129 else if (BUILTIN_TM_STORE_P (code))
30130 {
30131 attrs = attrs_store;
30132 attrs_type = attrs_type_store;
30133 }
30134 else
30135 {
30136 attrs = attrs_log;
30137 attrs_type = attrs_type_log;
30138 }
30139 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
30140 /* The builtin without the prefix for
30141 calling it directly. */
30142 d->name + strlen ("__builtin_"),
30143 attrs);
30144 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
30145 set the TYPE_ATTRIBUTES. */
30146 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
30147
30148 set_builtin_decl (code, decl, false);
30149 }
30150 }
30151 }
30152
30153 /* Macros for verification of enum ix86_builtins order. */
30154 #define BDESC_VERIFY(x, y, z) \
30155 gcc_checking_assert ((x) == (enum ix86_builtins) ((y) + (z)))
30156 #define BDESC_VERIFYS(x, y, z) \
30157 STATIC_ASSERT ((x) == (enum ix86_builtins) ((y) + (z)))
30158
30159 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30160 IX86_BUILTIN__BDESC_COMI_LAST, 1);
30161 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30162 IX86_BUILTIN__BDESC_PCMPESTR_LAST, 1);
30163 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30164 IX86_BUILTIN__BDESC_PCMPISTR_LAST, 1);
30165 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_FIRST,
30166 IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST, 1);
30167 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30168 IX86_BUILTIN__BDESC_ARGS_LAST, 1);
30169 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_FIRST,
30170 IX86_BUILTIN__BDESC_ROUND_ARGS_LAST, 1);
30171 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30172 IX86_BUILTIN__BDESC_ARGS2_LAST, 1);
30173 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_FIRST,
30174 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST, 1);
30175 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30176 IX86_BUILTIN__BDESC_MPX_LAST, 1);
30177 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30178 IX86_BUILTIN__BDESC_MPX_CONST_LAST, 1);
30179 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_FIRST,
30180 IX86_BUILTIN__BDESC_MULTI_ARG_LAST, 1);
30181 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30182 IX86_BUILTIN__BDESC_CET_LAST, 1);
30183 BDESC_VERIFYS (IX86_BUILTIN_MAX,
30184 IX86_BUILTIN__BDESC_CET_NORMAL_LAST, 1);
30185
30186 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
30187 in the current target ISA to allow the user to compile particular modules
30188 with different target specific options that differ from the command line
30189 options. */
30190 static void
30191 ix86_init_mmx_sse_builtins (void)
30192 {
30193 const struct builtin_description * d;
30194 enum ix86_builtin_func_type ftype;
30195 size_t i;
30196
30197 /* Add all special builtins with variable number of operands. */
30198 for (i = 0, d = bdesc_special_args;
30199 i < ARRAY_SIZE (bdesc_special_args);
30200 i++, d++)
30201 {
30202 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST, i);
30203 if (d->name == 0)
30204 continue;
30205
30206 ftype = (enum ix86_builtin_func_type) d->flag;
30207 def_builtin (d->mask, d->name, ftype, d->code);
30208 }
30209 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST,
30210 IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST,
30211 ARRAY_SIZE (bdesc_special_args) - 1);
30212
30213 /* Add all builtins with variable number of operands. */
30214 for (i = 0, d = bdesc_args;
30215 i < ARRAY_SIZE (bdesc_args);
30216 i++, d++)
30217 {
30218 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS_FIRST, i);
30219 if (d->name == 0)
30220 continue;
30221
30222 ftype = (enum ix86_builtin_func_type) d->flag;
30223 def_builtin_const (d->mask, d->name, ftype, d->code);
30224 }
30225 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS_LAST,
30226 IX86_BUILTIN__BDESC_ARGS_FIRST,
30227 ARRAY_SIZE (bdesc_args) - 1);
30228
30229 /* Add all builtins with variable number of operands. */
30230 for (i = 0, d = bdesc_args2;
30231 i < ARRAY_SIZE (bdesc_args2);
30232 i++, d++)
30233 {
30234 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ARGS2_FIRST, i);
30235 if (d->name == 0)
30236 continue;
30237
30238 ftype = (enum ix86_builtin_func_type) d->flag;
30239 def_builtin_const2 (d->mask, d->name, ftype, d->code);
30240 }
30241 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ARGS2_LAST,
30242 IX86_BUILTIN__BDESC_ARGS2_FIRST,
30243 ARRAY_SIZE (bdesc_args2) - 1);
30244
30245 for (i = 0, d = bdesc_special_args2;
30246 i < ARRAY_SIZE (bdesc_special_args2);
30247 i++, d++)
30248 {
30249 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST, i);
30250 if (d->name == 0)
30251 continue;
30252
30253 ftype = (enum ix86_builtin_func_type) d->flag;
30254 def_builtin2 (d->mask, d->name, ftype, d->code);
30255 }
30256 BDESC_VERIFYS (IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST,
30257 IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST,
30258 ARRAY_SIZE (bdesc_special_args2) - 1);
30259
30260 /* Add all builtins with rounding. */
30261 for (i = 0, d = bdesc_round_args;
30262 i < ARRAY_SIZE (bdesc_round_args);
30263 i++, d++)
30264 {
30265 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST, i);
30266 if (d->name == 0)
30267 continue;
30268
30269 ftype = (enum ix86_builtin_func_type) d->flag;
30270 def_builtin_const (d->mask, d->name, ftype, d->code);
30271 }
30272 BDESC_VERIFYS (IX86_BUILTIN__BDESC_ROUND_ARGS_LAST,
30273 IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST,
30274 ARRAY_SIZE (bdesc_round_args) - 1);
30275
30276 /* pcmpestr[im] insns. */
30277 for (i = 0, d = bdesc_pcmpestr;
30278 i < ARRAY_SIZE (bdesc_pcmpestr);
30279 i++, d++)
30280 {
30281 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPESTR_FIRST, i);
30282 if (d->code == IX86_BUILTIN_PCMPESTRM128)
30283 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
30284 else
30285 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
30286 def_builtin_const (d->mask, d->name, ftype, d->code);
30287 }
30288 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPESTR_LAST,
30289 IX86_BUILTIN__BDESC_PCMPESTR_FIRST,
30290 ARRAY_SIZE (bdesc_pcmpestr) - 1);
30291
30292 /* pcmpistr[im] insns. */
30293 for (i = 0, d = bdesc_pcmpistr;
30294 i < ARRAY_SIZE (bdesc_pcmpistr);
30295 i++, d++)
30296 {
30297 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_PCMPISTR_FIRST, i);
30298 if (d->code == IX86_BUILTIN_PCMPISTRM128)
30299 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
30300 else
30301 ftype = INT_FTYPE_V16QI_V16QI_INT;
30302 def_builtin_const (d->mask, d->name, ftype, d->code);
30303 }
30304 BDESC_VERIFYS (IX86_BUILTIN__BDESC_PCMPISTR_LAST,
30305 IX86_BUILTIN__BDESC_PCMPISTR_FIRST,
30306 ARRAY_SIZE (bdesc_pcmpistr) - 1);
30307
30308 /* comi/ucomi insns. */
30309 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
30310 {
30311 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_COMI_FIRST, i);
30312 if (d->mask == OPTION_MASK_ISA_SSE2)
30313 ftype = INT_FTYPE_V2DF_V2DF;
30314 else
30315 ftype = INT_FTYPE_V4SF_V4SF;
30316 def_builtin_const (d->mask, d->name, ftype, d->code);
30317 }
30318 BDESC_VERIFYS (IX86_BUILTIN__BDESC_COMI_LAST,
30319 IX86_BUILTIN__BDESC_COMI_FIRST,
30320 ARRAY_SIZE (bdesc_comi) - 1);
30321
30322 /* SSE */
30323 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
30324 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
30325 def_builtin_pure (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
30326 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
30327
30328 /* SSE or 3DNow!A */
30329 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30330 /* As it uses V4HImode, we have to require -mmmx too. */
30331 | OPTION_MASK_ISA_MMX,
30332 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
30333 IX86_BUILTIN_MASKMOVQ);
30334
30335 /* SSE2 */
30336 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
30337 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
30338
30339 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
30340 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
30341 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
30342 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
30343
30344 /* SSE3. */
30345 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
30346 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
30347 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
30348 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
30349
30350 /* AES */
30351 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
30352 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
30353 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
30354 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
30355 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
30356 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
30357 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
30358 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
30359 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
30360 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
30361 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
30362 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
30363
30364 /* PCLMUL */
30365 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
30366 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
30367
30368 /* RDRND */
30369 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
30370 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
30371 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
30372 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
30373 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
30374 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
30375 IX86_BUILTIN_RDRAND64_STEP);
30376
30377 /* AVX2 */
30378 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
30379 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
30380 IX86_BUILTIN_GATHERSIV2DF);
30381
30382 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
30383 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
30384 IX86_BUILTIN_GATHERSIV4DF);
30385
30386 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
30387 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
30388 IX86_BUILTIN_GATHERDIV2DF);
30389
30390 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
30391 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
30392 IX86_BUILTIN_GATHERDIV4DF);
30393
30394 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
30395 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
30396 IX86_BUILTIN_GATHERSIV4SF);
30397
30398 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
30399 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
30400 IX86_BUILTIN_GATHERSIV8SF);
30401
30402 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
30403 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
30404 IX86_BUILTIN_GATHERDIV4SF);
30405
30406 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
30407 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
30408 IX86_BUILTIN_GATHERDIV8SF);
30409
30410 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
30411 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
30412 IX86_BUILTIN_GATHERSIV2DI);
30413
30414 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
30415 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
30416 IX86_BUILTIN_GATHERSIV4DI);
30417
30418 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
30419 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
30420 IX86_BUILTIN_GATHERDIV2DI);
30421
30422 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
30423 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
30424 IX86_BUILTIN_GATHERDIV4DI);
30425
30426 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
30427 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
30428 IX86_BUILTIN_GATHERSIV4SI);
30429
30430 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
30431 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
30432 IX86_BUILTIN_GATHERSIV8SI);
30433
30434 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
30435 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
30436 IX86_BUILTIN_GATHERDIV4SI);
30437
30438 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
30439 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
30440 IX86_BUILTIN_GATHERDIV8SI);
30441
30442 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
30443 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
30444 IX86_BUILTIN_GATHERALTSIV4DF);
30445
30446 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
30447 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
30448 IX86_BUILTIN_GATHERALTDIV8SF);
30449
30450 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
30451 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
30452 IX86_BUILTIN_GATHERALTSIV4DI);
30453
30454 def_builtin_pure (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
30455 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
30456 IX86_BUILTIN_GATHERALTDIV8SI);
30457
30458 /* AVX512F */
30459 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16sf",
30460 V16SF_FTYPE_V16SF_PCVOID_V16SI_HI_INT,
30461 IX86_BUILTIN_GATHER3SIV16SF);
30462
30463 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8df",
30464 V8DF_FTYPE_V8DF_PCVOID_V8SI_QI_INT,
30465 IX86_BUILTIN_GATHER3SIV8DF);
30466
30467 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16sf",
30468 V8SF_FTYPE_V8SF_PCVOID_V8DI_QI_INT,
30469 IX86_BUILTIN_GATHER3DIV16SF);
30470
30471 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8df",
30472 V8DF_FTYPE_V8DF_PCVOID_V8DI_QI_INT,
30473 IX86_BUILTIN_GATHER3DIV8DF);
30474
30475 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv16si",
30476 V16SI_FTYPE_V16SI_PCVOID_V16SI_HI_INT,
30477 IX86_BUILTIN_GATHER3SIV16SI);
30478
30479 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gathersiv8di",
30480 V8DI_FTYPE_V8DI_PCVOID_V8SI_QI_INT,
30481 IX86_BUILTIN_GATHER3SIV8DI);
30482
30483 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv16si",
30484 V8SI_FTYPE_V8SI_PCVOID_V8DI_QI_INT,
30485 IX86_BUILTIN_GATHER3DIV16SI);
30486
30487 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatherdiv8di",
30488 V8DI_FTYPE_V8DI_PCVOID_V8DI_QI_INT,
30489 IX86_BUILTIN_GATHER3DIV8DI);
30490
30491 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8df ",
30492 V8DF_FTYPE_V8DF_PCDOUBLE_V16SI_QI_INT,
30493 IX86_BUILTIN_GATHER3ALTSIV8DF);
30494
30495 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8sf ",
30496 V16SF_FTYPE_V16SF_PCFLOAT_V8DI_HI_INT,
30497 IX86_BUILTIN_GATHER3ALTDIV16SF);
30498
30499 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltsiv8di ",
30500 V8DI_FTYPE_V8DI_PCINT64_V16SI_QI_INT,
30501 IX86_BUILTIN_GATHER3ALTSIV8DI);
30502
30503 def_builtin_pure (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_gatheraltdiv8si ",
30504 V16SI_FTYPE_V16SI_PCINT_V8DI_HI_INT,
30505 IX86_BUILTIN_GATHER3ALTDIV16SI);
30506
30507 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16sf",
30508 VOID_FTYPE_PVOID_HI_V16SI_V16SF_INT,
30509 IX86_BUILTIN_SCATTERSIV16SF);
30510
30511 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8df",
30512 VOID_FTYPE_PVOID_QI_V8SI_V8DF_INT,
30513 IX86_BUILTIN_SCATTERSIV8DF);
30514
30515 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16sf",
30516 VOID_FTYPE_PVOID_QI_V8DI_V8SF_INT,
30517 IX86_BUILTIN_SCATTERDIV16SF);
30518
30519 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8df",
30520 VOID_FTYPE_PVOID_QI_V8DI_V8DF_INT,
30521 IX86_BUILTIN_SCATTERDIV8DF);
30522
30523 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv16si",
30524 VOID_FTYPE_PVOID_HI_V16SI_V16SI_INT,
30525 IX86_BUILTIN_SCATTERSIV16SI);
30526
30527 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scattersiv8di",
30528 VOID_FTYPE_PVOID_QI_V8SI_V8DI_INT,
30529 IX86_BUILTIN_SCATTERSIV8DI);
30530
30531 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv16si",
30532 VOID_FTYPE_PVOID_QI_V8DI_V8SI_INT,
30533 IX86_BUILTIN_SCATTERDIV16SI);
30534
30535 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatterdiv8di",
30536 VOID_FTYPE_PVOID_QI_V8DI_V8DI_INT,
30537 IX86_BUILTIN_SCATTERDIV8DI);
30538
30539 /* AVX512VL */
30540 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2df",
30541 V2DF_FTYPE_V2DF_PCVOID_V4SI_QI_INT,
30542 IX86_BUILTIN_GATHER3SIV2DF);
30543
30544 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4df",
30545 V4DF_FTYPE_V4DF_PCVOID_V4SI_QI_INT,
30546 IX86_BUILTIN_GATHER3SIV4DF);
30547
30548 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2df",
30549 V2DF_FTYPE_V2DF_PCVOID_V2DI_QI_INT,
30550 IX86_BUILTIN_GATHER3DIV2DF);
30551
30552 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4df",
30553 V4DF_FTYPE_V4DF_PCVOID_V4DI_QI_INT,
30554 IX86_BUILTIN_GATHER3DIV4DF);
30555
30556 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4sf",
30557 V4SF_FTYPE_V4SF_PCVOID_V4SI_QI_INT,
30558 IX86_BUILTIN_GATHER3SIV4SF);
30559
30560 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8sf",
30561 V8SF_FTYPE_V8SF_PCVOID_V8SI_QI_INT,
30562 IX86_BUILTIN_GATHER3SIV8SF);
30563
30564 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4sf",
30565 V4SF_FTYPE_V4SF_PCVOID_V2DI_QI_INT,
30566 IX86_BUILTIN_GATHER3DIV4SF);
30567
30568 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8sf",
30569 V4SF_FTYPE_V4SF_PCVOID_V4DI_QI_INT,
30570 IX86_BUILTIN_GATHER3DIV8SF);
30571
30572 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv2di",
30573 V2DI_FTYPE_V2DI_PCVOID_V4SI_QI_INT,
30574 IX86_BUILTIN_GATHER3SIV2DI);
30575
30576 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4di",
30577 V4DI_FTYPE_V4DI_PCVOID_V4SI_QI_INT,
30578 IX86_BUILTIN_GATHER3SIV4DI);
30579
30580 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div2di",
30581 V2DI_FTYPE_V2DI_PCVOID_V2DI_QI_INT,
30582 IX86_BUILTIN_GATHER3DIV2DI);
30583
30584 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4di",
30585 V4DI_FTYPE_V4DI_PCVOID_V4DI_QI_INT,
30586 IX86_BUILTIN_GATHER3DIV4DI);
30587
30588 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv4si",
30589 V4SI_FTYPE_V4SI_PCVOID_V4SI_QI_INT,
30590 IX86_BUILTIN_GATHER3SIV4SI);
30591
30592 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3siv8si",
30593 V8SI_FTYPE_V8SI_PCVOID_V8SI_QI_INT,
30594 IX86_BUILTIN_GATHER3SIV8SI);
30595
30596 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div4si",
30597 V4SI_FTYPE_V4SI_PCVOID_V2DI_QI_INT,
30598 IX86_BUILTIN_GATHER3DIV4SI);
30599
30600 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3div8si",
30601 V4SI_FTYPE_V4SI_PCVOID_V4DI_QI_INT,
30602 IX86_BUILTIN_GATHER3DIV8SI);
30603
30604 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4df ",
30605 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_QI_INT,
30606 IX86_BUILTIN_GATHER3ALTSIV4DF);
30607
30608 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8sf ",
30609 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_QI_INT,
30610 IX86_BUILTIN_GATHER3ALTDIV8SF);
30611
30612 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altsiv4di ",
30613 V4DI_FTYPE_V4DI_PCINT64_V8SI_QI_INT,
30614 IX86_BUILTIN_GATHER3ALTSIV4DI);
30615
30616 def_builtin_pure (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_gather3altdiv8si ",
30617 V8SI_FTYPE_V8SI_PCINT_V4DI_QI_INT,
30618 IX86_BUILTIN_GATHER3ALTDIV8SI);
30619
30620 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8sf",
30621 VOID_FTYPE_PVOID_QI_V8SI_V8SF_INT,
30622 IX86_BUILTIN_SCATTERSIV8SF);
30623
30624 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4sf",
30625 VOID_FTYPE_PVOID_QI_V4SI_V4SF_INT,
30626 IX86_BUILTIN_SCATTERSIV4SF);
30627
30628 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4df",
30629 VOID_FTYPE_PVOID_QI_V4SI_V4DF_INT,
30630 IX86_BUILTIN_SCATTERSIV4DF);
30631
30632 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2df",
30633 VOID_FTYPE_PVOID_QI_V4SI_V2DF_INT,
30634 IX86_BUILTIN_SCATTERSIV2DF);
30635
30636 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8sf",
30637 VOID_FTYPE_PVOID_QI_V4DI_V4SF_INT,
30638 IX86_BUILTIN_SCATTERDIV8SF);
30639
30640 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4sf",
30641 VOID_FTYPE_PVOID_QI_V2DI_V4SF_INT,
30642 IX86_BUILTIN_SCATTERDIV4SF);
30643
30644 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4df",
30645 VOID_FTYPE_PVOID_QI_V4DI_V4DF_INT,
30646 IX86_BUILTIN_SCATTERDIV4DF);
30647
30648 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2df",
30649 VOID_FTYPE_PVOID_QI_V2DI_V2DF_INT,
30650 IX86_BUILTIN_SCATTERDIV2DF);
30651
30652 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv8si",
30653 VOID_FTYPE_PVOID_QI_V8SI_V8SI_INT,
30654 IX86_BUILTIN_SCATTERSIV8SI);
30655
30656 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4si",
30657 VOID_FTYPE_PVOID_QI_V4SI_V4SI_INT,
30658 IX86_BUILTIN_SCATTERSIV4SI);
30659
30660 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv4di",
30661 VOID_FTYPE_PVOID_QI_V4SI_V4DI_INT,
30662 IX86_BUILTIN_SCATTERSIV4DI);
30663
30664 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scattersiv2di",
30665 VOID_FTYPE_PVOID_QI_V4SI_V2DI_INT,
30666 IX86_BUILTIN_SCATTERSIV2DI);
30667
30668 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv8si",
30669 VOID_FTYPE_PVOID_QI_V4DI_V4SI_INT,
30670 IX86_BUILTIN_SCATTERDIV8SI);
30671
30672 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4si",
30673 VOID_FTYPE_PVOID_QI_V2DI_V4SI_INT,
30674 IX86_BUILTIN_SCATTERDIV4SI);
30675
30676 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv4di",
30677 VOID_FTYPE_PVOID_QI_V4DI_V4DI_INT,
30678 IX86_BUILTIN_SCATTERDIV4DI);
30679
30680 def_builtin (OPTION_MASK_ISA_AVX512VL, "__builtin_ia32_scatterdiv2di",
30681 VOID_FTYPE_PVOID_QI_V2DI_V2DI_INT,
30682 IX86_BUILTIN_SCATTERDIV2DI);
30683 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8df ",
30684 VOID_FTYPE_PDOUBLE_QI_V16SI_V8DF_INT,
30685 IX86_BUILTIN_SCATTERALTSIV8DF);
30686
30687 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8sf ",
30688 VOID_FTYPE_PFLOAT_HI_V8DI_V16SF_INT,
30689 IX86_BUILTIN_SCATTERALTDIV16SF);
30690
30691 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltsiv8di ",
30692 VOID_FTYPE_PLONGLONG_QI_V16SI_V8DI_INT,
30693 IX86_BUILTIN_SCATTERALTSIV8DI);
30694
30695 def_builtin (OPTION_MASK_ISA_AVX512F, "__builtin_ia32_scatteraltdiv8si ",
30696 VOID_FTYPE_PINT_HI_V8DI_V16SI_INT,
30697 IX86_BUILTIN_SCATTERALTDIV16SI);
30698
30699 /* AVX512PF */
30700 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdpd",
30701 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30702 IX86_BUILTIN_GATHERPFDPD);
30703 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfdps",
30704 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30705 IX86_BUILTIN_GATHERPFDPS);
30706 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqpd",
30707 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30708 IX86_BUILTIN_GATHERPFQPD);
30709 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_gatherpfqps",
30710 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30711 IX86_BUILTIN_GATHERPFQPS);
30712 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdpd",
30713 VOID_FTYPE_QI_V8SI_PCVOID_INT_INT,
30714 IX86_BUILTIN_SCATTERPFDPD);
30715 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfdps",
30716 VOID_FTYPE_HI_V16SI_PCVOID_INT_INT,
30717 IX86_BUILTIN_SCATTERPFDPS);
30718 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqpd",
30719 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30720 IX86_BUILTIN_SCATTERPFQPD);
30721 def_builtin (OPTION_MASK_ISA_AVX512PF, "__builtin_ia32_scatterpfqps",
30722 VOID_FTYPE_QI_V8DI_PCVOID_INT_INT,
30723 IX86_BUILTIN_SCATTERPFQPS);
30724
30725 /* SHA */
30726 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg1",
30727 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG1);
30728 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1msg2",
30729 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1MSG2);
30730 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1nexte",
30731 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA1NEXTE);
30732 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha1rnds4",
30733 V4SI_FTYPE_V4SI_V4SI_INT, IX86_BUILTIN_SHA1RNDS4);
30734 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg1",
30735 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG1);
30736 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256msg2",
30737 V4SI_FTYPE_V4SI_V4SI, IX86_BUILTIN_SHA256MSG2);
30738 def_builtin_const (OPTION_MASK_ISA_SHA, "__builtin_ia32_sha256rnds2",
30739 V4SI_FTYPE_V4SI_V4SI_V4SI, IX86_BUILTIN_SHA256RNDS2);
30740
30741 /* RTM. */
30742 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
30743 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
30744
30745 /* MMX access to the vec_init patterns. */
30746 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
30747 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
30748
30749 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
30750 V4HI_FTYPE_HI_HI_HI_HI,
30751 IX86_BUILTIN_VEC_INIT_V4HI);
30752
30753 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
30754 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
30755 IX86_BUILTIN_VEC_INIT_V8QI);
30756
30757 /* Access to the vec_extract patterns. */
30758 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
30759 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
30760 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
30761 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
30762 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
30763 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
30764 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
30765 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
30766 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
30767 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
30768
30769 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30770 /* As it uses V4HImode, we have to require -mmmx too. */
30771 | OPTION_MASK_ISA_MMX,
30772 "__builtin_ia32_vec_ext_v4hi",
30773 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
30774
30775 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
30776 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
30777
30778 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
30779 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
30780
30781 /* Access to the vec_set patterns. */
30782 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
30783 "__builtin_ia32_vec_set_v2di",
30784 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
30785
30786 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
30787 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
30788
30789 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
30790 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
30791
30792 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
30793 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
30794
30795 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
30796 /* As it uses V4HImode, we have to require -mmmx too. */
30797 | OPTION_MASK_ISA_MMX,
30798 "__builtin_ia32_vec_set_v4hi",
30799 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
30800
30801 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
30802 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
30803
30804 /* RDSEED */
30805 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
30806 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
30807 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
30808 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
30809 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
30810 "__builtin_ia32_rdseed_di_step",
30811 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
30812
30813 /* ADCX */
30814 def_builtin (0, "__builtin_ia32_addcarryx_u32",
30815 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
30816 def_builtin (OPTION_MASK_ISA_64BIT,
30817 "__builtin_ia32_addcarryx_u64",
30818 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30819 IX86_BUILTIN_ADDCARRYX64);
30820
30821 /* SBB */
30822 def_builtin (0, "__builtin_ia32_sbb_u32",
30823 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_SBB32);
30824 def_builtin (OPTION_MASK_ISA_64BIT,
30825 "__builtin_ia32_sbb_u64",
30826 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
30827 IX86_BUILTIN_SBB64);
30828
30829 /* Read/write FLAGS. */
30830 def_builtin (0, "__builtin_ia32_readeflags_u32",
30831 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30832 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_readeflags_u64",
30833 UINT64_FTYPE_VOID, IX86_BUILTIN_READ_FLAGS);
30834 def_builtin (0, "__builtin_ia32_writeeflags_u32",
30835 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_WRITE_FLAGS);
30836 def_builtin (OPTION_MASK_ISA_64BIT, "__builtin_ia32_writeeflags_u64",
30837 VOID_FTYPE_UINT64, IX86_BUILTIN_WRITE_FLAGS);
30838
30839 /* CLFLUSHOPT. */
30840 def_builtin (OPTION_MASK_ISA_CLFLUSHOPT, "__builtin_ia32_clflushopt",
30841 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSHOPT);
30842
30843 /* CLWB. */
30844 def_builtin (OPTION_MASK_ISA_CLWB, "__builtin_ia32_clwb",
30845 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLWB);
30846
30847 /* MONITORX and MWAITX. */
30848 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_monitorx",
30849 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITORX);
30850 def_builtin (OPTION_MASK_ISA_MWAITX, "__builtin_ia32_mwaitx",
30851 VOID_FTYPE_UNSIGNED_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAITX);
30852
30853 /* CLZERO. */
30854 def_builtin (OPTION_MASK_ISA_CLZERO, "__builtin_ia32_clzero",
30855 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLZERO);
30856
30857 /* Add FMA4 multi-arg argument instructions */
30858 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
30859 {
30860 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MULTI_ARG_FIRST, i);
30861 if (d->name == 0)
30862 continue;
30863
30864 ftype = (enum ix86_builtin_func_type) d->flag;
30865 def_builtin_const (d->mask, d->name, ftype, d->code);
30866 }
30867 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MULTI_ARG_LAST,
30868 IX86_BUILTIN__BDESC_MULTI_ARG_FIRST,
30869 ARRAY_SIZE (bdesc_multi_arg) - 1);
30870
30871 /* Add CET inrinsics. */
30872 for (i = 0, d = bdesc_cet; i < ARRAY_SIZE (bdesc_cet); i++, d++)
30873 {
30874 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_FIRST, i);
30875 if (d->name == 0)
30876 continue;
30877
30878 ftype = (enum ix86_builtin_func_type) d->flag;
30879 def_builtin2 (d->mask, d->name, ftype, d->code);
30880 }
30881 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_LAST,
30882 IX86_BUILTIN__BDESC_CET_FIRST,
30883 ARRAY_SIZE (bdesc_cet) - 1);
30884
30885 for (i = 0, d = bdesc_cet_rdssp;
30886 i < ARRAY_SIZE (bdesc_cet_rdssp);
30887 i++, d++)
30888 {
30889 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_CET_NORMAL_FIRST, i);
30890 if (d->name == 0)
30891 continue;
30892
30893 ftype = (enum ix86_builtin_func_type) d->flag;
30894 def_builtin2 (d->mask, d->name, ftype, d->code);
30895 }
30896 BDESC_VERIFYS (IX86_BUILTIN__BDESC_CET_NORMAL_LAST,
30897 IX86_BUILTIN__BDESC_CET_NORMAL_FIRST,
30898 ARRAY_SIZE (bdesc_cet_rdssp) - 1);
30899 }
30900
30901 static void
30902 ix86_init_mpx_builtins ()
30903 {
30904 const struct builtin_description * d;
30905 enum ix86_builtin_func_type ftype;
30906 tree decl;
30907 size_t i;
30908
30909 for (i = 0, d = bdesc_mpx;
30910 i < ARRAY_SIZE (bdesc_mpx);
30911 i++, d++)
30912 {
30913 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_FIRST, i);
30914 if (d->name == 0)
30915 continue;
30916
30917 ftype = (enum ix86_builtin_func_type) d->flag;
30918 decl = def_builtin2 (d->mask, d->name, ftype, d->code);
30919
30920 /* With no leaf and nothrow flags for MPX builtins
30921 abnormal edges may follow its call when setjmp
30922 presents in the function. Since we may have a lot
30923 of MPX builtins calls it causes lots of useless
30924 edges and enormous PHI nodes. To avoid this we mark
30925 MPX builtins as leaf and nothrow. */
30926 if (decl)
30927 {
30928 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30929 NULL_TREE);
30930 TREE_NOTHROW (decl) = 1;
30931 }
30932 else
30933 {
30934 ix86_builtins_isa[(int)d->code].leaf_p = true;
30935 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30936 }
30937 }
30938 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_LAST,
30939 IX86_BUILTIN__BDESC_MPX_FIRST,
30940 ARRAY_SIZE (bdesc_mpx) - 1);
30941
30942 for (i = 0, d = bdesc_mpx_const;
30943 i < ARRAY_SIZE (bdesc_mpx_const);
30944 i++, d++)
30945 {
30946 BDESC_VERIFY (d->code, IX86_BUILTIN__BDESC_MPX_CONST_FIRST, i);
30947 if (d->name == 0)
30948 continue;
30949
30950 ftype = (enum ix86_builtin_func_type) d->flag;
30951 decl = def_builtin_const2 (d->mask, d->name, ftype, d->code);
30952
30953 if (decl)
30954 {
30955 DECL_ATTRIBUTES (decl) = build_tree_list (get_identifier ("leaf"),
30956 NULL_TREE);
30957 TREE_NOTHROW (decl) = 1;
30958 }
30959 else
30960 {
30961 ix86_builtins_isa[(int)d->code].leaf_p = true;
30962 ix86_builtins_isa[(int)d->code].nothrow_p = true;
30963 }
30964 }
30965 BDESC_VERIFYS (IX86_BUILTIN__BDESC_MPX_CONST_LAST,
30966 IX86_BUILTIN__BDESC_MPX_CONST_FIRST,
30967 ARRAY_SIZE (bdesc_mpx_const) - 1);
30968 }
30969 #undef BDESC_VERIFY
30970 #undef BDESC_VERIFYS
30971
30972 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
30973 to return a pointer to VERSION_DECL if the outcome of the expression
30974 formed by PREDICATE_CHAIN is true. This function will be called during
30975 version dispatch to decide which function version to execute. It returns
30976 the basic block at the end, to which more conditions can be added. */
30977
30978 static basic_block
30979 add_condition_to_bb (tree function_decl, tree version_decl,
30980 tree predicate_chain, basic_block new_bb)
30981 {
30982 gimple *return_stmt;
30983 tree convert_expr, result_var;
30984 gimple *convert_stmt;
30985 gimple *call_cond_stmt;
30986 gimple *if_else_stmt;
30987
30988 basic_block bb1, bb2, bb3;
30989 edge e12, e23;
30990
30991 tree cond_var, and_expr_var = NULL_TREE;
30992 gimple_seq gseq;
30993
30994 tree predicate_decl, predicate_arg;
30995
30996 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
30997
30998 gcc_assert (new_bb != NULL);
30999 gseq = bb_seq (new_bb);
31000
31001
31002 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
31003 build_fold_addr_expr (version_decl));
31004 result_var = create_tmp_var (ptr_type_node);
31005 convert_stmt = gimple_build_assign (result_var, convert_expr);
31006 return_stmt = gimple_build_return (result_var);
31007
31008 if (predicate_chain == NULL_TREE)
31009 {
31010 gimple_seq_add_stmt (&gseq, convert_stmt);
31011 gimple_seq_add_stmt (&gseq, return_stmt);
31012 set_bb_seq (new_bb, gseq);
31013 gimple_set_bb (convert_stmt, new_bb);
31014 gimple_set_bb (return_stmt, new_bb);
31015 pop_cfun ();
31016 return new_bb;
31017 }
31018
31019 while (predicate_chain != NULL)
31020 {
31021 cond_var = create_tmp_var (integer_type_node);
31022 predicate_decl = TREE_PURPOSE (predicate_chain);
31023 predicate_arg = TREE_VALUE (predicate_chain);
31024 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
31025 gimple_call_set_lhs (call_cond_stmt, cond_var);
31026
31027 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
31028 gimple_set_bb (call_cond_stmt, new_bb);
31029 gimple_seq_add_stmt (&gseq, call_cond_stmt);
31030
31031 predicate_chain = TREE_CHAIN (predicate_chain);
31032
31033 if (and_expr_var == NULL)
31034 and_expr_var = cond_var;
31035 else
31036 {
31037 gimple *assign_stmt;
31038 /* Use MIN_EXPR to check if any integer is zero?.
31039 and_expr_var = min_expr <cond_var, and_expr_var> */
31040 assign_stmt = gimple_build_assign (and_expr_var,
31041 build2 (MIN_EXPR, integer_type_node,
31042 cond_var, and_expr_var));
31043
31044 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
31045 gimple_set_bb (assign_stmt, new_bb);
31046 gimple_seq_add_stmt (&gseq, assign_stmt);
31047 }
31048 }
31049
31050 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
31051 integer_zero_node,
31052 NULL_TREE, NULL_TREE);
31053 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
31054 gimple_set_bb (if_else_stmt, new_bb);
31055 gimple_seq_add_stmt (&gseq, if_else_stmt);
31056
31057 gimple_seq_add_stmt (&gseq, convert_stmt);
31058 gimple_seq_add_stmt (&gseq, return_stmt);
31059 set_bb_seq (new_bb, gseq);
31060
31061 bb1 = new_bb;
31062 e12 = split_block (bb1, if_else_stmt);
31063 bb2 = e12->dest;
31064 e12->flags &= ~EDGE_FALLTHRU;
31065 e12->flags |= EDGE_TRUE_VALUE;
31066
31067 e23 = split_block (bb2, return_stmt);
31068
31069 gimple_set_bb (convert_stmt, bb2);
31070 gimple_set_bb (return_stmt, bb2);
31071
31072 bb3 = e23->dest;
31073 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
31074
31075 remove_edge (e23);
31076 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
31077
31078 pop_cfun ();
31079
31080 return bb3;
31081 }
31082
31083 /* This parses the attribute arguments to target in DECL and determines
31084 the right builtin to use to match the platform specification.
31085 It returns the priority value for this version decl. If PREDICATE_LIST
31086 is not NULL, it stores the list of cpu features that need to be checked
31087 before dispatching this function. */
31088
31089 static unsigned int
31090 get_builtin_code_for_version (tree decl, tree *predicate_list)
31091 {
31092 tree attrs;
31093 struct cl_target_option cur_target;
31094 tree target_node;
31095 struct cl_target_option *new_target;
31096 const char *arg_str = NULL;
31097 const char *attrs_str = NULL;
31098 char *tok_str = NULL;
31099 char *token;
31100
31101 /* Priority of i386 features, greater value is higher priority. This is
31102 used to decide the order in which function dispatch must happen. For
31103 instance, a version specialized for SSE4.2 should be checked for dispatch
31104 before a version for SSE3, as SSE4.2 implies SSE3. */
31105 enum feature_priority
31106 {
31107 P_ZERO = 0,
31108 P_MMX,
31109 P_SSE,
31110 P_SSE2,
31111 P_SSE3,
31112 P_SSSE3,
31113 P_PROC_SSSE3,
31114 P_SSE4_A,
31115 P_PROC_SSE4_A,
31116 P_SSE4_1,
31117 P_SSE4_2,
31118 P_PROC_SSE4_2,
31119 P_POPCNT,
31120 P_AES,
31121 P_PCLMUL,
31122 P_AVX,
31123 P_PROC_AVX,
31124 P_BMI,
31125 P_PROC_BMI,
31126 P_FMA4,
31127 P_XOP,
31128 P_PROC_XOP,
31129 P_FMA,
31130 P_PROC_FMA,
31131 P_BMI2,
31132 P_AVX2,
31133 P_PROC_AVX2,
31134 P_AVX512F,
31135 P_PROC_AVX512F
31136 };
31137
31138 enum feature_priority priority = P_ZERO;
31139
31140 /* These are the target attribute strings for which a dispatcher is
31141 available, from fold_builtin_cpu. */
31142
31143 static struct _feature_list
31144 {
31145 const char *const name;
31146 const enum feature_priority priority;
31147 }
31148 const feature_list[] =
31149 {
31150 {"mmx", P_MMX},
31151 {"sse", P_SSE},
31152 {"sse2", P_SSE2},
31153 {"sse3", P_SSE3},
31154 {"sse4a", P_SSE4_A},
31155 {"ssse3", P_SSSE3},
31156 {"sse4.1", P_SSE4_1},
31157 {"sse4.2", P_SSE4_2},
31158 {"popcnt", P_POPCNT},
31159 {"aes", P_AES},
31160 {"pclmul", P_PCLMUL},
31161 {"avx", P_AVX},
31162 {"bmi", P_BMI},
31163 {"fma4", P_FMA4},
31164 {"xop", P_XOP},
31165 {"fma", P_FMA},
31166 {"bmi2", P_BMI2},
31167 {"avx2", P_AVX2},
31168 {"avx512f", P_AVX512F}
31169 };
31170
31171
31172 static unsigned int NUM_FEATURES
31173 = sizeof (feature_list) / sizeof (struct _feature_list);
31174
31175 unsigned int i;
31176
31177 tree predicate_chain = NULL_TREE;
31178 tree predicate_decl, predicate_arg;
31179
31180 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31181 gcc_assert (attrs != NULL);
31182
31183 attrs = TREE_VALUE (TREE_VALUE (attrs));
31184
31185 gcc_assert (TREE_CODE (attrs) == STRING_CST);
31186 attrs_str = TREE_STRING_POINTER (attrs);
31187
31188 /* Return priority zero for default function. */
31189 if (strcmp (attrs_str, "default") == 0)
31190 return 0;
31191
31192 /* Handle arch= if specified. For priority, set it to be 1 more than
31193 the best instruction set the processor can handle. For instance, if
31194 there is a version for atom and a version for ssse3 (the highest ISA
31195 priority for atom), the atom version must be checked for dispatch
31196 before the ssse3 version. */
31197 if (strstr (attrs_str, "arch=") != NULL)
31198 {
31199 cl_target_option_save (&cur_target, &global_options);
31200 target_node = ix86_valid_target_attribute_tree (attrs, &global_options,
31201 &global_options_set);
31202
31203 gcc_assert (target_node);
31204 new_target = TREE_TARGET_OPTION (target_node);
31205 gcc_assert (new_target);
31206
31207 if (new_target->arch_specified && new_target->arch > 0)
31208 {
31209 switch (new_target->arch)
31210 {
31211 case PROCESSOR_CORE2:
31212 arg_str = "core2";
31213 priority = P_PROC_SSSE3;
31214 break;
31215 case PROCESSOR_NEHALEM:
31216 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AES)
31217 {
31218 arg_str = "westmere";
31219 priority = P_AES;
31220 }
31221 else
31222 {
31223 /* We translate "arch=corei7" and "arch=nehalem" to
31224 "corei7" so that it will be mapped to M_INTEL_COREI7
31225 as cpu type to cover all M_INTEL_COREI7_XXXs. */
31226 arg_str = "corei7";
31227 priority = P_PROC_SSE4_2;
31228 }
31229 break;
31230 case PROCESSOR_SANDYBRIDGE:
31231 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_F16C)
31232 arg_str = "ivybridge";
31233 else
31234 arg_str = "sandybridge";
31235 priority = P_PROC_AVX;
31236 break;
31237 case PROCESSOR_HASWELL:
31238 case PROCESSOR_SKYLAKE_AVX512:
31239 if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VBMI)
31240 arg_str = "cannonlake";
31241 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_AVX512VL)
31242 arg_str = "skylake-avx512";
31243 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_XSAVES)
31244 arg_str = "skylake";
31245 else if (new_target->x_ix86_isa_flags & OPTION_MASK_ISA_ADX)
31246 arg_str = "broadwell";
31247 else
31248 arg_str = "haswell";
31249 priority = P_PROC_AVX2;
31250 break;
31251 case PROCESSOR_BONNELL:
31252 arg_str = "bonnell";
31253 priority = P_PROC_SSSE3;
31254 break;
31255 case PROCESSOR_KNL:
31256 arg_str = "knl";
31257 priority = P_PROC_AVX512F;
31258 break;
31259 case PROCESSOR_KNM:
31260 arg_str = "knm";
31261 priority = P_PROC_AVX512F;
31262 break;
31263 case PROCESSOR_SILVERMONT:
31264 arg_str = "silvermont";
31265 priority = P_PROC_SSE4_2;
31266 break;
31267 case PROCESSOR_AMDFAM10:
31268 arg_str = "amdfam10h";
31269 priority = P_PROC_SSE4_A;
31270 break;
31271 case PROCESSOR_BTVER1:
31272 arg_str = "btver1";
31273 priority = P_PROC_SSE4_A;
31274 break;
31275 case PROCESSOR_BTVER2:
31276 arg_str = "btver2";
31277 priority = P_PROC_BMI;
31278 break;
31279 case PROCESSOR_BDVER1:
31280 arg_str = "bdver1";
31281 priority = P_PROC_XOP;
31282 break;
31283 case PROCESSOR_BDVER2:
31284 arg_str = "bdver2";
31285 priority = P_PROC_FMA;
31286 break;
31287 case PROCESSOR_BDVER3:
31288 arg_str = "bdver3";
31289 priority = P_PROC_FMA;
31290 break;
31291 case PROCESSOR_BDVER4:
31292 arg_str = "bdver4";
31293 priority = P_PROC_AVX2;
31294 break;
31295 case PROCESSOR_ZNVER1:
31296 arg_str = "znver1";
31297 priority = P_PROC_AVX2;
31298 break;
31299 }
31300 }
31301
31302 cl_target_option_restore (&global_options, &cur_target);
31303
31304 if (predicate_list && arg_str == NULL)
31305 {
31306 error_at (DECL_SOURCE_LOCATION (decl),
31307 "No dispatcher found for the versioning attributes");
31308 return 0;
31309 }
31310
31311 if (predicate_list)
31312 {
31313 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
31314 /* For a C string literal the length includes the trailing NULL. */
31315 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
31316 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31317 predicate_chain);
31318 }
31319 }
31320
31321 /* Process feature name. */
31322 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
31323 strcpy (tok_str, attrs_str);
31324 token = strtok (tok_str, ",");
31325 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
31326
31327 while (token != NULL)
31328 {
31329 /* Do not process "arch=" */
31330 if (strncmp (token, "arch=", 5) == 0)
31331 {
31332 token = strtok (NULL, ",");
31333 continue;
31334 }
31335 for (i = 0; i < NUM_FEATURES; ++i)
31336 {
31337 if (strcmp (token, feature_list[i].name) == 0)
31338 {
31339 if (predicate_list)
31340 {
31341 predicate_arg = build_string_literal (
31342 strlen (feature_list[i].name) + 1,
31343 feature_list[i].name);
31344 predicate_chain = tree_cons (predicate_decl, predicate_arg,
31345 predicate_chain);
31346 }
31347 /* Find the maximum priority feature. */
31348 if (feature_list[i].priority > priority)
31349 priority = feature_list[i].priority;
31350
31351 break;
31352 }
31353 }
31354 if (predicate_list && i == NUM_FEATURES)
31355 {
31356 error_at (DECL_SOURCE_LOCATION (decl),
31357 "No dispatcher found for %s", token);
31358 return 0;
31359 }
31360 token = strtok (NULL, ",");
31361 }
31362 free (tok_str);
31363
31364 if (predicate_list && predicate_chain == NULL_TREE)
31365 {
31366 error_at (DECL_SOURCE_LOCATION (decl),
31367 "No dispatcher found for the versioning attributes : %s",
31368 attrs_str);
31369 return 0;
31370 }
31371 else if (predicate_list)
31372 {
31373 predicate_chain = nreverse (predicate_chain);
31374 *predicate_list = predicate_chain;
31375 }
31376
31377 return priority;
31378 }
31379
31380 /* This compares the priority of target features in function DECL1
31381 and DECL2. It returns positive value if DECL1 is higher priority,
31382 negative value if DECL2 is higher priority and 0 if they are the
31383 same. */
31384
31385 static int
31386 ix86_compare_version_priority (tree decl1, tree decl2)
31387 {
31388 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
31389 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
31390
31391 return (int)priority1 - (int)priority2;
31392 }
31393
31394 /* V1 and V2 point to function versions with different priorities
31395 based on the target ISA. This function compares their priorities. */
31396
31397 static int
31398 feature_compare (const void *v1, const void *v2)
31399 {
31400 typedef struct _function_version_info
31401 {
31402 tree version_decl;
31403 tree predicate_chain;
31404 unsigned int dispatch_priority;
31405 } function_version_info;
31406
31407 const function_version_info c1 = *(const function_version_info *)v1;
31408 const function_version_info c2 = *(const function_version_info *)v2;
31409 return (c2.dispatch_priority - c1.dispatch_priority);
31410 }
31411
31412 /* This function generates the dispatch function for
31413 multi-versioned functions. DISPATCH_DECL is the function which will
31414 contain the dispatch logic. FNDECLS are the function choices for
31415 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
31416 in DISPATCH_DECL in which the dispatch code is generated. */
31417
31418 static int
31419 dispatch_function_versions (tree dispatch_decl,
31420 void *fndecls_p,
31421 basic_block *empty_bb)
31422 {
31423 tree default_decl;
31424 gimple *ifunc_cpu_init_stmt;
31425 gimple_seq gseq;
31426 int ix;
31427 tree ele;
31428 vec<tree> *fndecls;
31429 unsigned int num_versions = 0;
31430 unsigned int actual_versions = 0;
31431 unsigned int i;
31432
31433 struct _function_version_info
31434 {
31435 tree version_decl;
31436 tree predicate_chain;
31437 unsigned int dispatch_priority;
31438 }*function_version_info;
31439
31440 gcc_assert (dispatch_decl != NULL
31441 && fndecls_p != NULL
31442 && empty_bb != NULL);
31443
31444 /*fndecls_p is actually a vector. */
31445 fndecls = static_cast<vec<tree> *> (fndecls_p);
31446
31447 /* At least one more version other than the default. */
31448 num_versions = fndecls->length ();
31449 gcc_assert (num_versions >= 2);
31450
31451 function_version_info = (struct _function_version_info *)
31452 XNEWVEC (struct _function_version_info, (num_versions - 1));
31453
31454 /* The first version in the vector is the default decl. */
31455 default_decl = (*fndecls)[0];
31456
31457 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
31458
31459 gseq = bb_seq (*empty_bb);
31460 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
31461 constructors, so explicity call __builtin_cpu_init here. */
31462 ifunc_cpu_init_stmt = gimple_build_call_vec (
31463 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
31464 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
31465 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
31466 set_bb_seq (*empty_bb, gseq);
31467
31468 pop_cfun ();
31469
31470
31471 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
31472 {
31473 tree version_decl = ele;
31474 tree predicate_chain = NULL_TREE;
31475 unsigned int priority;
31476 /* Get attribute string, parse it and find the right predicate decl.
31477 The predicate function could be a lengthy combination of many
31478 features, like arch-type and various isa-variants. */
31479 priority = get_builtin_code_for_version (version_decl,
31480 &predicate_chain);
31481
31482 if (predicate_chain == NULL_TREE)
31483 continue;
31484
31485 function_version_info [actual_versions].version_decl = version_decl;
31486 function_version_info [actual_versions].predicate_chain
31487 = predicate_chain;
31488 function_version_info [actual_versions].dispatch_priority = priority;
31489 actual_versions++;
31490 }
31491
31492 /* Sort the versions according to descending order of dispatch priority. The
31493 priority is based on the ISA. This is not a perfect solution. There
31494 could still be ambiguity. If more than one function version is suitable
31495 to execute, which one should be dispatched? In future, allow the user
31496 to specify a dispatch priority next to the version. */
31497 qsort (function_version_info, actual_versions,
31498 sizeof (struct _function_version_info), feature_compare);
31499
31500 for (i = 0; i < actual_versions; ++i)
31501 *empty_bb = add_condition_to_bb (dispatch_decl,
31502 function_version_info[i].version_decl,
31503 function_version_info[i].predicate_chain,
31504 *empty_bb);
31505
31506 /* dispatch default version at the end. */
31507 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
31508 NULL, *empty_bb);
31509
31510 free (function_version_info);
31511 return 0;
31512 }
31513
31514 /* This function changes the assembler name for functions that are
31515 versions. If DECL is a function version and has a "target"
31516 attribute, it appends the attribute string to its assembler name. */
31517
31518 static tree
31519 ix86_mangle_function_version_assembler_name (tree decl, tree id)
31520 {
31521 tree version_attr;
31522 const char *orig_name, *version_string;
31523 char *attr_str, *assembler_name;
31524
31525 if (DECL_DECLARED_INLINE_P (decl)
31526 && lookup_attribute ("gnu_inline",
31527 DECL_ATTRIBUTES (decl)))
31528 error_at (DECL_SOURCE_LOCATION (decl),
31529 "Function versions cannot be marked as gnu_inline,"
31530 " bodies have to be generated");
31531
31532 if (DECL_VIRTUAL_P (decl)
31533 || DECL_VINDEX (decl))
31534 sorry ("Virtual function multiversioning not supported");
31535
31536 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
31537
31538 /* target attribute string cannot be NULL. */
31539 gcc_assert (version_attr != NULL_TREE);
31540
31541 orig_name = IDENTIFIER_POINTER (id);
31542 version_string
31543 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
31544
31545 if (strcmp (version_string, "default") == 0)
31546 return id;
31547
31548 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
31549 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
31550
31551 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
31552
31553 /* Allow assembler name to be modified if already set. */
31554 if (DECL_ASSEMBLER_NAME_SET_P (decl))
31555 SET_DECL_RTL (decl, NULL);
31556
31557 tree ret = get_identifier (assembler_name);
31558 XDELETEVEC (attr_str);
31559 XDELETEVEC (assembler_name);
31560 return ret;
31561 }
31562
31563
31564 static tree
31565 ix86_mangle_decl_assembler_name (tree decl, tree id)
31566 {
31567 /* For function version, add the target suffix to the assembler name. */
31568 if (TREE_CODE (decl) == FUNCTION_DECL
31569 && DECL_FUNCTION_VERSIONED (decl))
31570 id = ix86_mangle_function_version_assembler_name (decl, id);
31571 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
31572 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
31573 #endif
31574
31575 return id;
31576 }
31577
31578 /* Make a dispatcher declaration for the multi-versioned function DECL.
31579 Calls to DECL function will be replaced with calls to the dispatcher
31580 by the front-end. Returns the decl of the dispatcher function. */
31581
31582 static tree
31583 ix86_get_function_versions_dispatcher (void *decl)
31584 {
31585 tree fn = (tree) decl;
31586 struct cgraph_node *node = NULL;
31587 struct cgraph_node *default_node = NULL;
31588 struct cgraph_function_version_info *node_v = NULL;
31589 struct cgraph_function_version_info *first_v = NULL;
31590
31591 tree dispatch_decl = NULL;
31592
31593 struct cgraph_function_version_info *default_version_info = NULL;
31594
31595 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
31596
31597 node = cgraph_node::get (fn);
31598 gcc_assert (node != NULL);
31599
31600 node_v = node->function_version ();
31601 gcc_assert (node_v != NULL);
31602
31603 if (node_v->dispatcher_resolver != NULL)
31604 return node_v->dispatcher_resolver;
31605
31606 /* Find the default version and make it the first node. */
31607 first_v = node_v;
31608 /* Go to the beginning of the chain. */
31609 while (first_v->prev != NULL)
31610 first_v = first_v->prev;
31611 default_version_info = first_v;
31612 while (default_version_info != NULL)
31613 {
31614 if (is_function_default_version
31615 (default_version_info->this_node->decl))
31616 break;
31617 default_version_info = default_version_info->next;
31618 }
31619
31620 /* If there is no default node, just return NULL. */
31621 if (default_version_info == NULL)
31622 return NULL;
31623
31624 /* Make default info the first node. */
31625 if (first_v != default_version_info)
31626 {
31627 default_version_info->prev->next = default_version_info->next;
31628 if (default_version_info->next)
31629 default_version_info->next->prev = default_version_info->prev;
31630 first_v->prev = default_version_info;
31631 default_version_info->next = first_v;
31632 default_version_info->prev = NULL;
31633 }
31634
31635 default_node = default_version_info->this_node;
31636
31637 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
31638 if (targetm.has_ifunc_p ())
31639 {
31640 struct cgraph_function_version_info *it_v = NULL;
31641 struct cgraph_node *dispatcher_node = NULL;
31642 struct cgraph_function_version_info *dispatcher_version_info = NULL;
31643
31644 /* Right now, the dispatching is done via ifunc. */
31645 dispatch_decl = make_dispatcher_decl (default_node->decl);
31646
31647 dispatcher_node = cgraph_node::get_create (dispatch_decl);
31648 gcc_assert (dispatcher_node != NULL);
31649 dispatcher_node->dispatcher_function = 1;
31650 dispatcher_version_info
31651 = dispatcher_node->insert_new_function_version ();
31652 dispatcher_version_info->next = default_version_info;
31653 dispatcher_node->definition = 1;
31654
31655 /* Set the dispatcher for all the versions. */
31656 it_v = default_version_info;
31657 while (it_v != NULL)
31658 {
31659 it_v->dispatcher_resolver = dispatch_decl;
31660 it_v = it_v->next;
31661 }
31662 }
31663 else
31664 #endif
31665 {
31666 error_at (DECL_SOURCE_LOCATION (default_node->decl),
31667 "multiversioning needs ifunc which is not supported "
31668 "on this target");
31669 }
31670
31671 return dispatch_decl;
31672 }
31673
31674 /* Make the resolver function decl to dispatch the versions of
31675 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
31676 ifunc alias that will point to the created resolver. Create an
31677 empty basic block in the resolver and store the pointer in
31678 EMPTY_BB. Return the decl of the resolver function. */
31679
31680 static tree
31681 make_resolver_func (const tree default_decl,
31682 const tree ifunc_alias_decl,
31683 basic_block *empty_bb)
31684 {
31685 char *resolver_name;
31686 tree decl, type, decl_name, t;
31687
31688 /* IFUNC's have to be globally visible. So, if the default_decl is
31689 not, then the name of the IFUNC should be made unique. */
31690 if (TREE_PUBLIC (default_decl) == 0)
31691 {
31692 char *ifunc_name = make_unique_name (default_decl, "ifunc", true);
31693 symtab->change_decl_assembler_name (ifunc_alias_decl,
31694 get_identifier (ifunc_name));
31695 XDELETEVEC (ifunc_name);
31696 }
31697
31698 resolver_name = make_unique_name (default_decl, "resolver", false);
31699
31700 /* The resolver function should return a (void *). */
31701 type = build_function_type_list (ptr_type_node, NULL_TREE);
31702
31703 decl = build_fn_decl (resolver_name, type);
31704 decl_name = get_identifier (resolver_name);
31705 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
31706
31707 DECL_NAME (decl) = decl_name;
31708 TREE_USED (decl) = 1;
31709 DECL_ARTIFICIAL (decl) = 1;
31710 DECL_IGNORED_P (decl) = 1;
31711 TREE_PUBLIC (decl) = 0;
31712 DECL_UNINLINABLE (decl) = 1;
31713
31714 /* Resolver is not external, body is generated. */
31715 DECL_EXTERNAL (decl) = 0;
31716 DECL_EXTERNAL (ifunc_alias_decl) = 0;
31717
31718 DECL_CONTEXT (decl) = NULL_TREE;
31719 DECL_INITIAL (decl) = make_node (BLOCK);
31720 DECL_STATIC_CONSTRUCTOR (decl) = 0;
31721
31722 if (DECL_COMDAT_GROUP (default_decl)
31723 || TREE_PUBLIC (default_decl))
31724 {
31725 /* In this case, each translation unit with a call to this
31726 versioned function will put out a resolver. Ensure it
31727 is comdat to keep just one copy. */
31728 DECL_COMDAT (decl) = 1;
31729 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
31730 }
31731 /* Build result decl and add to function_decl. */
31732 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
31733 DECL_ARTIFICIAL (t) = 1;
31734 DECL_IGNORED_P (t) = 1;
31735 DECL_RESULT (decl) = t;
31736
31737 gimplify_function_tree (decl);
31738 push_cfun (DECL_STRUCT_FUNCTION (decl));
31739 *empty_bb = init_lowered_empty_function (decl, false,
31740 profile_count::uninitialized ());
31741
31742 cgraph_node::add_new_function (decl, true);
31743 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
31744
31745 pop_cfun ();
31746
31747 gcc_assert (ifunc_alias_decl != NULL);
31748 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
31749 DECL_ATTRIBUTES (ifunc_alias_decl)
31750 = make_attribute ("ifunc", resolver_name,
31751 DECL_ATTRIBUTES (ifunc_alias_decl));
31752
31753 /* Create the alias for dispatch to resolver here. */
31754 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
31755 XDELETEVEC (resolver_name);
31756 return decl;
31757 }
31758
31759 /* Generate the dispatching code body to dispatch multi-versioned function
31760 DECL. The target hook is called to process the "target" attributes and
31761 provide the code to dispatch the right function at run-time. NODE points
31762 to the dispatcher decl whose body will be created. */
31763
31764 static tree
31765 ix86_generate_version_dispatcher_body (void *node_p)
31766 {
31767 tree resolver_decl;
31768 basic_block empty_bb;
31769 tree default_ver_decl;
31770 struct cgraph_node *versn;
31771 struct cgraph_node *node;
31772
31773 struct cgraph_function_version_info *node_version_info = NULL;
31774 struct cgraph_function_version_info *versn_info = NULL;
31775
31776 node = (cgraph_node *)node_p;
31777
31778 node_version_info = node->function_version ();
31779 gcc_assert (node->dispatcher_function
31780 && node_version_info != NULL);
31781
31782 if (node_version_info->dispatcher_resolver)
31783 return node_version_info->dispatcher_resolver;
31784
31785 /* The first version in the chain corresponds to the default version. */
31786 default_ver_decl = node_version_info->next->this_node->decl;
31787
31788 /* node is going to be an alias, so remove the finalized bit. */
31789 node->definition = false;
31790
31791 resolver_decl = make_resolver_func (default_ver_decl,
31792 node->decl, &empty_bb);
31793
31794 node_version_info->dispatcher_resolver = resolver_decl;
31795
31796 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
31797
31798 auto_vec<tree, 2> fn_ver_vec;
31799
31800 for (versn_info = node_version_info->next; versn_info;
31801 versn_info = versn_info->next)
31802 {
31803 versn = versn_info->this_node;
31804 /* Check for virtual functions here again, as by this time it should
31805 have been determined if this function needs a vtable index or
31806 not. This happens for methods in derived classes that override
31807 virtual methods in base classes but are not explicitly marked as
31808 virtual. */
31809 if (DECL_VINDEX (versn->decl))
31810 sorry ("Virtual function multiversioning not supported");
31811
31812 fn_ver_vec.safe_push (versn->decl);
31813 }
31814
31815 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
31816 cgraph_edge::rebuild_edges ();
31817 pop_cfun ();
31818 return resolver_decl;
31819 }
31820 /* This builds the processor_model struct type defined in
31821 libgcc/config/i386/cpuinfo.c */
31822
31823 static tree
31824 build_processor_model_struct (void)
31825 {
31826 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
31827 "__cpu_features"};
31828 tree field = NULL_TREE, field_chain = NULL_TREE;
31829 int i;
31830 tree type = make_node (RECORD_TYPE);
31831
31832 /* The first 3 fields are unsigned int. */
31833 for (i = 0; i < 3; ++i)
31834 {
31835 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31836 get_identifier (field_name[i]), unsigned_type_node);
31837 if (field_chain != NULL_TREE)
31838 DECL_CHAIN (field) = field_chain;
31839 field_chain = field;
31840 }
31841
31842 /* The last field is an array of unsigned integers of size one. */
31843 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
31844 get_identifier (field_name[3]),
31845 build_array_type (unsigned_type_node,
31846 build_index_type (size_one_node)));
31847 if (field_chain != NULL_TREE)
31848 DECL_CHAIN (field) = field_chain;
31849 field_chain = field;
31850
31851 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
31852 return type;
31853 }
31854
31855 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
31856
31857 static tree
31858 make_var_decl (tree type, const char *name)
31859 {
31860 tree new_decl;
31861
31862 new_decl = build_decl (UNKNOWN_LOCATION,
31863 VAR_DECL,
31864 get_identifier(name),
31865 type);
31866
31867 DECL_EXTERNAL (new_decl) = 1;
31868 TREE_STATIC (new_decl) = 1;
31869 TREE_PUBLIC (new_decl) = 1;
31870 DECL_INITIAL (new_decl) = 0;
31871 DECL_ARTIFICIAL (new_decl) = 0;
31872 DECL_PRESERVE_P (new_decl) = 1;
31873
31874 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
31875 assemble_variable (new_decl, 0, 0, 0);
31876
31877 return new_decl;
31878 }
31879
31880 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
31881 into an integer defined in libgcc/config/i386/cpuinfo.c */
31882
31883 static tree
31884 fold_builtin_cpu (tree fndecl, tree *args)
31885 {
31886 unsigned int i;
31887 enum ix86_builtins fn_code = (enum ix86_builtins)
31888 DECL_FUNCTION_CODE (fndecl);
31889 tree param_string_cst = NULL;
31890
31891 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
31892 enum processor_features
31893 {
31894 F_CMOV = 0,
31895 F_MMX,
31896 F_POPCNT,
31897 F_SSE,
31898 F_SSE2,
31899 F_SSE3,
31900 F_SSSE3,
31901 F_SSE4_1,
31902 F_SSE4_2,
31903 F_AVX,
31904 F_AVX2,
31905 F_SSE4_A,
31906 F_FMA4,
31907 F_XOP,
31908 F_FMA,
31909 F_AVX512F,
31910 F_BMI,
31911 F_BMI2,
31912 F_AES,
31913 F_PCLMUL,
31914 F_AVX512VL,
31915 F_AVX512BW,
31916 F_AVX512DQ,
31917 F_AVX512CD,
31918 F_AVX512ER,
31919 F_AVX512PF,
31920 F_AVX512VBMI,
31921 F_AVX512IFMA,
31922 F_AVX5124VNNIW,
31923 F_AVX5124FMAPS,
31924 F_AVX512VPOPCNTDQ,
31925 F_MAX
31926 };
31927
31928 /* These are the values for vendor types and cpu types and subtypes
31929 in cpuinfo.c. Cpu types and subtypes should be subtracted by
31930 the corresponding start value. */
31931 enum processor_model
31932 {
31933 M_INTEL = 1,
31934 M_AMD,
31935 M_CPU_TYPE_START,
31936 M_INTEL_BONNELL,
31937 M_INTEL_CORE2,
31938 M_INTEL_COREI7,
31939 M_AMDFAM10H,
31940 M_AMDFAM15H,
31941 M_INTEL_SILVERMONT,
31942 M_INTEL_KNL,
31943 M_AMD_BTVER1,
31944 M_AMD_BTVER2,
31945 M_AMDFAM17H,
31946 M_INTEL_KNM,
31947 M_CPU_SUBTYPE_START,
31948 M_INTEL_COREI7_NEHALEM,
31949 M_INTEL_COREI7_WESTMERE,
31950 M_INTEL_COREI7_SANDYBRIDGE,
31951 M_AMDFAM10H_BARCELONA,
31952 M_AMDFAM10H_SHANGHAI,
31953 M_AMDFAM10H_ISTANBUL,
31954 M_AMDFAM15H_BDVER1,
31955 M_AMDFAM15H_BDVER2,
31956 M_AMDFAM15H_BDVER3,
31957 M_AMDFAM15H_BDVER4,
31958 M_AMDFAM17H_ZNVER1,
31959 M_INTEL_COREI7_IVYBRIDGE,
31960 M_INTEL_COREI7_HASWELL,
31961 M_INTEL_COREI7_BROADWELL,
31962 M_INTEL_COREI7_SKYLAKE,
31963 M_INTEL_COREI7_SKYLAKE_AVX512,
31964 M_INTEL_COREI7_CANNONLAKE
31965 };
31966
31967 static struct _arch_names_table
31968 {
31969 const char *const name;
31970 const enum processor_model model;
31971 }
31972 const arch_names_table[] =
31973 {
31974 {"amd", M_AMD},
31975 {"intel", M_INTEL},
31976 {"atom", M_INTEL_BONNELL},
31977 {"slm", M_INTEL_SILVERMONT},
31978 {"core2", M_INTEL_CORE2},
31979 {"corei7", M_INTEL_COREI7},
31980 {"nehalem", M_INTEL_COREI7_NEHALEM},
31981 {"westmere", M_INTEL_COREI7_WESTMERE},
31982 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
31983 {"ivybridge", M_INTEL_COREI7_IVYBRIDGE},
31984 {"haswell", M_INTEL_COREI7_HASWELL},
31985 {"broadwell", M_INTEL_COREI7_BROADWELL},
31986 {"skylake", M_INTEL_COREI7_SKYLAKE},
31987 {"skylake-avx512", M_INTEL_COREI7_SKYLAKE_AVX512},
31988 {"cannonlake", M_INTEL_COREI7_CANNONLAKE},
31989 {"bonnell", M_INTEL_BONNELL},
31990 {"silvermont", M_INTEL_SILVERMONT},
31991 {"knl", M_INTEL_KNL},
31992 {"knm", M_INTEL_KNM},
31993 {"amdfam10h", M_AMDFAM10H},
31994 {"barcelona", M_AMDFAM10H_BARCELONA},
31995 {"shanghai", M_AMDFAM10H_SHANGHAI},
31996 {"istanbul", M_AMDFAM10H_ISTANBUL},
31997 {"btver1", M_AMD_BTVER1},
31998 {"amdfam15h", M_AMDFAM15H},
31999 {"bdver1", M_AMDFAM15H_BDVER1},
32000 {"bdver2", M_AMDFAM15H_BDVER2},
32001 {"bdver3", M_AMDFAM15H_BDVER3},
32002 {"bdver4", M_AMDFAM15H_BDVER4},
32003 {"btver2", M_AMD_BTVER2},
32004 {"amdfam17h", M_AMDFAM17H},
32005 {"znver1", M_AMDFAM17H_ZNVER1},
32006 };
32007
32008 static struct _isa_names_table
32009 {
32010 const char *const name;
32011 const enum processor_features feature;
32012 }
32013 const isa_names_table[] =
32014 {
32015 {"cmov", F_CMOV},
32016 {"mmx", F_MMX},
32017 {"popcnt", F_POPCNT},
32018 {"sse", F_SSE},
32019 {"sse2", F_SSE2},
32020 {"sse3", F_SSE3},
32021 {"ssse3", F_SSSE3},
32022 {"sse4a", F_SSE4_A},
32023 {"sse4.1", F_SSE4_1},
32024 {"sse4.2", F_SSE4_2},
32025 {"avx", F_AVX},
32026 {"fma4", F_FMA4},
32027 {"xop", F_XOP},
32028 {"fma", F_FMA},
32029 {"avx2", F_AVX2},
32030 {"avx512f", F_AVX512F},
32031 {"bmi", F_BMI},
32032 {"bmi2", F_BMI2},
32033 {"aes", F_AES},
32034 {"pclmul", F_PCLMUL},
32035 {"avx512vl",F_AVX512VL},
32036 {"avx512bw",F_AVX512BW},
32037 {"avx512dq",F_AVX512DQ},
32038 {"avx512cd",F_AVX512CD},
32039 {"avx512er",F_AVX512ER},
32040 {"avx512pf",F_AVX512PF},
32041 {"avx512vbmi",F_AVX512VBMI},
32042 {"avx512ifma",F_AVX512IFMA},
32043 {"avx5124vnniw",F_AVX5124VNNIW},
32044 {"avx5124fmaps",F_AVX5124FMAPS},
32045 {"avx512vpopcntdq",F_AVX512VPOPCNTDQ}
32046 };
32047
32048 tree __processor_model_type = build_processor_model_struct ();
32049 tree __cpu_model_var = make_var_decl (__processor_model_type,
32050 "__cpu_model");
32051
32052
32053 varpool_node::add (__cpu_model_var);
32054
32055 gcc_assert ((args != NULL) && (*args != NULL));
32056
32057 param_string_cst = *args;
32058 while (param_string_cst
32059 && TREE_CODE (param_string_cst) != STRING_CST)
32060 {
32061 /* *args must be a expr that can contain other EXPRS leading to a
32062 STRING_CST. */
32063 if (!EXPR_P (param_string_cst))
32064 {
32065 error ("Parameter to builtin must be a string constant or literal");
32066 return integer_zero_node;
32067 }
32068 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
32069 }
32070
32071 gcc_assert (param_string_cst);
32072
32073 if (fn_code == IX86_BUILTIN_CPU_IS)
32074 {
32075 tree ref;
32076 tree field;
32077 tree final;
32078
32079 unsigned int field_val = 0;
32080 unsigned int NUM_ARCH_NAMES
32081 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
32082
32083 for (i = 0; i < NUM_ARCH_NAMES; i++)
32084 if (strcmp (arch_names_table[i].name,
32085 TREE_STRING_POINTER (param_string_cst)) == 0)
32086 break;
32087
32088 if (i == NUM_ARCH_NAMES)
32089 {
32090 error ("Parameter to builtin not valid: %s",
32091 TREE_STRING_POINTER (param_string_cst));
32092 return integer_zero_node;
32093 }
32094
32095 field = TYPE_FIELDS (__processor_model_type);
32096 field_val = arch_names_table[i].model;
32097
32098 /* CPU types are stored in the next field. */
32099 if (field_val > M_CPU_TYPE_START
32100 && field_val < M_CPU_SUBTYPE_START)
32101 {
32102 field = DECL_CHAIN (field);
32103 field_val -= M_CPU_TYPE_START;
32104 }
32105
32106 /* CPU subtypes are stored in the next field. */
32107 if (field_val > M_CPU_SUBTYPE_START)
32108 {
32109 field = DECL_CHAIN ( DECL_CHAIN (field));
32110 field_val -= M_CPU_SUBTYPE_START;
32111 }
32112
32113 /* Get the appropriate field in __cpu_model. */
32114 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32115 field, NULL_TREE);
32116
32117 /* Check the value. */
32118 final = build2 (EQ_EXPR, unsigned_type_node, ref,
32119 build_int_cstu (unsigned_type_node, field_val));
32120 return build1 (CONVERT_EXPR, integer_type_node, final);
32121 }
32122 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
32123 {
32124 tree ref;
32125 tree array_elt;
32126 tree field;
32127 tree final;
32128
32129 unsigned int field_val = 0;
32130 unsigned int NUM_ISA_NAMES
32131 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
32132
32133 for (i = 0; i < NUM_ISA_NAMES; i++)
32134 if (strcmp (isa_names_table[i].name,
32135 TREE_STRING_POINTER (param_string_cst)) == 0)
32136 break;
32137
32138 if (i == NUM_ISA_NAMES)
32139 {
32140 error ("Parameter to builtin not valid: %s",
32141 TREE_STRING_POINTER (param_string_cst));
32142 return integer_zero_node;
32143 }
32144
32145 field = TYPE_FIELDS (__processor_model_type);
32146 /* Get the last field, which is __cpu_features. */
32147 while (DECL_CHAIN (field))
32148 field = DECL_CHAIN (field);
32149
32150 /* Get the appropriate field: __cpu_model.__cpu_features */
32151 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
32152 field, NULL_TREE);
32153
32154 /* Access the 0th element of __cpu_features array. */
32155 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
32156 integer_zero_node, NULL_TREE, NULL_TREE);
32157
32158 field_val = (1 << isa_names_table[i].feature);
32159 /* Return __cpu_model.__cpu_features[0] & field_val */
32160 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
32161 build_int_cstu (unsigned_type_node, field_val));
32162 return build1 (CONVERT_EXPR, integer_type_node, final);
32163 }
32164 gcc_unreachable ();
32165 }
32166
32167 static tree
32168 ix86_fold_builtin (tree fndecl, int n_args,
32169 tree *args, bool ignore ATTRIBUTE_UNUSED)
32170 {
32171 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32172 {
32173 enum ix86_builtins fn_code = (enum ix86_builtins)
32174 DECL_FUNCTION_CODE (fndecl);
32175 switch (fn_code)
32176 {
32177 case IX86_BUILTIN_CPU_IS:
32178 case IX86_BUILTIN_CPU_SUPPORTS:
32179 gcc_assert (n_args == 1);
32180 return fold_builtin_cpu (fndecl, args);
32181
32182 case IX86_BUILTIN_NANQ:
32183 case IX86_BUILTIN_NANSQ:
32184 {
32185 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32186 const char *str = c_getstr (*args);
32187 int quiet = fn_code == IX86_BUILTIN_NANQ;
32188 REAL_VALUE_TYPE real;
32189
32190 if (str && real_nan (&real, str, quiet, TYPE_MODE (type)))
32191 return build_real (type, real);
32192 return NULL_TREE;
32193 }
32194
32195 case IX86_BUILTIN_INFQ:
32196 case IX86_BUILTIN_HUGE_VALQ:
32197 {
32198 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32199 REAL_VALUE_TYPE inf;
32200 real_inf (&inf);
32201 return build_real (type, inf);
32202 }
32203
32204 case IX86_BUILTIN_TZCNT16:
32205 case IX86_BUILTIN_CTZS:
32206 case IX86_BUILTIN_TZCNT32:
32207 case IX86_BUILTIN_TZCNT64:
32208 gcc_assert (n_args == 1);
32209 if (TREE_CODE (args[0]) == INTEGER_CST)
32210 {
32211 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32212 tree arg = args[0];
32213 if (fn_code == IX86_BUILTIN_TZCNT16
32214 || fn_code == IX86_BUILTIN_CTZS)
32215 arg = fold_convert (short_unsigned_type_node, arg);
32216 if (integer_zerop (arg))
32217 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32218 else
32219 return fold_const_call (CFN_CTZ, type, arg);
32220 }
32221 break;
32222
32223 case IX86_BUILTIN_LZCNT16:
32224 case IX86_BUILTIN_CLZS:
32225 case IX86_BUILTIN_LZCNT32:
32226 case IX86_BUILTIN_LZCNT64:
32227 gcc_assert (n_args == 1);
32228 if (TREE_CODE (args[0]) == INTEGER_CST)
32229 {
32230 tree type = TREE_TYPE (TREE_TYPE (fndecl));
32231 tree arg = args[0];
32232 if (fn_code == IX86_BUILTIN_LZCNT16
32233 || fn_code == IX86_BUILTIN_CLZS)
32234 arg = fold_convert (short_unsigned_type_node, arg);
32235 if (integer_zerop (arg))
32236 return build_int_cst (type, TYPE_PRECISION (TREE_TYPE (arg)));
32237 else
32238 return fold_const_call (CFN_CLZ, type, arg);
32239 }
32240 break;
32241
32242 case IX86_BUILTIN_BEXTR32:
32243 case IX86_BUILTIN_BEXTR64:
32244 case IX86_BUILTIN_BEXTRI32:
32245 case IX86_BUILTIN_BEXTRI64:
32246 gcc_assert (n_args == 2);
32247 if (tree_fits_uhwi_p (args[1]))
32248 {
32249 unsigned HOST_WIDE_INT res = 0;
32250 unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
32251 unsigned int start = tree_to_uhwi (args[1]);
32252 unsigned int len = (start & 0xff00) >> 8;
32253 start &= 0xff;
32254 if (start >= prec || len == 0)
32255 res = 0;
32256 else if (!tree_fits_uhwi_p (args[0]))
32257 break;
32258 else
32259 res = tree_to_uhwi (args[0]) >> start;
32260 if (len > prec)
32261 len = prec;
32262 if (len < HOST_BITS_PER_WIDE_INT)
32263 res &= (HOST_WIDE_INT_1U << len) - 1;
32264 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32265 }
32266 break;
32267
32268 case IX86_BUILTIN_BZHI32:
32269 case IX86_BUILTIN_BZHI64:
32270 gcc_assert (n_args == 2);
32271 if (tree_fits_uhwi_p (args[1]))
32272 {
32273 unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
32274 if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
32275 return args[0];
32276 if (!tree_fits_uhwi_p (args[0]))
32277 break;
32278 unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
32279 res &= ~(HOST_WIDE_INT_M1U << idx);
32280 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32281 }
32282 break;
32283
32284 case IX86_BUILTIN_PDEP32:
32285 case IX86_BUILTIN_PDEP64:
32286 gcc_assert (n_args == 2);
32287 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32288 {
32289 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32290 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32291 unsigned HOST_WIDE_INT res = 0;
32292 unsigned HOST_WIDE_INT m, k = 1;
32293 for (m = 1; m; m <<= 1)
32294 if ((mask & m) != 0)
32295 {
32296 if ((src & k) != 0)
32297 res |= m;
32298 k <<= 1;
32299 }
32300 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32301 }
32302 break;
32303
32304 case IX86_BUILTIN_PEXT32:
32305 case IX86_BUILTIN_PEXT64:
32306 gcc_assert (n_args == 2);
32307 if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
32308 {
32309 unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
32310 unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
32311 unsigned HOST_WIDE_INT res = 0;
32312 unsigned HOST_WIDE_INT m, k = 1;
32313 for (m = 1; m; m <<= 1)
32314 if ((mask & m) != 0)
32315 {
32316 if ((src & m) != 0)
32317 res |= k;
32318 k <<= 1;
32319 }
32320 return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
32321 }
32322 break;
32323
32324 default:
32325 break;
32326 }
32327 }
32328
32329 #ifdef SUBTARGET_FOLD_BUILTIN
32330 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
32331 #endif
32332
32333 return NULL_TREE;
32334 }
32335
32336 /* Fold a MD builtin (use ix86_fold_builtin for folding into
32337 constant) in GIMPLE. */
32338
32339 bool
32340 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
32341 {
32342 gimple *stmt = gsi_stmt (*gsi);
32343 tree fndecl = gimple_call_fndecl (stmt);
32344 gcc_checking_assert (fndecl && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD);
32345 int n_args = gimple_call_num_args (stmt);
32346 enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
32347 tree decl = NULL_TREE;
32348 tree arg0, arg1;
32349
32350 switch (fn_code)
32351 {
32352 case IX86_BUILTIN_TZCNT32:
32353 decl = builtin_decl_implicit (BUILT_IN_CTZ);
32354 goto fold_tzcnt_lzcnt;
32355
32356 case IX86_BUILTIN_TZCNT64:
32357 decl = builtin_decl_implicit (BUILT_IN_CTZLL);
32358 goto fold_tzcnt_lzcnt;
32359
32360 case IX86_BUILTIN_LZCNT32:
32361 decl = builtin_decl_implicit (BUILT_IN_CLZ);
32362 goto fold_tzcnt_lzcnt;
32363
32364 case IX86_BUILTIN_LZCNT64:
32365 decl = builtin_decl_implicit (BUILT_IN_CLZLL);
32366 goto fold_tzcnt_lzcnt;
32367
32368 fold_tzcnt_lzcnt:
32369 gcc_assert (n_args == 1);
32370 arg0 = gimple_call_arg (stmt, 0);
32371 if (TREE_CODE (arg0) == SSA_NAME && decl && gimple_call_lhs (stmt))
32372 {
32373 int prec = TYPE_PRECISION (TREE_TYPE (arg0));
32374 /* If arg0 is provably non-zero, optimize into generic
32375 __builtin_c[tl]z{,ll} function the middle-end handles
32376 better. */
32377 if (!expr_not_equal_to (arg0, wi::zero (prec)))
32378 return false;
32379
32380 location_t loc = gimple_location (stmt);
32381 gimple *g = gimple_build_call (decl, 1, arg0);
32382 gimple_set_location (g, loc);
32383 tree lhs = make_ssa_name (integer_type_node);
32384 gimple_call_set_lhs (g, lhs);
32385 gsi_insert_before (gsi, g, GSI_SAME_STMT);
32386 g = gimple_build_assign (gimple_call_lhs (stmt), NOP_EXPR, lhs);
32387 gimple_set_location (g, loc);
32388 gsi_replace (gsi, g, false);
32389 return true;
32390 }
32391 break;
32392
32393 case IX86_BUILTIN_BZHI32:
32394 case IX86_BUILTIN_BZHI64:
32395 gcc_assert (n_args == 2);
32396 arg1 = gimple_call_arg (stmt, 1);
32397 if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
32398 {
32399 unsigned int idx = tree_to_uhwi (arg1) & 0xff;
32400 arg0 = gimple_call_arg (stmt, 0);
32401 if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
32402 break;
32403 location_t loc = gimple_location (stmt);
32404 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32405 gimple_set_location (g, loc);
32406 gsi_replace (gsi, g, false);
32407 return true;
32408 }
32409 break;
32410
32411 case IX86_BUILTIN_PDEP32:
32412 case IX86_BUILTIN_PDEP64:
32413 case IX86_BUILTIN_PEXT32:
32414 case IX86_BUILTIN_PEXT64:
32415 gcc_assert (n_args == 2);
32416 arg1 = gimple_call_arg (stmt, 1);
32417 if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
32418 {
32419 location_t loc = gimple_location (stmt);
32420 arg0 = gimple_call_arg (stmt, 0);
32421 gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
32422 gimple_set_location (g, loc);
32423 gsi_replace (gsi, g, false);
32424 return true;
32425 }
32426 break;
32427
32428 default:
32429 break;
32430 }
32431
32432 return false;
32433 }
32434
32435 /* Make builtins to detect cpu type and features supported. NAME is
32436 the builtin name, CODE is the builtin code, and FTYPE is the function
32437 type of the builtin. */
32438
32439 static void
32440 make_cpu_type_builtin (const char* name, int code,
32441 enum ix86_builtin_func_type ftype, bool is_const)
32442 {
32443 tree decl;
32444 tree type;
32445
32446 type = ix86_get_builtin_func_type (ftype);
32447 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
32448 NULL, NULL_TREE);
32449 gcc_assert (decl != NULL_TREE);
32450 ix86_builtins[(int) code] = decl;
32451 TREE_READONLY (decl) = is_const;
32452 }
32453
32454 /* Make builtins to get CPU type and features supported. The created
32455 builtins are :
32456
32457 __builtin_cpu_init (), to detect cpu type and features,
32458 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
32459 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
32460 */
32461
32462 static void
32463 ix86_init_platform_type_builtins (void)
32464 {
32465 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
32466 INT_FTYPE_VOID, false);
32467 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
32468 INT_FTYPE_PCCHAR, true);
32469 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
32470 INT_FTYPE_PCCHAR, true);
32471 }
32472
32473 /* Internal method for ix86_init_builtins. */
32474
32475 static void
32476 ix86_init_builtins_va_builtins_abi (void)
32477 {
32478 tree ms_va_ref, sysv_va_ref;
32479 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
32480 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
32481 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
32482 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
32483
32484 if (!TARGET_64BIT)
32485 return;
32486 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
32487 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
32488 ms_va_ref = build_reference_type (ms_va_list_type_node);
32489 sysv_va_ref =
32490 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
32491
32492 fnvoid_va_end_ms =
32493 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32494 fnvoid_va_start_ms =
32495 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
32496 fnvoid_va_end_sysv =
32497 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
32498 fnvoid_va_start_sysv =
32499 build_varargs_function_type_list (void_type_node, sysv_va_ref,
32500 NULL_TREE);
32501 fnvoid_va_copy_ms =
32502 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
32503 NULL_TREE);
32504 fnvoid_va_copy_sysv =
32505 build_function_type_list (void_type_node, sysv_va_ref,
32506 sysv_va_ref, NULL_TREE);
32507
32508 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
32509 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
32510 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
32511 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
32512 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
32513 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
32514 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
32515 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32516 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
32517 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32518 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
32519 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
32520 }
32521
32522 static void
32523 ix86_init_builtin_types (void)
32524 {
32525 tree float80_type_node, const_string_type_node;
32526
32527 /* The __float80 type. */
32528 float80_type_node = long_double_type_node;
32529 if (TYPE_MODE (float80_type_node) != XFmode)
32530 {
32531 if (float64x_type_node != NULL_TREE
32532 && TYPE_MODE (float64x_type_node) == XFmode)
32533 float80_type_node = float64x_type_node;
32534 else
32535 {
32536 /* The __float80 type. */
32537 float80_type_node = make_node (REAL_TYPE);
32538
32539 TYPE_PRECISION (float80_type_node) = 80;
32540 layout_type (float80_type_node);
32541 }
32542 }
32543 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
32544
32545 /* The __float128 type. The node has already been created as
32546 _Float128, so we only need to register the __float128 name for
32547 it. */
32548 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
32549
32550 const_string_type_node
32551 = build_pointer_type (build_qualified_type
32552 (char_type_node, TYPE_QUAL_CONST));
32553
32554 /* This macro is built by i386-builtin-types.awk. */
32555 DEFINE_BUILTIN_PRIMITIVE_TYPES;
32556 }
32557
32558 static void
32559 ix86_init_builtins (void)
32560 {
32561 tree ftype, decl;
32562
32563 ix86_init_builtin_types ();
32564
32565 /* Builtins to get CPU type and features. */
32566 ix86_init_platform_type_builtins ();
32567
32568 /* TFmode support builtins. */
32569 def_builtin_const (0, "__builtin_infq",
32570 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
32571 def_builtin_const (0, "__builtin_huge_valq",
32572 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
32573
32574 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_CONST_STRING);
32575 decl = add_builtin_function ("__builtin_nanq", ftype, IX86_BUILTIN_NANQ,
32576 BUILT_IN_MD, "nanq", NULL_TREE);
32577 TREE_READONLY (decl) = 1;
32578 ix86_builtins[(int) IX86_BUILTIN_NANQ] = decl;
32579
32580 decl = add_builtin_function ("__builtin_nansq", ftype, IX86_BUILTIN_NANSQ,
32581 BUILT_IN_MD, "nansq", NULL_TREE);
32582 TREE_READONLY (decl) = 1;
32583 ix86_builtins[(int) IX86_BUILTIN_NANSQ] = decl;
32584
32585 /* We will expand them to normal call if SSE isn't available since
32586 they are used by libgcc. */
32587 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
32588 decl = add_builtin_function ("__builtin_fabsq", ftype, IX86_BUILTIN_FABSQ,
32589 BUILT_IN_MD, "__fabstf2", NULL_TREE);
32590 TREE_READONLY (decl) = 1;
32591 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = decl;
32592
32593 ftype = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
32594 decl = add_builtin_function ("__builtin_copysignq", ftype,
32595 IX86_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
32596 "__copysigntf3", NULL_TREE);
32597 TREE_READONLY (decl) = 1;
32598 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = decl;
32599
32600 ix86_init_tm_builtins ();
32601 ix86_init_mmx_sse_builtins ();
32602 ix86_init_mpx_builtins ();
32603
32604 if (TARGET_LP64)
32605 ix86_init_builtins_va_builtins_abi ();
32606
32607 #ifdef SUBTARGET_INIT_BUILTINS
32608 SUBTARGET_INIT_BUILTINS;
32609 #endif
32610 }
32611
32612 /* Return the ix86 builtin for CODE. */
32613
32614 static tree
32615 ix86_builtin_decl (unsigned code, bool)
32616 {
32617 if (code >= IX86_BUILTIN_MAX)
32618 return error_mark_node;
32619
32620 return ix86_builtins[code];
32621 }
32622
32623 /* Errors in the source file can cause expand_expr to return const0_rtx
32624 where we expect a vector. To avoid crashing, use one of the vector
32625 clear instructions. */
32626 static rtx
32627 safe_vector_operand (rtx x, machine_mode mode)
32628 {
32629 if (x == const0_rtx)
32630 x = CONST0_RTX (mode);
32631 return x;
32632 }
32633
32634 /* Fixup modeless constants to fit required mode. */
32635 static rtx
32636 fixup_modeless_constant (rtx x, machine_mode mode)
32637 {
32638 if (GET_MODE (x) == VOIDmode)
32639 x = convert_to_mode (mode, x, 1);
32640 return x;
32641 }
32642
32643 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
32644
32645 static rtx
32646 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
32647 {
32648 rtx pat;
32649 tree arg0 = CALL_EXPR_ARG (exp, 0);
32650 tree arg1 = CALL_EXPR_ARG (exp, 1);
32651 rtx op0 = expand_normal (arg0);
32652 rtx op1 = expand_normal (arg1);
32653 machine_mode tmode = insn_data[icode].operand[0].mode;
32654 machine_mode mode0 = insn_data[icode].operand[1].mode;
32655 machine_mode mode1 = insn_data[icode].operand[2].mode;
32656
32657 if (VECTOR_MODE_P (mode0))
32658 op0 = safe_vector_operand (op0, mode0);
32659 if (VECTOR_MODE_P (mode1))
32660 op1 = safe_vector_operand (op1, mode1);
32661
32662 if (optimize || !target
32663 || GET_MODE (target) != tmode
32664 || !insn_data[icode].operand[0].predicate (target, tmode))
32665 target = gen_reg_rtx (tmode);
32666
32667 if (GET_MODE (op1) == SImode && mode1 == TImode)
32668 {
32669 rtx x = gen_reg_rtx (V4SImode);
32670 emit_insn (gen_sse2_loadd (x, op1));
32671 op1 = gen_lowpart (TImode, x);
32672 }
32673
32674 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32675 op0 = copy_to_mode_reg (mode0, op0);
32676 if (!insn_data[icode].operand[2].predicate (op1, mode1))
32677 op1 = copy_to_mode_reg (mode1, op1);
32678
32679 pat = GEN_FCN (icode) (target, op0, op1);
32680 if (! pat)
32681 return 0;
32682
32683 emit_insn (pat);
32684
32685 return target;
32686 }
32687
32688 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
32689
32690 static rtx
32691 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
32692 enum ix86_builtin_func_type m_type,
32693 enum rtx_code sub_code)
32694 {
32695 rtx pat;
32696 int i;
32697 int nargs;
32698 bool comparison_p = false;
32699 bool tf_p = false;
32700 bool last_arg_constant = false;
32701 int num_memory = 0;
32702 struct {
32703 rtx op;
32704 machine_mode mode;
32705 } args[4];
32706
32707 machine_mode tmode = insn_data[icode].operand[0].mode;
32708
32709 switch (m_type)
32710 {
32711 case MULTI_ARG_4_DF2_DI_I:
32712 case MULTI_ARG_4_DF2_DI_I1:
32713 case MULTI_ARG_4_SF2_SI_I:
32714 case MULTI_ARG_4_SF2_SI_I1:
32715 nargs = 4;
32716 last_arg_constant = true;
32717 break;
32718
32719 case MULTI_ARG_3_SF:
32720 case MULTI_ARG_3_DF:
32721 case MULTI_ARG_3_SF2:
32722 case MULTI_ARG_3_DF2:
32723 case MULTI_ARG_3_DI:
32724 case MULTI_ARG_3_SI:
32725 case MULTI_ARG_3_SI_DI:
32726 case MULTI_ARG_3_HI:
32727 case MULTI_ARG_3_HI_SI:
32728 case MULTI_ARG_3_QI:
32729 case MULTI_ARG_3_DI2:
32730 case MULTI_ARG_3_SI2:
32731 case MULTI_ARG_3_HI2:
32732 case MULTI_ARG_3_QI2:
32733 nargs = 3;
32734 break;
32735
32736 case MULTI_ARG_2_SF:
32737 case MULTI_ARG_2_DF:
32738 case MULTI_ARG_2_DI:
32739 case MULTI_ARG_2_SI:
32740 case MULTI_ARG_2_HI:
32741 case MULTI_ARG_2_QI:
32742 nargs = 2;
32743 break;
32744
32745 case MULTI_ARG_2_DI_IMM:
32746 case MULTI_ARG_2_SI_IMM:
32747 case MULTI_ARG_2_HI_IMM:
32748 case MULTI_ARG_2_QI_IMM:
32749 nargs = 2;
32750 last_arg_constant = true;
32751 break;
32752
32753 case MULTI_ARG_1_SF:
32754 case MULTI_ARG_1_DF:
32755 case MULTI_ARG_1_SF2:
32756 case MULTI_ARG_1_DF2:
32757 case MULTI_ARG_1_DI:
32758 case MULTI_ARG_1_SI:
32759 case MULTI_ARG_1_HI:
32760 case MULTI_ARG_1_QI:
32761 case MULTI_ARG_1_SI_DI:
32762 case MULTI_ARG_1_HI_DI:
32763 case MULTI_ARG_1_HI_SI:
32764 case MULTI_ARG_1_QI_DI:
32765 case MULTI_ARG_1_QI_SI:
32766 case MULTI_ARG_1_QI_HI:
32767 nargs = 1;
32768 break;
32769
32770 case MULTI_ARG_2_DI_CMP:
32771 case MULTI_ARG_2_SI_CMP:
32772 case MULTI_ARG_2_HI_CMP:
32773 case MULTI_ARG_2_QI_CMP:
32774 nargs = 2;
32775 comparison_p = true;
32776 break;
32777
32778 case MULTI_ARG_2_SF_TF:
32779 case MULTI_ARG_2_DF_TF:
32780 case MULTI_ARG_2_DI_TF:
32781 case MULTI_ARG_2_SI_TF:
32782 case MULTI_ARG_2_HI_TF:
32783 case MULTI_ARG_2_QI_TF:
32784 nargs = 2;
32785 tf_p = true;
32786 break;
32787
32788 default:
32789 gcc_unreachable ();
32790 }
32791
32792 if (optimize || !target
32793 || GET_MODE (target) != tmode
32794 || !insn_data[icode].operand[0].predicate (target, tmode))
32795 target = gen_reg_rtx (tmode);
32796 else if (memory_operand (target, tmode))
32797 num_memory++;
32798
32799 gcc_assert (nargs <= 4);
32800
32801 for (i = 0; i < nargs; i++)
32802 {
32803 tree arg = CALL_EXPR_ARG (exp, i);
32804 rtx op = expand_normal (arg);
32805 int adjust = (comparison_p) ? 1 : 0;
32806 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
32807
32808 if (last_arg_constant && i == nargs - 1)
32809 {
32810 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
32811 {
32812 enum insn_code new_icode = icode;
32813 switch (icode)
32814 {
32815 case CODE_FOR_xop_vpermil2v2df3:
32816 case CODE_FOR_xop_vpermil2v4sf3:
32817 case CODE_FOR_xop_vpermil2v4df3:
32818 case CODE_FOR_xop_vpermil2v8sf3:
32819 error ("the last argument must be a 2-bit immediate");
32820 return gen_reg_rtx (tmode);
32821 case CODE_FOR_xop_rotlv2di3:
32822 new_icode = CODE_FOR_rotlv2di3;
32823 goto xop_rotl;
32824 case CODE_FOR_xop_rotlv4si3:
32825 new_icode = CODE_FOR_rotlv4si3;
32826 goto xop_rotl;
32827 case CODE_FOR_xop_rotlv8hi3:
32828 new_icode = CODE_FOR_rotlv8hi3;
32829 goto xop_rotl;
32830 case CODE_FOR_xop_rotlv16qi3:
32831 new_icode = CODE_FOR_rotlv16qi3;
32832 xop_rotl:
32833 if (CONST_INT_P (op))
32834 {
32835 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
32836 op = GEN_INT (INTVAL (op) & mask);
32837 gcc_checking_assert
32838 (insn_data[icode].operand[i + 1].predicate (op, mode));
32839 }
32840 else
32841 {
32842 gcc_checking_assert
32843 (nargs == 2
32844 && insn_data[new_icode].operand[0].mode == tmode
32845 && insn_data[new_icode].operand[1].mode == tmode
32846 && insn_data[new_icode].operand[2].mode == mode
32847 && insn_data[new_icode].operand[0].predicate
32848 == insn_data[icode].operand[0].predicate
32849 && insn_data[new_icode].operand[1].predicate
32850 == insn_data[icode].operand[1].predicate);
32851 icode = new_icode;
32852 goto non_constant;
32853 }
32854 break;
32855 default:
32856 gcc_unreachable ();
32857 }
32858 }
32859 }
32860 else
32861 {
32862 non_constant:
32863 if (VECTOR_MODE_P (mode))
32864 op = safe_vector_operand (op, mode);
32865
32866 /* If we aren't optimizing, only allow one memory operand to be
32867 generated. */
32868 if (memory_operand (op, mode))
32869 num_memory++;
32870
32871 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
32872
32873 if (optimize
32874 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
32875 || num_memory > 1)
32876 op = force_reg (mode, op);
32877 }
32878
32879 args[i].op = op;
32880 args[i].mode = mode;
32881 }
32882
32883 switch (nargs)
32884 {
32885 case 1:
32886 pat = GEN_FCN (icode) (target, args[0].op);
32887 break;
32888
32889 case 2:
32890 if (tf_p)
32891 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
32892 GEN_INT ((int)sub_code));
32893 else if (! comparison_p)
32894 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
32895 else
32896 {
32897 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
32898 args[0].op,
32899 args[1].op);
32900
32901 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
32902 }
32903 break;
32904
32905 case 3:
32906 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
32907 break;
32908
32909 case 4:
32910 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
32911 break;
32912
32913 default:
32914 gcc_unreachable ();
32915 }
32916
32917 if (! pat)
32918 return 0;
32919
32920 emit_insn (pat);
32921 return target;
32922 }
32923
32924 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
32925 insns with vec_merge. */
32926
32927 static rtx
32928 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
32929 rtx target)
32930 {
32931 rtx pat;
32932 tree arg0 = CALL_EXPR_ARG (exp, 0);
32933 rtx op1, op0 = expand_normal (arg0);
32934 machine_mode tmode = insn_data[icode].operand[0].mode;
32935 machine_mode mode0 = insn_data[icode].operand[1].mode;
32936
32937 if (optimize || !target
32938 || GET_MODE (target) != tmode
32939 || !insn_data[icode].operand[0].predicate (target, tmode))
32940 target = gen_reg_rtx (tmode);
32941
32942 if (VECTOR_MODE_P (mode0))
32943 op0 = safe_vector_operand (op0, mode0);
32944
32945 if ((optimize && !register_operand (op0, mode0))
32946 || !insn_data[icode].operand[1].predicate (op0, mode0))
32947 op0 = copy_to_mode_reg (mode0, op0);
32948
32949 op1 = op0;
32950 if (!insn_data[icode].operand[2].predicate (op1, mode0))
32951 op1 = copy_to_mode_reg (mode0, op1);
32952
32953 pat = GEN_FCN (icode) (target, op0, op1);
32954 if (! pat)
32955 return 0;
32956 emit_insn (pat);
32957 return target;
32958 }
32959
32960 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
32961
32962 static rtx
32963 ix86_expand_sse_compare (const struct builtin_description *d,
32964 tree exp, rtx target, bool swap)
32965 {
32966 rtx pat;
32967 tree arg0 = CALL_EXPR_ARG (exp, 0);
32968 tree arg1 = CALL_EXPR_ARG (exp, 1);
32969 rtx op0 = expand_normal (arg0);
32970 rtx op1 = expand_normal (arg1);
32971 rtx op2;
32972 machine_mode tmode = insn_data[d->icode].operand[0].mode;
32973 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
32974 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
32975 enum rtx_code comparison = d->comparison;
32976
32977 if (VECTOR_MODE_P (mode0))
32978 op0 = safe_vector_operand (op0, mode0);
32979 if (VECTOR_MODE_P (mode1))
32980 op1 = safe_vector_operand (op1, mode1);
32981
32982 /* Swap operands if we have a comparison that isn't available in
32983 hardware. */
32984 if (swap)
32985 std::swap (op0, op1);
32986
32987 if (optimize || !target
32988 || GET_MODE (target) != tmode
32989 || !insn_data[d->icode].operand[0].predicate (target, tmode))
32990 target = gen_reg_rtx (tmode);
32991
32992 if ((optimize && !register_operand (op0, mode0))
32993 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
32994 op0 = copy_to_mode_reg (mode0, op0);
32995 if ((optimize && !register_operand (op1, mode1))
32996 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
32997 op1 = copy_to_mode_reg (mode1, op1);
32998
32999 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
33000 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33001 if (! pat)
33002 return 0;
33003 emit_insn (pat);
33004 return target;
33005 }
33006
33007 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
33008
33009 static rtx
33010 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
33011 rtx target)
33012 {
33013 rtx pat;
33014 tree arg0 = CALL_EXPR_ARG (exp, 0);
33015 tree arg1 = CALL_EXPR_ARG (exp, 1);
33016 rtx op0 = expand_normal (arg0);
33017 rtx op1 = expand_normal (arg1);
33018 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33019 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33020 enum rtx_code comparison = d->comparison;
33021
33022 if (VECTOR_MODE_P (mode0))
33023 op0 = safe_vector_operand (op0, mode0);
33024 if (VECTOR_MODE_P (mode1))
33025 op1 = safe_vector_operand (op1, mode1);
33026
33027 /* Swap operands if we have a comparison that isn't available in
33028 hardware. */
33029 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
33030 std::swap (op0, op1);
33031
33032 target = gen_reg_rtx (SImode);
33033 emit_move_insn (target, const0_rtx);
33034 target = gen_rtx_SUBREG (QImode, target, 0);
33035
33036 if ((optimize && !register_operand (op0, mode0))
33037 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33038 op0 = copy_to_mode_reg (mode0, op0);
33039 if ((optimize && !register_operand (op1, mode1))
33040 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33041 op1 = copy_to_mode_reg (mode1, op1);
33042
33043 pat = GEN_FCN (d->icode) (op0, op1);
33044 if (! pat)
33045 return 0;
33046 emit_insn (pat);
33047 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33048 gen_rtx_fmt_ee (comparison, QImode,
33049 SET_DEST (pat),
33050 const0_rtx)));
33051
33052 return SUBREG_REG (target);
33053 }
33054
33055 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
33056
33057 static rtx
33058 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
33059 rtx target)
33060 {
33061 rtx pat;
33062 tree arg0 = CALL_EXPR_ARG (exp, 0);
33063 rtx op1, op0 = expand_normal (arg0);
33064 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33065 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33066
33067 if (optimize || target == 0
33068 || GET_MODE (target) != tmode
33069 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33070 target = gen_reg_rtx (tmode);
33071
33072 if (VECTOR_MODE_P (mode0))
33073 op0 = safe_vector_operand (op0, mode0);
33074
33075 if ((optimize && !register_operand (op0, mode0))
33076 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33077 op0 = copy_to_mode_reg (mode0, op0);
33078
33079 op1 = GEN_INT (d->comparison);
33080
33081 pat = GEN_FCN (d->icode) (target, op0, op1);
33082 if (! pat)
33083 return 0;
33084 emit_insn (pat);
33085 return target;
33086 }
33087
33088 static rtx
33089 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
33090 tree exp, rtx target)
33091 {
33092 rtx pat;
33093 tree arg0 = CALL_EXPR_ARG (exp, 0);
33094 tree arg1 = CALL_EXPR_ARG (exp, 1);
33095 rtx op0 = expand_normal (arg0);
33096 rtx op1 = expand_normal (arg1);
33097 rtx op2;
33098 machine_mode tmode = insn_data[d->icode].operand[0].mode;
33099 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
33100 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
33101
33102 if (optimize || target == 0
33103 || GET_MODE (target) != tmode
33104 || !insn_data[d->icode].operand[0].predicate (target, tmode))
33105 target = gen_reg_rtx (tmode);
33106
33107 op0 = safe_vector_operand (op0, mode0);
33108 op1 = safe_vector_operand (op1, mode1);
33109
33110 if ((optimize && !register_operand (op0, mode0))
33111 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33112 op0 = copy_to_mode_reg (mode0, op0);
33113 if ((optimize && !register_operand (op1, mode1))
33114 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33115 op1 = copy_to_mode_reg (mode1, op1);
33116
33117 op2 = GEN_INT (d->comparison);
33118
33119 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
33120 if (! pat)
33121 return 0;
33122 emit_insn (pat);
33123 return target;
33124 }
33125
33126 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
33127
33128 static rtx
33129 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
33130 rtx target)
33131 {
33132 rtx pat;
33133 tree arg0 = CALL_EXPR_ARG (exp, 0);
33134 tree arg1 = CALL_EXPR_ARG (exp, 1);
33135 rtx op0 = expand_normal (arg0);
33136 rtx op1 = expand_normal (arg1);
33137 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
33138 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
33139 enum rtx_code comparison = d->comparison;
33140
33141 if (VECTOR_MODE_P (mode0))
33142 op0 = safe_vector_operand (op0, mode0);
33143 if (VECTOR_MODE_P (mode1))
33144 op1 = safe_vector_operand (op1, mode1);
33145
33146 target = gen_reg_rtx (SImode);
33147 emit_move_insn (target, const0_rtx);
33148 target = gen_rtx_SUBREG (QImode, target, 0);
33149
33150 if ((optimize && !register_operand (op0, mode0))
33151 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
33152 op0 = copy_to_mode_reg (mode0, op0);
33153 if ((optimize && !register_operand (op1, mode1))
33154 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
33155 op1 = copy_to_mode_reg (mode1, op1);
33156
33157 pat = GEN_FCN (d->icode) (op0, op1);
33158 if (! pat)
33159 return 0;
33160 emit_insn (pat);
33161 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33162 gen_rtx_fmt_ee (comparison, QImode,
33163 SET_DEST (pat),
33164 const0_rtx)));
33165
33166 return SUBREG_REG (target);
33167 }
33168
33169 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
33170
33171 static rtx
33172 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
33173 tree exp, rtx target)
33174 {
33175 rtx pat;
33176 tree arg0 = CALL_EXPR_ARG (exp, 0);
33177 tree arg1 = CALL_EXPR_ARG (exp, 1);
33178 tree arg2 = CALL_EXPR_ARG (exp, 2);
33179 tree arg3 = CALL_EXPR_ARG (exp, 3);
33180 tree arg4 = CALL_EXPR_ARG (exp, 4);
33181 rtx scratch0, scratch1;
33182 rtx op0 = expand_normal (arg0);
33183 rtx op1 = expand_normal (arg1);
33184 rtx op2 = expand_normal (arg2);
33185 rtx op3 = expand_normal (arg3);
33186 rtx op4 = expand_normal (arg4);
33187 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
33188
33189 tmode0 = insn_data[d->icode].operand[0].mode;
33190 tmode1 = insn_data[d->icode].operand[1].mode;
33191 modev2 = insn_data[d->icode].operand[2].mode;
33192 modei3 = insn_data[d->icode].operand[3].mode;
33193 modev4 = insn_data[d->icode].operand[4].mode;
33194 modei5 = insn_data[d->icode].operand[5].mode;
33195 modeimm = insn_data[d->icode].operand[6].mode;
33196
33197 if (VECTOR_MODE_P (modev2))
33198 op0 = safe_vector_operand (op0, modev2);
33199 if (VECTOR_MODE_P (modev4))
33200 op2 = safe_vector_operand (op2, modev4);
33201
33202 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33203 op0 = copy_to_mode_reg (modev2, op0);
33204 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
33205 op1 = copy_to_mode_reg (modei3, op1);
33206 if ((optimize && !register_operand (op2, modev4))
33207 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
33208 op2 = copy_to_mode_reg (modev4, op2);
33209 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
33210 op3 = copy_to_mode_reg (modei5, op3);
33211
33212 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
33213 {
33214 error ("the fifth argument must be an 8-bit immediate");
33215 return const0_rtx;
33216 }
33217
33218 if (d->code == IX86_BUILTIN_PCMPESTRI128)
33219 {
33220 if (optimize || !target
33221 || GET_MODE (target) != tmode0
33222 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33223 target = gen_reg_rtx (tmode0);
33224
33225 scratch1 = gen_reg_rtx (tmode1);
33226
33227 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
33228 }
33229 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
33230 {
33231 if (optimize || !target
33232 || GET_MODE (target) != tmode1
33233 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33234 target = gen_reg_rtx (tmode1);
33235
33236 scratch0 = gen_reg_rtx (tmode0);
33237
33238 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
33239 }
33240 else
33241 {
33242 gcc_assert (d->flag);
33243
33244 scratch0 = gen_reg_rtx (tmode0);
33245 scratch1 = gen_reg_rtx (tmode1);
33246
33247 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
33248 }
33249
33250 if (! pat)
33251 return 0;
33252
33253 emit_insn (pat);
33254
33255 if (d->flag)
33256 {
33257 target = gen_reg_rtx (SImode);
33258 emit_move_insn (target, const0_rtx);
33259 target = gen_rtx_SUBREG (QImode, target, 0);
33260
33261 emit_insn
33262 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33263 gen_rtx_fmt_ee (EQ, QImode,
33264 gen_rtx_REG ((machine_mode) d->flag,
33265 FLAGS_REG),
33266 const0_rtx)));
33267 return SUBREG_REG (target);
33268 }
33269 else
33270 return target;
33271 }
33272
33273
33274 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
33275
33276 static rtx
33277 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
33278 tree exp, rtx target)
33279 {
33280 rtx pat;
33281 tree arg0 = CALL_EXPR_ARG (exp, 0);
33282 tree arg1 = CALL_EXPR_ARG (exp, 1);
33283 tree arg2 = CALL_EXPR_ARG (exp, 2);
33284 rtx scratch0, scratch1;
33285 rtx op0 = expand_normal (arg0);
33286 rtx op1 = expand_normal (arg1);
33287 rtx op2 = expand_normal (arg2);
33288 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
33289
33290 tmode0 = insn_data[d->icode].operand[0].mode;
33291 tmode1 = insn_data[d->icode].operand[1].mode;
33292 modev2 = insn_data[d->icode].operand[2].mode;
33293 modev3 = insn_data[d->icode].operand[3].mode;
33294 modeimm = insn_data[d->icode].operand[4].mode;
33295
33296 if (VECTOR_MODE_P (modev2))
33297 op0 = safe_vector_operand (op0, modev2);
33298 if (VECTOR_MODE_P (modev3))
33299 op1 = safe_vector_operand (op1, modev3);
33300
33301 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
33302 op0 = copy_to_mode_reg (modev2, op0);
33303 if ((optimize && !register_operand (op1, modev3))
33304 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
33305 op1 = copy_to_mode_reg (modev3, op1);
33306
33307 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
33308 {
33309 error ("the third argument must be an 8-bit immediate");
33310 return const0_rtx;
33311 }
33312
33313 if (d->code == IX86_BUILTIN_PCMPISTRI128)
33314 {
33315 if (optimize || !target
33316 || GET_MODE (target) != tmode0
33317 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
33318 target = gen_reg_rtx (tmode0);
33319
33320 scratch1 = gen_reg_rtx (tmode1);
33321
33322 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
33323 }
33324 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
33325 {
33326 if (optimize || !target
33327 || GET_MODE (target) != tmode1
33328 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
33329 target = gen_reg_rtx (tmode1);
33330
33331 scratch0 = gen_reg_rtx (tmode0);
33332
33333 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
33334 }
33335 else
33336 {
33337 gcc_assert (d->flag);
33338
33339 scratch0 = gen_reg_rtx (tmode0);
33340 scratch1 = gen_reg_rtx (tmode1);
33341
33342 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
33343 }
33344
33345 if (! pat)
33346 return 0;
33347
33348 emit_insn (pat);
33349
33350 if (d->flag)
33351 {
33352 target = gen_reg_rtx (SImode);
33353 emit_move_insn (target, const0_rtx);
33354 target = gen_rtx_SUBREG (QImode, target, 0);
33355
33356 emit_insn
33357 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
33358 gen_rtx_fmt_ee (EQ, QImode,
33359 gen_rtx_REG ((machine_mode) d->flag,
33360 FLAGS_REG),
33361 const0_rtx)));
33362 return SUBREG_REG (target);
33363 }
33364 else
33365 return target;
33366 }
33367
33368 /* Subroutine of ix86_expand_builtin to take care of insns with
33369 variable number of operands. */
33370
33371 static rtx
33372 ix86_expand_args_builtin (const struct builtin_description *d,
33373 tree exp, rtx target)
33374 {
33375 rtx pat, real_target;
33376 unsigned int i, nargs;
33377 unsigned int nargs_constant = 0;
33378 unsigned int mask_pos = 0;
33379 int num_memory = 0;
33380 struct
33381 {
33382 rtx op;
33383 machine_mode mode;
33384 } args[6];
33385 bool second_arg_count = false;
33386 enum insn_code icode = d->icode;
33387 const struct insn_data_d *insn_p = &insn_data[icode];
33388 machine_mode tmode = insn_p->operand[0].mode;
33389 machine_mode rmode = VOIDmode;
33390 bool swap = false;
33391 enum rtx_code comparison = d->comparison;
33392
33393 switch ((enum ix86_builtin_func_type) d->flag)
33394 {
33395 case V2DF_FTYPE_V2DF_ROUND:
33396 case V4DF_FTYPE_V4DF_ROUND:
33397 case V8DF_FTYPE_V8DF_ROUND:
33398 case V4SF_FTYPE_V4SF_ROUND:
33399 case V8SF_FTYPE_V8SF_ROUND:
33400 case V16SF_FTYPE_V16SF_ROUND:
33401 case V4SI_FTYPE_V4SF_ROUND:
33402 case V8SI_FTYPE_V8SF_ROUND:
33403 case V16SI_FTYPE_V16SF_ROUND:
33404 return ix86_expand_sse_round (d, exp, target);
33405 case V4SI_FTYPE_V2DF_V2DF_ROUND:
33406 case V8SI_FTYPE_V4DF_V4DF_ROUND:
33407 case V16SI_FTYPE_V8DF_V8DF_ROUND:
33408 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
33409 case INT_FTYPE_V8SF_V8SF_PTEST:
33410 case INT_FTYPE_V4DI_V4DI_PTEST:
33411 case INT_FTYPE_V4DF_V4DF_PTEST:
33412 case INT_FTYPE_V4SF_V4SF_PTEST:
33413 case INT_FTYPE_V2DI_V2DI_PTEST:
33414 case INT_FTYPE_V2DF_V2DF_PTEST:
33415 return ix86_expand_sse_ptest (d, exp, target);
33416 case FLOAT128_FTYPE_FLOAT128:
33417 case FLOAT_FTYPE_FLOAT:
33418 case INT_FTYPE_INT:
33419 case UINT_FTYPE_UINT:
33420 case UINT16_FTYPE_UINT16:
33421 case UINT64_FTYPE_INT:
33422 case UINT64_FTYPE_UINT64:
33423 case INT64_FTYPE_INT64:
33424 case INT64_FTYPE_V4SF:
33425 case INT64_FTYPE_V2DF:
33426 case INT_FTYPE_V16QI:
33427 case INT_FTYPE_V8QI:
33428 case INT_FTYPE_V8SF:
33429 case INT_FTYPE_V4DF:
33430 case INT_FTYPE_V4SF:
33431 case INT_FTYPE_V2DF:
33432 case INT_FTYPE_V32QI:
33433 case V16QI_FTYPE_V16QI:
33434 case V8SI_FTYPE_V8SF:
33435 case V8SI_FTYPE_V4SI:
33436 case V8HI_FTYPE_V8HI:
33437 case V8HI_FTYPE_V16QI:
33438 case V8QI_FTYPE_V8QI:
33439 case V8SF_FTYPE_V8SF:
33440 case V8SF_FTYPE_V8SI:
33441 case V8SF_FTYPE_V4SF:
33442 case V8SF_FTYPE_V8HI:
33443 case V4SI_FTYPE_V4SI:
33444 case V4SI_FTYPE_V16QI:
33445 case V4SI_FTYPE_V4SF:
33446 case V4SI_FTYPE_V8SI:
33447 case V4SI_FTYPE_V8HI:
33448 case V4SI_FTYPE_V4DF:
33449 case V4SI_FTYPE_V2DF:
33450 case V4HI_FTYPE_V4HI:
33451 case V4DF_FTYPE_V4DF:
33452 case V4DF_FTYPE_V4SI:
33453 case V4DF_FTYPE_V4SF:
33454 case V4DF_FTYPE_V2DF:
33455 case V4SF_FTYPE_V4SF:
33456 case V4SF_FTYPE_V4SI:
33457 case V4SF_FTYPE_V8SF:
33458 case V4SF_FTYPE_V4DF:
33459 case V4SF_FTYPE_V8HI:
33460 case V4SF_FTYPE_V2DF:
33461 case V2DI_FTYPE_V2DI:
33462 case V2DI_FTYPE_V16QI:
33463 case V2DI_FTYPE_V8HI:
33464 case V2DI_FTYPE_V4SI:
33465 case V2DF_FTYPE_V2DF:
33466 case V2DF_FTYPE_V4SI:
33467 case V2DF_FTYPE_V4DF:
33468 case V2DF_FTYPE_V4SF:
33469 case V2DF_FTYPE_V2SI:
33470 case V2SI_FTYPE_V2SI:
33471 case V2SI_FTYPE_V4SF:
33472 case V2SI_FTYPE_V2SF:
33473 case V2SI_FTYPE_V2DF:
33474 case V2SF_FTYPE_V2SF:
33475 case V2SF_FTYPE_V2SI:
33476 case V32QI_FTYPE_V32QI:
33477 case V32QI_FTYPE_V16QI:
33478 case V16HI_FTYPE_V16HI:
33479 case V16HI_FTYPE_V8HI:
33480 case V8SI_FTYPE_V8SI:
33481 case V16HI_FTYPE_V16QI:
33482 case V8SI_FTYPE_V16QI:
33483 case V4DI_FTYPE_V16QI:
33484 case V8SI_FTYPE_V8HI:
33485 case V4DI_FTYPE_V8HI:
33486 case V4DI_FTYPE_V4SI:
33487 case V4DI_FTYPE_V2DI:
33488 case UQI_FTYPE_UQI:
33489 case UHI_FTYPE_UHI:
33490 case USI_FTYPE_USI:
33491 case USI_FTYPE_UQI:
33492 case USI_FTYPE_UHI:
33493 case UDI_FTYPE_UDI:
33494 case UHI_FTYPE_V16QI:
33495 case USI_FTYPE_V32QI:
33496 case UDI_FTYPE_V64QI:
33497 case V16QI_FTYPE_UHI:
33498 case V32QI_FTYPE_USI:
33499 case V64QI_FTYPE_UDI:
33500 case V8HI_FTYPE_UQI:
33501 case V16HI_FTYPE_UHI:
33502 case V32HI_FTYPE_USI:
33503 case V4SI_FTYPE_UQI:
33504 case V8SI_FTYPE_UQI:
33505 case V4SI_FTYPE_UHI:
33506 case V8SI_FTYPE_UHI:
33507 case UQI_FTYPE_V8HI:
33508 case UHI_FTYPE_V16HI:
33509 case USI_FTYPE_V32HI:
33510 case UQI_FTYPE_V4SI:
33511 case UQI_FTYPE_V8SI:
33512 case UHI_FTYPE_V16SI:
33513 case UQI_FTYPE_V2DI:
33514 case UQI_FTYPE_V4DI:
33515 case UQI_FTYPE_V8DI:
33516 case V16SI_FTYPE_UHI:
33517 case V2DI_FTYPE_UQI:
33518 case V4DI_FTYPE_UQI:
33519 case V16SI_FTYPE_INT:
33520 case V16SF_FTYPE_V8SF:
33521 case V16SI_FTYPE_V8SI:
33522 case V16SF_FTYPE_V4SF:
33523 case V16SI_FTYPE_V4SI:
33524 case V16SI_FTYPE_V16SF:
33525 case V16SI_FTYPE_V16SI:
33526 case V16SF_FTYPE_V16SF:
33527 case V8DI_FTYPE_UQI:
33528 case V8DI_FTYPE_V8DI:
33529 case V8DF_FTYPE_V4DF:
33530 case V8DF_FTYPE_V2DF:
33531 case V8DF_FTYPE_V8DF:
33532 nargs = 1;
33533 break;
33534 case V4SF_FTYPE_V4SF_VEC_MERGE:
33535 case V2DF_FTYPE_V2DF_VEC_MERGE:
33536 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
33537 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
33538 case V16QI_FTYPE_V16QI_V16QI:
33539 case V16QI_FTYPE_V8HI_V8HI:
33540 case V16SF_FTYPE_V16SF_V16SF:
33541 case V8QI_FTYPE_V8QI_V8QI:
33542 case V8QI_FTYPE_V4HI_V4HI:
33543 case V8HI_FTYPE_V8HI_V8HI:
33544 case V8HI_FTYPE_V16QI_V16QI:
33545 case V8HI_FTYPE_V4SI_V4SI:
33546 case V8SF_FTYPE_V8SF_V8SF:
33547 case V8SF_FTYPE_V8SF_V8SI:
33548 case V8DF_FTYPE_V8DF_V8DF:
33549 case V4SI_FTYPE_V4SI_V4SI:
33550 case V4SI_FTYPE_V8HI_V8HI:
33551 case V4SI_FTYPE_V2DF_V2DF:
33552 case V4HI_FTYPE_V4HI_V4HI:
33553 case V4HI_FTYPE_V8QI_V8QI:
33554 case V4HI_FTYPE_V2SI_V2SI:
33555 case V4DF_FTYPE_V4DF_V4DF:
33556 case V4DF_FTYPE_V4DF_V4DI:
33557 case V4SF_FTYPE_V4SF_V4SF:
33558 case V4SF_FTYPE_V4SF_V4SI:
33559 case V4SF_FTYPE_V4SF_V2SI:
33560 case V4SF_FTYPE_V4SF_V2DF:
33561 case V4SF_FTYPE_V4SF_UINT:
33562 case V4SF_FTYPE_V4SF_DI:
33563 case V4SF_FTYPE_V4SF_SI:
33564 case V2DI_FTYPE_V2DI_V2DI:
33565 case V2DI_FTYPE_V16QI_V16QI:
33566 case V2DI_FTYPE_V4SI_V4SI:
33567 case V2DI_FTYPE_V2DI_V16QI:
33568 case V2SI_FTYPE_V2SI_V2SI:
33569 case V2SI_FTYPE_V4HI_V4HI:
33570 case V2SI_FTYPE_V2SF_V2SF:
33571 case V2DF_FTYPE_V2DF_V2DF:
33572 case V2DF_FTYPE_V2DF_V4SF:
33573 case V2DF_FTYPE_V2DF_V2DI:
33574 case V2DF_FTYPE_V2DF_DI:
33575 case V2DF_FTYPE_V2DF_SI:
33576 case V2DF_FTYPE_V2DF_UINT:
33577 case V2SF_FTYPE_V2SF_V2SF:
33578 case V1DI_FTYPE_V1DI_V1DI:
33579 case V1DI_FTYPE_V8QI_V8QI:
33580 case V1DI_FTYPE_V2SI_V2SI:
33581 case V32QI_FTYPE_V16HI_V16HI:
33582 case V16HI_FTYPE_V8SI_V8SI:
33583 case V64QI_FTYPE_V64QI_V64QI:
33584 case V32QI_FTYPE_V32QI_V32QI:
33585 case V16HI_FTYPE_V32QI_V32QI:
33586 case V16HI_FTYPE_V16HI_V16HI:
33587 case V8SI_FTYPE_V4DF_V4DF:
33588 case V8SI_FTYPE_V8SI_V8SI:
33589 case V8SI_FTYPE_V16HI_V16HI:
33590 case V4DI_FTYPE_V4DI_V4DI:
33591 case V4DI_FTYPE_V8SI_V8SI:
33592 case V8DI_FTYPE_V64QI_V64QI:
33593 if (comparison == UNKNOWN)
33594 return ix86_expand_binop_builtin (icode, exp, target);
33595 nargs = 2;
33596 break;
33597 case V4SF_FTYPE_V4SF_V4SF_SWAP:
33598 case V2DF_FTYPE_V2DF_V2DF_SWAP:
33599 gcc_assert (comparison != UNKNOWN);
33600 nargs = 2;
33601 swap = true;
33602 break;
33603 case V16HI_FTYPE_V16HI_V8HI_COUNT:
33604 case V16HI_FTYPE_V16HI_SI_COUNT:
33605 case V8SI_FTYPE_V8SI_V4SI_COUNT:
33606 case V8SI_FTYPE_V8SI_SI_COUNT:
33607 case V4DI_FTYPE_V4DI_V2DI_COUNT:
33608 case V4DI_FTYPE_V4DI_INT_COUNT:
33609 case V8HI_FTYPE_V8HI_V8HI_COUNT:
33610 case V8HI_FTYPE_V8HI_SI_COUNT:
33611 case V4SI_FTYPE_V4SI_V4SI_COUNT:
33612 case V4SI_FTYPE_V4SI_SI_COUNT:
33613 case V4HI_FTYPE_V4HI_V4HI_COUNT:
33614 case V4HI_FTYPE_V4HI_SI_COUNT:
33615 case V2DI_FTYPE_V2DI_V2DI_COUNT:
33616 case V2DI_FTYPE_V2DI_SI_COUNT:
33617 case V2SI_FTYPE_V2SI_V2SI_COUNT:
33618 case V2SI_FTYPE_V2SI_SI_COUNT:
33619 case V1DI_FTYPE_V1DI_V1DI_COUNT:
33620 case V1DI_FTYPE_V1DI_SI_COUNT:
33621 nargs = 2;
33622 second_arg_count = true;
33623 break;
33624 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
33625 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
33626 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
33627 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
33628 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
33629 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
33630 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
33631 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
33632 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
33633 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
33634 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
33635 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
33636 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
33637 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
33638 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
33639 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
33640 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
33641 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
33642 nargs = 4;
33643 second_arg_count = true;
33644 break;
33645 case UINT64_FTYPE_UINT64_UINT64:
33646 case UINT_FTYPE_UINT_UINT:
33647 case UINT_FTYPE_UINT_USHORT:
33648 case UINT_FTYPE_UINT_UCHAR:
33649 case UINT16_FTYPE_UINT16_INT:
33650 case UINT8_FTYPE_UINT8_INT:
33651 case UQI_FTYPE_UQI_UQI:
33652 case UHI_FTYPE_UHI_UHI:
33653 case USI_FTYPE_USI_USI:
33654 case UDI_FTYPE_UDI_UDI:
33655 case V16SI_FTYPE_V8DF_V8DF:
33656 nargs = 2;
33657 break;
33658 case V2DI_FTYPE_V2DI_INT_CONVERT:
33659 nargs = 2;
33660 rmode = V1TImode;
33661 nargs_constant = 1;
33662 break;
33663 case V4DI_FTYPE_V4DI_INT_CONVERT:
33664 nargs = 2;
33665 rmode = V2TImode;
33666 nargs_constant = 1;
33667 break;
33668 case V8DI_FTYPE_V8DI_INT_CONVERT:
33669 nargs = 2;
33670 rmode = V4TImode;
33671 nargs_constant = 1;
33672 break;
33673 case V8HI_FTYPE_V8HI_INT:
33674 case V8HI_FTYPE_V8SF_INT:
33675 case V16HI_FTYPE_V16SF_INT:
33676 case V8HI_FTYPE_V4SF_INT:
33677 case V8SF_FTYPE_V8SF_INT:
33678 case V4SF_FTYPE_V16SF_INT:
33679 case V16SF_FTYPE_V16SF_INT:
33680 case V4SI_FTYPE_V4SI_INT:
33681 case V4SI_FTYPE_V8SI_INT:
33682 case V4HI_FTYPE_V4HI_INT:
33683 case V4DF_FTYPE_V4DF_INT:
33684 case V4DF_FTYPE_V8DF_INT:
33685 case V4SF_FTYPE_V4SF_INT:
33686 case V4SF_FTYPE_V8SF_INT:
33687 case V2DI_FTYPE_V2DI_INT:
33688 case V2DF_FTYPE_V2DF_INT:
33689 case V2DF_FTYPE_V4DF_INT:
33690 case V16HI_FTYPE_V16HI_INT:
33691 case V8SI_FTYPE_V8SI_INT:
33692 case V16SI_FTYPE_V16SI_INT:
33693 case V4SI_FTYPE_V16SI_INT:
33694 case V4DI_FTYPE_V4DI_INT:
33695 case V2DI_FTYPE_V4DI_INT:
33696 case V4DI_FTYPE_V8DI_INT:
33697 case QI_FTYPE_V4SF_INT:
33698 case QI_FTYPE_V2DF_INT:
33699 case UQI_FTYPE_UQI_UQI_CONST:
33700 case UHI_FTYPE_UHI_UQI:
33701 case USI_FTYPE_USI_UQI:
33702 case UDI_FTYPE_UDI_UQI:
33703 nargs = 2;
33704 nargs_constant = 1;
33705 break;
33706 case V16QI_FTYPE_V16QI_V16QI_V16QI:
33707 case V8SF_FTYPE_V8SF_V8SF_V8SF:
33708 case V4DF_FTYPE_V4DF_V4DF_V4DF:
33709 case V4SF_FTYPE_V4SF_V4SF_V4SF:
33710 case V2DF_FTYPE_V2DF_V2DF_V2DF:
33711 case V32QI_FTYPE_V32QI_V32QI_V32QI:
33712 case UHI_FTYPE_V16SI_V16SI_UHI:
33713 case UQI_FTYPE_V8DI_V8DI_UQI:
33714 case V16HI_FTYPE_V16SI_V16HI_UHI:
33715 case V16QI_FTYPE_V16SI_V16QI_UHI:
33716 case V16QI_FTYPE_V8DI_V16QI_UQI:
33717 case V16SF_FTYPE_V16SF_V16SF_UHI:
33718 case V16SF_FTYPE_V4SF_V16SF_UHI:
33719 case V16SI_FTYPE_SI_V16SI_UHI:
33720 case V16SI_FTYPE_V16HI_V16SI_UHI:
33721 case V16SI_FTYPE_V16QI_V16SI_UHI:
33722 case V8SF_FTYPE_V4SF_V8SF_UQI:
33723 case V4DF_FTYPE_V2DF_V4DF_UQI:
33724 case V8SI_FTYPE_V4SI_V8SI_UQI:
33725 case V8SI_FTYPE_SI_V8SI_UQI:
33726 case V4SI_FTYPE_V4SI_V4SI_UQI:
33727 case V4SI_FTYPE_SI_V4SI_UQI:
33728 case V4DI_FTYPE_V2DI_V4DI_UQI:
33729 case V4DI_FTYPE_DI_V4DI_UQI:
33730 case V2DI_FTYPE_V2DI_V2DI_UQI:
33731 case V2DI_FTYPE_DI_V2DI_UQI:
33732 case V64QI_FTYPE_V64QI_V64QI_UDI:
33733 case V64QI_FTYPE_V16QI_V64QI_UDI:
33734 case V64QI_FTYPE_QI_V64QI_UDI:
33735 case V32QI_FTYPE_V32QI_V32QI_USI:
33736 case V32QI_FTYPE_V16QI_V32QI_USI:
33737 case V32QI_FTYPE_QI_V32QI_USI:
33738 case V16QI_FTYPE_V16QI_V16QI_UHI:
33739 case V16QI_FTYPE_QI_V16QI_UHI:
33740 case V32HI_FTYPE_V8HI_V32HI_USI:
33741 case V32HI_FTYPE_HI_V32HI_USI:
33742 case V16HI_FTYPE_V8HI_V16HI_UHI:
33743 case V16HI_FTYPE_HI_V16HI_UHI:
33744 case V8HI_FTYPE_V8HI_V8HI_UQI:
33745 case V8HI_FTYPE_HI_V8HI_UQI:
33746 case V8SF_FTYPE_V8HI_V8SF_UQI:
33747 case V4SF_FTYPE_V8HI_V4SF_UQI:
33748 case V8SI_FTYPE_V8SF_V8SI_UQI:
33749 case V4SI_FTYPE_V4SF_V4SI_UQI:
33750 case V4DI_FTYPE_V4SF_V4DI_UQI:
33751 case V2DI_FTYPE_V4SF_V2DI_UQI:
33752 case V4SF_FTYPE_V4DI_V4SF_UQI:
33753 case V4SF_FTYPE_V2DI_V4SF_UQI:
33754 case V4DF_FTYPE_V4DI_V4DF_UQI:
33755 case V2DF_FTYPE_V2DI_V2DF_UQI:
33756 case V16QI_FTYPE_V8HI_V16QI_UQI:
33757 case V16QI_FTYPE_V16HI_V16QI_UHI:
33758 case V16QI_FTYPE_V4SI_V16QI_UQI:
33759 case V16QI_FTYPE_V8SI_V16QI_UQI:
33760 case V8HI_FTYPE_V4SI_V8HI_UQI:
33761 case V8HI_FTYPE_V8SI_V8HI_UQI:
33762 case V16QI_FTYPE_V2DI_V16QI_UQI:
33763 case V16QI_FTYPE_V4DI_V16QI_UQI:
33764 case V8HI_FTYPE_V2DI_V8HI_UQI:
33765 case V8HI_FTYPE_V4DI_V8HI_UQI:
33766 case V4SI_FTYPE_V2DI_V4SI_UQI:
33767 case V4SI_FTYPE_V4DI_V4SI_UQI:
33768 case V32QI_FTYPE_V32HI_V32QI_USI:
33769 case UHI_FTYPE_V16QI_V16QI_UHI:
33770 case USI_FTYPE_V32QI_V32QI_USI:
33771 case UDI_FTYPE_V64QI_V64QI_UDI:
33772 case UQI_FTYPE_V8HI_V8HI_UQI:
33773 case UHI_FTYPE_V16HI_V16HI_UHI:
33774 case USI_FTYPE_V32HI_V32HI_USI:
33775 case UQI_FTYPE_V4SI_V4SI_UQI:
33776 case UQI_FTYPE_V8SI_V8SI_UQI:
33777 case UQI_FTYPE_V2DI_V2DI_UQI:
33778 case UQI_FTYPE_V4DI_V4DI_UQI:
33779 case V4SF_FTYPE_V2DF_V4SF_UQI:
33780 case V4SF_FTYPE_V4DF_V4SF_UQI:
33781 case V16SI_FTYPE_V16SI_V16SI_UHI:
33782 case V16SI_FTYPE_V4SI_V16SI_UHI:
33783 case V2DI_FTYPE_V4SI_V2DI_UQI:
33784 case V2DI_FTYPE_V8HI_V2DI_UQI:
33785 case V2DI_FTYPE_V16QI_V2DI_UQI:
33786 case V4DI_FTYPE_V4DI_V4DI_UQI:
33787 case V4DI_FTYPE_V4SI_V4DI_UQI:
33788 case V4DI_FTYPE_V8HI_V4DI_UQI:
33789 case V4DI_FTYPE_V16QI_V4DI_UQI:
33790 case V4DI_FTYPE_V4DF_V4DI_UQI:
33791 case V2DI_FTYPE_V2DF_V2DI_UQI:
33792 case V4SI_FTYPE_V4DF_V4SI_UQI:
33793 case V4SI_FTYPE_V2DF_V4SI_UQI:
33794 case V4SI_FTYPE_V8HI_V4SI_UQI:
33795 case V4SI_FTYPE_V16QI_V4SI_UQI:
33796 case V4DI_FTYPE_V4DI_V4DI_V4DI:
33797 case V8DF_FTYPE_V2DF_V8DF_UQI:
33798 case V8DF_FTYPE_V4DF_V8DF_UQI:
33799 case V8DF_FTYPE_V8DF_V8DF_UQI:
33800 case V8SF_FTYPE_V8SF_V8SF_UQI:
33801 case V8SF_FTYPE_V8SI_V8SF_UQI:
33802 case V4DF_FTYPE_V4DF_V4DF_UQI:
33803 case V4SF_FTYPE_V4SF_V4SF_UQI:
33804 case V2DF_FTYPE_V2DF_V2DF_UQI:
33805 case V2DF_FTYPE_V4SF_V2DF_UQI:
33806 case V2DF_FTYPE_V4SI_V2DF_UQI:
33807 case V4SF_FTYPE_V4SI_V4SF_UQI:
33808 case V4DF_FTYPE_V4SF_V4DF_UQI:
33809 case V4DF_FTYPE_V4SI_V4DF_UQI:
33810 case V8SI_FTYPE_V8SI_V8SI_UQI:
33811 case V8SI_FTYPE_V8HI_V8SI_UQI:
33812 case V8SI_FTYPE_V16QI_V8SI_UQI:
33813 case V8DF_FTYPE_V8SI_V8DF_UQI:
33814 case V8DI_FTYPE_DI_V8DI_UQI:
33815 case V16SF_FTYPE_V8SF_V16SF_UHI:
33816 case V16SI_FTYPE_V8SI_V16SI_UHI:
33817 case V16HI_FTYPE_V16HI_V16HI_UHI:
33818 case V8HI_FTYPE_V16QI_V8HI_UQI:
33819 case V16HI_FTYPE_V16QI_V16HI_UHI:
33820 case V32HI_FTYPE_V32HI_V32HI_USI:
33821 case V32HI_FTYPE_V32QI_V32HI_USI:
33822 case V8DI_FTYPE_V16QI_V8DI_UQI:
33823 case V8DI_FTYPE_V2DI_V8DI_UQI:
33824 case V8DI_FTYPE_V4DI_V8DI_UQI:
33825 case V8DI_FTYPE_V8DI_V8DI_UQI:
33826 case V8DI_FTYPE_V8HI_V8DI_UQI:
33827 case V8DI_FTYPE_V8SI_V8DI_UQI:
33828 case V8HI_FTYPE_V8DI_V8HI_UQI:
33829 case V8SI_FTYPE_V8DI_V8SI_UQI:
33830 case V4SI_FTYPE_V4SI_V4SI_V4SI:
33831 case V16SI_FTYPE_V16SI_V16SI_V16SI:
33832 case V8DI_FTYPE_V8DI_V8DI_V8DI:
33833 case V32HI_FTYPE_V32HI_V32HI_V32HI:
33834 case V2DI_FTYPE_V2DI_V2DI_V2DI:
33835 case V16HI_FTYPE_V16HI_V16HI_V16HI:
33836 case V8SI_FTYPE_V8SI_V8SI_V8SI:
33837 case V8HI_FTYPE_V8HI_V8HI_V8HI:
33838 nargs = 3;
33839 break;
33840 case V32QI_FTYPE_V32QI_V32QI_INT:
33841 case V16HI_FTYPE_V16HI_V16HI_INT:
33842 case V16QI_FTYPE_V16QI_V16QI_INT:
33843 case V4DI_FTYPE_V4DI_V4DI_INT:
33844 case V8HI_FTYPE_V8HI_V8HI_INT:
33845 case V8SI_FTYPE_V8SI_V8SI_INT:
33846 case V8SI_FTYPE_V8SI_V4SI_INT:
33847 case V8SF_FTYPE_V8SF_V8SF_INT:
33848 case V8SF_FTYPE_V8SF_V4SF_INT:
33849 case V4SI_FTYPE_V4SI_V4SI_INT:
33850 case V4DF_FTYPE_V4DF_V4DF_INT:
33851 case V16SF_FTYPE_V16SF_V16SF_INT:
33852 case V16SF_FTYPE_V16SF_V4SF_INT:
33853 case V16SI_FTYPE_V16SI_V4SI_INT:
33854 case V4DF_FTYPE_V4DF_V2DF_INT:
33855 case V4SF_FTYPE_V4SF_V4SF_INT:
33856 case V2DI_FTYPE_V2DI_V2DI_INT:
33857 case V4DI_FTYPE_V4DI_V2DI_INT:
33858 case V2DF_FTYPE_V2DF_V2DF_INT:
33859 case UQI_FTYPE_V8DI_V8UDI_INT:
33860 case UQI_FTYPE_V8DF_V8DF_INT:
33861 case UQI_FTYPE_V2DF_V2DF_INT:
33862 case UQI_FTYPE_V4SF_V4SF_INT:
33863 case UHI_FTYPE_V16SI_V16SI_INT:
33864 case UHI_FTYPE_V16SF_V16SF_INT:
33865 case V64QI_FTYPE_V64QI_V64QI_INT:
33866 case V32HI_FTYPE_V32HI_V32HI_INT:
33867 case V16SI_FTYPE_V16SI_V16SI_INT:
33868 case V8DI_FTYPE_V8DI_V8DI_INT:
33869 nargs = 3;
33870 nargs_constant = 1;
33871 break;
33872 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
33873 nargs = 3;
33874 rmode = V4DImode;
33875 nargs_constant = 1;
33876 break;
33877 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
33878 nargs = 3;
33879 rmode = V2DImode;
33880 nargs_constant = 1;
33881 break;
33882 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
33883 nargs = 3;
33884 rmode = DImode;
33885 nargs_constant = 1;
33886 break;
33887 case V2DI_FTYPE_V2DI_UINT_UINT:
33888 nargs = 3;
33889 nargs_constant = 2;
33890 break;
33891 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
33892 nargs = 3;
33893 rmode = V8DImode;
33894 nargs_constant = 1;
33895 break;
33896 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
33897 nargs = 5;
33898 rmode = V8DImode;
33899 mask_pos = 2;
33900 nargs_constant = 1;
33901 break;
33902 case QI_FTYPE_V8DF_INT_UQI:
33903 case QI_FTYPE_V4DF_INT_UQI:
33904 case QI_FTYPE_V2DF_INT_UQI:
33905 case HI_FTYPE_V16SF_INT_UHI:
33906 case QI_FTYPE_V8SF_INT_UQI:
33907 case QI_FTYPE_V4SF_INT_UQI:
33908 nargs = 3;
33909 mask_pos = 1;
33910 nargs_constant = 1;
33911 break;
33912 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
33913 nargs = 5;
33914 rmode = V4DImode;
33915 mask_pos = 2;
33916 nargs_constant = 1;
33917 break;
33918 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
33919 nargs = 5;
33920 rmode = V2DImode;
33921 mask_pos = 2;
33922 nargs_constant = 1;
33923 break;
33924 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
33925 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
33926 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
33927 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
33928 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
33929 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
33930 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
33931 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
33932 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
33933 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
33934 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
33935 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
33936 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
33937 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
33938 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
33939 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
33940 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
33941 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
33942 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
33943 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
33944 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
33945 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
33946 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
33947 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
33948 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
33949 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
33950 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
33951 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
33952 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
33953 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
33954 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
33955 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
33956 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
33957 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
33958 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
33959 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
33960 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
33961 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
33962 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
33963 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
33964 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
33965 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
33966 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
33967 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
33968 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
33969 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
33970 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
33971 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
33972 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
33973 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
33974 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
33975 nargs = 4;
33976 break;
33977 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
33978 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
33979 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
33980 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
33981 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
33982 nargs = 4;
33983 nargs_constant = 1;
33984 break;
33985 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
33986 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
33987 case QI_FTYPE_V4DF_V4DF_INT_UQI:
33988 case QI_FTYPE_V8SF_V8SF_INT_UQI:
33989 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
33990 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
33991 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
33992 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
33993 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
33994 case USI_FTYPE_V32QI_V32QI_INT_USI:
33995 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
33996 case USI_FTYPE_V32HI_V32HI_INT_USI:
33997 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
33998 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
33999 case V32HI_FTYPE_V32HI_V32HI_V32HI_INT:
34000 case V16HI_FTYPE_V16HI_V16HI_V16HI_INT:
34001 case V8HI_FTYPE_V8HI_V8HI_V8HI_INT:
34002 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT:
34003 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT:
34004 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT:
34005 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT:
34006 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT:
34007 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
34008 nargs = 4;
34009 mask_pos = 1;
34010 nargs_constant = 1;
34011 break;
34012 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
34013 nargs = 4;
34014 nargs_constant = 2;
34015 break;
34016 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
34017 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
34018 nargs = 4;
34019 break;
34020 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
34021 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
34022 mask_pos = 1;
34023 nargs = 4;
34024 nargs_constant = 1;
34025 break;
34026 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
34027 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
34028 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
34029 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
34030 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
34031 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
34032 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
34033 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
34034 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
34035 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
34036 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
34037 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
34038 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
34039 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
34040 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
34041 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
34042 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
34043 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
34044 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
34045 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
34046 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
34047 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
34048 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
34049 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
34050 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
34051 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
34052 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
34053 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
34054 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
34055 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
34056 nargs = 4;
34057 mask_pos = 2;
34058 nargs_constant = 1;
34059 break;
34060 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
34061 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
34062 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
34063 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
34064 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
34065 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
34066 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
34067 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
34068 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
34069 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
34070 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
34071 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
34072 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
34073 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
34074 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
34075 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
34076 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
34077 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
34078 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
34079 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
34080 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
34081 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
34082 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
34083 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
34084 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
34085 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
34086 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
34087 nargs = 5;
34088 mask_pos = 2;
34089 nargs_constant = 1;
34090 break;
34091 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
34092 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
34093 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
34094 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
34095 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
34096 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
34097 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
34098 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
34099 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
34100 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
34101 nargs = 5;
34102 mask_pos = 1;
34103 nargs_constant = 1;
34104 break;
34105 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
34106 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
34107 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
34108 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
34109 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
34110 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
34111 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
34112 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
34113 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
34114 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
34115 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
34116 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
34117 nargs = 5;
34118 mask_pos = 1;
34119 nargs_constant = 2;
34120 break;
34121
34122 default:
34123 gcc_unreachable ();
34124 }
34125
34126 gcc_assert (nargs <= ARRAY_SIZE (args));
34127
34128 if (comparison != UNKNOWN)
34129 {
34130 gcc_assert (nargs == 2);
34131 return ix86_expand_sse_compare (d, exp, target, swap);
34132 }
34133
34134 if (rmode == VOIDmode || rmode == tmode)
34135 {
34136 if (optimize
34137 || target == 0
34138 || GET_MODE (target) != tmode
34139 || !insn_p->operand[0].predicate (target, tmode))
34140 target = gen_reg_rtx (tmode);
34141 else if (memory_operand (target, tmode))
34142 num_memory++;
34143 real_target = target;
34144 }
34145 else
34146 {
34147 real_target = gen_reg_rtx (tmode);
34148 target = lowpart_subreg (rmode, real_target, tmode);
34149 }
34150
34151 for (i = 0; i < nargs; i++)
34152 {
34153 tree arg = CALL_EXPR_ARG (exp, i);
34154 rtx op = expand_normal (arg);
34155 machine_mode mode = insn_p->operand[i + 1].mode;
34156 bool match = insn_p->operand[i + 1].predicate (op, mode);
34157
34158 if (second_arg_count && i == 1)
34159 {
34160 /* SIMD shift insns take either an 8-bit immediate or
34161 register as count. But builtin functions take int as
34162 count. If count doesn't match, we put it in register.
34163 The instructions are using 64-bit count, if op is just
34164 32-bit, zero-extend it, as negative shift counts
34165 are undefined behavior and zero-extension is more
34166 efficient. */
34167 if (!match)
34168 {
34169 if (SCALAR_INT_MODE_P (GET_MODE (op)))
34170 op = convert_modes (mode, GET_MODE (op), op, 1);
34171 else
34172 op = lowpart_subreg (mode, op, GET_MODE (op));
34173 if (!insn_p->operand[i + 1].predicate (op, mode))
34174 op = copy_to_reg (op);
34175 }
34176 }
34177 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34178 (!mask_pos && (nargs - i) <= nargs_constant))
34179 {
34180 if (!match)
34181 switch (icode)
34182 {
34183 case CODE_FOR_avx_vinsertf128v4di:
34184 case CODE_FOR_avx_vextractf128v4di:
34185 error ("the last argument must be an 1-bit immediate");
34186 return const0_rtx;
34187
34188 case CODE_FOR_avx512f_cmpv8di3_mask:
34189 case CODE_FOR_avx512f_cmpv16si3_mask:
34190 case CODE_FOR_avx512f_ucmpv8di3_mask:
34191 case CODE_FOR_avx512f_ucmpv16si3_mask:
34192 case CODE_FOR_avx512vl_cmpv4di3_mask:
34193 case CODE_FOR_avx512vl_cmpv8si3_mask:
34194 case CODE_FOR_avx512vl_ucmpv4di3_mask:
34195 case CODE_FOR_avx512vl_ucmpv8si3_mask:
34196 case CODE_FOR_avx512vl_cmpv2di3_mask:
34197 case CODE_FOR_avx512vl_cmpv4si3_mask:
34198 case CODE_FOR_avx512vl_ucmpv2di3_mask:
34199 case CODE_FOR_avx512vl_ucmpv4si3_mask:
34200 error ("the last argument must be a 3-bit immediate");
34201 return const0_rtx;
34202
34203 case CODE_FOR_sse4_1_roundsd:
34204 case CODE_FOR_sse4_1_roundss:
34205
34206 case CODE_FOR_sse4_1_roundpd:
34207 case CODE_FOR_sse4_1_roundps:
34208 case CODE_FOR_avx_roundpd256:
34209 case CODE_FOR_avx_roundps256:
34210
34211 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
34212 case CODE_FOR_sse4_1_roundps_sfix:
34213 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
34214 case CODE_FOR_avx_roundps_sfix256:
34215
34216 case CODE_FOR_sse4_1_blendps:
34217 case CODE_FOR_avx_blendpd256:
34218 case CODE_FOR_avx_vpermilv4df:
34219 case CODE_FOR_avx_vpermilv4df_mask:
34220 case CODE_FOR_avx512f_getmantv8df_mask:
34221 case CODE_FOR_avx512f_getmantv16sf_mask:
34222 case CODE_FOR_avx512vl_getmantv8sf_mask:
34223 case CODE_FOR_avx512vl_getmantv4df_mask:
34224 case CODE_FOR_avx512vl_getmantv4sf_mask:
34225 case CODE_FOR_avx512vl_getmantv2df_mask:
34226 case CODE_FOR_avx512dq_rangepv8df_mask_round:
34227 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
34228 case CODE_FOR_avx512dq_rangepv4df_mask:
34229 case CODE_FOR_avx512dq_rangepv8sf_mask:
34230 case CODE_FOR_avx512dq_rangepv2df_mask:
34231 case CODE_FOR_avx512dq_rangepv4sf_mask:
34232 case CODE_FOR_avx_shufpd256_mask:
34233 error ("the last argument must be a 4-bit immediate");
34234 return const0_rtx;
34235
34236 case CODE_FOR_sha1rnds4:
34237 case CODE_FOR_sse4_1_blendpd:
34238 case CODE_FOR_avx_vpermilv2df:
34239 case CODE_FOR_avx_vpermilv2df_mask:
34240 case CODE_FOR_xop_vpermil2v2df3:
34241 case CODE_FOR_xop_vpermil2v4sf3:
34242 case CODE_FOR_xop_vpermil2v4df3:
34243 case CODE_FOR_xop_vpermil2v8sf3:
34244 case CODE_FOR_avx512f_vinsertf32x4_mask:
34245 case CODE_FOR_avx512f_vinserti32x4_mask:
34246 case CODE_FOR_avx512f_vextractf32x4_mask:
34247 case CODE_FOR_avx512f_vextracti32x4_mask:
34248 case CODE_FOR_sse2_shufpd:
34249 case CODE_FOR_sse2_shufpd_mask:
34250 case CODE_FOR_avx512dq_shuf_f64x2_mask:
34251 case CODE_FOR_avx512dq_shuf_i64x2_mask:
34252 case CODE_FOR_avx512vl_shuf_i32x4_mask:
34253 case CODE_FOR_avx512vl_shuf_f32x4_mask:
34254 error ("the last argument must be a 2-bit immediate");
34255 return const0_rtx;
34256
34257 case CODE_FOR_avx_vextractf128v4df:
34258 case CODE_FOR_avx_vextractf128v8sf:
34259 case CODE_FOR_avx_vextractf128v8si:
34260 case CODE_FOR_avx_vinsertf128v4df:
34261 case CODE_FOR_avx_vinsertf128v8sf:
34262 case CODE_FOR_avx_vinsertf128v8si:
34263 case CODE_FOR_avx512f_vinsertf64x4_mask:
34264 case CODE_FOR_avx512f_vinserti64x4_mask:
34265 case CODE_FOR_avx512f_vextractf64x4_mask:
34266 case CODE_FOR_avx512f_vextracti64x4_mask:
34267 case CODE_FOR_avx512dq_vinsertf32x8_mask:
34268 case CODE_FOR_avx512dq_vinserti32x8_mask:
34269 case CODE_FOR_avx512vl_vinsertv4df:
34270 case CODE_FOR_avx512vl_vinsertv4di:
34271 case CODE_FOR_avx512vl_vinsertv8sf:
34272 case CODE_FOR_avx512vl_vinsertv8si:
34273 error ("the last argument must be a 1-bit immediate");
34274 return const0_rtx;
34275
34276 case CODE_FOR_avx_vmcmpv2df3:
34277 case CODE_FOR_avx_vmcmpv4sf3:
34278 case CODE_FOR_avx_cmpv2df3:
34279 case CODE_FOR_avx_cmpv4sf3:
34280 case CODE_FOR_avx_cmpv4df3:
34281 case CODE_FOR_avx_cmpv8sf3:
34282 case CODE_FOR_avx512f_cmpv8df3_mask:
34283 case CODE_FOR_avx512f_cmpv16sf3_mask:
34284 case CODE_FOR_avx512f_vmcmpv2df3_mask:
34285 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
34286 error ("the last argument must be a 5-bit immediate");
34287 return const0_rtx;
34288
34289 default:
34290 switch (nargs_constant)
34291 {
34292 case 2:
34293 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
34294 (!mask_pos && (nargs - i) == nargs_constant))
34295 {
34296 error ("the next to last argument must be an 8-bit immediate");
34297 break;
34298 }
34299 /* FALLTHRU */
34300 case 1:
34301 error ("the last argument must be an 8-bit immediate");
34302 break;
34303 default:
34304 gcc_unreachable ();
34305 }
34306 return const0_rtx;
34307 }
34308 }
34309 else
34310 {
34311 if (VECTOR_MODE_P (mode))
34312 op = safe_vector_operand (op, mode);
34313
34314 /* If we aren't optimizing, only allow one memory operand to
34315 be generated. */
34316 if (memory_operand (op, mode))
34317 num_memory++;
34318
34319 op = fixup_modeless_constant (op, mode);
34320
34321 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34322 {
34323 if (optimize || !match || num_memory > 1)
34324 op = copy_to_mode_reg (mode, op);
34325 }
34326 else
34327 {
34328 op = copy_to_reg (op);
34329 op = lowpart_subreg (mode, op, GET_MODE (op));
34330 }
34331 }
34332
34333 args[i].op = op;
34334 args[i].mode = mode;
34335 }
34336
34337 switch (nargs)
34338 {
34339 case 1:
34340 pat = GEN_FCN (icode) (real_target, args[0].op);
34341 break;
34342 case 2:
34343 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
34344 break;
34345 case 3:
34346 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34347 args[2].op);
34348 break;
34349 case 4:
34350 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34351 args[2].op, args[3].op);
34352 break;
34353 case 5:
34354 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34355 args[2].op, args[3].op, args[4].op);
34356 break;
34357 case 6:
34358 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
34359 args[2].op, args[3].op, args[4].op,
34360 args[5].op);
34361 break;
34362 default:
34363 gcc_unreachable ();
34364 }
34365
34366 if (! pat)
34367 return 0;
34368
34369 emit_insn (pat);
34370 return target;
34371 }
34372
34373 /* Transform pattern of following layout:
34374 (set A
34375 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
34376 )
34377 into:
34378 (set (A B)) */
34379
34380 static rtx
34381 ix86_erase_embedded_rounding (rtx pat)
34382 {
34383 if (GET_CODE (pat) == INSN)
34384 pat = PATTERN (pat);
34385
34386 gcc_assert (GET_CODE (pat) == SET);
34387 rtx src = SET_SRC (pat);
34388 gcc_assert (XVECLEN (src, 0) == 2);
34389 rtx p0 = XVECEXP (src, 0, 0);
34390 gcc_assert (GET_CODE (src) == UNSPEC
34391 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
34392 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
34393 return res;
34394 }
34395
34396 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
34397 with rounding. */
34398 static rtx
34399 ix86_expand_sse_comi_round (const struct builtin_description *d,
34400 tree exp, rtx target)
34401 {
34402 rtx pat, set_dst;
34403 tree arg0 = CALL_EXPR_ARG (exp, 0);
34404 tree arg1 = CALL_EXPR_ARG (exp, 1);
34405 tree arg2 = CALL_EXPR_ARG (exp, 2);
34406 tree arg3 = CALL_EXPR_ARG (exp, 3);
34407 rtx op0 = expand_normal (arg0);
34408 rtx op1 = expand_normal (arg1);
34409 rtx op2 = expand_normal (arg2);
34410 rtx op3 = expand_normal (arg3);
34411 enum insn_code icode = d->icode;
34412 const struct insn_data_d *insn_p = &insn_data[icode];
34413 machine_mode mode0 = insn_p->operand[0].mode;
34414 machine_mode mode1 = insn_p->operand[1].mode;
34415 enum rtx_code comparison = UNEQ;
34416 bool need_ucomi = false;
34417
34418 /* See avxintrin.h for values. */
34419 enum rtx_code comi_comparisons[32] =
34420 {
34421 UNEQ, GT, GE, UNORDERED, LTGT, UNLE, UNLT, ORDERED, UNEQ, UNLT,
34422 UNLE, LT, LTGT, GE, GT, LT, UNEQ, GT, GE, UNORDERED, LTGT, UNLE,
34423 UNLT, ORDERED, UNEQ, UNLT, UNLE, LT, LTGT, GE, GT, LT
34424 };
34425 bool need_ucomi_values[32] =
34426 {
34427 true, false, false, true, true, false, false, true,
34428 true, false, false, true, true, false, false, true,
34429 false, true, true, false, false, true, true, false,
34430 false, true, true, false, false, true, true, false
34431 };
34432
34433 if (!CONST_INT_P (op2))
34434 {
34435 error ("the third argument must be comparison constant");
34436 return const0_rtx;
34437 }
34438 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
34439 {
34440 error ("incorrect comparison mode");
34441 return const0_rtx;
34442 }
34443
34444 if (!insn_p->operand[2].predicate (op3, SImode))
34445 {
34446 error ("incorrect rounding operand");
34447 return const0_rtx;
34448 }
34449
34450 comparison = comi_comparisons[INTVAL (op2)];
34451 need_ucomi = need_ucomi_values[INTVAL (op2)];
34452
34453 if (VECTOR_MODE_P (mode0))
34454 op0 = safe_vector_operand (op0, mode0);
34455 if (VECTOR_MODE_P (mode1))
34456 op1 = safe_vector_operand (op1, mode1);
34457
34458 target = gen_reg_rtx (SImode);
34459 emit_move_insn (target, const0_rtx);
34460 target = gen_rtx_SUBREG (QImode, target, 0);
34461
34462 if ((optimize && !register_operand (op0, mode0))
34463 || !insn_p->operand[0].predicate (op0, mode0))
34464 op0 = copy_to_mode_reg (mode0, op0);
34465 if ((optimize && !register_operand (op1, mode1))
34466 || !insn_p->operand[1].predicate (op1, mode1))
34467 op1 = copy_to_mode_reg (mode1, op1);
34468
34469 if (need_ucomi)
34470 icode = icode == CODE_FOR_sse_comi_round
34471 ? CODE_FOR_sse_ucomi_round
34472 : CODE_FOR_sse2_ucomi_round;
34473
34474 pat = GEN_FCN (icode) (op0, op1, op3);
34475 if (! pat)
34476 return 0;
34477
34478 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
34479 if (INTVAL (op3) == NO_ROUND)
34480 {
34481 pat = ix86_erase_embedded_rounding (pat);
34482 if (! pat)
34483 return 0;
34484
34485 set_dst = SET_DEST (pat);
34486 }
34487 else
34488 {
34489 gcc_assert (GET_CODE (pat) == SET);
34490 set_dst = SET_DEST (pat);
34491 }
34492
34493 emit_insn (pat);
34494 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
34495 gen_rtx_fmt_ee (comparison, QImode,
34496 set_dst,
34497 const0_rtx)));
34498
34499 return SUBREG_REG (target);
34500 }
34501
34502 static rtx
34503 ix86_expand_round_builtin (const struct builtin_description *d,
34504 tree exp, rtx target)
34505 {
34506 rtx pat;
34507 unsigned int i, nargs;
34508 struct
34509 {
34510 rtx op;
34511 machine_mode mode;
34512 } args[6];
34513 enum insn_code icode = d->icode;
34514 const struct insn_data_d *insn_p = &insn_data[icode];
34515 machine_mode tmode = insn_p->operand[0].mode;
34516 unsigned int nargs_constant = 0;
34517 unsigned int redundant_embed_rnd = 0;
34518
34519 switch ((enum ix86_builtin_func_type) d->flag)
34520 {
34521 case UINT64_FTYPE_V2DF_INT:
34522 case UINT64_FTYPE_V4SF_INT:
34523 case UINT_FTYPE_V2DF_INT:
34524 case UINT_FTYPE_V4SF_INT:
34525 case INT64_FTYPE_V2DF_INT:
34526 case INT64_FTYPE_V4SF_INT:
34527 case INT_FTYPE_V2DF_INT:
34528 case INT_FTYPE_V4SF_INT:
34529 nargs = 2;
34530 break;
34531 case V4SF_FTYPE_V4SF_UINT_INT:
34532 case V4SF_FTYPE_V4SF_UINT64_INT:
34533 case V2DF_FTYPE_V2DF_UINT64_INT:
34534 case V4SF_FTYPE_V4SF_INT_INT:
34535 case V4SF_FTYPE_V4SF_INT64_INT:
34536 case V2DF_FTYPE_V2DF_INT64_INT:
34537 case V4SF_FTYPE_V4SF_V4SF_INT:
34538 case V2DF_FTYPE_V2DF_V2DF_INT:
34539 case V4SF_FTYPE_V4SF_V2DF_INT:
34540 case V2DF_FTYPE_V2DF_V4SF_INT:
34541 nargs = 3;
34542 break;
34543 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
34544 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
34545 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
34546 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
34547 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
34548 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
34549 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
34550 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
34551 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
34552 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
34553 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
34554 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
34555 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
34556 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
34557 nargs = 4;
34558 break;
34559 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
34560 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
34561 nargs_constant = 2;
34562 nargs = 4;
34563 break;
34564 case INT_FTYPE_V4SF_V4SF_INT_INT:
34565 case INT_FTYPE_V2DF_V2DF_INT_INT:
34566 return ix86_expand_sse_comi_round (d, exp, target);
34567 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
34568 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
34569 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
34570 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
34571 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
34572 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
34573 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
34574 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
34575 nargs = 5;
34576 break;
34577 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
34578 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
34579 nargs_constant = 4;
34580 nargs = 5;
34581 break;
34582 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
34583 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
34584 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
34585 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
34586 nargs_constant = 3;
34587 nargs = 5;
34588 break;
34589 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
34590 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
34591 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
34592 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
34593 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
34594 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
34595 nargs = 6;
34596 nargs_constant = 4;
34597 break;
34598 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
34599 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
34600 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
34601 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
34602 nargs = 6;
34603 nargs_constant = 3;
34604 break;
34605 default:
34606 gcc_unreachable ();
34607 }
34608 gcc_assert (nargs <= ARRAY_SIZE (args));
34609
34610 if (optimize
34611 || target == 0
34612 || GET_MODE (target) != tmode
34613 || !insn_p->operand[0].predicate (target, tmode))
34614 target = gen_reg_rtx (tmode);
34615
34616 for (i = 0; i < nargs; i++)
34617 {
34618 tree arg = CALL_EXPR_ARG (exp, i);
34619 rtx op = expand_normal (arg);
34620 machine_mode mode = insn_p->operand[i + 1].mode;
34621 bool match = insn_p->operand[i + 1].predicate (op, mode);
34622
34623 if (i == nargs - nargs_constant)
34624 {
34625 if (!match)
34626 {
34627 switch (icode)
34628 {
34629 case CODE_FOR_avx512f_getmantv8df_mask_round:
34630 case CODE_FOR_avx512f_getmantv16sf_mask_round:
34631 case CODE_FOR_avx512f_vgetmantv2df_round:
34632 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
34633 case CODE_FOR_avx512f_vgetmantv4sf_round:
34634 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
34635 error ("the immediate argument must be a 4-bit immediate");
34636 return const0_rtx;
34637 case CODE_FOR_avx512f_cmpv8df3_mask_round:
34638 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
34639 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
34640 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
34641 error ("the immediate argument must be a 5-bit immediate");
34642 return const0_rtx;
34643 default:
34644 error ("the immediate argument must be an 8-bit immediate");
34645 return const0_rtx;
34646 }
34647 }
34648 }
34649 else if (i == nargs-1)
34650 {
34651 if (!insn_p->operand[nargs].predicate (op, SImode))
34652 {
34653 error ("incorrect rounding operand");
34654 return const0_rtx;
34655 }
34656
34657 /* If there is no rounding use normal version of the pattern. */
34658 if (INTVAL (op) == NO_ROUND)
34659 redundant_embed_rnd = 1;
34660 }
34661 else
34662 {
34663 if (VECTOR_MODE_P (mode))
34664 op = safe_vector_operand (op, mode);
34665
34666 op = fixup_modeless_constant (op, mode);
34667
34668 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
34669 {
34670 if (optimize || !match)
34671 op = copy_to_mode_reg (mode, op);
34672 }
34673 else
34674 {
34675 op = copy_to_reg (op);
34676 op = lowpart_subreg (mode, op, GET_MODE (op));
34677 }
34678 }
34679
34680 args[i].op = op;
34681 args[i].mode = mode;
34682 }
34683
34684 switch (nargs)
34685 {
34686 case 1:
34687 pat = GEN_FCN (icode) (target, args[0].op);
34688 break;
34689 case 2:
34690 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
34691 break;
34692 case 3:
34693 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34694 args[2].op);
34695 break;
34696 case 4:
34697 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34698 args[2].op, args[3].op);
34699 break;
34700 case 5:
34701 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34702 args[2].op, args[3].op, args[4].op);
34703 break;
34704 case 6:
34705 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
34706 args[2].op, args[3].op, args[4].op,
34707 args[5].op);
34708 break;
34709 default:
34710 gcc_unreachable ();
34711 }
34712
34713 if (!pat)
34714 return 0;
34715
34716 if (redundant_embed_rnd)
34717 pat = ix86_erase_embedded_rounding (pat);
34718
34719 emit_insn (pat);
34720 return target;
34721 }
34722
34723 /* Subroutine of ix86_expand_builtin to take care of special insns
34724 with variable number of operands. */
34725
34726 static rtx
34727 ix86_expand_special_args_builtin (const struct builtin_description *d,
34728 tree exp, rtx target)
34729 {
34730 tree arg;
34731 rtx pat, op;
34732 unsigned int i, nargs, arg_adjust, memory;
34733 bool aligned_mem = false;
34734 struct
34735 {
34736 rtx op;
34737 machine_mode mode;
34738 } args[3];
34739 enum insn_code icode = d->icode;
34740 bool last_arg_constant = false;
34741 const struct insn_data_d *insn_p = &insn_data[icode];
34742 machine_mode tmode = insn_p->operand[0].mode;
34743 enum { load, store } klass;
34744
34745 switch ((enum ix86_builtin_func_type) d->flag)
34746 {
34747 case VOID_FTYPE_VOID:
34748 emit_insn (GEN_FCN (icode) (target));
34749 return 0;
34750 case VOID_FTYPE_UINT64:
34751 case VOID_FTYPE_UNSIGNED:
34752 nargs = 0;
34753 klass = store;
34754 memory = 0;
34755 break;
34756
34757 case INT_FTYPE_VOID:
34758 case USHORT_FTYPE_VOID:
34759 case UINT64_FTYPE_VOID:
34760 case UNSIGNED_FTYPE_VOID:
34761 nargs = 0;
34762 klass = load;
34763 memory = 0;
34764 break;
34765 case UINT64_FTYPE_PUNSIGNED:
34766 case V2DI_FTYPE_PV2DI:
34767 case V4DI_FTYPE_PV4DI:
34768 case V32QI_FTYPE_PCCHAR:
34769 case V16QI_FTYPE_PCCHAR:
34770 case V8SF_FTYPE_PCV4SF:
34771 case V8SF_FTYPE_PCFLOAT:
34772 case V4SF_FTYPE_PCFLOAT:
34773 case V4DF_FTYPE_PCV2DF:
34774 case V4DF_FTYPE_PCDOUBLE:
34775 case V2DF_FTYPE_PCDOUBLE:
34776 case VOID_FTYPE_PVOID:
34777 case V8DI_FTYPE_PV8DI:
34778 nargs = 1;
34779 klass = load;
34780 memory = 0;
34781 switch (icode)
34782 {
34783 case CODE_FOR_sse4_1_movntdqa:
34784 case CODE_FOR_avx2_movntdqa:
34785 case CODE_FOR_avx512f_movntdqa:
34786 aligned_mem = true;
34787 break;
34788 default:
34789 break;
34790 }
34791 break;
34792 case VOID_FTYPE_PV2SF_V4SF:
34793 case VOID_FTYPE_PV8DI_V8DI:
34794 case VOID_FTYPE_PV4DI_V4DI:
34795 case VOID_FTYPE_PV2DI_V2DI:
34796 case VOID_FTYPE_PCHAR_V32QI:
34797 case VOID_FTYPE_PCHAR_V16QI:
34798 case VOID_FTYPE_PFLOAT_V16SF:
34799 case VOID_FTYPE_PFLOAT_V8SF:
34800 case VOID_FTYPE_PFLOAT_V4SF:
34801 case VOID_FTYPE_PDOUBLE_V8DF:
34802 case VOID_FTYPE_PDOUBLE_V4DF:
34803 case VOID_FTYPE_PDOUBLE_V2DF:
34804 case VOID_FTYPE_PLONGLONG_LONGLONG:
34805 case VOID_FTYPE_PULONGLONG_ULONGLONG:
34806 case VOID_FTYPE_PINT_INT:
34807 nargs = 1;
34808 klass = store;
34809 /* Reserve memory operand for target. */
34810 memory = ARRAY_SIZE (args);
34811 switch (icode)
34812 {
34813 /* These builtins and instructions require the memory
34814 to be properly aligned. */
34815 case CODE_FOR_avx_movntv4di:
34816 case CODE_FOR_sse2_movntv2di:
34817 case CODE_FOR_avx_movntv8sf:
34818 case CODE_FOR_sse_movntv4sf:
34819 case CODE_FOR_sse4a_vmmovntv4sf:
34820 case CODE_FOR_avx_movntv4df:
34821 case CODE_FOR_sse2_movntv2df:
34822 case CODE_FOR_sse4a_vmmovntv2df:
34823 case CODE_FOR_sse2_movntidi:
34824 case CODE_FOR_sse_movntq:
34825 case CODE_FOR_sse2_movntisi:
34826 case CODE_FOR_avx512f_movntv16sf:
34827 case CODE_FOR_avx512f_movntv8df:
34828 case CODE_FOR_avx512f_movntv8di:
34829 aligned_mem = true;
34830 break;
34831 default:
34832 break;
34833 }
34834 break;
34835 case V4SF_FTYPE_V4SF_PCV2SF:
34836 case V2DF_FTYPE_V2DF_PCDOUBLE:
34837 nargs = 2;
34838 klass = load;
34839 memory = 1;
34840 break;
34841 case V8SF_FTYPE_PCV8SF_V8SI:
34842 case V4DF_FTYPE_PCV4DF_V4DI:
34843 case V4SF_FTYPE_PCV4SF_V4SI:
34844 case V2DF_FTYPE_PCV2DF_V2DI:
34845 case V8SI_FTYPE_PCV8SI_V8SI:
34846 case V4DI_FTYPE_PCV4DI_V4DI:
34847 case V4SI_FTYPE_PCV4SI_V4SI:
34848 case V2DI_FTYPE_PCV2DI_V2DI:
34849 case VOID_FTYPE_INT_INT64:
34850 nargs = 2;
34851 klass = load;
34852 memory = 0;
34853 break;
34854 case VOID_FTYPE_PV8DF_V8DF_UQI:
34855 case VOID_FTYPE_PV4DF_V4DF_UQI:
34856 case VOID_FTYPE_PV2DF_V2DF_UQI:
34857 case VOID_FTYPE_PV16SF_V16SF_UHI:
34858 case VOID_FTYPE_PV8SF_V8SF_UQI:
34859 case VOID_FTYPE_PV4SF_V4SF_UQI:
34860 case VOID_FTYPE_PV8DI_V8DI_UQI:
34861 case VOID_FTYPE_PV4DI_V4DI_UQI:
34862 case VOID_FTYPE_PV2DI_V2DI_UQI:
34863 case VOID_FTYPE_PV16SI_V16SI_UHI:
34864 case VOID_FTYPE_PV8SI_V8SI_UQI:
34865 case VOID_FTYPE_PV4SI_V4SI_UQI:
34866 case VOID_FTYPE_PV64QI_V64QI_UDI:
34867 case VOID_FTYPE_PV32HI_V32HI_USI:
34868 case VOID_FTYPE_PV32QI_V32QI_USI:
34869 case VOID_FTYPE_PV16QI_V16QI_UHI:
34870 case VOID_FTYPE_PV16HI_V16HI_UHI:
34871 case VOID_FTYPE_PV8HI_V8HI_UQI:
34872 switch (icode)
34873 {
34874 /* These builtins and instructions require the memory
34875 to be properly aligned. */
34876 case CODE_FOR_avx512f_storev16sf_mask:
34877 case CODE_FOR_avx512f_storev16si_mask:
34878 case CODE_FOR_avx512f_storev8df_mask:
34879 case CODE_FOR_avx512f_storev8di_mask:
34880 case CODE_FOR_avx512vl_storev8sf_mask:
34881 case CODE_FOR_avx512vl_storev8si_mask:
34882 case CODE_FOR_avx512vl_storev4df_mask:
34883 case CODE_FOR_avx512vl_storev4di_mask:
34884 case CODE_FOR_avx512vl_storev4sf_mask:
34885 case CODE_FOR_avx512vl_storev4si_mask:
34886 case CODE_FOR_avx512vl_storev2df_mask:
34887 case CODE_FOR_avx512vl_storev2di_mask:
34888 aligned_mem = true;
34889 break;
34890 default:
34891 break;
34892 }
34893 /* FALLTHRU */
34894 case VOID_FTYPE_PV8SF_V8SI_V8SF:
34895 case VOID_FTYPE_PV4DF_V4DI_V4DF:
34896 case VOID_FTYPE_PV4SF_V4SI_V4SF:
34897 case VOID_FTYPE_PV2DF_V2DI_V2DF:
34898 case VOID_FTYPE_PV8SI_V8SI_V8SI:
34899 case VOID_FTYPE_PV4DI_V4DI_V4DI:
34900 case VOID_FTYPE_PV4SI_V4SI_V4SI:
34901 case VOID_FTYPE_PV2DI_V2DI_V2DI:
34902 case VOID_FTYPE_PV8SI_V8DI_UQI:
34903 case VOID_FTYPE_PV8HI_V8DI_UQI:
34904 case VOID_FTYPE_PV16HI_V16SI_UHI:
34905 case VOID_FTYPE_PV16QI_V8DI_UQI:
34906 case VOID_FTYPE_PV16QI_V16SI_UHI:
34907 case VOID_FTYPE_PV4SI_V4DI_UQI:
34908 case VOID_FTYPE_PV4SI_V2DI_UQI:
34909 case VOID_FTYPE_PV8HI_V4DI_UQI:
34910 case VOID_FTYPE_PV8HI_V2DI_UQI:
34911 case VOID_FTYPE_PV8HI_V8SI_UQI:
34912 case VOID_FTYPE_PV8HI_V4SI_UQI:
34913 case VOID_FTYPE_PV16QI_V4DI_UQI:
34914 case VOID_FTYPE_PV16QI_V2DI_UQI:
34915 case VOID_FTYPE_PV16QI_V8SI_UQI:
34916 case VOID_FTYPE_PV16QI_V4SI_UQI:
34917 case VOID_FTYPE_PCHAR_V64QI_UDI:
34918 case VOID_FTYPE_PCHAR_V32QI_USI:
34919 case VOID_FTYPE_PCHAR_V16QI_UHI:
34920 case VOID_FTYPE_PSHORT_V32HI_USI:
34921 case VOID_FTYPE_PSHORT_V16HI_UHI:
34922 case VOID_FTYPE_PSHORT_V8HI_UQI:
34923 case VOID_FTYPE_PINT_V16SI_UHI:
34924 case VOID_FTYPE_PINT_V8SI_UQI:
34925 case VOID_FTYPE_PINT_V4SI_UQI:
34926 case VOID_FTYPE_PINT64_V8DI_UQI:
34927 case VOID_FTYPE_PINT64_V4DI_UQI:
34928 case VOID_FTYPE_PINT64_V2DI_UQI:
34929 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
34930 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
34931 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
34932 case VOID_FTYPE_PFLOAT_V16SF_UHI:
34933 case VOID_FTYPE_PFLOAT_V8SF_UQI:
34934 case VOID_FTYPE_PFLOAT_V4SF_UQI:
34935 case VOID_FTYPE_PV32QI_V32HI_USI:
34936 case VOID_FTYPE_PV16QI_V16HI_UHI:
34937 case VOID_FTYPE_PV8QI_V8HI_UQI:
34938 nargs = 2;
34939 klass = store;
34940 /* Reserve memory operand for target. */
34941 memory = ARRAY_SIZE (args);
34942 break;
34943 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
34944 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
34945 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
34946 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
34947 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
34948 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
34949 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
34950 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
34951 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
34952 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
34953 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
34954 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
34955 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
34956 case V32HI_FTYPE_PCV32HI_V32HI_USI:
34957 case V32QI_FTYPE_PCV32QI_V32QI_USI:
34958 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
34959 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
34960 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
34961 switch (icode)
34962 {
34963 /* These builtins and instructions require the memory
34964 to be properly aligned. */
34965 case CODE_FOR_avx512f_loadv16sf_mask:
34966 case CODE_FOR_avx512f_loadv16si_mask:
34967 case CODE_FOR_avx512f_loadv8df_mask:
34968 case CODE_FOR_avx512f_loadv8di_mask:
34969 case CODE_FOR_avx512vl_loadv8sf_mask:
34970 case CODE_FOR_avx512vl_loadv8si_mask:
34971 case CODE_FOR_avx512vl_loadv4df_mask:
34972 case CODE_FOR_avx512vl_loadv4di_mask:
34973 case CODE_FOR_avx512vl_loadv4sf_mask:
34974 case CODE_FOR_avx512vl_loadv4si_mask:
34975 case CODE_FOR_avx512vl_loadv2df_mask:
34976 case CODE_FOR_avx512vl_loadv2di_mask:
34977 case CODE_FOR_avx512bw_loadv64qi_mask:
34978 case CODE_FOR_avx512vl_loadv32qi_mask:
34979 case CODE_FOR_avx512vl_loadv16qi_mask:
34980 case CODE_FOR_avx512bw_loadv32hi_mask:
34981 case CODE_FOR_avx512vl_loadv16hi_mask:
34982 case CODE_FOR_avx512vl_loadv8hi_mask:
34983 aligned_mem = true;
34984 break;
34985 default:
34986 break;
34987 }
34988 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
34989 case V32QI_FTYPE_PCCHAR_V32QI_USI:
34990 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
34991 case V32HI_FTYPE_PCSHORT_V32HI_USI:
34992 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
34993 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
34994 case V16SI_FTYPE_PCINT_V16SI_UHI:
34995 case V8SI_FTYPE_PCINT_V8SI_UQI:
34996 case V4SI_FTYPE_PCINT_V4SI_UQI:
34997 case V8DI_FTYPE_PCINT64_V8DI_UQI:
34998 case V4DI_FTYPE_PCINT64_V4DI_UQI:
34999 case V2DI_FTYPE_PCINT64_V2DI_UQI:
35000 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
35001 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
35002 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
35003 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
35004 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
35005 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
35006 nargs = 3;
35007 klass = load;
35008 memory = 0;
35009 break;
35010 case VOID_FTYPE_UINT_UINT_UINT:
35011 case VOID_FTYPE_UINT64_UINT_UINT:
35012 case UCHAR_FTYPE_UINT_UINT_UINT:
35013 case UCHAR_FTYPE_UINT64_UINT_UINT:
35014 nargs = 3;
35015 klass = load;
35016 memory = ARRAY_SIZE (args);
35017 last_arg_constant = true;
35018 break;
35019 default:
35020 gcc_unreachable ();
35021 }
35022
35023 gcc_assert (nargs <= ARRAY_SIZE (args));
35024
35025 if (klass == store)
35026 {
35027 arg = CALL_EXPR_ARG (exp, 0);
35028 op = expand_normal (arg);
35029 gcc_assert (target == 0);
35030 if (memory)
35031 {
35032 op = ix86_zero_extend_to_Pmode (op);
35033 target = gen_rtx_MEM (tmode, op);
35034 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
35035 on it. Try to improve it using get_pointer_alignment,
35036 and if the special builtin is one that requires strict
35037 mode alignment, also from it's GET_MODE_ALIGNMENT.
35038 Failure to do so could lead to ix86_legitimate_combined_insn
35039 rejecting all changes to such insns. */
35040 unsigned int align = get_pointer_alignment (arg);
35041 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
35042 align = GET_MODE_ALIGNMENT (tmode);
35043 if (MEM_ALIGN (target) < align)
35044 set_mem_align (target, align);
35045 }
35046 else
35047 target = force_reg (tmode, op);
35048 arg_adjust = 1;
35049 }
35050 else
35051 {
35052 arg_adjust = 0;
35053 if (optimize
35054 || target == 0
35055 || !register_operand (target, tmode)
35056 || GET_MODE (target) != tmode)
35057 target = gen_reg_rtx (tmode);
35058 }
35059
35060 for (i = 0; i < nargs; i++)
35061 {
35062 machine_mode mode = insn_p->operand[i + 1].mode;
35063 bool match;
35064
35065 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
35066 op = expand_normal (arg);
35067 match = insn_p->operand[i + 1].predicate (op, mode);
35068
35069 if (last_arg_constant && (i + 1) == nargs)
35070 {
35071 if (!match)
35072 {
35073 if (icode == CODE_FOR_lwp_lwpvalsi3
35074 || icode == CODE_FOR_lwp_lwpinssi3
35075 || icode == CODE_FOR_lwp_lwpvaldi3
35076 || icode == CODE_FOR_lwp_lwpinsdi3)
35077 error ("the last argument must be a 32-bit immediate");
35078 else
35079 error ("the last argument must be an 8-bit immediate");
35080 return const0_rtx;
35081 }
35082 }
35083 else
35084 {
35085 if (i == memory)
35086 {
35087 /* This must be the memory operand. */
35088 op = ix86_zero_extend_to_Pmode (op);
35089 op = gen_rtx_MEM (mode, op);
35090 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
35091 on it. Try to improve it using get_pointer_alignment,
35092 and if the special builtin is one that requires strict
35093 mode alignment, also from it's GET_MODE_ALIGNMENT.
35094 Failure to do so could lead to ix86_legitimate_combined_insn
35095 rejecting all changes to such insns. */
35096 unsigned int align = get_pointer_alignment (arg);
35097 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
35098 align = GET_MODE_ALIGNMENT (mode);
35099 if (MEM_ALIGN (op) < align)
35100 set_mem_align (op, align);
35101 }
35102 else
35103 {
35104 /* This must be register. */
35105 if (VECTOR_MODE_P (mode))
35106 op = safe_vector_operand (op, mode);
35107
35108 op = fixup_modeless_constant (op, mode);
35109
35110 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
35111 op = copy_to_mode_reg (mode, op);
35112 else
35113 {
35114 op = copy_to_reg (op);
35115 op = lowpart_subreg (mode, op, GET_MODE (op));
35116 }
35117 }
35118 }
35119
35120 args[i].op = op;
35121 args[i].mode = mode;
35122 }
35123
35124 switch (nargs)
35125 {
35126 case 0:
35127 pat = GEN_FCN (icode) (target);
35128 break;
35129 case 1:
35130 pat = GEN_FCN (icode) (target, args[0].op);
35131 break;
35132 case 2:
35133 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
35134 break;
35135 case 3:
35136 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
35137 break;
35138 default:
35139 gcc_unreachable ();
35140 }
35141
35142 if (! pat)
35143 return 0;
35144 emit_insn (pat);
35145 return klass == store ? 0 : target;
35146 }
35147
35148 /* Return the integer constant in ARG. Constrain it to be in the range
35149 of the subparts of VEC_TYPE; issue an error if not. */
35150
35151 static int
35152 get_element_number (tree vec_type, tree arg)
35153 {
35154 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
35155
35156 if (!tree_fits_uhwi_p (arg)
35157 || (elt = tree_to_uhwi (arg), elt > max))
35158 {
35159 error ("selector must be an integer constant in the range 0..%wi", max);
35160 return 0;
35161 }
35162
35163 return elt;
35164 }
35165
35166 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35167 ix86_expand_vector_init. We DO have language-level syntax for this, in
35168 the form of (type){ init-list }. Except that since we can't place emms
35169 instructions from inside the compiler, we can't allow the use of MMX
35170 registers unless the user explicitly asks for it. So we do *not* define
35171 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
35172 we have builtins invoked by mmintrin.h that gives us license to emit
35173 these sorts of instructions. */
35174
35175 static rtx
35176 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
35177 {
35178 machine_mode tmode = TYPE_MODE (type);
35179 machine_mode inner_mode = GET_MODE_INNER (tmode);
35180 int i, n_elt = GET_MODE_NUNITS (tmode);
35181 rtvec v = rtvec_alloc (n_elt);
35182
35183 gcc_assert (VECTOR_MODE_P (tmode));
35184 gcc_assert (call_expr_nargs (exp) == n_elt);
35185
35186 for (i = 0; i < n_elt; ++i)
35187 {
35188 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
35189 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
35190 }
35191
35192 if (!target || !register_operand (target, tmode))
35193 target = gen_reg_rtx (tmode);
35194
35195 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
35196 return target;
35197 }
35198
35199 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35200 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
35201 had a language-level syntax for referencing vector elements. */
35202
35203 static rtx
35204 ix86_expand_vec_ext_builtin (tree exp, rtx target)
35205 {
35206 machine_mode tmode, mode0;
35207 tree arg0, arg1;
35208 int elt;
35209 rtx op0;
35210
35211 arg0 = CALL_EXPR_ARG (exp, 0);
35212 arg1 = CALL_EXPR_ARG (exp, 1);
35213
35214 op0 = expand_normal (arg0);
35215 elt = get_element_number (TREE_TYPE (arg0), arg1);
35216
35217 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35218 mode0 = TYPE_MODE (TREE_TYPE (arg0));
35219 gcc_assert (VECTOR_MODE_P (mode0));
35220
35221 op0 = force_reg (mode0, op0);
35222
35223 if (optimize || !target || !register_operand (target, tmode))
35224 target = gen_reg_rtx (tmode);
35225
35226 ix86_expand_vector_extract (true, target, op0, elt);
35227
35228 return target;
35229 }
35230
35231 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
35232 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
35233 a language-level syntax for referencing vector elements. */
35234
35235 static rtx
35236 ix86_expand_vec_set_builtin (tree exp)
35237 {
35238 machine_mode tmode, mode1;
35239 tree arg0, arg1, arg2;
35240 int elt;
35241 rtx op0, op1, target;
35242
35243 arg0 = CALL_EXPR_ARG (exp, 0);
35244 arg1 = CALL_EXPR_ARG (exp, 1);
35245 arg2 = CALL_EXPR_ARG (exp, 2);
35246
35247 tmode = TYPE_MODE (TREE_TYPE (arg0));
35248 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
35249 gcc_assert (VECTOR_MODE_P (tmode));
35250
35251 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
35252 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
35253 elt = get_element_number (TREE_TYPE (arg0), arg2);
35254
35255 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
35256 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
35257
35258 op0 = force_reg (tmode, op0);
35259 op1 = force_reg (mode1, op1);
35260
35261 /* OP0 is the source of these builtin functions and shouldn't be
35262 modified. Create a copy, use it and return it as target. */
35263 target = gen_reg_rtx (tmode);
35264 emit_move_insn (target, op0);
35265 ix86_expand_vector_set (true, target, op1, elt);
35266
35267 return target;
35268 }
35269
35270 /* Emit conditional move of SRC to DST with condition
35271 OP1 CODE OP2. */
35272 static void
35273 ix86_emit_cmove (rtx dst, rtx src, enum rtx_code code, rtx op1, rtx op2)
35274 {
35275 rtx t;
35276
35277 if (TARGET_CMOVE)
35278 {
35279 t = ix86_expand_compare (code, op1, op2);
35280 emit_insn (gen_rtx_SET (dst, gen_rtx_IF_THEN_ELSE (GET_MODE (dst), t,
35281 src, dst)));
35282 }
35283 else
35284 {
35285 rtx_code_label *nomove = gen_label_rtx ();
35286 emit_cmp_and_jump_insns (op1, op2, reverse_condition (code),
35287 const0_rtx, GET_MODE (op1), 1, nomove);
35288 emit_move_insn (dst, src);
35289 emit_label (nomove);
35290 }
35291 }
35292
35293 /* Choose max of DST and SRC and put it to DST. */
35294 static void
35295 ix86_emit_move_max (rtx dst, rtx src)
35296 {
35297 ix86_emit_cmove (dst, src, LTU, dst, src);
35298 }
35299
35300 /* Expand an expression EXP that calls a built-in function,
35301 with result going to TARGET if that's convenient
35302 (and in mode MODE if that's convenient).
35303 SUBTARGET may be used as the target for computing one of EXP's operands.
35304 IGNORE is nonzero if the value is to be ignored. */
35305
35306 static rtx
35307 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
35308 machine_mode mode, int ignore)
35309 {
35310 size_t i;
35311 enum insn_code icode, icode2;
35312 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
35313 tree arg0, arg1, arg2, arg3, arg4;
35314 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
35315 machine_mode mode0, mode1, mode2, mode3, mode4;
35316 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
35317
35318 /* For CPU builtins that can be folded, fold first and expand the fold. */
35319 switch (fcode)
35320 {
35321 case IX86_BUILTIN_CPU_INIT:
35322 {
35323 /* Make it call __cpu_indicator_init in libgcc. */
35324 tree call_expr, fndecl, type;
35325 type = build_function_type_list (integer_type_node, NULL_TREE);
35326 fndecl = build_fn_decl ("__cpu_indicator_init", type);
35327 call_expr = build_call_expr (fndecl, 0);
35328 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
35329 }
35330 case IX86_BUILTIN_CPU_IS:
35331 case IX86_BUILTIN_CPU_SUPPORTS:
35332 {
35333 tree arg0 = CALL_EXPR_ARG (exp, 0);
35334 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
35335 gcc_assert (fold_expr != NULL_TREE);
35336 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
35337 }
35338 }
35339
35340 /* Determine whether the builtin function is available under the current ISA.
35341 Originally the builtin was not created if it wasn't applicable to the
35342 current ISA based on the command line switches. With function specific
35343 options, we need to check in the context of the function making the call
35344 whether it is supported. Treat AVX512VL and MMX specially. For other flags,
35345 if isa includes more than one ISA bit, treat those are requiring any
35346 of them. For AVX512VL, require both AVX512VL and the non-AVX512VL
35347 ISAs. Likewise for MMX, require both MMX and the non-MMX ISAs.
35348 Similarly for 64BIT, but we shouldn't be building such builtins
35349 at all, -m64 is a whole TU option. */
35350 if (((ix86_builtins_isa[fcode].isa
35351 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35352 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI))
35353 && !(ix86_builtins_isa[fcode].isa
35354 & ~(OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA_MMX
35355 | OPTION_MASK_ISA_64BIT | OPTION_MASK_ISA_GFNI)
35356 & ix86_isa_flags))
35357 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_AVX512VL)
35358 && !(ix86_isa_flags & OPTION_MASK_ISA_AVX512VL))
35359 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_GFNI)
35360 && !(ix86_isa_flags & OPTION_MASK_ISA_GFNI))
35361 || ((ix86_builtins_isa[fcode].isa & OPTION_MASK_ISA_MMX)
35362 && !(ix86_isa_flags & OPTION_MASK_ISA_MMX))
35363 || (ix86_builtins_isa[fcode].isa2
35364 && !(ix86_builtins_isa[fcode].isa2 & ix86_isa_flags2)))
35365 {
35366 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa,
35367 ix86_builtins_isa[fcode].isa2, 0, 0,
35368 NULL, NULL, (enum fpmath_unit) 0,
35369 false);
35370 if (!opts)
35371 error ("%qE needs unknown isa option", fndecl);
35372 else
35373 {
35374 gcc_assert (opts != NULL);
35375 error ("%qE needs isa option %s", fndecl, opts);
35376 free (opts);
35377 }
35378 return expand_call (exp, target, ignore);
35379 }
35380
35381 switch (fcode)
35382 {
35383 case IX86_BUILTIN_BNDMK:
35384 if (!target
35385 || GET_MODE (target) != BNDmode
35386 || !register_operand (target, BNDmode))
35387 target = gen_reg_rtx (BNDmode);
35388
35389 arg0 = CALL_EXPR_ARG (exp, 0);
35390 arg1 = CALL_EXPR_ARG (exp, 1);
35391
35392 op0 = expand_normal (arg0);
35393 op1 = expand_normal (arg1);
35394
35395 if (!register_operand (op0, Pmode))
35396 op0 = ix86_zero_extend_to_Pmode (op0);
35397 if (!register_operand (op1, Pmode))
35398 op1 = ix86_zero_extend_to_Pmode (op1);
35399
35400 /* Builtin arg1 is size of block but instruction op1 should
35401 be (size - 1). */
35402 op1 = expand_simple_binop (Pmode, PLUS, op1, constm1_rtx,
35403 NULL_RTX, 1, OPTAB_DIRECT);
35404
35405 emit_insn (BNDmode == BND64mode
35406 ? gen_bnd64_mk (target, op0, op1)
35407 : gen_bnd32_mk (target, op0, op1));
35408 return target;
35409
35410 case IX86_BUILTIN_BNDSTX:
35411 arg0 = CALL_EXPR_ARG (exp, 0);
35412 arg1 = CALL_EXPR_ARG (exp, 1);
35413 arg2 = CALL_EXPR_ARG (exp, 2);
35414
35415 op0 = expand_normal (arg0);
35416 op1 = expand_normal (arg1);
35417 op2 = expand_normal (arg2);
35418
35419 if (!register_operand (op0, Pmode))
35420 op0 = ix86_zero_extend_to_Pmode (op0);
35421 if (!register_operand (op1, BNDmode))
35422 op1 = copy_to_mode_reg (BNDmode, op1);
35423 if (!register_operand (op2, Pmode))
35424 op2 = ix86_zero_extend_to_Pmode (op2);
35425
35426 emit_insn (BNDmode == BND64mode
35427 ? gen_bnd64_stx (op2, op0, op1)
35428 : gen_bnd32_stx (op2, op0, op1));
35429 return 0;
35430
35431 case IX86_BUILTIN_BNDLDX:
35432 if (!target
35433 || GET_MODE (target) != BNDmode
35434 || !register_operand (target, BNDmode))
35435 target = gen_reg_rtx (BNDmode);
35436
35437 arg0 = CALL_EXPR_ARG (exp, 0);
35438 arg1 = CALL_EXPR_ARG (exp, 1);
35439
35440 op0 = expand_normal (arg0);
35441 op1 = expand_normal (arg1);
35442
35443 if (!register_operand (op0, Pmode))
35444 op0 = ix86_zero_extend_to_Pmode (op0);
35445 if (!register_operand (op1, Pmode))
35446 op1 = ix86_zero_extend_to_Pmode (op1);
35447
35448 emit_insn (BNDmode == BND64mode
35449 ? gen_bnd64_ldx (target, op0, op1)
35450 : gen_bnd32_ldx (target, op0, op1));
35451 return target;
35452
35453 case IX86_BUILTIN_BNDCL:
35454 arg0 = CALL_EXPR_ARG (exp, 0);
35455 arg1 = CALL_EXPR_ARG (exp, 1);
35456
35457 op0 = expand_normal (arg0);
35458 op1 = expand_normal (arg1);
35459
35460 if (!register_operand (op0, Pmode))
35461 op0 = ix86_zero_extend_to_Pmode (op0);
35462 if (!register_operand (op1, BNDmode))
35463 op1 = copy_to_mode_reg (BNDmode, op1);
35464
35465 emit_insn (BNDmode == BND64mode
35466 ? gen_bnd64_cl (op1, op0)
35467 : gen_bnd32_cl (op1, op0));
35468 return 0;
35469
35470 case IX86_BUILTIN_BNDCU:
35471 arg0 = CALL_EXPR_ARG (exp, 0);
35472 arg1 = CALL_EXPR_ARG (exp, 1);
35473
35474 op0 = expand_normal (arg0);
35475 op1 = expand_normal (arg1);
35476
35477 if (!register_operand (op0, Pmode))
35478 op0 = ix86_zero_extend_to_Pmode (op0);
35479 if (!register_operand (op1, BNDmode))
35480 op1 = copy_to_mode_reg (BNDmode, op1);
35481
35482 emit_insn (BNDmode == BND64mode
35483 ? gen_bnd64_cu (op1, op0)
35484 : gen_bnd32_cu (op1, op0));
35485 return 0;
35486
35487 case IX86_BUILTIN_BNDRET:
35488 arg0 = CALL_EXPR_ARG (exp, 0);
35489 target = chkp_get_rtl_bounds (arg0);
35490
35491 /* If no bounds were specified for returned value,
35492 then use INIT bounds. It usually happens when
35493 some built-in function is expanded. */
35494 if (!target)
35495 {
35496 rtx t1 = gen_reg_rtx (Pmode);
35497 rtx t2 = gen_reg_rtx (Pmode);
35498 target = gen_reg_rtx (BNDmode);
35499 emit_move_insn (t1, const0_rtx);
35500 emit_move_insn (t2, constm1_rtx);
35501 emit_insn (BNDmode == BND64mode
35502 ? gen_bnd64_mk (target, t1, t2)
35503 : gen_bnd32_mk (target, t1, t2));
35504 }
35505
35506 gcc_assert (target && REG_P (target));
35507 return target;
35508
35509 case IX86_BUILTIN_BNDNARROW:
35510 {
35511 rtx m1, m1h1, m1h2, lb, ub, t1;
35512
35513 /* Return value and lb. */
35514 arg0 = CALL_EXPR_ARG (exp, 0);
35515 /* Bounds. */
35516 arg1 = CALL_EXPR_ARG (exp, 1);
35517 /* Size. */
35518 arg2 = CALL_EXPR_ARG (exp, 2);
35519
35520 lb = expand_normal (arg0);
35521 op1 = expand_normal (arg1);
35522 op2 = expand_normal (arg2);
35523
35524 /* Size was passed but we need to use (size - 1) as for bndmk. */
35525 op2 = expand_simple_binop (Pmode, PLUS, op2, constm1_rtx,
35526 NULL_RTX, 1, OPTAB_DIRECT);
35527
35528 /* Add LB to size and inverse to get UB. */
35529 op2 = expand_simple_binop (Pmode, PLUS, op2, lb,
35530 op2, 1, OPTAB_DIRECT);
35531 ub = expand_simple_unop (Pmode, NOT, op2, op2, 1);
35532
35533 if (!register_operand (lb, Pmode))
35534 lb = ix86_zero_extend_to_Pmode (lb);
35535 if (!register_operand (ub, Pmode))
35536 ub = ix86_zero_extend_to_Pmode (ub);
35537
35538 /* We need to move bounds to memory before any computations. */
35539 if (MEM_P (op1))
35540 m1 = op1;
35541 else
35542 {
35543 m1 = assign_386_stack_local (BNDmode, SLOT_TEMP);
35544 emit_move_insn (m1, op1);
35545 }
35546
35547 /* Generate mem expression to be used for access to LB and UB. */
35548 m1h1 = adjust_address (m1, Pmode, 0);
35549 m1h2 = adjust_address (m1, Pmode, GET_MODE_SIZE (Pmode));
35550
35551 t1 = gen_reg_rtx (Pmode);
35552
35553 /* Compute LB. */
35554 emit_move_insn (t1, m1h1);
35555 ix86_emit_move_max (t1, lb);
35556 emit_move_insn (m1h1, t1);
35557
35558 /* Compute UB. UB is stored in 1's complement form. Therefore
35559 we also use max here. */
35560 emit_move_insn (t1, m1h2);
35561 ix86_emit_move_max (t1, ub);
35562 emit_move_insn (m1h2, t1);
35563
35564 op2 = gen_reg_rtx (BNDmode);
35565 emit_move_insn (op2, m1);
35566
35567 return chkp_join_splitted_slot (lb, op2);
35568 }
35569
35570 case IX86_BUILTIN_BNDINT:
35571 {
35572 rtx res, rh1, rh2, lb1, lb2, ub1, ub2;
35573
35574 if (!target
35575 || GET_MODE (target) != BNDmode
35576 || !register_operand (target, BNDmode))
35577 target = gen_reg_rtx (BNDmode);
35578
35579 arg0 = CALL_EXPR_ARG (exp, 0);
35580 arg1 = CALL_EXPR_ARG (exp, 1);
35581
35582 op0 = expand_normal (arg0);
35583 op1 = expand_normal (arg1);
35584
35585 res = assign_386_stack_local (BNDmode, SLOT_TEMP);
35586 rh1 = adjust_address (res, Pmode, 0);
35587 rh2 = adjust_address (res, Pmode, GET_MODE_SIZE (Pmode));
35588
35589 /* Put first bounds to temporaries. */
35590 lb1 = gen_reg_rtx (Pmode);
35591 ub1 = gen_reg_rtx (Pmode);
35592 if (MEM_P (op0))
35593 {
35594 emit_move_insn (lb1, adjust_address (op0, Pmode, 0));
35595 emit_move_insn (ub1, adjust_address (op0, Pmode,
35596 GET_MODE_SIZE (Pmode)));
35597 }
35598 else
35599 {
35600 emit_move_insn (res, op0);
35601 emit_move_insn (lb1, rh1);
35602 emit_move_insn (ub1, rh2);
35603 }
35604
35605 /* Put second bounds to temporaries. */
35606 lb2 = gen_reg_rtx (Pmode);
35607 ub2 = gen_reg_rtx (Pmode);
35608 if (MEM_P (op1))
35609 {
35610 emit_move_insn (lb2, adjust_address (op1, Pmode, 0));
35611 emit_move_insn (ub2, adjust_address (op1, Pmode,
35612 GET_MODE_SIZE (Pmode)));
35613 }
35614 else
35615 {
35616 emit_move_insn (res, op1);
35617 emit_move_insn (lb2, rh1);
35618 emit_move_insn (ub2, rh2);
35619 }
35620
35621 /* Compute LB. */
35622 ix86_emit_move_max (lb1, lb2);
35623 emit_move_insn (rh1, lb1);
35624
35625 /* Compute UB. UB is stored in 1's complement form. Therefore
35626 we also use max here. */
35627 ix86_emit_move_max (ub1, ub2);
35628 emit_move_insn (rh2, ub1);
35629
35630 emit_move_insn (target, res);
35631
35632 return target;
35633 }
35634
35635 case IX86_BUILTIN_SIZEOF:
35636 {
35637 tree name;
35638 rtx symbol;
35639
35640 if (!target
35641 || GET_MODE (target) != Pmode
35642 || !register_operand (target, Pmode))
35643 target = gen_reg_rtx (Pmode);
35644
35645 arg0 = CALL_EXPR_ARG (exp, 0);
35646 gcc_assert (VAR_P (arg0));
35647
35648 name = DECL_ASSEMBLER_NAME (arg0);
35649 symbol = gen_rtx_SYMBOL_REF (Pmode, IDENTIFIER_POINTER (name));
35650
35651 emit_insn (Pmode == SImode
35652 ? gen_move_size_reloc_si (target, symbol)
35653 : gen_move_size_reloc_di (target, symbol));
35654
35655 return target;
35656 }
35657
35658 case IX86_BUILTIN_BNDLOWER:
35659 {
35660 rtx mem, hmem;
35661
35662 if (!target
35663 || GET_MODE (target) != Pmode
35664 || !register_operand (target, Pmode))
35665 target = gen_reg_rtx (Pmode);
35666
35667 arg0 = CALL_EXPR_ARG (exp, 0);
35668 op0 = expand_normal (arg0);
35669
35670 /* We need to move bounds to memory first. */
35671 if (MEM_P (op0))
35672 mem = op0;
35673 else
35674 {
35675 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35676 emit_move_insn (mem, op0);
35677 }
35678
35679 /* Generate mem expression to access LB and load it. */
35680 hmem = adjust_address (mem, Pmode, 0);
35681 emit_move_insn (target, hmem);
35682
35683 return target;
35684 }
35685
35686 case IX86_BUILTIN_BNDUPPER:
35687 {
35688 rtx mem, hmem, res;
35689
35690 if (!target
35691 || GET_MODE (target) != Pmode
35692 || !register_operand (target, Pmode))
35693 target = gen_reg_rtx (Pmode);
35694
35695 arg0 = CALL_EXPR_ARG (exp, 0);
35696 op0 = expand_normal (arg0);
35697
35698 /* We need to move bounds to memory first. */
35699 if (MEM_P (op0))
35700 mem = op0;
35701 else
35702 {
35703 mem = assign_386_stack_local (BNDmode, SLOT_TEMP);
35704 emit_move_insn (mem, op0);
35705 }
35706
35707 /* Generate mem expression to access UB. */
35708 hmem = adjust_address (mem, Pmode, GET_MODE_SIZE (Pmode));
35709
35710 /* We need to inverse all bits of UB. */
35711 res = expand_simple_unop (Pmode, NOT, hmem, target, 1);
35712
35713 if (res != target)
35714 emit_move_insn (target, res);
35715
35716 return target;
35717 }
35718
35719 case IX86_BUILTIN_MASKMOVQ:
35720 case IX86_BUILTIN_MASKMOVDQU:
35721 icode = (fcode == IX86_BUILTIN_MASKMOVQ
35722 ? CODE_FOR_mmx_maskmovq
35723 : CODE_FOR_sse2_maskmovdqu);
35724 /* Note the arg order is different from the operand order. */
35725 arg1 = CALL_EXPR_ARG (exp, 0);
35726 arg2 = CALL_EXPR_ARG (exp, 1);
35727 arg0 = CALL_EXPR_ARG (exp, 2);
35728 op0 = expand_normal (arg0);
35729 op1 = expand_normal (arg1);
35730 op2 = expand_normal (arg2);
35731 mode0 = insn_data[icode].operand[0].mode;
35732 mode1 = insn_data[icode].operand[1].mode;
35733 mode2 = insn_data[icode].operand[2].mode;
35734
35735 op0 = ix86_zero_extend_to_Pmode (op0);
35736 op0 = gen_rtx_MEM (mode1, op0);
35737
35738 if (!insn_data[icode].operand[0].predicate (op0, mode0))
35739 op0 = copy_to_mode_reg (mode0, op0);
35740 if (!insn_data[icode].operand[1].predicate (op1, mode1))
35741 op1 = copy_to_mode_reg (mode1, op1);
35742 if (!insn_data[icode].operand[2].predicate (op2, mode2))
35743 op2 = copy_to_mode_reg (mode2, op2);
35744 pat = GEN_FCN (icode) (op0, op1, op2);
35745 if (! pat)
35746 return 0;
35747 emit_insn (pat);
35748 return 0;
35749
35750 case IX86_BUILTIN_LDMXCSR:
35751 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
35752 target = assign_386_stack_local (SImode, SLOT_TEMP);
35753 emit_move_insn (target, op0);
35754 emit_insn (gen_sse_ldmxcsr (target));
35755 return 0;
35756
35757 case IX86_BUILTIN_STMXCSR:
35758 target = assign_386_stack_local (SImode, SLOT_TEMP);
35759 emit_insn (gen_sse_stmxcsr (target));
35760 return copy_to_mode_reg (SImode, target);
35761
35762 case IX86_BUILTIN_CLFLUSH:
35763 arg0 = CALL_EXPR_ARG (exp, 0);
35764 op0 = expand_normal (arg0);
35765 icode = CODE_FOR_sse2_clflush;
35766 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35767 op0 = ix86_zero_extend_to_Pmode (op0);
35768
35769 emit_insn (gen_sse2_clflush (op0));
35770 return 0;
35771
35772 case IX86_BUILTIN_CLWB:
35773 arg0 = CALL_EXPR_ARG (exp, 0);
35774 op0 = expand_normal (arg0);
35775 icode = CODE_FOR_clwb;
35776 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35777 op0 = ix86_zero_extend_to_Pmode (op0);
35778
35779 emit_insn (gen_clwb (op0));
35780 return 0;
35781
35782 case IX86_BUILTIN_CLFLUSHOPT:
35783 arg0 = CALL_EXPR_ARG (exp, 0);
35784 op0 = expand_normal (arg0);
35785 icode = CODE_FOR_clflushopt;
35786 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
35787 op0 = ix86_zero_extend_to_Pmode (op0);
35788
35789 emit_insn (gen_clflushopt (op0));
35790 return 0;
35791
35792 case IX86_BUILTIN_MONITOR:
35793 case IX86_BUILTIN_MONITORX:
35794 arg0 = CALL_EXPR_ARG (exp, 0);
35795 arg1 = CALL_EXPR_ARG (exp, 1);
35796 arg2 = CALL_EXPR_ARG (exp, 2);
35797 op0 = expand_normal (arg0);
35798 op1 = expand_normal (arg1);
35799 op2 = expand_normal (arg2);
35800 if (!REG_P (op0))
35801 op0 = ix86_zero_extend_to_Pmode (op0);
35802 if (!REG_P (op1))
35803 op1 = copy_to_mode_reg (SImode, op1);
35804 if (!REG_P (op2))
35805 op2 = copy_to_mode_reg (SImode, op2);
35806
35807 emit_insn (fcode == IX86_BUILTIN_MONITOR
35808 ? ix86_gen_monitor (op0, op1, op2)
35809 : ix86_gen_monitorx (op0, op1, op2));
35810 return 0;
35811
35812 case IX86_BUILTIN_MWAIT:
35813 arg0 = CALL_EXPR_ARG (exp, 0);
35814 arg1 = CALL_EXPR_ARG (exp, 1);
35815 op0 = expand_normal (arg0);
35816 op1 = expand_normal (arg1);
35817 if (!REG_P (op0))
35818 op0 = copy_to_mode_reg (SImode, op0);
35819 if (!REG_P (op1))
35820 op1 = copy_to_mode_reg (SImode, op1);
35821 emit_insn (gen_sse3_mwait (op0, op1));
35822 return 0;
35823
35824 case IX86_BUILTIN_MWAITX:
35825 arg0 = CALL_EXPR_ARG (exp, 0);
35826 arg1 = CALL_EXPR_ARG (exp, 1);
35827 arg2 = CALL_EXPR_ARG (exp, 2);
35828 op0 = expand_normal (arg0);
35829 op1 = expand_normal (arg1);
35830 op2 = expand_normal (arg2);
35831 if (!REG_P (op0))
35832 op0 = copy_to_mode_reg (SImode, op0);
35833 if (!REG_P (op1))
35834 op1 = copy_to_mode_reg (SImode, op1);
35835 if (!REG_P (op2))
35836 op2 = copy_to_mode_reg (SImode, op2);
35837 emit_insn (gen_mwaitx (op0, op1, op2));
35838 return 0;
35839
35840 case IX86_BUILTIN_CLZERO:
35841 arg0 = CALL_EXPR_ARG (exp, 0);
35842 op0 = expand_normal (arg0);
35843 if (!REG_P (op0))
35844 op0 = ix86_zero_extend_to_Pmode (op0);
35845 emit_insn (ix86_gen_clzero (op0));
35846 return 0;
35847
35848 case IX86_BUILTIN_VEC_INIT_V2SI:
35849 case IX86_BUILTIN_VEC_INIT_V4HI:
35850 case IX86_BUILTIN_VEC_INIT_V8QI:
35851 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
35852
35853 case IX86_BUILTIN_VEC_EXT_V2DF:
35854 case IX86_BUILTIN_VEC_EXT_V2DI:
35855 case IX86_BUILTIN_VEC_EXT_V4SF:
35856 case IX86_BUILTIN_VEC_EXT_V4SI:
35857 case IX86_BUILTIN_VEC_EXT_V8HI:
35858 case IX86_BUILTIN_VEC_EXT_V2SI:
35859 case IX86_BUILTIN_VEC_EXT_V4HI:
35860 case IX86_BUILTIN_VEC_EXT_V16QI:
35861 return ix86_expand_vec_ext_builtin (exp, target);
35862
35863 case IX86_BUILTIN_VEC_SET_V2DI:
35864 case IX86_BUILTIN_VEC_SET_V4SF:
35865 case IX86_BUILTIN_VEC_SET_V4SI:
35866 case IX86_BUILTIN_VEC_SET_V8HI:
35867 case IX86_BUILTIN_VEC_SET_V4HI:
35868 case IX86_BUILTIN_VEC_SET_V16QI:
35869 return ix86_expand_vec_set_builtin (exp);
35870
35871 case IX86_BUILTIN_NANQ:
35872 case IX86_BUILTIN_NANSQ:
35873 return expand_call (exp, target, ignore);
35874
35875 case IX86_BUILTIN_RDPMC:
35876 case IX86_BUILTIN_RDTSC:
35877 case IX86_BUILTIN_RDTSCP:
35878 case IX86_BUILTIN_XGETBV:
35879
35880 op0 = gen_reg_rtx (DImode);
35881 op1 = gen_reg_rtx (DImode);
35882
35883 if (fcode == IX86_BUILTIN_RDPMC)
35884 {
35885 arg0 = CALL_EXPR_ARG (exp, 0);
35886 op2 = expand_normal (arg0);
35887 if (!register_operand (op2, SImode))
35888 op2 = copy_to_mode_reg (SImode, op2);
35889
35890 insn = (TARGET_64BIT
35891 ? gen_rdpmc_rex64 (op0, op1, op2)
35892 : gen_rdpmc (op0, op2));
35893 emit_insn (insn);
35894 }
35895 else if (fcode == IX86_BUILTIN_XGETBV)
35896 {
35897 arg0 = CALL_EXPR_ARG (exp, 0);
35898 op2 = expand_normal (arg0);
35899 if (!register_operand (op2, SImode))
35900 op2 = copy_to_mode_reg (SImode, op2);
35901
35902 insn = (TARGET_64BIT
35903 ? gen_xgetbv_rex64 (op0, op1, op2)
35904 : gen_xgetbv (op0, op2));
35905 emit_insn (insn);
35906 }
35907 else if (fcode == IX86_BUILTIN_RDTSC)
35908 {
35909 insn = (TARGET_64BIT
35910 ? gen_rdtsc_rex64 (op0, op1)
35911 : gen_rdtsc (op0));
35912 emit_insn (insn);
35913 }
35914 else
35915 {
35916 op2 = gen_reg_rtx (SImode);
35917
35918 insn = (TARGET_64BIT
35919 ? gen_rdtscp_rex64 (op0, op1, op2)
35920 : gen_rdtscp (op0, op2));
35921 emit_insn (insn);
35922
35923 arg0 = CALL_EXPR_ARG (exp, 0);
35924 op4 = expand_normal (arg0);
35925 if (!address_operand (op4, VOIDmode))
35926 {
35927 op4 = convert_memory_address (Pmode, op4);
35928 op4 = copy_addr_to_reg (op4);
35929 }
35930 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
35931 }
35932
35933 if (target == 0)
35934 {
35935 /* mode is VOIDmode if __builtin_rd* has been called
35936 without lhs. */
35937 if (mode == VOIDmode)
35938 return target;
35939 target = gen_reg_rtx (mode);
35940 }
35941
35942 if (TARGET_64BIT)
35943 {
35944 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
35945 op1, 1, OPTAB_DIRECT);
35946 op0 = expand_simple_binop (DImode, IOR, op0, op1,
35947 op0, 1, OPTAB_DIRECT);
35948 }
35949
35950 emit_move_insn (target, op0);
35951 return target;
35952
35953 case IX86_BUILTIN_FXSAVE:
35954 case IX86_BUILTIN_FXRSTOR:
35955 case IX86_BUILTIN_FXSAVE64:
35956 case IX86_BUILTIN_FXRSTOR64:
35957 case IX86_BUILTIN_FNSTENV:
35958 case IX86_BUILTIN_FLDENV:
35959 mode0 = BLKmode;
35960 switch (fcode)
35961 {
35962 case IX86_BUILTIN_FXSAVE:
35963 icode = CODE_FOR_fxsave;
35964 break;
35965 case IX86_BUILTIN_FXRSTOR:
35966 icode = CODE_FOR_fxrstor;
35967 break;
35968 case IX86_BUILTIN_FXSAVE64:
35969 icode = CODE_FOR_fxsave64;
35970 break;
35971 case IX86_BUILTIN_FXRSTOR64:
35972 icode = CODE_FOR_fxrstor64;
35973 break;
35974 case IX86_BUILTIN_FNSTENV:
35975 icode = CODE_FOR_fnstenv;
35976 break;
35977 case IX86_BUILTIN_FLDENV:
35978 icode = CODE_FOR_fldenv;
35979 break;
35980 default:
35981 gcc_unreachable ();
35982 }
35983
35984 arg0 = CALL_EXPR_ARG (exp, 0);
35985 op0 = expand_normal (arg0);
35986
35987 if (!address_operand (op0, VOIDmode))
35988 {
35989 op0 = convert_memory_address (Pmode, op0);
35990 op0 = copy_addr_to_reg (op0);
35991 }
35992 op0 = gen_rtx_MEM (mode0, op0);
35993
35994 pat = GEN_FCN (icode) (op0);
35995 if (pat)
35996 emit_insn (pat);
35997 return 0;
35998
35999 case IX86_BUILTIN_XSETBV:
36000 arg0 = CALL_EXPR_ARG (exp, 0);
36001 arg1 = CALL_EXPR_ARG (exp, 1);
36002 op0 = expand_normal (arg0);
36003 op1 = expand_normal (arg1);
36004
36005 if (!REG_P (op0))
36006 op0 = copy_to_mode_reg (SImode, op0);
36007
36008 if (TARGET_64BIT)
36009 {
36010 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36011 NULL, 1, OPTAB_DIRECT);
36012
36013 op2 = gen_lowpart (SImode, op2);
36014 op1 = gen_lowpart (SImode, op1);
36015 if (!REG_P (op1))
36016 op1 = copy_to_mode_reg (SImode, op1);
36017 if (!REG_P (op2))
36018 op2 = copy_to_mode_reg (SImode, op2);
36019 icode = CODE_FOR_xsetbv_rex64;
36020 pat = GEN_FCN (icode) (op0, op1, op2);
36021 }
36022 else
36023 {
36024 if (!REG_P (op1))
36025 op1 = copy_to_mode_reg (DImode, op1);
36026 icode = CODE_FOR_xsetbv;
36027 pat = GEN_FCN (icode) (op0, op1);
36028 }
36029 if (pat)
36030 emit_insn (pat);
36031 return 0;
36032
36033 case IX86_BUILTIN_XSAVE:
36034 case IX86_BUILTIN_XRSTOR:
36035 case IX86_BUILTIN_XSAVE64:
36036 case IX86_BUILTIN_XRSTOR64:
36037 case IX86_BUILTIN_XSAVEOPT:
36038 case IX86_BUILTIN_XSAVEOPT64:
36039 case IX86_BUILTIN_XSAVES:
36040 case IX86_BUILTIN_XRSTORS:
36041 case IX86_BUILTIN_XSAVES64:
36042 case IX86_BUILTIN_XRSTORS64:
36043 case IX86_BUILTIN_XSAVEC:
36044 case IX86_BUILTIN_XSAVEC64:
36045 arg0 = CALL_EXPR_ARG (exp, 0);
36046 arg1 = CALL_EXPR_ARG (exp, 1);
36047 op0 = expand_normal (arg0);
36048 op1 = expand_normal (arg1);
36049
36050 if (!address_operand (op0, VOIDmode))
36051 {
36052 op0 = convert_memory_address (Pmode, op0);
36053 op0 = copy_addr_to_reg (op0);
36054 }
36055 op0 = gen_rtx_MEM (BLKmode, op0);
36056
36057 op1 = force_reg (DImode, op1);
36058
36059 if (TARGET_64BIT)
36060 {
36061 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
36062 NULL, 1, OPTAB_DIRECT);
36063 switch (fcode)
36064 {
36065 case IX86_BUILTIN_XSAVE:
36066 icode = CODE_FOR_xsave_rex64;
36067 break;
36068 case IX86_BUILTIN_XRSTOR:
36069 icode = CODE_FOR_xrstor_rex64;
36070 break;
36071 case IX86_BUILTIN_XSAVE64:
36072 icode = CODE_FOR_xsave64;
36073 break;
36074 case IX86_BUILTIN_XRSTOR64:
36075 icode = CODE_FOR_xrstor64;
36076 break;
36077 case IX86_BUILTIN_XSAVEOPT:
36078 icode = CODE_FOR_xsaveopt_rex64;
36079 break;
36080 case IX86_BUILTIN_XSAVEOPT64:
36081 icode = CODE_FOR_xsaveopt64;
36082 break;
36083 case IX86_BUILTIN_XSAVES:
36084 icode = CODE_FOR_xsaves_rex64;
36085 break;
36086 case IX86_BUILTIN_XRSTORS:
36087 icode = CODE_FOR_xrstors_rex64;
36088 break;
36089 case IX86_BUILTIN_XSAVES64:
36090 icode = CODE_FOR_xsaves64;
36091 break;
36092 case IX86_BUILTIN_XRSTORS64:
36093 icode = CODE_FOR_xrstors64;
36094 break;
36095 case IX86_BUILTIN_XSAVEC:
36096 icode = CODE_FOR_xsavec_rex64;
36097 break;
36098 case IX86_BUILTIN_XSAVEC64:
36099 icode = CODE_FOR_xsavec64;
36100 break;
36101 default:
36102 gcc_unreachable ();
36103 }
36104
36105 op2 = gen_lowpart (SImode, op2);
36106 op1 = gen_lowpart (SImode, op1);
36107 pat = GEN_FCN (icode) (op0, op1, op2);
36108 }
36109 else
36110 {
36111 switch (fcode)
36112 {
36113 case IX86_BUILTIN_XSAVE:
36114 icode = CODE_FOR_xsave;
36115 break;
36116 case IX86_BUILTIN_XRSTOR:
36117 icode = CODE_FOR_xrstor;
36118 break;
36119 case IX86_BUILTIN_XSAVEOPT:
36120 icode = CODE_FOR_xsaveopt;
36121 break;
36122 case IX86_BUILTIN_XSAVES:
36123 icode = CODE_FOR_xsaves;
36124 break;
36125 case IX86_BUILTIN_XRSTORS:
36126 icode = CODE_FOR_xrstors;
36127 break;
36128 case IX86_BUILTIN_XSAVEC:
36129 icode = CODE_FOR_xsavec;
36130 break;
36131 default:
36132 gcc_unreachable ();
36133 }
36134 pat = GEN_FCN (icode) (op0, op1);
36135 }
36136
36137 if (pat)
36138 emit_insn (pat);
36139 return 0;
36140
36141 case IX86_BUILTIN_LLWPCB:
36142 arg0 = CALL_EXPR_ARG (exp, 0);
36143 op0 = expand_normal (arg0);
36144 icode = CODE_FOR_lwp_llwpcb;
36145 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
36146 op0 = ix86_zero_extend_to_Pmode (op0);
36147 emit_insn (gen_lwp_llwpcb (op0));
36148 return 0;
36149
36150 case IX86_BUILTIN_SLWPCB:
36151 icode = CODE_FOR_lwp_slwpcb;
36152 if (!target
36153 || !insn_data[icode].operand[0].predicate (target, Pmode))
36154 target = gen_reg_rtx (Pmode);
36155 emit_insn (gen_lwp_slwpcb (target));
36156 return target;
36157
36158 case IX86_BUILTIN_BEXTRI32:
36159 case IX86_BUILTIN_BEXTRI64:
36160 arg0 = CALL_EXPR_ARG (exp, 0);
36161 arg1 = CALL_EXPR_ARG (exp, 1);
36162 op0 = expand_normal (arg0);
36163 op1 = expand_normal (arg1);
36164 icode = (fcode == IX86_BUILTIN_BEXTRI32
36165 ? CODE_FOR_tbm_bextri_si
36166 : CODE_FOR_tbm_bextri_di);
36167 if (!CONST_INT_P (op1))
36168 {
36169 error ("last argument must be an immediate");
36170 return const0_rtx;
36171 }
36172 else
36173 {
36174 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
36175 unsigned char lsb_index = INTVAL (op1) & 0xFF;
36176 op1 = GEN_INT (length);
36177 op2 = GEN_INT (lsb_index);
36178 pat = GEN_FCN (icode) (target, op0, op1, op2);
36179 if (pat)
36180 emit_insn (pat);
36181 return target;
36182 }
36183
36184 case IX86_BUILTIN_RDRAND16_STEP:
36185 icode = CODE_FOR_rdrandhi_1;
36186 mode0 = HImode;
36187 goto rdrand_step;
36188
36189 case IX86_BUILTIN_RDRAND32_STEP:
36190 icode = CODE_FOR_rdrandsi_1;
36191 mode0 = SImode;
36192 goto rdrand_step;
36193
36194 case IX86_BUILTIN_RDRAND64_STEP:
36195 icode = CODE_FOR_rdranddi_1;
36196 mode0 = DImode;
36197
36198 rdrand_step:
36199 arg0 = CALL_EXPR_ARG (exp, 0);
36200 op1 = expand_normal (arg0);
36201 if (!address_operand (op1, VOIDmode))
36202 {
36203 op1 = convert_memory_address (Pmode, op1);
36204 op1 = copy_addr_to_reg (op1);
36205 }
36206
36207 op0 = gen_reg_rtx (mode0);
36208 emit_insn (GEN_FCN (icode) (op0));
36209
36210 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36211
36212 op1 = gen_reg_rtx (SImode);
36213 emit_move_insn (op1, CONST1_RTX (SImode));
36214
36215 /* Emit SImode conditional move. */
36216 if (mode0 == HImode)
36217 {
36218 if (TARGET_ZERO_EXTEND_WITH_AND
36219 && optimize_function_for_speed_p (cfun))
36220 {
36221 op2 = force_reg (SImode, const0_rtx);
36222
36223 emit_insn (gen_movstricthi
36224 (gen_lowpart (HImode, op2), op0));
36225 }
36226 else
36227 {
36228 op2 = gen_reg_rtx (SImode);
36229
36230 emit_insn (gen_zero_extendhisi2 (op2, op0));
36231 }
36232 }
36233 else if (mode0 == SImode)
36234 op2 = op0;
36235 else
36236 op2 = gen_rtx_SUBREG (SImode, op0, 0);
36237
36238 if (target == 0
36239 || !register_operand (target, SImode))
36240 target = gen_reg_rtx (SImode);
36241
36242 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
36243 const0_rtx);
36244 emit_insn (gen_rtx_SET (target,
36245 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
36246 return target;
36247
36248 case IX86_BUILTIN_RDSEED16_STEP:
36249 icode = CODE_FOR_rdseedhi_1;
36250 mode0 = HImode;
36251 goto rdseed_step;
36252
36253 case IX86_BUILTIN_RDSEED32_STEP:
36254 icode = CODE_FOR_rdseedsi_1;
36255 mode0 = SImode;
36256 goto rdseed_step;
36257
36258 case IX86_BUILTIN_RDSEED64_STEP:
36259 icode = CODE_FOR_rdseeddi_1;
36260 mode0 = DImode;
36261
36262 rdseed_step:
36263 arg0 = CALL_EXPR_ARG (exp, 0);
36264 op1 = expand_normal (arg0);
36265 if (!address_operand (op1, VOIDmode))
36266 {
36267 op1 = convert_memory_address (Pmode, op1);
36268 op1 = copy_addr_to_reg (op1);
36269 }
36270
36271 op0 = gen_reg_rtx (mode0);
36272 emit_insn (GEN_FCN (icode) (op0));
36273
36274 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
36275
36276 op2 = gen_reg_rtx (QImode);
36277
36278 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
36279 const0_rtx);
36280 emit_insn (gen_rtx_SET (op2, pat));
36281
36282 if (target == 0
36283 || !register_operand (target, SImode))
36284 target = gen_reg_rtx (SImode);
36285
36286 emit_insn (gen_zero_extendqisi2 (target, op2));
36287 return target;
36288
36289 case IX86_BUILTIN_SBB32:
36290 icode = CODE_FOR_subborrowsi;
36291 icode2 = CODE_FOR_subborrowsi_0;
36292 mode0 = SImode;
36293 mode1 = DImode;
36294 mode2 = CCmode;
36295 goto handlecarry;
36296
36297 case IX86_BUILTIN_SBB64:
36298 icode = CODE_FOR_subborrowdi;
36299 icode2 = CODE_FOR_subborrowdi_0;
36300 mode0 = DImode;
36301 mode1 = TImode;
36302 mode2 = CCmode;
36303 goto handlecarry;
36304
36305 case IX86_BUILTIN_ADDCARRYX32:
36306 icode = CODE_FOR_addcarrysi;
36307 icode2 = CODE_FOR_addcarrysi_0;
36308 mode0 = SImode;
36309 mode1 = DImode;
36310 mode2 = CCCmode;
36311 goto handlecarry;
36312
36313 case IX86_BUILTIN_ADDCARRYX64:
36314 icode = CODE_FOR_addcarrydi;
36315 icode2 = CODE_FOR_addcarrydi_0;
36316 mode0 = DImode;
36317 mode1 = TImode;
36318 mode2 = CCCmode;
36319
36320 handlecarry:
36321 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
36322 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
36323 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
36324 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
36325
36326 op1 = expand_normal (arg0);
36327 if (!integer_zerop (arg0))
36328 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
36329
36330 op2 = expand_normal (arg1);
36331 if (!register_operand (op2, mode0))
36332 op2 = copy_to_mode_reg (mode0, op2);
36333
36334 op3 = expand_normal (arg2);
36335 if (!register_operand (op3, mode0))
36336 op3 = copy_to_mode_reg (mode0, op3);
36337
36338 op4 = expand_normal (arg3);
36339 if (!address_operand (op4, VOIDmode))
36340 {
36341 op4 = convert_memory_address (Pmode, op4);
36342 op4 = copy_addr_to_reg (op4);
36343 }
36344
36345 op0 = gen_reg_rtx (mode0);
36346 if (integer_zerop (arg0))
36347 {
36348 /* If arg0 is 0, optimize right away into add or sub
36349 instruction that sets CCCmode flags. */
36350 op1 = gen_rtx_REG (mode2, FLAGS_REG);
36351 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
36352 }
36353 else
36354 {
36355 /* Generate CF from input operand. */
36356 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
36357
36358 /* Generate instruction that consumes CF. */
36359 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
36360 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
36361 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
36362 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
36363 }
36364
36365 /* Return current CF value. */
36366 if (target == 0)
36367 target = gen_reg_rtx (QImode);
36368
36369 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
36370 emit_insn (gen_rtx_SET (target, pat));
36371
36372 /* Store the result. */
36373 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
36374
36375 return target;
36376
36377 case IX86_BUILTIN_READ_FLAGS:
36378 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
36379
36380 if (optimize
36381 || target == NULL_RTX
36382 || !nonimmediate_operand (target, word_mode)
36383 || GET_MODE (target) != word_mode)
36384 target = gen_reg_rtx (word_mode);
36385
36386 emit_insn (gen_pop (target));
36387 return target;
36388
36389 case IX86_BUILTIN_WRITE_FLAGS:
36390
36391 arg0 = CALL_EXPR_ARG (exp, 0);
36392 op0 = expand_normal (arg0);
36393 if (!general_no_elim_operand (op0, word_mode))
36394 op0 = copy_to_mode_reg (word_mode, op0);
36395
36396 emit_insn (gen_push (op0));
36397 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
36398 return 0;
36399
36400 case IX86_BUILTIN_KTESTC8:
36401 icode = CODE_FOR_ktestqi;
36402 mode3 = CCCmode;
36403 goto kortest;
36404
36405 case IX86_BUILTIN_KTESTZ8:
36406 icode = CODE_FOR_ktestqi;
36407 mode3 = CCZmode;
36408 goto kortest;
36409
36410 case IX86_BUILTIN_KTESTC16:
36411 icode = CODE_FOR_ktesthi;
36412 mode3 = CCCmode;
36413 goto kortest;
36414
36415 case IX86_BUILTIN_KTESTZ16:
36416 icode = CODE_FOR_ktesthi;
36417 mode3 = CCZmode;
36418 goto kortest;
36419
36420 case IX86_BUILTIN_KTESTC32:
36421 icode = CODE_FOR_ktestsi;
36422 mode3 = CCCmode;
36423 goto kortest;
36424
36425 case IX86_BUILTIN_KTESTZ32:
36426 icode = CODE_FOR_ktestsi;
36427 mode3 = CCZmode;
36428 goto kortest;
36429
36430 case IX86_BUILTIN_KTESTC64:
36431 icode = CODE_FOR_ktestdi;
36432 mode3 = CCCmode;
36433 goto kortest;
36434
36435 case IX86_BUILTIN_KTESTZ64:
36436 icode = CODE_FOR_ktestdi;
36437 mode3 = CCZmode;
36438 goto kortest;
36439
36440 case IX86_BUILTIN_KORTESTC8:
36441 icode = CODE_FOR_kortestqi;
36442 mode3 = CCCmode;
36443 goto kortest;
36444
36445 case IX86_BUILTIN_KORTESTZ8:
36446 icode = CODE_FOR_kortestqi;
36447 mode3 = CCZmode;
36448 goto kortest;
36449
36450 case IX86_BUILTIN_KORTESTC16:
36451 icode = CODE_FOR_kortesthi;
36452 mode3 = CCCmode;
36453 goto kortest;
36454
36455 case IX86_BUILTIN_KORTESTZ16:
36456 icode = CODE_FOR_kortesthi;
36457 mode3 = CCZmode;
36458 goto kortest;
36459
36460 case IX86_BUILTIN_KORTESTC32:
36461 icode = CODE_FOR_kortestsi;
36462 mode3 = CCCmode;
36463 goto kortest;
36464
36465 case IX86_BUILTIN_KORTESTZ32:
36466 icode = CODE_FOR_kortestsi;
36467 mode3 = CCZmode;
36468 goto kortest;
36469
36470 case IX86_BUILTIN_KORTESTC64:
36471 icode = CODE_FOR_kortestdi;
36472 mode3 = CCCmode;
36473 goto kortest;
36474
36475 case IX86_BUILTIN_KORTESTZ64:
36476 icode = CODE_FOR_kortestdi;
36477 mode3 = CCZmode;
36478
36479 kortest:
36480 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
36481 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
36482 op0 = expand_normal (arg0);
36483 op1 = expand_normal (arg1);
36484
36485 mode0 = insn_data[icode].operand[0].mode;
36486 mode1 = insn_data[icode].operand[1].mode;
36487
36488 if (GET_MODE (op0) != VOIDmode)
36489 op0 = force_reg (GET_MODE (op0), op0);
36490
36491 op0 = gen_lowpart (mode0, op0);
36492
36493 if (!insn_data[icode].operand[0].predicate (op0, mode0))
36494 op0 = copy_to_mode_reg (mode0, op0);
36495
36496 if (GET_MODE (op1) != VOIDmode)
36497 op1 = force_reg (GET_MODE (op1), op1);
36498
36499 op1 = gen_lowpart (mode1, op1);
36500
36501 if (!insn_data[icode].operand[1].predicate (op1, mode1))
36502 op1 = copy_to_mode_reg (mode1, op1);
36503
36504 target = gen_reg_rtx (QImode);
36505
36506 /* Emit kortest. */
36507 emit_insn (GEN_FCN (icode) (op0, op1));
36508 /* And use setcc to return result from flags. */
36509 ix86_expand_setcc (target, EQ,
36510 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
36511 return target;
36512
36513 case IX86_BUILTIN_GATHERSIV2DF:
36514 icode = CODE_FOR_avx2_gathersiv2df;
36515 goto gather_gen;
36516 case IX86_BUILTIN_GATHERSIV4DF:
36517 icode = CODE_FOR_avx2_gathersiv4df;
36518 goto gather_gen;
36519 case IX86_BUILTIN_GATHERDIV2DF:
36520 icode = CODE_FOR_avx2_gatherdiv2df;
36521 goto gather_gen;
36522 case IX86_BUILTIN_GATHERDIV4DF:
36523 icode = CODE_FOR_avx2_gatherdiv4df;
36524 goto gather_gen;
36525 case IX86_BUILTIN_GATHERSIV4SF:
36526 icode = CODE_FOR_avx2_gathersiv4sf;
36527 goto gather_gen;
36528 case IX86_BUILTIN_GATHERSIV8SF:
36529 icode = CODE_FOR_avx2_gathersiv8sf;
36530 goto gather_gen;
36531 case IX86_BUILTIN_GATHERDIV4SF:
36532 icode = CODE_FOR_avx2_gatherdiv4sf;
36533 goto gather_gen;
36534 case IX86_BUILTIN_GATHERDIV8SF:
36535 icode = CODE_FOR_avx2_gatherdiv8sf;
36536 goto gather_gen;
36537 case IX86_BUILTIN_GATHERSIV2DI:
36538 icode = CODE_FOR_avx2_gathersiv2di;
36539 goto gather_gen;
36540 case IX86_BUILTIN_GATHERSIV4DI:
36541 icode = CODE_FOR_avx2_gathersiv4di;
36542 goto gather_gen;
36543 case IX86_BUILTIN_GATHERDIV2DI:
36544 icode = CODE_FOR_avx2_gatherdiv2di;
36545 goto gather_gen;
36546 case IX86_BUILTIN_GATHERDIV4DI:
36547 icode = CODE_FOR_avx2_gatherdiv4di;
36548 goto gather_gen;
36549 case IX86_BUILTIN_GATHERSIV4SI:
36550 icode = CODE_FOR_avx2_gathersiv4si;
36551 goto gather_gen;
36552 case IX86_BUILTIN_GATHERSIV8SI:
36553 icode = CODE_FOR_avx2_gathersiv8si;
36554 goto gather_gen;
36555 case IX86_BUILTIN_GATHERDIV4SI:
36556 icode = CODE_FOR_avx2_gatherdiv4si;
36557 goto gather_gen;
36558 case IX86_BUILTIN_GATHERDIV8SI:
36559 icode = CODE_FOR_avx2_gatherdiv8si;
36560 goto gather_gen;
36561 case IX86_BUILTIN_GATHERALTSIV4DF:
36562 icode = CODE_FOR_avx2_gathersiv4df;
36563 goto gather_gen;
36564 case IX86_BUILTIN_GATHERALTDIV8SF:
36565 icode = CODE_FOR_avx2_gatherdiv8sf;
36566 goto gather_gen;
36567 case IX86_BUILTIN_GATHERALTSIV4DI:
36568 icode = CODE_FOR_avx2_gathersiv4di;
36569 goto gather_gen;
36570 case IX86_BUILTIN_GATHERALTDIV8SI:
36571 icode = CODE_FOR_avx2_gatherdiv8si;
36572 goto gather_gen;
36573 case IX86_BUILTIN_GATHER3SIV16SF:
36574 icode = CODE_FOR_avx512f_gathersiv16sf;
36575 goto gather_gen;
36576 case IX86_BUILTIN_GATHER3SIV8DF:
36577 icode = CODE_FOR_avx512f_gathersiv8df;
36578 goto gather_gen;
36579 case IX86_BUILTIN_GATHER3DIV16SF:
36580 icode = CODE_FOR_avx512f_gatherdiv16sf;
36581 goto gather_gen;
36582 case IX86_BUILTIN_GATHER3DIV8DF:
36583 icode = CODE_FOR_avx512f_gatherdiv8df;
36584 goto gather_gen;
36585 case IX86_BUILTIN_GATHER3SIV16SI:
36586 icode = CODE_FOR_avx512f_gathersiv16si;
36587 goto gather_gen;
36588 case IX86_BUILTIN_GATHER3SIV8DI:
36589 icode = CODE_FOR_avx512f_gathersiv8di;
36590 goto gather_gen;
36591 case IX86_BUILTIN_GATHER3DIV16SI:
36592 icode = CODE_FOR_avx512f_gatherdiv16si;
36593 goto gather_gen;
36594 case IX86_BUILTIN_GATHER3DIV8DI:
36595 icode = CODE_FOR_avx512f_gatherdiv8di;
36596 goto gather_gen;
36597 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36598 icode = CODE_FOR_avx512f_gathersiv8df;
36599 goto gather_gen;
36600 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36601 icode = CODE_FOR_avx512f_gatherdiv16sf;
36602 goto gather_gen;
36603 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36604 icode = CODE_FOR_avx512f_gathersiv8di;
36605 goto gather_gen;
36606 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36607 icode = CODE_FOR_avx512f_gatherdiv16si;
36608 goto gather_gen;
36609 case IX86_BUILTIN_GATHER3SIV2DF:
36610 icode = CODE_FOR_avx512vl_gathersiv2df;
36611 goto gather_gen;
36612 case IX86_BUILTIN_GATHER3SIV4DF:
36613 icode = CODE_FOR_avx512vl_gathersiv4df;
36614 goto gather_gen;
36615 case IX86_BUILTIN_GATHER3DIV2DF:
36616 icode = CODE_FOR_avx512vl_gatherdiv2df;
36617 goto gather_gen;
36618 case IX86_BUILTIN_GATHER3DIV4DF:
36619 icode = CODE_FOR_avx512vl_gatherdiv4df;
36620 goto gather_gen;
36621 case IX86_BUILTIN_GATHER3SIV4SF:
36622 icode = CODE_FOR_avx512vl_gathersiv4sf;
36623 goto gather_gen;
36624 case IX86_BUILTIN_GATHER3SIV8SF:
36625 icode = CODE_FOR_avx512vl_gathersiv8sf;
36626 goto gather_gen;
36627 case IX86_BUILTIN_GATHER3DIV4SF:
36628 icode = CODE_FOR_avx512vl_gatherdiv4sf;
36629 goto gather_gen;
36630 case IX86_BUILTIN_GATHER3DIV8SF:
36631 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36632 goto gather_gen;
36633 case IX86_BUILTIN_GATHER3SIV2DI:
36634 icode = CODE_FOR_avx512vl_gathersiv2di;
36635 goto gather_gen;
36636 case IX86_BUILTIN_GATHER3SIV4DI:
36637 icode = CODE_FOR_avx512vl_gathersiv4di;
36638 goto gather_gen;
36639 case IX86_BUILTIN_GATHER3DIV2DI:
36640 icode = CODE_FOR_avx512vl_gatherdiv2di;
36641 goto gather_gen;
36642 case IX86_BUILTIN_GATHER3DIV4DI:
36643 icode = CODE_FOR_avx512vl_gatherdiv4di;
36644 goto gather_gen;
36645 case IX86_BUILTIN_GATHER3SIV4SI:
36646 icode = CODE_FOR_avx512vl_gathersiv4si;
36647 goto gather_gen;
36648 case IX86_BUILTIN_GATHER3SIV8SI:
36649 icode = CODE_FOR_avx512vl_gathersiv8si;
36650 goto gather_gen;
36651 case IX86_BUILTIN_GATHER3DIV4SI:
36652 icode = CODE_FOR_avx512vl_gatherdiv4si;
36653 goto gather_gen;
36654 case IX86_BUILTIN_GATHER3DIV8SI:
36655 icode = CODE_FOR_avx512vl_gatherdiv8si;
36656 goto gather_gen;
36657 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36658 icode = CODE_FOR_avx512vl_gathersiv4df;
36659 goto gather_gen;
36660 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36661 icode = CODE_FOR_avx512vl_gatherdiv8sf;
36662 goto gather_gen;
36663 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36664 icode = CODE_FOR_avx512vl_gathersiv4di;
36665 goto gather_gen;
36666 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36667 icode = CODE_FOR_avx512vl_gatherdiv8si;
36668 goto gather_gen;
36669 case IX86_BUILTIN_SCATTERSIV16SF:
36670 icode = CODE_FOR_avx512f_scattersiv16sf;
36671 goto scatter_gen;
36672 case IX86_BUILTIN_SCATTERSIV8DF:
36673 icode = CODE_FOR_avx512f_scattersiv8df;
36674 goto scatter_gen;
36675 case IX86_BUILTIN_SCATTERDIV16SF:
36676 icode = CODE_FOR_avx512f_scatterdiv16sf;
36677 goto scatter_gen;
36678 case IX86_BUILTIN_SCATTERDIV8DF:
36679 icode = CODE_FOR_avx512f_scatterdiv8df;
36680 goto scatter_gen;
36681 case IX86_BUILTIN_SCATTERSIV16SI:
36682 icode = CODE_FOR_avx512f_scattersiv16si;
36683 goto scatter_gen;
36684 case IX86_BUILTIN_SCATTERSIV8DI:
36685 icode = CODE_FOR_avx512f_scattersiv8di;
36686 goto scatter_gen;
36687 case IX86_BUILTIN_SCATTERDIV16SI:
36688 icode = CODE_FOR_avx512f_scatterdiv16si;
36689 goto scatter_gen;
36690 case IX86_BUILTIN_SCATTERDIV8DI:
36691 icode = CODE_FOR_avx512f_scatterdiv8di;
36692 goto scatter_gen;
36693 case IX86_BUILTIN_SCATTERSIV8SF:
36694 icode = CODE_FOR_avx512vl_scattersiv8sf;
36695 goto scatter_gen;
36696 case IX86_BUILTIN_SCATTERSIV4SF:
36697 icode = CODE_FOR_avx512vl_scattersiv4sf;
36698 goto scatter_gen;
36699 case IX86_BUILTIN_SCATTERSIV4DF:
36700 icode = CODE_FOR_avx512vl_scattersiv4df;
36701 goto scatter_gen;
36702 case IX86_BUILTIN_SCATTERSIV2DF:
36703 icode = CODE_FOR_avx512vl_scattersiv2df;
36704 goto scatter_gen;
36705 case IX86_BUILTIN_SCATTERDIV8SF:
36706 icode = CODE_FOR_avx512vl_scatterdiv8sf;
36707 goto scatter_gen;
36708 case IX86_BUILTIN_SCATTERDIV4SF:
36709 icode = CODE_FOR_avx512vl_scatterdiv4sf;
36710 goto scatter_gen;
36711 case IX86_BUILTIN_SCATTERDIV4DF:
36712 icode = CODE_FOR_avx512vl_scatterdiv4df;
36713 goto scatter_gen;
36714 case IX86_BUILTIN_SCATTERDIV2DF:
36715 icode = CODE_FOR_avx512vl_scatterdiv2df;
36716 goto scatter_gen;
36717 case IX86_BUILTIN_SCATTERSIV8SI:
36718 icode = CODE_FOR_avx512vl_scattersiv8si;
36719 goto scatter_gen;
36720 case IX86_BUILTIN_SCATTERSIV4SI:
36721 icode = CODE_FOR_avx512vl_scattersiv4si;
36722 goto scatter_gen;
36723 case IX86_BUILTIN_SCATTERSIV4DI:
36724 icode = CODE_FOR_avx512vl_scattersiv4di;
36725 goto scatter_gen;
36726 case IX86_BUILTIN_SCATTERSIV2DI:
36727 icode = CODE_FOR_avx512vl_scattersiv2di;
36728 goto scatter_gen;
36729 case IX86_BUILTIN_SCATTERDIV8SI:
36730 icode = CODE_FOR_avx512vl_scatterdiv8si;
36731 goto scatter_gen;
36732 case IX86_BUILTIN_SCATTERDIV4SI:
36733 icode = CODE_FOR_avx512vl_scatterdiv4si;
36734 goto scatter_gen;
36735 case IX86_BUILTIN_SCATTERDIV4DI:
36736 icode = CODE_FOR_avx512vl_scatterdiv4di;
36737 goto scatter_gen;
36738 case IX86_BUILTIN_SCATTERDIV2DI:
36739 icode = CODE_FOR_avx512vl_scatterdiv2di;
36740 goto scatter_gen;
36741 case IX86_BUILTIN_GATHERPFDPD:
36742 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
36743 goto vec_prefetch_gen;
36744 case IX86_BUILTIN_SCATTERALTSIV8DF:
36745 icode = CODE_FOR_avx512f_scattersiv8df;
36746 goto scatter_gen;
36747 case IX86_BUILTIN_SCATTERALTDIV16SF:
36748 icode = CODE_FOR_avx512f_scatterdiv16sf;
36749 goto scatter_gen;
36750 case IX86_BUILTIN_SCATTERALTSIV8DI:
36751 icode = CODE_FOR_avx512f_scattersiv8di;
36752 goto scatter_gen;
36753 case IX86_BUILTIN_SCATTERALTDIV16SI:
36754 icode = CODE_FOR_avx512f_scatterdiv16si;
36755 goto scatter_gen;
36756 case IX86_BUILTIN_GATHERPFDPS:
36757 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
36758 goto vec_prefetch_gen;
36759 case IX86_BUILTIN_GATHERPFQPD:
36760 icode = CODE_FOR_avx512pf_gatherpfv8didf;
36761 goto vec_prefetch_gen;
36762 case IX86_BUILTIN_GATHERPFQPS:
36763 icode = CODE_FOR_avx512pf_gatherpfv8disf;
36764 goto vec_prefetch_gen;
36765 case IX86_BUILTIN_SCATTERPFDPD:
36766 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
36767 goto vec_prefetch_gen;
36768 case IX86_BUILTIN_SCATTERPFDPS:
36769 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
36770 goto vec_prefetch_gen;
36771 case IX86_BUILTIN_SCATTERPFQPD:
36772 icode = CODE_FOR_avx512pf_scatterpfv8didf;
36773 goto vec_prefetch_gen;
36774 case IX86_BUILTIN_SCATTERPFQPS:
36775 icode = CODE_FOR_avx512pf_scatterpfv8disf;
36776 goto vec_prefetch_gen;
36777
36778 gather_gen:
36779 rtx half;
36780 rtx (*gen) (rtx, rtx);
36781
36782 arg0 = CALL_EXPR_ARG (exp, 0);
36783 arg1 = CALL_EXPR_ARG (exp, 1);
36784 arg2 = CALL_EXPR_ARG (exp, 2);
36785 arg3 = CALL_EXPR_ARG (exp, 3);
36786 arg4 = CALL_EXPR_ARG (exp, 4);
36787 op0 = expand_normal (arg0);
36788 op1 = expand_normal (arg1);
36789 op2 = expand_normal (arg2);
36790 op3 = expand_normal (arg3);
36791 op4 = expand_normal (arg4);
36792 /* Note the arg order is different from the operand order. */
36793 mode0 = insn_data[icode].operand[1].mode;
36794 mode2 = insn_data[icode].operand[3].mode;
36795 mode3 = insn_data[icode].operand[4].mode;
36796 mode4 = insn_data[icode].operand[5].mode;
36797
36798 if (target == NULL_RTX
36799 || GET_MODE (target) != insn_data[icode].operand[0].mode
36800 || !insn_data[icode].operand[0].predicate (target,
36801 GET_MODE (target)))
36802 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
36803 else
36804 subtarget = target;
36805
36806 switch (fcode)
36807 {
36808 case IX86_BUILTIN_GATHER3ALTSIV8DF:
36809 case IX86_BUILTIN_GATHER3ALTSIV8DI:
36810 half = gen_reg_rtx (V8SImode);
36811 if (!nonimmediate_operand (op2, V16SImode))
36812 op2 = copy_to_mode_reg (V16SImode, op2);
36813 emit_insn (gen_vec_extract_lo_v16si (half, op2));
36814 op2 = half;
36815 break;
36816 case IX86_BUILTIN_GATHER3ALTSIV4DF:
36817 case IX86_BUILTIN_GATHER3ALTSIV4DI:
36818 case IX86_BUILTIN_GATHERALTSIV4DF:
36819 case IX86_BUILTIN_GATHERALTSIV4DI:
36820 half = gen_reg_rtx (V4SImode);
36821 if (!nonimmediate_operand (op2, V8SImode))
36822 op2 = copy_to_mode_reg (V8SImode, op2);
36823 emit_insn (gen_vec_extract_lo_v8si (half, op2));
36824 op2 = half;
36825 break;
36826 case IX86_BUILTIN_GATHER3ALTDIV16SF:
36827 case IX86_BUILTIN_GATHER3ALTDIV16SI:
36828 half = gen_reg_rtx (mode0);
36829 if (mode0 == V8SFmode)
36830 gen = gen_vec_extract_lo_v16sf;
36831 else
36832 gen = gen_vec_extract_lo_v16si;
36833 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36834 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36835 emit_insn (gen (half, op0));
36836 op0 = half;
36837 if (GET_MODE (op3) != VOIDmode)
36838 {
36839 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36840 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36841 emit_insn (gen (half, op3));
36842 op3 = half;
36843 }
36844 break;
36845 case IX86_BUILTIN_GATHER3ALTDIV8SF:
36846 case IX86_BUILTIN_GATHER3ALTDIV8SI:
36847 case IX86_BUILTIN_GATHERALTDIV8SF:
36848 case IX86_BUILTIN_GATHERALTDIV8SI:
36849 half = gen_reg_rtx (mode0);
36850 if (mode0 == V4SFmode)
36851 gen = gen_vec_extract_lo_v8sf;
36852 else
36853 gen = gen_vec_extract_lo_v8si;
36854 if (!nonimmediate_operand (op0, GET_MODE (op0)))
36855 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
36856 emit_insn (gen (half, op0));
36857 op0 = half;
36858 if (GET_MODE (op3) != VOIDmode)
36859 {
36860 if (!nonimmediate_operand (op3, GET_MODE (op3)))
36861 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
36862 emit_insn (gen (half, op3));
36863 op3 = half;
36864 }
36865 break;
36866 default:
36867 break;
36868 }
36869
36870 /* Force memory operand only with base register here. But we
36871 don't want to do it on memory operand for other builtin
36872 functions. */
36873 op1 = ix86_zero_extend_to_Pmode (op1);
36874
36875 if (!insn_data[icode].operand[1].predicate (op0, mode0))
36876 op0 = copy_to_mode_reg (mode0, op0);
36877 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
36878 op1 = copy_to_mode_reg (Pmode, op1);
36879 if (!insn_data[icode].operand[3].predicate (op2, mode2))
36880 op2 = copy_to_mode_reg (mode2, op2);
36881
36882 op3 = fixup_modeless_constant (op3, mode3);
36883
36884 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
36885 {
36886 if (!insn_data[icode].operand[4].predicate (op3, mode3))
36887 op3 = copy_to_mode_reg (mode3, op3);
36888 }
36889 else
36890 {
36891 op3 = copy_to_reg (op3);
36892 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
36893 }
36894 if (!insn_data[icode].operand[5].predicate (op4, mode4))
36895 {
36896 error ("the last argument must be scale 1, 2, 4, 8");
36897 return const0_rtx;
36898 }
36899
36900 /* Optimize. If mask is known to have all high bits set,
36901 replace op0 with pc_rtx to signal that the instruction
36902 overwrites the whole destination and doesn't use its
36903 previous contents. */
36904 if (optimize)
36905 {
36906 if (TREE_CODE (arg3) == INTEGER_CST)
36907 {
36908 if (integer_all_onesp (arg3))
36909 op0 = pc_rtx;
36910 }
36911 else if (TREE_CODE (arg3) == VECTOR_CST)
36912 {
36913 unsigned int negative = 0;
36914 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
36915 {
36916 tree cst = VECTOR_CST_ELT (arg3, i);
36917 if (TREE_CODE (cst) == INTEGER_CST
36918 && tree_int_cst_sign_bit (cst))
36919 negative++;
36920 else if (TREE_CODE (cst) == REAL_CST
36921 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
36922 negative++;
36923 }
36924 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
36925 op0 = pc_rtx;
36926 }
36927 else if (TREE_CODE (arg3) == SSA_NAME
36928 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
36929 {
36930 /* Recognize also when mask is like:
36931 __v2df src = _mm_setzero_pd ();
36932 __v2df mask = _mm_cmpeq_pd (src, src);
36933 or
36934 __v8sf src = _mm256_setzero_ps ();
36935 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
36936 as that is a cheaper way to load all ones into
36937 a register than having to load a constant from
36938 memory. */
36939 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
36940 if (is_gimple_call (def_stmt))
36941 {
36942 tree fndecl = gimple_call_fndecl (def_stmt);
36943 if (fndecl
36944 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
36945 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
36946 {
36947 case IX86_BUILTIN_CMPPD:
36948 case IX86_BUILTIN_CMPPS:
36949 case IX86_BUILTIN_CMPPD256:
36950 case IX86_BUILTIN_CMPPS256:
36951 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
36952 break;
36953 /* FALLTHRU */
36954 case IX86_BUILTIN_CMPEQPD:
36955 case IX86_BUILTIN_CMPEQPS:
36956 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
36957 && initializer_zerop (gimple_call_arg (def_stmt,
36958 1)))
36959 op0 = pc_rtx;
36960 break;
36961 default:
36962 break;
36963 }
36964 }
36965 }
36966 }
36967
36968 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
36969 if (! pat)
36970 return const0_rtx;
36971 emit_insn (pat);
36972
36973 switch (fcode)
36974 {
36975 case IX86_BUILTIN_GATHER3DIV16SF:
36976 if (target == NULL_RTX)
36977 target = gen_reg_rtx (V8SFmode);
36978 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
36979 break;
36980 case IX86_BUILTIN_GATHER3DIV16SI:
36981 if (target == NULL_RTX)
36982 target = gen_reg_rtx (V8SImode);
36983 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
36984 break;
36985 case IX86_BUILTIN_GATHER3DIV8SF:
36986 case IX86_BUILTIN_GATHERDIV8SF:
36987 if (target == NULL_RTX)
36988 target = gen_reg_rtx (V4SFmode);
36989 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
36990 break;
36991 case IX86_BUILTIN_GATHER3DIV8SI:
36992 case IX86_BUILTIN_GATHERDIV8SI:
36993 if (target == NULL_RTX)
36994 target = gen_reg_rtx (V4SImode);
36995 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
36996 break;
36997 default:
36998 target = subtarget;
36999 break;
37000 }
37001 return target;
37002
37003 scatter_gen:
37004 arg0 = CALL_EXPR_ARG (exp, 0);
37005 arg1 = CALL_EXPR_ARG (exp, 1);
37006 arg2 = CALL_EXPR_ARG (exp, 2);
37007 arg3 = CALL_EXPR_ARG (exp, 3);
37008 arg4 = CALL_EXPR_ARG (exp, 4);
37009 op0 = expand_normal (arg0);
37010 op1 = expand_normal (arg1);
37011 op2 = expand_normal (arg2);
37012 op3 = expand_normal (arg3);
37013 op4 = expand_normal (arg4);
37014 mode1 = insn_data[icode].operand[1].mode;
37015 mode2 = insn_data[icode].operand[2].mode;
37016 mode3 = insn_data[icode].operand[3].mode;
37017 mode4 = insn_data[icode].operand[4].mode;
37018
37019 /* Scatter instruction stores operand op3 to memory with
37020 indices from op2 and scale from op4 under writemask op1.
37021 If index operand op2 has more elements then source operand
37022 op3 one need to use only its low half. And vice versa. */
37023 switch (fcode)
37024 {
37025 case IX86_BUILTIN_SCATTERALTSIV8DF:
37026 case IX86_BUILTIN_SCATTERALTSIV8DI:
37027 half = gen_reg_rtx (V8SImode);
37028 if (!nonimmediate_operand (op2, V16SImode))
37029 op2 = copy_to_mode_reg (V16SImode, op2);
37030 emit_insn (gen_vec_extract_lo_v16si (half, op2));
37031 op2 = half;
37032 break;
37033 case IX86_BUILTIN_SCATTERALTDIV16SF:
37034 case IX86_BUILTIN_SCATTERALTDIV16SI:
37035 half = gen_reg_rtx (mode3);
37036 if (mode3 == V8SFmode)
37037 gen = gen_vec_extract_lo_v16sf;
37038 else
37039 gen = gen_vec_extract_lo_v16si;
37040 if (!nonimmediate_operand (op3, GET_MODE (op3)))
37041 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
37042 emit_insn (gen (half, op3));
37043 op3 = half;
37044 break;
37045 default:
37046 break;
37047 }
37048
37049 /* Force memory operand only with base register here. But we
37050 don't want to do it on memory operand for other builtin
37051 functions. */
37052 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
37053
37054 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
37055 op0 = copy_to_mode_reg (Pmode, op0);
37056
37057 op1 = fixup_modeless_constant (op1, mode1);
37058
37059 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
37060 {
37061 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37062 op1 = copy_to_mode_reg (mode1, op1);
37063 }
37064 else
37065 {
37066 op1 = copy_to_reg (op1);
37067 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
37068 }
37069
37070 if (!insn_data[icode].operand[2].predicate (op2, mode2))
37071 op2 = copy_to_mode_reg (mode2, op2);
37072
37073 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37074 op3 = copy_to_mode_reg (mode3, op3);
37075
37076 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37077 {
37078 error ("the last argument must be scale 1, 2, 4, 8");
37079 return const0_rtx;
37080 }
37081
37082 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37083 if (! pat)
37084 return const0_rtx;
37085
37086 emit_insn (pat);
37087 return 0;
37088
37089 vec_prefetch_gen:
37090 arg0 = CALL_EXPR_ARG (exp, 0);
37091 arg1 = CALL_EXPR_ARG (exp, 1);
37092 arg2 = CALL_EXPR_ARG (exp, 2);
37093 arg3 = CALL_EXPR_ARG (exp, 3);
37094 arg4 = CALL_EXPR_ARG (exp, 4);
37095 op0 = expand_normal (arg0);
37096 op1 = expand_normal (arg1);
37097 op2 = expand_normal (arg2);
37098 op3 = expand_normal (arg3);
37099 op4 = expand_normal (arg4);
37100 mode0 = insn_data[icode].operand[0].mode;
37101 mode1 = insn_data[icode].operand[1].mode;
37102 mode3 = insn_data[icode].operand[3].mode;
37103 mode4 = insn_data[icode].operand[4].mode;
37104
37105 op0 = fixup_modeless_constant (op0, mode0);
37106
37107 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
37108 {
37109 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37110 op0 = copy_to_mode_reg (mode0, op0);
37111 }
37112 else
37113 {
37114 op0 = copy_to_reg (op0);
37115 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
37116 }
37117
37118 if (!insn_data[icode].operand[1].predicate (op1, mode1))
37119 op1 = copy_to_mode_reg (mode1, op1);
37120
37121 /* Force memory operand only with base register here. But we
37122 don't want to do it on memory operand for other builtin
37123 functions. */
37124 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
37125
37126 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
37127 op2 = copy_to_mode_reg (Pmode, op2);
37128
37129 if (!insn_data[icode].operand[3].predicate (op3, mode3))
37130 {
37131 error ("the forth argument must be scale 1, 2, 4, 8");
37132 return const0_rtx;
37133 }
37134
37135 if (!insn_data[icode].operand[4].predicate (op4, mode4))
37136 {
37137 error ("incorrect hint operand");
37138 return const0_rtx;
37139 }
37140
37141 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
37142 if (! pat)
37143 return const0_rtx;
37144
37145 emit_insn (pat);
37146
37147 return 0;
37148
37149 case IX86_BUILTIN_XABORT:
37150 icode = CODE_FOR_xabort;
37151 arg0 = CALL_EXPR_ARG (exp, 0);
37152 op0 = expand_normal (arg0);
37153 mode0 = insn_data[icode].operand[0].mode;
37154 if (!insn_data[icode].operand[0].predicate (op0, mode0))
37155 {
37156 error ("the xabort's argument must be an 8-bit immediate");
37157 return const0_rtx;
37158 }
37159 emit_insn (gen_xabort (op0));
37160 return 0;
37161
37162 case IX86_BUILTIN_RSTORSSP:
37163 case IX86_BUILTIN_CLRSSBSY:
37164 arg0 = CALL_EXPR_ARG (exp, 0);
37165 op0 = expand_normal (arg0);
37166 icode = (fcode == IX86_BUILTIN_RSTORSSP
37167 ? CODE_FOR_rstorssp
37168 : CODE_FOR_clrssbsy);
37169 if (!address_operand (op0, VOIDmode))
37170 {
37171 op1 = convert_memory_address (Pmode, op0);
37172 op0 = copy_addr_to_reg (op1);
37173 }
37174 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (Pmode, op0)));
37175 return 0;
37176
37177 case IX86_BUILTIN_WRSSD:
37178 case IX86_BUILTIN_WRSSQ:
37179 case IX86_BUILTIN_WRUSSD:
37180 case IX86_BUILTIN_WRUSSQ:
37181 arg0 = CALL_EXPR_ARG (exp, 0);
37182 op0 = expand_normal (arg0);
37183 arg1 = CALL_EXPR_ARG (exp, 1);
37184 op1 = expand_normal (arg1);
37185 switch (fcode)
37186 {
37187 case IX86_BUILTIN_WRSSD:
37188 icode = CODE_FOR_wrsssi;
37189 mode = SImode;
37190 break;
37191 case IX86_BUILTIN_WRSSQ:
37192 icode = CODE_FOR_wrssdi;
37193 mode = DImode;
37194 break;
37195 case IX86_BUILTIN_WRUSSD:
37196 icode = CODE_FOR_wrusssi;
37197 mode = SImode;
37198 break;
37199 case IX86_BUILTIN_WRUSSQ:
37200 icode = CODE_FOR_wrussdi;
37201 mode = DImode;
37202 break;
37203 }
37204 op0 = force_reg (mode, op0);
37205 if (!address_operand (op1, VOIDmode))
37206 {
37207 op2 = convert_memory_address (Pmode, op1);
37208 op1 = copy_addr_to_reg (op2);
37209 }
37210 emit_insn (GEN_FCN (icode) (op0, gen_rtx_MEM (mode, op1)));
37211 return 0;
37212
37213 default:
37214 break;
37215 }
37216
37217 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
37218 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
37219 {
37220 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
37221 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
37222 target);
37223 }
37224
37225 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
37226 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
37227 {
37228 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
37229 switch (fcode)
37230 {
37231 case IX86_BUILTIN_FABSQ:
37232 case IX86_BUILTIN_COPYSIGNQ:
37233 if (!TARGET_SSE)
37234 /* Emit a normal call if SSE isn't available. */
37235 return expand_call (exp, target, ignore);
37236 /* FALLTHRU */
37237 default:
37238 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
37239 }
37240 }
37241
37242 if (fcode >= IX86_BUILTIN__BDESC_ARGS2_FIRST
37243 && fcode <= IX86_BUILTIN__BDESC_ARGS2_LAST)
37244 {
37245 i = fcode - IX86_BUILTIN__BDESC_ARGS2_FIRST;
37246 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
37247 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
37248 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
37249 int masked = 1;
37250 machine_mode mode, wide_mode, nar_mode;
37251
37252 nar_mode = V4SFmode;
37253 mode = V16SFmode;
37254 wide_mode = V64SFmode;
37255 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
37256 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
37257
37258 switch (fcode)
37259 {
37260 case IX86_BUILTIN_4FMAPS:
37261 fcn = gen_avx5124fmaddps_4fmaddps;
37262 masked = 0;
37263 goto v4fma_expand;
37264
37265 case IX86_BUILTIN_4DPWSSD:
37266 nar_mode = V4SImode;
37267 mode = V16SImode;
37268 wide_mode = V64SImode;
37269 fcn = gen_avx5124vnniw_vp4dpwssd;
37270 masked = 0;
37271 goto v4fma_expand;
37272
37273 case IX86_BUILTIN_4DPWSSDS:
37274 nar_mode = V4SImode;
37275 mode = V16SImode;
37276 wide_mode = V64SImode;
37277 fcn = gen_avx5124vnniw_vp4dpwssds;
37278 masked = 0;
37279 goto v4fma_expand;
37280
37281 case IX86_BUILTIN_4FNMAPS:
37282 fcn = gen_avx5124fmaddps_4fnmaddps;
37283 masked = 0;
37284 goto v4fma_expand;
37285
37286 case IX86_BUILTIN_4FNMAPS_MASK:
37287 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
37288 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
37289 goto v4fma_expand;
37290
37291 case IX86_BUILTIN_4DPWSSD_MASK:
37292 nar_mode = V4SImode;
37293 mode = V16SImode;
37294 wide_mode = V64SImode;
37295 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
37296 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
37297 goto v4fma_expand;
37298
37299 case IX86_BUILTIN_4DPWSSDS_MASK:
37300 nar_mode = V4SImode;
37301 mode = V16SImode;
37302 wide_mode = V64SImode;
37303 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
37304 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
37305 goto v4fma_expand;
37306
37307 case IX86_BUILTIN_4FMAPS_MASK:
37308 {
37309 tree args[4];
37310 rtx ops[4];
37311 rtx wide_reg;
37312 rtx accum;
37313 rtx addr;
37314 rtx mem;
37315
37316 v4fma_expand:
37317 wide_reg = gen_reg_rtx (wide_mode);
37318 for (i = 0; i < 4; i++)
37319 {
37320 args[i] = CALL_EXPR_ARG (exp, i);
37321 ops[i] = expand_normal (args[i]);
37322
37323 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
37324 ops[i]);
37325 }
37326
37327 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37328 accum = force_reg (mode, accum);
37329
37330 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37331 addr = force_reg (Pmode, addr);
37332
37333 mem = gen_rtx_MEM (nar_mode, addr);
37334
37335 target = gen_reg_rtx (mode);
37336
37337 emit_move_insn (target, accum);
37338
37339 if (! masked)
37340 emit_insn (fcn (target, accum, wide_reg, mem));
37341 else
37342 {
37343 rtx merge, mask;
37344 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37345
37346 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37347
37348 if (CONST_INT_P (mask))
37349 mask = fixup_modeless_constant (mask, HImode);
37350
37351 mask = force_reg (HImode, mask);
37352
37353 if (GET_MODE (mask) != HImode)
37354 mask = gen_rtx_SUBREG (HImode, mask, 0);
37355
37356 /* If merge is 0 then we're about to emit z-masked variant. */
37357 if (const0_operand (merge, mode))
37358 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37359 /* If merge is the same as accum then emit merge-masked variant. */
37360 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37361 {
37362 merge = force_reg (mode, merge);
37363 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37364 }
37365 /* Merge with something unknown might happen if we z-mask w/ -O0. */
37366 else
37367 {
37368 target = gen_reg_rtx (mode);
37369 emit_move_insn (target, merge);
37370 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37371 }
37372 }
37373 return target;
37374 }
37375
37376 case IX86_BUILTIN_4FNMASS:
37377 fcn = gen_avx5124fmaddps_4fnmaddss;
37378 masked = 0;
37379 goto s4fma_expand;
37380
37381 case IX86_BUILTIN_4FMASS:
37382 fcn = gen_avx5124fmaddps_4fmaddss;
37383 masked = 0;
37384 goto s4fma_expand;
37385
37386 case IX86_BUILTIN_4FNMASS_MASK:
37387 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
37388 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
37389 goto s4fma_expand;
37390
37391 case IX86_BUILTIN_4FMASS_MASK:
37392 {
37393 tree args[4];
37394 rtx ops[4];
37395 rtx wide_reg;
37396 rtx accum;
37397 rtx addr;
37398 rtx mem;
37399
37400 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
37401 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
37402
37403 s4fma_expand:
37404 mode = V4SFmode;
37405 wide_reg = gen_reg_rtx (V64SFmode);
37406 for (i = 0; i < 4; i++)
37407 {
37408 rtx tmp;
37409 args[i] = CALL_EXPR_ARG (exp, i);
37410 ops[i] = expand_normal (args[i]);
37411
37412 tmp = gen_reg_rtx (SFmode);
37413 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
37414
37415 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
37416 gen_rtx_SUBREG (V16SFmode, tmp, 0));
37417 }
37418
37419 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
37420 accum = force_reg (V4SFmode, accum);
37421
37422 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
37423 addr = force_reg (Pmode, addr);
37424
37425 mem = gen_rtx_MEM (V4SFmode, addr);
37426
37427 target = gen_reg_rtx (V4SFmode);
37428
37429 emit_move_insn (target, accum);
37430
37431 if (! masked)
37432 emit_insn (fcn (target, accum, wide_reg, mem));
37433 else
37434 {
37435 rtx merge, mask;
37436 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
37437
37438 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
37439
37440 if (CONST_INT_P (mask))
37441 mask = fixup_modeless_constant (mask, QImode);
37442
37443 mask = force_reg (QImode, mask);
37444
37445 if (GET_MODE (mask) != QImode)
37446 mask = gen_rtx_SUBREG (QImode, mask, 0);
37447
37448 /* If merge is 0 then we're about to emit z-masked variant. */
37449 if (const0_operand (merge, mode))
37450 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
37451 /* If merge is the same as accum then emit merge-masked
37452 variant. */
37453 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
37454 {
37455 merge = force_reg (mode, merge);
37456 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
37457 }
37458 /* Merge with something unknown might happen if we z-mask
37459 w/ -O0. */
37460 else
37461 {
37462 target = gen_reg_rtx (mode);
37463 emit_move_insn (target, merge);
37464 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
37465 }
37466 }
37467 return target;
37468 }
37469 case IX86_BUILTIN_RDPID:
37470 return ix86_expand_special_args_builtin (bdesc_args2 + i, exp,
37471 target);
37472 default:
37473 return ix86_expand_args_builtin (bdesc_args2 + i, exp, target);
37474 }
37475 }
37476
37477 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST
37478 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS2_LAST)
37479 {
37480 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS2_FIRST;
37481 return ix86_expand_special_args_builtin (bdesc_special_args2 + i, exp,
37482 target);
37483 }
37484
37485 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
37486 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
37487 {
37488 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
37489 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
37490 }
37491
37492 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
37493 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
37494 {
37495 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
37496 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
37497 }
37498
37499 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
37500 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
37501 {
37502 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
37503 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
37504 }
37505
37506 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
37507 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
37508 {
37509 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
37510 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
37511 }
37512
37513 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
37514 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
37515 {
37516 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
37517 const struct builtin_description *d = bdesc_multi_arg + i;
37518 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
37519 (enum ix86_builtin_func_type)
37520 d->flag, d->comparison);
37521 }
37522
37523 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
37524 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
37525 {
37526 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
37527 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
37528 target);
37529 }
37530
37531 if (fcode >= IX86_BUILTIN__BDESC_CET_NORMAL_FIRST
37532 && fcode <= IX86_BUILTIN__BDESC_CET_NORMAL_LAST)
37533 {
37534 i = fcode - IX86_BUILTIN__BDESC_CET_NORMAL_FIRST;
37535 return ix86_expand_args_builtin (bdesc_cet_rdssp + i, exp,
37536 target);
37537 }
37538
37539 gcc_unreachable ();
37540 }
37541
37542 /* This returns the target-specific builtin with code CODE if
37543 current_function_decl has visibility on this builtin, which is checked
37544 using isa flags. Returns NULL_TREE otherwise. */
37545
37546 static tree ix86_get_builtin (enum ix86_builtins code)
37547 {
37548 struct cl_target_option *opts;
37549 tree target_tree = NULL_TREE;
37550
37551 /* Determine the isa flags of current_function_decl. */
37552
37553 if (current_function_decl)
37554 target_tree = DECL_FUNCTION_SPECIFIC_TARGET (current_function_decl);
37555
37556 if (target_tree == NULL)
37557 target_tree = target_option_default_node;
37558
37559 opts = TREE_TARGET_OPTION (target_tree);
37560
37561 if ((ix86_builtins_isa[(int) code].isa & opts->x_ix86_isa_flags)
37562 || (ix86_builtins_isa[(int) code].isa2 & opts->x_ix86_isa_flags2))
37563 return ix86_builtin_decl (code, true);
37564 else
37565 return NULL_TREE;
37566 }
37567
37568 /* Return function decl for target specific builtin
37569 for given MPX builtin passed i FCODE. */
37570 static tree
37571 ix86_builtin_mpx_function (unsigned fcode)
37572 {
37573 switch (fcode)
37574 {
37575 case BUILT_IN_CHKP_BNDMK:
37576 return ix86_builtins[IX86_BUILTIN_BNDMK];
37577
37578 case BUILT_IN_CHKP_BNDSTX:
37579 return ix86_builtins[IX86_BUILTIN_BNDSTX];
37580
37581 case BUILT_IN_CHKP_BNDLDX:
37582 return ix86_builtins[IX86_BUILTIN_BNDLDX];
37583
37584 case BUILT_IN_CHKP_BNDCL:
37585 return ix86_builtins[IX86_BUILTIN_BNDCL];
37586
37587 case BUILT_IN_CHKP_BNDCU:
37588 return ix86_builtins[IX86_BUILTIN_BNDCU];
37589
37590 case BUILT_IN_CHKP_BNDRET:
37591 return ix86_builtins[IX86_BUILTIN_BNDRET];
37592
37593 case BUILT_IN_CHKP_INTERSECT:
37594 return ix86_builtins[IX86_BUILTIN_BNDINT];
37595
37596 case BUILT_IN_CHKP_NARROW:
37597 return ix86_builtins[IX86_BUILTIN_BNDNARROW];
37598
37599 case BUILT_IN_CHKP_SIZEOF:
37600 return ix86_builtins[IX86_BUILTIN_SIZEOF];
37601
37602 case BUILT_IN_CHKP_EXTRACT_LOWER:
37603 return ix86_builtins[IX86_BUILTIN_BNDLOWER];
37604
37605 case BUILT_IN_CHKP_EXTRACT_UPPER:
37606 return ix86_builtins[IX86_BUILTIN_BNDUPPER];
37607
37608 default:
37609 return NULL_TREE;
37610 }
37611
37612 gcc_unreachable ();
37613 }
37614
37615 /* Helper function for ix86_load_bounds and ix86_store_bounds.
37616
37617 Return an address to be used to load/store bounds for pointer
37618 passed in SLOT.
37619
37620 SLOT_NO is an integer constant holding number of a target
37621 dependent special slot to be used in case SLOT is not a memory.
37622
37623 SPECIAL_BASE is a pointer to be used as a base of fake address
37624 to access special slots in Bounds Table. SPECIAL_BASE[-1],
37625 SPECIAL_BASE[-2] etc. will be used as fake pointer locations. */
37626
37627 static rtx
37628 ix86_get_arg_address_for_bt (rtx slot, rtx slot_no, rtx special_base)
37629 {
37630 rtx addr = NULL;
37631
37632 /* NULL slot means we pass bounds for pointer not passed to the
37633 function at all. Register slot means we pass pointer in a
37634 register. In both these cases bounds are passed via Bounds
37635 Table. Since we do not have actual pointer stored in memory,
37636 we have to use fake addresses to access Bounds Table. We
37637 start with (special_base - sizeof (void*)) and decrease this
37638 address by pointer size to get addresses for other slots. */
37639 if (!slot || REG_P (slot))
37640 {
37641 gcc_assert (CONST_INT_P (slot_no));
37642 addr = plus_constant (Pmode, special_base,
37643 -(INTVAL (slot_no) + 1) * GET_MODE_SIZE (Pmode));
37644 }
37645 /* If pointer is passed in a memory then its address is used to
37646 access Bounds Table. */
37647 else if (MEM_P (slot))
37648 {
37649 addr = XEXP (slot, 0);
37650 if (!register_operand (addr, Pmode))
37651 addr = copy_addr_to_reg (addr);
37652 }
37653 else
37654 gcc_unreachable ();
37655
37656 return addr;
37657 }
37658
37659 /* Expand pass uses this hook to load bounds for function parameter
37660 PTR passed in SLOT in case its bounds are not passed in a register.
37661
37662 If SLOT is a memory, then bounds are loaded as for regular pointer
37663 loaded from memory. PTR may be NULL in case SLOT is a memory.
37664 In such case value of PTR (if required) may be loaded from SLOT.
37665
37666 If SLOT is NULL or a register then SLOT_NO is an integer constant
37667 holding number of the target dependent special slot which should be
37668 used to obtain bounds.
37669
37670 Return loaded bounds. */
37671
37672 static rtx
37673 ix86_load_bounds (rtx slot, rtx ptr, rtx slot_no)
37674 {
37675 rtx reg = gen_reg_rtx (BNDmode);
37676 rtx addr;
37677
37678 /* Get address to be used to access Bounds Table. Special slots start
37679 at the location of return address of the current function. */
37680 addr = ix86_get_arg_address_for_bt (slot, slot_no, arg_pointer_rtx);
37681
37682 /* Load pointer value from a memory if we don't have it. */
37683 if (!ptr)
37684 {
37685 gcc_assert (MEM_P (slot));
37686 ptr = copy_addr_to_reg (slot);
37687 }
37688
37689 if (!register_operand (ptr, Pmode))
37690 ptr = ix86_zero_extend_to_Pmode (ptr);
37691
37692 emit_insn (BNDmode == BND64mode
37693 ? gen_bnd64_ldx (reg, addr, ptr)
37694 : gen_bnd32_ldx (reg, addr, ptr));
37695
37696 return reg;
37697 }
37698
37699 /* Expand pass uses this hook to store BOUNDS for call argument PTR
37700 passed in SLOT in case BOUNDS are not passed in a register.
37701
37702 If SLOT is a memory, then BOUNDS are stored as for regular pointer
37703 stored in memory. PTR may be NULL in case SLOT is a memory.
37704 In such case value of PTR (if required) may be loaded from SLOT.
37705
37706 If SLOT is NULL or a register then SLOT_NO is an integer constant
37707 holding number of the target dependent special slot which should be
37708 used to store BOUNDS. */
37709
37710 static void
37711 ix86_store_bounds (rtx ptr, rtx slot, rtx bounds, rtx slot_no)
37712 {
37713 rtx addr;
37714
37715 /* Get address to be used to access Bounds Table. Special slots start
37716 at the location of return address of a called function. */
37717 addr = ix86_get_arg_address_for_bt (slot, slot_no, stack_pointer_rtx);
37718
37719 /* Load pointer value from a memory if we don't have it. */
37720 if (!ptr)
37721 {
37722 gcc_assert (MEM_P (slot));
37723 ptr = copy_addr_to_reg (slot);
37724 }
37725
37726 if (!register_operand (ptr, Pmode))
37727 ptr = ix86_zero_extend_to_Pmode (ptr);
37728
37729 gcc_assert (POINTER_BOUNDS_MODE_P (GET_MODE (bounds)));
37730 if (!register_operand (bounds, BNDmode))
37731 bounds = copy_to_mode_reg (BNDmode, bounds);
37732
37733 emit_insn (BNDmode == BND64mode
37734 ? gen_bnd64_stx (addr, ptr, bounds)
37735 : gen_bnd32_stx (addr, ptr, bounds));
37736 }
37737
37738 /* Load and return bounds returned by function in SLOT. */
37739
37740 static rtx
37741 ix86_load_returned_bounds (rtx slot)
37742 {
37743 rtx res;
37744
37745 gcc_assert (REG_P (slot));
37746 res = gen_reg_rtx (BNDmode);
37747 emit_move_insn (res, slot);
37748
37749 return res;
37750 }
37751
37752 /* Store BOUNDS returned by function into SLOT. */
37753
37754 static void
37755 ix86_store_returned_bounds (rtx slot, rtx bounds)
37756 {
37757 gcc_assert (REG_P (slot));
37758 emit_move_insn (slot, bounds);
37759 }
37760
37761 /* Returns a function decl for a vectorized version of the combined function
37762 with combined_fn code FN and the result vector type TYPE, or NULL_TREE
37763 if it is not available. */
37764
37765 static tree
37766 ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
37767 tree type_in)
37768 {
37769 machine_mode in_mode, out_mode;
37770 int in_n, out_n;
37771
37772 if (TREE_CODE (type_out) != VECTOR_TYPE
37773 || TREE_CODE (type_in) != VECTOR_TYPE)
37774 return NULL_TREE;
37775
37776 out_mode = TYPE_MODE (TREE_TYPE (type_out));
37777 out_n = TYPE_VECTOR_SUBPARTS (type_out);
37778 in_mode = TYPE_MODE (TREE_TYPE (type_in));
37779 in_n = TYPE_VECTOR_SUBPARTS (type_in);
37780
37781 switch (fn)
37782 {
37783 CASE_CFN_EXP2:
37784 if (out_mode == SFmode && in_mode == SFmode)
37785 {
37786 if (out_n == 16 && in_n == 16)
37787 return ix86_get_builtin (IX86_BUILTIN_EXP2PS);
37788 }
37789 break;
37790
37791 CASE_CFN_IFLOOR:
37792 CASE_CFN_LFLOOR:
37793 CASE_CFN_LLFLOOR:
37794 /* The round insn does not trap on denormals. */
37795 if (flag_trapping_math || !TARGET_SSE4_1)
37796 break;
37797
37798 if (out_mode == SImode && in_mode == DFmode)
37799 {
37800 if (out_n == 4 && in_n == 2)
37801 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX);
37802 else if (out_n == 8 && in_n == 4)
37803 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256);
37804 else if (out_n == 16 && in_n == 8)
37805 return ix86_get_builtin (IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX512);
37806 }
37807 if (out_mode == SImode && in_mode == SFmode)
37808 {
37809 if (out_n == 4 && in_n == 4)
37810 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX);
37811 else if (out_n == 8 && in_n == 8)
37812 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX256);
37813 else if (out_n == 16 && in_n == 16)
37814 return ix86_get_builtin (IX86_BUILTIN_FLOORPS_SFIX512);
37815 }
37816 break;
37817
37818 CASE_CFN_ICEIL:
37819 CASE_CFN_LCEIL:
37820 CASE_CFN_LLCEIL:
37821 /* The round insn does not trap on denormals. */
37822 if (flag_trapping_math || !TARGET_SSE4_1)
37823 break;
37824
37825 if (out_mode == SImode && in_mode == DFmode)
37826 {
37827 if (out_n == 4 && in_n == 2)
37828 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX);
37829 else if (out_n == 8 && in_n == 4)
37830 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256);
37831 else if (out_n == 16 && in_n == 8)
37832 return ix86_get_builtin (IX86_BUILTIN_CEILPD_VEC_PACK_SFIX512);
37833 }
37834 if (out_mode == SImode && in_mode == SFmode)
37835 {
37836 if (out_n == 4 && in_n == 4)
37837 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX);
37838 else if (out_n == 8 && in_n == 8)
37839 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX256);
37840 else if (out_n == 16 && in_n == 16)
37841 return ix86_get_builtin (IX86_BUILTIN_CEILPS_SFIX512);
37842 }
37843 break;
37844
37845 CASE_CFN_IRINT:
37846 CASE_CFN_LRINT:
37847 CASE_CFN_LLRINT:
37848 if (out_mode == SImode && in_mode == DFmode)
37849 {
37850 if (out_n == 4 && in_n == 2)
37851 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX);
37852 else if (out_n == 8 && in_n == 4)
37853 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX256);
37854 else if (out_n == 16 && in_n == 8)
37855 return ix86_get_builtin (IX86_BUILTIN_VEC_PACK_SFIX512);
37856 }
37857 if (out_mode == SImode && in_mode == SFmode)
37858 {
37859 if (out_n == 4 && in_n == 4)
37860 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ);
37861 else if (out_n == 8 && in_n == 8)
37862 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ256);
37863 else if (out_n == 16 && in_n == 16)
37864 return ix86_get_builtin (IX86_BUILTIN_CVTPS2DQ512);
37865 }
37866 break;
37867
37868 CASE_CFN_IROUND:
37869 CASE_CFN_LROUND:
37870 CASE_CFN_LLROUND:
37871 /* The round insn does not trap on denormals. */
37872 if (flag_trapping_math || !TARGET_SSE4_1)
37873 break;
37874
37875 if (out_mode == SImode && in_mode == DFmode)
37876 {
37877 if (out_n == 4 && in_n == 2)
37878 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX);
37879 else if (out_n == 8 && in_n == 4)
37880 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256);
37881 else if (out_n == 16 && in_n == 8)
37882 return ix86_get_builtin (IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX512);
37883 }
37884 if (out_mode == SImode && in_mode == SFmode)
37885 {
37886 if (out_n == 4 && in_n == 4)
37887 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX);
37888 else if (out_n == 8 && in_n == 8)
37889 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX256);
37890 else if (out_n == 16 && in_n == 16)
37891 return ix86_get_builtin (IX86_BUILTIN_ROUNDPS_AZ_SFIX512);
37892 }
37893 break;
37894
37895 CASE_CFN_FLOOR:
37896 /* The round insn does not trap on denormals. */
37897 if (flag_trapping_math || !TARGET_SSE4_1)
37898 break;
37899
37900 if (out_mode == DFmode && in_mode == DFmode)
37901 {
37902 if (out_n == 2 && in_n == 2)
37903 return ix86_get_builtin (IX86_BUILTIN_FLOORPD);
37904 else if (out_n == 4 && in_n == 4)
37905 return ix86_get_builtin (IX86_BUILTIN_FLOORPD256);
37906 else if (out_n == 8 && in_n == 8)
37907 return ix86_get_builtin (IX86_BUILTIN_FLOORPD512);
37908 }
37909 if (out_mode == SFmode && in_mode == SFmode)
37910 {
37911 if (out_n == 4 && in_n == 4)
37912 return ix86_get_builtin (IX86_BUILTIN_FLOORPS);
37913 else if (out_n == 8 && in_n == 8)
37914 return ix86_get_builtin (IX86_BUILTIN_FLOORPS256);
37915 else if (out_n == 16 && in_n == 16)
37916 return ix86_get_builtin (IX86_BUILTIN_FLOORPS512);
37917 }
37918 break;
37919
37920 CASE_CFN_CEIL:
37921 /* The round insn does not trap on denormals. */
37922 if (flag_trapping_math || !TARGET_SSE4_1)
37923 break;
37924
37925 if (out_mode == DFmode && in_mode == DFmode)
37926 {
37927 if (out_n == 2 && in_n == 2)
37928 return ix86_get_builtin (IX86_BUILTIN_CEILPD);
37929 else if (out_n == 4 && in_n == 4)
37930 return ix86_get_builtin (IX86_BUILTIN_CEILPD256);
37931 else if (out_n == 8 && in_n == 8)
37932 return ix86_get_builtin (IX86_BUILTIN_CEILPD512);
37933 }
37934 if (out_mode == SFmode && in_mode == SFmode)
37935 {
37936 if (out_n == 4 && in_n == 4)
37937 return ix86_get_builtin (IX86_BUILTIN_CEILPS);
37938 else if (out_n == 8 && in_n == 8)
37939 return ix86_get_builtin (IX86_BUILTIN_CEILPS256);
37940 else if (out_n == 16 && in_n == 16)
37941 return ix86_get_builtin (IX86_BUILTIN_CEILPS512);
37942 }
37943 break;
37944
37945 CASE_CFN_TRUNC:
37946 /* The round insn does not trap on denormals. */
37947 if (flag_trapping_math || !TARGET_SSE4_1)
37948 break;
37949
37950 if (out_mode == DFmode && in_mode == DFmode)
37951 {
37952 if (out_n == 2 && in_n == 2)
37953 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD);
37954 else if (out_n == 4 && in_n == 4)
37955 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD256);
37956 else if (out_n == 8 && in_n == 8)
37957 return ix86_get_builtin (IX86_BUILTIN_TRUNCPD512);
37958 }
37959 if (out_mode == SFmode && in_mode == SFmode)
37960 {
37961 if (out_n == 4 && in_n == 4)
37962 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS);
37963 else if (out_n == 8 && in_n == 8)
37964 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS256);
37965 else if (out_n == 16 && in_n == 16)
37966 return ix86_get_builtin (IX86_BUILTIN_TRUNCPS512);
37967 }
37968 break;
37969
37970 CASE_CFN_RINT:
37971 /* The round insn does not trap on denormals. */
37972 if (flag_trapping_math || !TARGET_SSE4_1)
37973 break;
37974
37975 if (out_mode == DFmode && in_mode == DFmode)
37976 {
37977 if (out_n == 2 && in_n == 2)
37978 return ix86_get_builtin (IX86_BUILTIN_RINTPD);
37979 else if (out_n == 4 && in_n == 4)
37980 return ix86_get_builtin (IX86_BUILTIN_RINTPD256);
37981 }
37982 if (out_mode == SFmode && in_mode == SFmode)
37983 {
37984 if (out_n == 4 && in_n == 4)
37985 return ix86_get_builtin (IX86_BUILTIN_RINTPS);
37986 else if (out_n == 8 && in_n == 8)
37987 return ix86_get_builtin (IX86_BUILTIN_RINTPS256);
37988 }
37989 break;
37990
37991 CASE_CFN_FMA:
37992 if (out_mode == DFmode && in_mode == DFmode)
37993 {
37994 if (out_n == 2 && in_n == 2)
37995 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD);
37996 if (out_n == 4 && in_n == 4)
37997 return ix86_get_builtin (IX86_BUILTIN_VFMADDPD256);
37998 }
37999 if (out_mode == SFmode && in_mode == SFmode)
38000 {
38001 if (out_n == 4 && in_n == 4)
38002 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS);
38003 if (out_n == 8 && in_n == 8)
38004 return ix86_get_builtin (IX86_BUILTIN_VFMADDPS256);
38005 }
38006 break;
38007
38008 default:
38009 break;
38010 }
38011
38012 /* Dispatch to a handler for a vectorization library. */
38013 if (ix86_veclib_handler)
38014 return ix86_veclib_handler (combined_fn (fn), type_out, type_in);
38015
38016 return NULL_TREE;
38017 }
38018
38019 /* Handler for an SVML-style interface to
38020 a library with vectorized intrinsics. */
38021
38022 static tree
38023 ix86_veclibabi_svml (combined_fn fn, tree type_out, tree type_in)
38024 {
38025 char name[20];
38026 tree fntype, new_fndecl, args;
38027 unsigned arity;
38028 const char *bname;
38029 machine_mode el_mode, in_mode;
38030 int n, in_n;
38031
38032 /* The SVML is suitable for unsafe math only. */
38033 if (!flag_unsafe_math_optimizations)
38034 return NULL_TREE;
38035
38036 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38037 n = TYPE_VECTOR_SUBPARTS (type_out);
38038 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38039 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38040 if (el_mode != in_mode
38041 || n != in_n)
38042 return NULL_TREE;
38043
38044 switch (fn)
38045 {
38046 CASE_CFN_EXP:
38047 CASE_CFN_LOG:
38048 CASE_CFN_LOG10:
38049 CASE_CFN_POW:
38050 CASE_CFN_TANH:
38051 CASE_CFN_TAN:
38052 CASE_CFN_ATAN:
38053 CASE_CFN_ATAN2:
38054 CASE_CFN_ATANH:
38055 CASE_CFN_CBRT:
38056 CASE_CFN_SINH:
38057 CASE_CFN_SIN:
38058 CASE_CFN_ASINH:
38059 CASE_CFN_ASIN:
38060 CASE_CFN_COSH:
38061 CASE_CFN_COS:
38062 CASE_CFN_ACOSH:
38063 CASE_CFN_ACOS:
38064 if ((el_mode != DFmode || n != 2)
38065 && (el_mode != SFmode || n != 4))
38066 return NULL_TREE;
38067 break;
38068
38069 default:
38070 return NULL_TREE;
38071 }
38072
38073 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38074 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38075
38076 if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOGF)
38077 strcpy (name, "vmlsLn4");
38078 else if (DECL_FUNCTION_CODE (fndecl) == BUILT_IN_LOG)
38079 strcpy (name, "vmldLn2");
38080 else if (n == 4)
38081 {
38082 sprintf (name, "vmls%s", bname+10);
38083 name[strlen (name)-1] = '4';
38084 }
38085 else
38086 sprintf (name, "vmld%s2", bname+10);
38087
38088 /* Convert to uppercase. */
38089 name[4] &= ~0x20;
38090
38091 arity = 0;
38092 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38093 arity++;
38094
38095 if (arity == 1)
38096 fntype = build_function_type_list (type_out, type_in, NULL);
38097 else
38098 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38099
38100 /* Build a function declaration for the vectorized function. */
38101 new_fndecl = build_decl (BUILTINS_LOCATION,
38102 FUNCTION_DECL, get_identifier (name), fntype);
38103 TREE_PUBLIC (new_fndecl) = 1;
38104 DECL_EXTERNAL (new_fndecl) = 1;
38105 DECL_IS_NOVOPS (new_fndecl) = 1;
38106 TREE_READONLY (new_fndecl) = 1;
38107
38108 return new_fndecl;
38109 }
38110
38111 /* Handler for an ACML-style interface to
38112 a library with vectorized intrinsics. */
38113
38114 static tree
38115 ix86_veclibabi_acml (combined_fn fn, tree type_out, tree type_in)
38116 {
38117 char name[20] = "__vr.._";
38118 tree fntype, new_fndecl, args;
38119 unsigned arity;
38120 const char *bname;
38121 machine_mode el_mode, in_mode;
38122 int n, in_n;
38123
38124 /* The ACML is 64bits only and suitable for unsafe math only as
38125 it does not correctly support parts of IEEE with the required
38126 precision such as denormals. */
38127 if (!TARGET_64BIT
38128 || !flag_unsafe_math_optimizations)
38129 return NULL_TREE;
38130
38131 el_mode = TYPE_MODE (TREE_TYPE (type_out));
38132 n = TYPE_VECTOR_SUBPARTS (type_out);
38133 in_mode = TYPE_MODE (TREE_TYPE (type_in));
38134 in_n = TYPE_VECTOR_SUBPARTS (type_in);
38135 if (el_mode != in_mode
38136 || n != in_n)
38137 return NULL_TREE;
38138
38139 switch (fn)
38140 {
38141 CASE_CFN_SIN:
38142 CASE_CFN_COS:
38143 CASE_CFN_EXP:
38144 CASE_CFN_LOG:
38145 CASE_CFN_LOG2:
38146 CASE_CFN_LOG10:
38147 if (el_mode == DFmode && n == 2)
38148 {
38149 name[4] = 'd';
38150 name[5] = '2';
38151 }
38152 else if (el_mode == SFmode && n == 4)
38153 {
38154 name[4] = 's';
38155 name[5] = '4';
38156 }
38157 else
38158 return NULL_TREE;
38159 break;
38160
38161 default:
38162 return NULL_TREE;
38163 }
38164
38165 tree fndecl = mathfn_built_in (TREE_TYPE (type_in), fn);
38166 bname = IDENTIFIER_POINTER (DECL_NAME (fndecl));
38167 sprintf (name + 7, "%s", bname+10);
38168
38169 arity = 0;
38170 for (args = DECL_ARGUMENTS (fndecl); args; args = TREE_CHAIN (args))
38171 arity++;
38172
38173 if (arity == 1)
38174 fntype = build_function_type_list (type_out, type_in, NULL);
38175 else
38176 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
38177
38178 /* Build a function declaration for the vectorized function. */
38179 new_fndecl = build_decl (BUILTINS_LOCATION,
38180 FUNCTION_DECL, get_identifier (name), fntype);
38181 TREE_PUBLIC (new_fndecl) = 1;
38182 DECL_EXTERNAL (new_fndecl) = 1;
38183 DECL_IS_NOVOPS (new_fndecl) = 1;
38184 TREE_READONLY (new_fndecl) = 1;
38185
38186 return new_fndecl;
38187 }
38188
38189 /* Returns a decl of a function that implements gather load with
38190 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
38191 Return NULL_TREE if it is not available. */
38192
38193 static tree
38194 ix86_vectorize_builtin_gather (const_tree mem_vectype,
38195 const_tree index_type, int scale)
38196 {
38197 bool si;
38198 enum ix86_builtins code;
38199
38200 if (! TARGET_AVX2)
38201 return NULL_TREE;
38202
38203 if ((TREE_CODE (index_type) != INTEGER_TYPE
38204 && !POINTER_TYPE_P (index_type))
38205 || (TYPE_MODE (index_type) != SImode
38206 && TYPE_MODE (index_type) != DImode))
38207 return NULL_TREE;
38208
38209 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38210 return NULL_TREE;
38211
38212 /* v*gather* insn sign extends index to pointer mode. */
38213 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38214 && TYPE_UNSIGNED (index_type))
38215 return NULL_TREE;
38216
38217 if (scale <= 0
38218 || scale > 8
38219 || (scale & (scale - 1)) != 0)
38220 return NULL_TREE;
38221
38222 si = TYPE_MODE (index_type) == SImode;
38223 switch (TYPE_MODE (mem_vectype))
38224 {
38225 case E_V2DFmode:
38226 if (TARGET_AVX512VL)
38227 code = si ? IX86_BUILTIN_GATHER3SIV2DF : IX86_BUILTIN_GATHER3DIV2DF;
38228 else
38229 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
38230 break;
38231 case E_V4DFmode:
38232 if (TARGET_AVX512VL)
38233 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DF : IX86_BUILTIN_GATHER3DIV4DF;
38234 else
38235 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
38236 break;
38237 case E_V2DImode:
38238 if (TARGET_AVX512VL)
38239 code = si ? IX86_BUILTIN_GATHER3SIV2DI : IX86_BUILTIN_GATHER3DIV2DI;
38240 else
38241 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
38242 break;
38243 case E_V4DImode:
38244 if (TARGET_AVX512VL)
38245 code = si ? IX86_BUILTIN_GATHER3ALTSIV4DI : IX86_BUILTIN_GATHER3DIV4DI;
38246 else
38247 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
38248 break;
38249 case E_V4SFmode:
38250 if (TARGET_AVX512VL)
38251 code = si ? IX86_BUILTIN_GATHER3SIV4SF : IX86_BUILTIN_GATHER3DIV4SF;
38252 else
38253 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
38254 break;
38255 case E_V8SFmode:
38256 if (TARGET_AVX512VL)
38257 code = si ? IX86_BUILTIN_GATHER3SIV8SF : IX86_BUILTIN_GATHER3ALTDIV8SF;
38258 else
38259 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
38260 break;
38261 case E_V4SImode:
38262 if (TARGET_AVX512VL)
38263 code = si ? IX86_BUILTIN_GATHER3SIV4SI : IX86_BUILTIN_GATHER3DIV4SI;
38264 else
38265 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
38266 break;
38267 case E_V8SImode:
38268 if (TARGET_AVX512VL)
38269 code = si ? IX86_BUILTIN_GATHER3SIV8SI : IX86_BUILTIN_GATHER3ALTDIV8SI;
38270 else
38271 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
38272 break;
38273 case E_V8DFmode:
38274 if (TARGET_AVX512F)
38275 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DF : IX86_BUILTIN_GATHER3DIV8DF;
38276 else
38277 return NULL_TREE;
38278 break;
38279 case E_V8DImode:
38280 if (TARGET_AVX512F)
38281 code = si ? IX86_BUILTIN_GATHER3ALTSIV8DI : IX86_BUILTIN_GATHER3DIV8DI;
38282 else
38283 return NULL_TREE;
38284 break;
38285 case E_V16SFmode:
38286 if (TARGET_AVX512F)
38287 code = si ? IX86_BUILTIN_GATHER3SIV16SF : IX86_BUILTIN_GATHER3ALTDIV16SF;
38288 else
38289 return NULL_TREE;
38290 break;
38291 case E_V16SImode:
38292 if (TARGET_AVX512F)
38293 code = si ? IX86_BUILTIN_GATHER3SIV16SI : IX86_BUILTIN_GATHER3ALTDIV16SI;
38294 else
38295 return NULL_TREE;
38296 break;
38297 default:
38298 return NULL_TREE;
38299 }
38300
38301 return ix86_get_builtin (code);
38302 }
38303
38304 /* Returns a decl of a function that implements scatter store with
38305 register type VECTYPE and index type INDEX_TYPE and SCALE.
38306 Return NULL_TREE if it is not available. */
38307
38308 static tree
38309 ix86_vectorize_builtin_scatter (const_tree vectype,
38310 const_tree index_type, int scale)
38311 {
38312 bool si;
38313 enum ix86_builtins code;
38314
38315 if (!TARGET_AVX512F)
38316 return NULL_TREE;
38317
38318 if ((TREE_CODE (index_type) != INTEGER_TYPE
38319 && !POINTER_TYPE_P (index_type))
38320 || (TYPE_MODE (index_type) != SImode
38321 && TYPE_MODE (index_type) != DImode))
38322 return NULL_TREE;
38323
38324 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
38325 return NULL_TREE;
38326
38327 /* v*scatter* insn sign extends index to pointer mode. */
38328 if (TYPE_PRECISION (index_type) < POINTER_SIZE
38329 && TYPE_UNSIGNED (index_type))
38330 return NULL_TREE;
38331
38332 /* Scale can be 1, 2, 4 or 8. */
38333 if (scale <= 0
38334 || scale > 8
38335 || (scale & (scale - 1)) != 0)
38336 return NULL_TREE;
38337
38338 si = TYPE_MODE (index_type) == SImode;
38339 switch (TYPE_MODE (vectype))
38340 {
38341 case E_V8DFmode:
38342 code = si ? IX86_BUILTIN_SCATTERALTSIV8DF : IX86_BUILTIN_SCATTERDIV8DF;
38343 break;
38344 case E_V8DImode:
38345 code = si ? IX86_BUILTIN_SCATTERALTSIV8DI : IX86_BUILTIN_SCATTERDIV8DI;
38346 break;
38347 case E_V16SFmode:
38348 code = si ? IX86_BUILTIN_SCATTERSIV16SF : IX86_BUILTIN_SCATTERALTDIV16SF;
38349 break;
38350 case E_V16SImode:
38351 code = si ? IX86_BUILTIN_SCATTERSIV16SI : IX86_BUILTIN_SCATTERALTDIV16SI;
38352 break;
38353 default:
38354 return NULL_TREE;
38355 }
38356
38357 return ix86_builtins[code];
38358 }
38359
38360 /* Return true if it is safe to use the rsqrt optabs to optimize
38361 1.0/sqrt. */
38362
38363 static bool
38364 use_rsqrt_p ()
38365 {
38366 return (TARGET_SSE_MATH
38367 && flag_finite_math_only
38368 && !flag_trapping_math
38369 && flag_unsafe_math_optimizations);
38370 }
38371
38372 /* Returns a code for a target-specific builtin that implements
38373 reciprocal of the function, or NULL_TREE if not available. */
38374
38375 static tree
38376 ix86_builtin_reciprocal (tree fndecl)
38377 {
38378 switch (DECL_FUNCTION_CODE (fndecl))
38379 {
38380 /* Vectorized version of sqrt to rsqrt conversion. */
38381 case IX86_BUILTIN_SQRTPS_NR:
38382 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR);
38383
38384 case IX86_BUILTIN_SQRTPS_NR256:
38385 return ix86_get_builtin (IX86_BUILTIN_RSQRTPS_NR256);
38386
38387 default:
38388 return NULL_TREE;
38389 }
38390 }
38391 \f
38392 /* Helper for avx_vpermilps256_operand et al. This is also used by
38393 the expansion functions to turn the parallel back into a mask.
38394 The return value is 0 for no match and the imm8+1 for a match. */
38395
38396 int
38397 avx_vpermilp_parallel (rtx par, machine_mode mode)
38398 {
38399 unsigned i, nelt = GET_MODE_NUNITS (mode);
38400 unsigned mask = 0;
38401 unsigned char ipar[16] = {}; /* Silence -Wuninitialized warning. */
38402
38403 if (XVECLEN (par, 0) != (int) nelt)
38404 return 0;
38405
38406 /* Validate that all of the elements are constants, and not totally
38407 out of range. Copy the data into an integral array to make the
38408 subsequent checks easier. */
38409 for (i = 0; i < nelt; ++i)
38410 {
38411 rtx er = XVECEXP (par, 0, i);
38412 unsigned HOST_WIDE_INT ei;
38413
38414 if (!CONST_INT_P (er))
38415 return 0;
38416 ei = INTVAL (er);
38417 if (ei >= nelt)
38418 return 0;
38419 ipar[i] = ei;
38420 }
38421
38422 switch (mode)
38423 {
38424 case E_V8DFmode:
38425 /* In the 512-bit DFmode case, we can only move elements within
38426 a 128-bit lane. First fill the second part of the mask,
38427 then fallthru. */
38428 for (i = 4; i < 6; ++i)
38429 {
38430 if (ipar[i] < 4 || ipar[i] >= 6)
38431 return 0;
38432 mask |= (ipar[i] - 4) << i;
38433 }
38434 for (i = 6; i < 8; ++i)
38435 {
38436 if (ipar[i] < 6)
38437 return 0;
38438 mask |= (ipar[i] - 6) << i;
38439 }
38440 /* FALLTHRU */
38441
38442 case E_V4DFmode:
38443 /* In the 256-bit DFmode case, we can only move elements within
38444 a 128-bit lane. */
38445 for (i = 0; i < 2; ++i)
38446 {
38447 if (ipar[i] >= 2)
38448 return 0;
38449 mask |= ipar[i] << i;
38450 }
38451 for (i = 2; i < 4; ++i)
38452 {
38453 if (ipar[i] < 2)
38454 return 0;
38455 mask |= (ipar[i] - 2) << i;
38456 }
38457 break;
38458
38459 case E_V16SFmode:
38460 /* In 512 bit SFmode case, permutation in the upper 256 bits
38461 must mirror the permutation in the lower 256-bits. */
38462 for (i = 0; i < 8; ++i)
38463 if (ipar[i] + 8 != ipar[i + 8])
38464 return 0;
38465 /* FALLTHRU */
38466
38467 case E_V8SFmode:
38468 /* In 256 bit SFmode case, we have full freedom of
38469 movement within the low 128-bit lane, but the high 128-bit
38470 lane must mirror the exact same pattern. */
38471 for (i = 0; i < 4; ++i)
38472 if (ipar[i] + 4 != ipar[i + 4])
38473 return 0;
38474 nelt = 4;
38475 /* FALLTHRU */
38476
38477 case E_V2DFmode:
38478 case E_V4SFmode:
38479 /* In the 128-bit case, we've full freedom in the placement of
38480 the elements from the source operand. */
38481 for (i = 0; i < nelt; ++i)
38482 mask |= ipar[i] << (i * (nelt / 2));
38483 break;
38484
38485 default:
38486 gcc_unreachable ();
38487 }
38488
38489 /* Make sure success has a non-zero value by adding one. */
38490 return mask + 1;
38491 }
38492
38493 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
38494 the expansion functions to turn the parallel back into a mask.
38495 The return value is 0 for no match and the imm8+1 for a match. */
38496
38497 int
38498 avx_vperm2f128_parallel (rtx par, machine_mode mode)
38499 {
38500 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
38501 unsigned mask = 0;
38502 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
38503
38504 if (XVECLEN (par, 0) != (int) nelt)
38505 return 0;
38506
38507 /* Validate that all of the elements are constants, and not totally
38508 out of range. Copy the data into an integral array to make the
38509 subsequent checks easier. */
38510 for (i = 0; i < nelt; ++i)
38511 {
38512 rtx er = XVECEXP (par, 0, i);
38513 unsigned HOST_WIDE_INT ei;
38514
38515 if (!CONST_INT_P (er))
38516 return 0;
38517 ei = INTVAL (er);
38518 if (ei >= 2 * nelt)
38519 return 0;
38520 ipar[i] = ei;
38521 }
38522
38523 /* Validate that the halves of the permute are halves. */
38524 for (i = 0; i < nelt2 - 1; ++i)
38525 if (ipar[i] + 1 != ipar[i + 1])
38526 return 0;
38527 for (i = nelt2; i < nelt - 1; ++i)
38528 if (ipar[i] + 1 != ipar[i + 1])
38529 return 0;
38530
38531 /* Reconstruct the mask. */
38532 for (i = 0; i < 2; ++i)
38533 {
38534 unsigned e = ipar[i * nelt2];
38535 if (e % nelt2)
38536 return 0;
38537 e /= nelt2;
38538 mask |= e << (i * 4);
38539 }
38540
38541 /* Make sure success has a non-zero value by adding one. */
38542 return mask + 1;
38543 }
38544 \f
38545 /* Return a register priority for hard reg REGNO. */
38546 static int
38547 ix86_register_priority (int hard_regno)
38548 {
38549 /* ebp and r13 as the base always wants a displacement, r12 as the
38550 base always wants an index. So discourage their usage in an
38551 address. */
38552 if (hard_regno == R12_REG || hard_regno == R13_REG)
38553 return 0;
38554 if (hard_regno == BP_REG)
38555 return 1;
38556 /* New x86-64 int registers result in bigger code size. Discourage
38557 them. */
38558 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
38559 return 2;
38560 /* New x86-64 SSE registers result in bigger code size. Discourage
38561 them. */
38562 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
38563 return 2;
38564 /* Usage of AX register results in smaller code. Prefer it. */
38565 if (hard_regno == AX_REG)
38566 return 4;
38567 return 3;
38568 }
38569
38570 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
38571
38572 Put float CONST_DOUBLE in the constant pool instead of fp regs.
38573 QImode must go into class Q_REGS.
38574 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
38575 movdf to do mem-to-mem moves through integer regs. */
38576
38577 static reg_class_t
38578 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
38579 {
38580 machine_mode mode = GET_MODE (x);
38581
38582 /* We're only allowed to return a subclass of CLASS. Many of the
38583 following checks fail for NO_REGS, so eliminate that early. */
38584 if (regclass == NO_REGS)
38585 return NO_REGS;
38586
38587 /* All classes can load zeros. */
38588 if (x == CONST0_RTX (mode))
38589 return regclass;
38590
38591 /* Force constants into memory if we are loading a (nonzero) constant into
38592 an MMX, SSE or MASK register. This is because there are no MMX/SSE/MASK
38593 instructions to load from a constant. */
38594 if (CONSTANT_P (x)
38595 && (MAYBE_MMX_CLASS_P (regclass)
38596 || MAYBE_SSE_CLASS_P (regclass)
38597 || MAYBE_MASK_CLASS_P (regclass)))
38598 return NO_REGS;
38599
38600 /* Floating-point constants need more complex checks. */
38601 if (CONST_DOUBLE_P (x))
38602 {
38603 /* General regs can load everything. */
38604 if (INTEGER_CLASS_P (regclass))
38605 return regclass;
38606
38607 /* Floats can load 0 and 1 plus some others. Note that we eliminated
38608 zero above. We only want to wind up preferring 80387 registers if
38609 we plan on doing computation with them. */
38610 if (IS_STACK_MODE (mode)
38611 && standard_80387_constant_p (x) > 0)
38612 {
38613 /* Limit class to FP regs. */
38614 if (FLOAT_CLASS_P (regclass))
38615 return FLOAT_REGS;
38616 else if (regclass == FP_TOP_SSE_REGS)
38617 return FP_TOP_REG;
38618 else if (regclass == FP_SECOND_SSE_REGS)
38619 return FP_SECOND_REG;
38620 }
38621
38622 return NO_REGS;
38623 }
38624
38625 /* Prefer SSE regs only, if we can use them for math. */
38626 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38627 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
38628
38629 /* Generally when we see PLUS here, it's the function invariant
38630 (plus soft-fp const_int). Which can only be computed into general
38631 regs. */
38632 if (GET_CODE (x) == PLUS)
38633 return INTEGER_CLASS_P (regclass) ? regclass : NO_REGS;
38634
38635 /* QImode constants are easy to load, but non-constant QImode data
38636 must go into Q_REGS. */
38637 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
38638 {
38639 if (Q_CLASS_P (regclass))
38640 return regclass;
38641 else if (reg_class_subset_p (Q_REGS, regclass))
38642 return Q_REGS;
38643 else
38644 return NO_REGS;
38645 }
38646
38647 return regclass;
38648 }
38649
38650 /* Discourage putting floating-point values in SSE registers unless
38651 SSE math is being used, and likewise for the 387 registers. */
38652 static reg_class_t
38653 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
38654 {
38655 machine_mode mode = GET_MODE (x);
38656
38657 /* Restrict the output reload class to the register bank that we are doing
38658 math on. If we would like not to return a subset of CLASS, reject this
38659 alternative: if reload cannot do this, it will still use its choice. */
38660 mode = GET_MODE (x);
38661 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
38662 return MAYBE_SSE_CLASS_P (regclass) ? ALL_SSE_REGS : NO_REGS;
38663
38664 if (IS_STACK_MODE (mode))
38665 {
38666 if (regclass == FP_TOP_SSE_REGS)
38667 return FP_TOP_REG;
38668 else if (regclass == FP_SECOND_SSE_REGS)
38669 return FP_SECOND_REG;
38670 else
38671 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
38672 }
38673
38674 return regclass;
38675 }
38676
38677 static reg_class_t
38678 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
38679 machine_mode mode, secondary_reload_info *sri)
38680 {
38681 /* Double-word spills from general registers to non-offsettable memory
38682 references (zero-extended addresses) require special handling. */
38683 if (TARGET_64BIT
38684 && MEM_P (x)
38685 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
38686 && INTEGER_CLASS_P (rclass)
38687 && !offsettable_memref_p (x))
38688 {
38689 sri->icode = (in_p
38690 ? CODE_FOR_reload_noff_load
38691 : CODE_FOR_reload_noff_store);
38692 /* Add the cost of moving address to a temporary. */
38693 sri->extra_cost = 1;
38694
38695 return NO_REGS;
38696 }
38697
38698 /* QImode spills from non-QI registers require
38699 intermediate register on 32bit targets. */
38700 if (mode == QImode
38701 && ((!TARGET_64BIT && !in_p
38702 && INTEGER_CLASS_P (rclass)
38703 && MAYBE_NON_Q_CLASS_P (rclass))
38704 || (!TARGET_AVX512DQ
38705 && MAYBE_MASK_CLASS_P (rclass))))
38706 {
38707 int regno = true_regnum (x);
38708
38709 /* Return Q_REGS if the operand is in memory. */
38710 if (regno == -1)
38711 return Q_REGS;
38712
38713 return NO_REGS;
38714 }
38715
38716 /* This condition handles corner case where an expression involving
38717 pointers gets vectorized. We're trying to use the address of a
38718 stack slot as a vector initializer.
38719
38720 (set (reg:V2DI 74 [ vect_cst_.2 ])
38721 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
38722
38723 Eventually frame gets turned into sp+offset like this:
38724
38725 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38726 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38727 (const_int 392 [0x188]))))
38728
38729 That later gets turned into:
38730
38731 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38732 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
38733 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
38734
38735 We'll have the following reload recorded:
38736
38737 Reload 0: reload_in (DI) =
38738 (plus:DI (reg/f:DI 7 sp)
38739 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
38740 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38741 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
38742 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
38743 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
38744 reload_reg_rtx: (reg:V2DI 22 xmm1)
38745
38746 Which isn't going to work since SSE instructions can't handle scalar
38747 additions. Returning GENERAL_REGS forces the addition into integer
38748 register and reload can handle subsequent reloads without problems. */
38749
38750 if (in_p && GET_CODE (x) == PLUS
38751 && SSE_CLASS_P (rclass)
38752 && SCALAR_INT_MODE_P (mode))
38753 return GENERAL_REGS;
38754
38755 return NO_REGS;
38756 }
38757
38758 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
38759
38760 static bool
38761 ix86_class_likely_spilled_p (reg_class_t rclass)
38762 {
38763 switch (rclass)
38764 {
38765 case AREG:
38766 case DREG:
38767 case CREG:
38768 case BREG:
38769 case AD_REGS:
38770 case SIREG:
38771 case DIREG:
38772 case SSE_FIRST_REG:
38773 case FP_TOP_REG:
38774 case FP_SECOND_REG:
38775 case BND_REGS:
38776 return true;
38777
38778 default:
38779 break;
38780 }
38781
38782 return false;
38783 }
38784
38785 /* If we are copying between registers from different register sets
38786 (e.g. FP and integer), we may need a memory location.
38787
38788 The function can't work reliably when one of the CLASSES is a class
38789 containing registers from multiple sets. We avoid this by never combining
38790 different sets in a single alternative in the machine description.
38791 Ensure that this constraint holds to avoid unexpected surprises.
38792
38793 When STRICT is false, we are being called from REGISTER_MOVE_COST,
38794 so do not enforce these sanity checks.
38795
38796 To optimize register_move_cost performance, define inline variant. */
38797
38798 static inline bool
38799 inline_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38800 reg_class_t class2, int strict)
38801 {
38802 if (lra_in_progress && (class1 == NO_REGS || class2 == NO_REGS))
38803 return false;
38804
38805 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
38806 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
38807 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
38808 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
38809 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
38810 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2)
38811 || MAYBE_MASK_CLASS_P (class1) != MASK_CLASS_P (class1)
38812 || MAYBE_MASK_CLASS_P (class2) != MASK_CLASS_P (class2))
38813 {
38814 gcc_assert (!strict || lra_in_progress);
38815 return true;
38816 }
38817
38818 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
38819 return true;
38820
38821 /* Between mask and general, we have moves no larger than word size. */
38822 if ((MASK_CLASS_P (class1) != MASK_CLASS_P (class2))
38823 && (GET_MODE_SIZE (mode) > UNITS_PER_WORD))
38824 return true;
38825
38826 /* ??? This is a lie. We do have moves between mmx/general, and for
38827 mmx/sse2. But by saying we need secondary memory we discourage the
38828 register allocator from using the mmx registers unless needed. */
38829 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
38830 return true;
38831
38832 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
38833 {
38834 /* SSE1 doesn't have any direct moves from other classes. */
38835 if (!TARGET_SSE2)
38836 return true;
38837
38838 /* If the target says that inter-unit moves are more expensive
38839 than moving through memory, then don't generate them. */
38840 if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
38841 || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
38842 return true;
38843
38844 /* Between SSE and general, we have moves no larger than word size. */
38845 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
38846 return true;
38847 }
38848
38849 return false;
38850 }
38851
38852 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
38853
38854 static bool
38855 ix86_secondary_memory_needed (machine_mode mode, reg_class_t class1,
38856 reg_class_t class2)
38857 {
38858 return inline_secondary_memory_needed (mode, class1, class2, true);
38859 }
38860
38861 /* Implement TARGET_SECONDARY_MEMORY_NEEDED_MODE.
38862
38863 get_secondary_mem widens integral modes to BITS_PER_WORD.
38864 There is no need to emit full 64 bit move on 64 bit targets
38865 for integral modes that can be moved using 32 bit move. */
38866
38867 static machine_mode
38868 ix86_secondary_memory_needed_mode (machine_mode mode)
38869 {
38870 if (GET_MODE_BITSIZE (mode) < 32 && INTEGRAL_MODE_P (mode))
38871 return mode_for_size (32, GET_MODE_CLASS (mode), 0).require ();
38872 return mode;
38873 }
38874
38875 /* Implement the TARGET_CLASS_MAX_NREGS hook.
38876
38877 On the 80386, this is the size of MODE in words,
38878 except in the FP regs, where a single reg is always enough. */
38879
38880 static unsigned char
38881 ix86_class_max_nregs (reg_class_t rclass, machine_mode mode)
38882 {
38883 if (MAYBE_INTEGER_CLASS_P (rclass))
38884 {
38885 if (mode == XFmode)
38886 return (TARGET_64BIT ? 2 : 3);
38887 else if (mode == XCmode)
38888 return (TARGET_64BIT ? 4 : 6);
38889 else
38890 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
38891 }
38892 else
38893 {
38894 if (COMPLEX_MODE_P (mode))
38895 return 2;
38896 else
38897 return 1;
38898 }
38899 }
38900
38901 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
38902
38903 static bool
38904 ix86_can_change_mode_class (machine_mode from, machine_mode to,
38905 reg_class_t regclass)
38906 {
38907 if (from == to)
38908 return true;
38909
38910 /* x87 registers can't do subreg at all, as all values are reformatted
38911 to extended precision. */
38912 if (MAYBE_FLOAT_CLASS_P (regclass))
38913 return false;
38914
38915 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
38916 {
38917 /* Vector registers do not support QI or HImode loads. If we don't
38918 disallow a change to these modes, reload will assume it's ok to
38919 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
38920 the vec_dupv4hi pattern. */
38921 if (GET_MODE_SIZE (from) < 4)
38922 return false;
38923 }
38924
38925 return true;
38926 }
38927
38928 /* Return index of MODE in the sse load/store tables. */
38929
38930 static inline int
38931 sse_store_index (machine_mode mode)
38932 {
38933 switch (GET_MODE_SIZE (mode))
38934 {
38935 case 4:
38936 return 0;
38937 case 8:
38938 return 1;
38939 case 16:
38940 return 2;
38941 case 32:
38942 return 3;
38943 case 64:
38944 return 4;
38945 default:
38946 return -1;
38947 }
38948 }
38949
38950 /* Return the cost of moving data of mode M between a
38951 register and memory. A value of 2 is the default; this cost is
38952 relative to those in `REGISTER_MOVE_COST'.
38953
38954 This function is used extensively by register_move_cost that is used to
38955 build tables at startup. Make it inline in this case.
38956 When IN is 2, return maximum of in and out move cost.
38957
38958 If moving between registers and memory is more expensive than
38959 between two registers, you should define this macro to express the
38960 relative cost.
38961
38962 Model also increased moving costs of QImode registers in non
38963 Q_REGS classes.
38964 */
38965 static inline int
38966 inline_memory_move_cost (machine_mode mode, enum reg_class regclass,
38967 int in)
38968 {
38969 int cost;
38970 if (FLOAT_CLASS_P (regclass))
38971 {
38972 int index;
38973 switch (mode)
38974 {
38975 case E_SFmode:
38976 index = 0;
38977 break;
38978 case E_DFmode:
38979 index = 1;
38980 break;
38981 case E_XFmode:
38982 index = 2;
38983 break;
38984 default:
38985 return 100;
38986 }
38987 if (in == 2)
38988 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
38989 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
38990 }
38991 if (SSE_CLASS_P (regclass))
38992 {
38993 int index = sse_store_index (mode);
38994 if (index == -1)
38995 return 100;
38996 if (in == 2)
38997 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
38998 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
38999 }
39000 if (MMX_CLASS_P (regclass))
39001 {
39002 int index;
39003 switch (GET_MODE_SIZE (mode))
39004 {
39005 case 4:
39006 index = 0;
39007 break;
39008 case 8:
39009 index = 1;
39010 break;
39011 default:
39012 return 100;
39013 }
39014 if (in)
39015 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
39016 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
39017 }
39018 switch (GET_MODE_SIZE (mode))
39019 {
39020 case 1:
39021 if (Q_CLASS_P (regclass) || TARGET_64BIT)
39022 {
39023 if (!in)
39024 return ix86_cost->int_store[0];
39025 if (TARGET_PARTIAL_REG_DEPENDENCY
39026 && optimize_function_for_speed_p (cfun))
39027 cost = ix86_cost->movzbl_load;
39028 else
39029 cost = ix86_cost->int_load[0];
39030 if (in == 2)
39031 return MAX (cost, ix86_cost->int_store[0]);
39032 return cost;
39033 }
39034 else
39035 {
39036 if (in == 2)
39037 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
39038 if (in)
39039 return ix86_cost->movzbl_load;
39040 else
39041 return ix86_cost->int_store[0] + 4;
39042 }
39043 break;
39044 case 2:
39045 if (in == 2)
39046 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
39047 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
39048 default:
39049 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
39050 if (mode == TFmode)
39051 mode = XFmode;
39052 if (in == 2)
39053 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
39054 else if (in)
39055 cost = ix86_cost->int_load[2];
39056 else
39057 cost = ix86_cost->int_store[2];
39058 return cost * CEIL ((int) GET_MODE_SIZE (mode), UNITS_PER_WORD);
39059 }
39060 }
39061
39062 static int
39063 ix86_memory_move_cost (machine_mode mode, reg_class_t regclass,
39064 bool in)
39065 {
39066 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
39067 }
39068
39069
39070 /* Return the cost of moving data from a register in class CLASS1 to
39071 one in class CLASS2.
39072
39073 It is not required that the cost always equal 2 when FROM is the same as TO;
39074 on some machines it is expensive to move between registers if they are not
39075 general registers. */
39076
39077 static int
39078 ix86_register_move_cost (machine_mode mode, reg_class_t class1_i,
39079 reg_class_t class2_i)
39080 {
39081 enum reg_class class1 = (enum reg_class) class1_i;
39082 enum reg_class class2 = (enum reg_class) class2_i;
39083
39084 /* In case we require secondary memory, compute cost of the store followed
39085 by load. In order to avoid bad register allocation choices, we need
39086 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
39087
39088 if (inline_secondary_memory_needed (mode, class1, class2, false))
39089 {
39090 int cost = 1;
39091
39092 cost += inline_memory_move_cost (mode, class1, 2);
39093 cost += inline_memory_move_cost (mode, class2, 2);
39094
39095 /* In case of copying from general_purpose_register we may emit multiple
39096 stores followed by single load causing memory size mismatch stall.
39097 Count this as arbitrarily high cost of 20. */
39098 if (GET_MODE_BITSIZE (mode) > BITS_PER_WORD
39099 && TARGET_MEMORY_MISMATCH_STALL
39100 && targetm.class_max_nregs (class1, mode)
39101 > targetm.class_max_nregs (class2, mode))
39102 cost += 20;
39103
39104 /* In the case of FP/MMX moves, the registers actually overlap, and we
39105 have to switch modes in order to treat them differently. */
39106 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
39107 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
39108 cost += 20;
39109
39110 return cost;
39111 }
39112
39113 /* Moves between SSE/MMX and integer unit are expensive. */
39114 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
39115 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
39116
39117 /* ??? By keeping returned value relatively high, we limit the number
39118 of moves between integer and MMX/SSE registers for all targets.
39119 Additionally, high value prevents problem with x86_modes_tieable_p(),
39120 where integer modes in MMX/SSE registers are not tieable
39121 because of missing QImode and HImode moves to, from or between
39122 MMX/SSE registers. */
39123 return MAX (8, MMX_CLASS_P (class1) || MMX_CLASS_P (class2)
39124 ? ix86_cost->mmxsse_to_integer : ix86_cost->ssemmx_to_integer);
39125
39126 if (MAYBE_FLOAT_CLASS_P (class1))
39127 return ix86_cost->fp_move;
39128 if (MAYBE_SSE_CLASS_P (class1))
39129 {
39130 if (GET_MODE_BITSIZE (mode) <= 128)
39131 return ix86_cost->xmm_move;
39132 if (GET_MODE_BITSIZE (mode) <= 256)
39133 return ix86_cost->ymm_move;
39134 return ix86_cost->zmm_move;
39135 }
39136 if (MAYBE_MMX_CLASS_P (class1))
39137 return ix86_cost->mmx_move;
39138 return 2;
39139 }
39140
39141 /* Implement TARGET_HARD_REGNO_NREGS. This is ordinarily the length in
39142 words of a value of mode MODE but can be less for certain modes in
39143 special long registers.
39144
39145 Actually there are no two word move instructions for consecutive
39146 registers. And only registers 0-3 may have mov byte instructions
39147 applied to them. */
39148
39149 static unsigned int
39150 ix86_hard_regno_nregs (unsigned int regno, machine_mode mode)
39151 {
39152 if (GENERAL_REGNO_P (regno))
39153 {
39154 if (mode == XFmode)
39155 return TARGET_64BIT ? 2 : 3;
39156 if (mode == XCmode)
39157 return TARGET_64BIT ? 4 : 6;
39158 return CEIL (GET_MODE_SIZE (mode), UNITS_PER_WORD);
39159 }
39160 if (COMPLEX_MODE_P (mode))
39161 return 2;
39162 if (mode == V64SFmode || mode == V64SImode)
39163 return 4;
39164 return 1;
39165 }
39166
39167 /* Implement TARGET_HARD_REGNO_MODE_OK. */
39168
39169 static bool
39170 ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode)
39171 {
39172 /* Flags and only flags can only hold CCmode values. */
39173 if (CC_REGNO_P (regno))
39174 return GET_MODE_CLASS (mode) == MODE_CC;
39175 if (GET_MODE_CLASS (mode) == MODE_CC
39176 || GET_MODE_CLASS (mode) == MODE_RANDOM
39177 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
39178 return false;
39179 if (STACK_REGNO_P (regno))
39180 return VALID_FP_MODE_P (mode);
39181 if (MASK_REGNO_P (regno))
39182 return (VALID_MASK_REG_MODE (mode)
39183 || (TARGET_AVX512BW
39184 && VALID_MASK_AVX512BW_MODE (mode)));
39185 if (BND_REGNO_P (regno))
39186 return VALID_BND_REG_MODE (mode);
39187 if (SSE_REGNO_P (regno))
39188 {
39189 /* We implement the move patterns for all vector modes into and
39190 out of SSE registers, even when no operation instructions
39191 are available. */
39192
39193 /* For AVX-512 we allow, regardless of regno:
39194 - XI mode
39195 - any of 512-bit wide vector mode
39196 - any scalar mode. */
39197 if (TARGET_AVX512F
39198 && (mode == XImode
39199 || VALID_AVX512F_REG_MODE (mode)
39200 || VALID_AVX512F_SCALAR_MODE (mode)))
39201 return true;
39202
39203 /* For AVX-5124FMAPS allow V64SFmode for special regnos. */
39204 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39205 && MOD4_SSE_REGNO_P (regno)
39206 && mode == V64SFmode)
39207 return true;
39208
39209 /* For AVX-5124VNNIW allow V64SImode for special regnos. */
39210 if ((TARGET_AVX5124FMAPS || TARGET_AVX5124VNNIW)
39211 && MOD4_SSE_REGNO_P (regno)
39212 && mode == V64SImode)
39213 return true;
39214
39215 /* TODO check for QI/HI scalars. */
39216 /* AVX512VL allows sse regs16+ for 128/256 bit modes. */
39217 if (TARGET_AVX512VL
39218 && (mode == OImode
39219 || mode == TImode
39220 || VALID_AVX256_REG_MODE (mode)
39221 || VALID_AVX512VL_128_REG_MODE (mode)))
39222 return true;
39223
39224 /* xmm16-xmm31 are only available for AVX-512. */
39225 if (EXT_REX_SSE_REGNO_P (regno))
39226 return false;
39227
39228 /* OImode and AVX modes are available only when AVX is enabled. */
39229 return ((TARGET_AVX
39230 && VALID_AVX256_REG_OR_OI_MODE (mode))
39231 || VALID_SSE_REG_MODE (mode)
39232 || VALID_SSE2_REG_MODE (mode)
39233 || VALID_MMX_REG_MODE (mode)
39234 || VALID_MMX_REG_MODE_3DNOW (mode));
39235 }
39236 if (MMX_REGNO_P (regno))
39237 {
39238 /* We implement the move patterns for 3DNOW modes even in MMX mode,
39239 so if the register is available at all, then we can move data of
39240 the given mode into or out of it. */
39241 return (VALID_MMX_REG_MODE (mode)
39242 || VALID_MMX_REG_MODE_3DNOW (mode));
39243 }
39244
39245 if (mode == QImode)
39246 {
39247 /* Take care for QImode values - they can be in non-QI regs,
39248 but then they do cause partial register stalls. */
39249 if (ANY_QI_REGNO_P (regno))
39250 return true;
39251 if (!TARGET_PARTIAL_REG_STALL)
39252 return true;
39253 /* LRA checks if the hard register is OK for the given mode.
39254 QImode values can live in non-QI regs, so we allow all
39255 registers here. */
39256 if (lra_in_progress)
39257 return true;
39258 return !can_create_pseudo_p ();
39259 }
39260 /* We handle both integer and floats in the general purpose registers. */
39261 else if (VALID_INT_MODE_P (mode))
39262 return true;
39263 else if (VALID_FP_MODE_P (mode))
39264 return true;
39265 else if (VALID_DFP_MODE_P (mode))
39266 return true;
39267 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
39268 on to use that value in smaller contexts, this can easily force a
39269 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
39270 supporting DImode, allow it. */
39271 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
39272 return true;
39273
39274 return false;
39275 }
39276
39277 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that
39278 saves SSE registers across calls is Win64 (thus no need to check the
39279 current ABI here), and with AVX enabled Win64 only guarantees that
39280 the low 16 bytes are saved. */
39281
39282 static bool
39283 ix86_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
39284 {
39285 return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16;
39286 }
39287
39288 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
39289 tieable integer mode. */
39290
39291 static bool
39292 ix86_tieable_integer_mode_p (machine_mode mode)
39293 {
39294 switch (mode)
39295 {
39296 case E_HImode:
39297 case E_SImode:
39298 return true;
39299
39300 case E_QImode:
39301 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
39302
39303 case E_DImode:
39304 return TARGET_64BIT;
39305
39306 default:
39307 return false;
39308 }
39309 }
39310
39311 /* Implement TARGET_MODES_TIEABLE_P.
39312
39313 Return true if MODE1 is accessible in a register that can hold MODE2
39314 without copying. That is, all register classes that can hold MODE2
39315 can also hold MODE1. */
39316
39317 static bool
39318 ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2)
39319 {
39320 if (mode1 == mode2)
39321 return true;
39322
39323 if (ix86_tieable_integer_mode_p (mode1)
39324 && ix86_tieable_integer_mode_p (mode2))
39325 return true;
39326
39327 /* MODE2 being XFmode implies fp stack or general regs, which means we
39328 can tie any smaller floating point modes to it. Note that we do not
39329 tie this with TFmode. */
39330 if (mode2 == XFmode)
39331 return mode1 == SFmode || mode1 == DFmode;
39332
39333 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
39334 that we can tie it with SFmode. */
39335 if (mode2 == DFmode)
39336 return mode1 == SFmode;
39337
39338 /* If MODE2 is only appropriate for an SSE register, then tie with
39339 any other mode acceptable to SSE registers. */
39340 if (GET_MODE_SIZE (mode2) == 32
39341 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39342 return (GET_MODE_SIZE (mode1) == 32
39343 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39344 if (GET_MODE_SIZE (mode2) == 16
39345 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
39346 return (GET_MODE_SIZE (mode1) == 16
39347 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
39348
39349 /* If MODE2 is appropriate for an MMX register, then tie
39350 with any other mode acceptable to MMX registers. */
39351 if (GET_MODE_SIZE (mode2) == 8
39352 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
39353 return (GET_MODE_SIZE (mode1) == 8
39354 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
39355
39356 return false;
39357 }
39358
39359 /* Return the cost of moving between two registers of mode MODE. */
39360
39361 static int
39362 ix86_set_reg_reg_cost (machine_mode mode)
39363 {
39364 unsigned int units = UNITS_PER_WORD;
39365
39366 switch (GET_MODE_CLASS (mode))
39367 {
39368 default:
39369 break;
39370
39371 case MODE_CC:
39372 units = GET_MODE_SIZE (CCmode);
39373 break;
39374
39375 case MODE_FLOAT:
39376 if ((TARGET_SSE && mode == TFmode)
39377 || (TARGET_80387 && mode == XFmode)
39378 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
39379 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
39380 units = GET_MODE_SIZE (mode);
39381 break;
39382
39383 case MODE_COMPLEX_FLOAT:
39384 if ((TARGET_SSE && mode == TCmode)
39385 || (TARGET_80387 && mode == XCmode)
39386 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
39387 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
39388 units = GET_MODE_SIZE (mode);
39389 break;
39390
39391 case MODE_VECTOR_INT:
39392 case MODE_VECTOR_FLOAT:
39393 if ((TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
39394 || (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
39395 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
39396 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
39397 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
39398 units = GET_MODE_SIZE (mode);
39399 }
39400
39401 /* Return the cost of moving between two registers of mode MODE,
39402 assuming that the move will be in pieces of at most UNITS bytes. */
39403 return COSTS_N_INSNS (CEIL (GET_MODE_SIZE (mode), units));
39404 }
39405
39406 /* Return cost of vector operation in MODE given that scalar version has
39407 COST. If PARALLEL is true assume that CPU has more than one unit
39408 performing the operation. */
39409
39410 static int
39411 ix86_vec_cost (machine_mode mode, int cost, bool parallel)
39412 {
39413 if (!VECTOR_MODE_P (mode))
39414 return cost;
39415
39416 if (!parallel)
39417 return cost * GET_MODE_NUNITS (mode);
39418 if (GET_MODE_BITSIZE (mode) == 128
39419 && TARGET_SSE_SPLIT_REGS)
39420 return cost * 2;
39421 if (GET_MODE_BITSIZE (mode) > 128
39422 && TARGET_AVX128_OPTIMAL)
39423 return cost * GET_MODE_BITSIZE (mode) / 128;
39424 return cost;
39425 }
39426
39427 /* Return cost of multiplication in MODE. */
39428
39429 static int
39430 ix86_multiplication_cost (const struct processor_costs *cost,
39431 enum machine_mode mode)
39432 {
39433 machine_mode inner_mode = mode;
39434 if (VECTOR_MODE_P (mode))
39435 inner_mode = GET_MODE_INNER (mode);
39436
39437 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39438 return inner_mode == DFmode ? cost->mulsd : cost->mulss;
39439 else if (X87_FLOAT_MODE_P (mode))
39440 return cost->fmul;
39441 else if (FLOAT_MODE_P (mode))
39442 return ix86_vec_cost (mode,
39443 inner_mode == DFmode
39444 ? cost->mulsd : cost->mulss, true);
39445 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39446 {
39447 /* V*QImode is emulated with 7-13 insns. */
39448 if (mode == V16QImode || mode == V32QImode)
39449 {
39450 int extra = 11;
39451 if (TARGET_XOP && mode == V16QImode)
39452 extra = 5;
39453 else if (TARGET_SSSE3)
39454 extra = 6;
39455 return ix86_vec_cost (mode,
39456 cost->mulss * 2 + cost->sse_op * extra,
39457 true);
39458 }
39459 /* V*DImode is emulated with 5-8 insns. */
39460 else if (mode == V2DImode || mode == V4DImode)
39461 {
39462 if (TARGET_XOP && mode == V2DImode)
39463 return ix86_vec_cost (mode,
39464 cost->mulss * 2 + cost->sse_op * 3,
39465 true);
39466 else
39467 return ix86_vec_cost (mode,
39468 cost->mulss * 3 + cost->sse_op * 5,
39469 true);
39470 }
39471 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
39472 insns, including two PMULUDQ. */
39473 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
39474 return ix86_vec_cost (mode, cost->mulss * 2 + cost->sse_op * 5,
39475 true);
39476 else
39477 return ix86_vec_cost (mode, cost->mulss, true);
39478 }
39479 else
39480 return (cost->mult_init[MODE_INDEX (mode)] + cost->mult_bit * 7);
39481 }
39482
39483 /* Return cost of multiplication in MODE. */
39484
39485 static int
39486 ix86_division_cost (const struct processor_costs *cost,
39487 enum machine_mode mode)
39488 {
39489 machine_mode inner_mode = mode;
39490 if (VECTOR_MODE_P (mode))
39491 inner_mode = GET_MODE_INNER (mode);
39492
39493 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39494 return inner_mode == DFmode ? cost->divsd : cost->divss;
39495 else if (X87_FLOAT_MODE_P (mode))
39496 return cost->fdiv;
39497 else if (FLOAT_MODE_P (mode))
39498 return ix86_vec_cost (mode,
39499 inner_mode == DFmode ? cost->divsd : cost->divss,
39500 true);
39501 else
39502 return cost->divide[MODE_INDEX (mode)];
39503 }
39504
39505 /* Return cost of shift in MODE.
39506 If CONSTANT_OP1 is true, the op1 value is known and set in OP1_VAL.
39507 AND_IN_OP1 specify in op1 is result of and and SHIFT_AND_TRUNCATE
39508 if op1 is a result of subreg.
39509
39510 SKIP_OP0/1 is set to true if cost of OP0/1 should be ignored. */
39511
39512 static int
39513 ix86_shift_rotate_cost (const struct processor_costs *cost,
39514 enum machine_mode mode, bool constant_op1,
39515 HOST_WIDE_INT op1_val,
39516 bool speed,
39517 bool and_in_op1,
39518 bool shift_and_truncate,
39519 bool *skip_op0, bool *skip_op1)
39520 {
39521 if (skip_op0)
39522 *skip_op0 = *skip_op1 = false;
39523 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39524 {
39525 /* V*QImode is emulated with 1-11 insns. */
39526 if (mode == V16QImode || mode == V32QImode)
39527 {
39528 int count = 11;
39529 if (TARGET_XOP && mode == V16QImode)
39530 {
39531 /* For XOP we use vpshab, which requires a broadcast of the
39532 value to the variable shift insn. For constants this
39533 means a V16Q const in mem; even when we can perform the
39534 shift with one insn set the cost to prefer paddb. */
39535 if (constant_op1)
39536 {
39537 if (skip_op1)
39538 *skip_op1 = true;
39539 return ix86_vec_cost (mode,
39540 cost->sse_op
39541 + (speed
39542 ? 2
39543 : COSTS_N_BYTES
39544 (GET_MODE_UNIT_SIZE (mode))), true);
39545 }
39546 count = 3;
39547 }
39548 else if (TARGET_SSSE3)
39549 count = 7;
39550 return ix86_vec_cost (mode, cost->sse_op * count, true);
39551 }
39552 else
39553 return ix86_vec_cost (mode, cost->sse_op, true);
39554 }
39555 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39556 {
39557 if (constant_op1)
39558 {
39559 if (op1_val > 32)
39560 return cost->shift_const + COSTS_N_INSNS (2);
39561 else
39562 return cost->shift_const * 2;
39563 }
39564 else
39565 {
39566 if (and_in_op1)
39567 return cost->shift_var * 2;
39568 else
39569 return cost->shift_var * 6 + COSTS_N_INSNS (2);
39570 }
39571 }
39572 else
39573 {
39574 if (constant_op1)
39575 return cost->shift_const;
39576 else if (shift_and_truncate)
39577 {
39578 if (skip_op0)
39579 *skip_op0 = *skip_op1 = true;
39580 /* Return the cost after shift-and truncation. */
39581 return cost->shift_var;
39582 }
39583 else
39584 return cost->shift_var;
39585 }
39586 return cost->shift_const;
39587 }
39588
39589 /* Compute a (partial) cost for rtx X. Return true if the complete
39590 cost has been computed, and false if subexpressions should be
39591 scanned. In either case, *TOTAL contains the cost result. */
39592
39593 static bool
39594 ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
39595 int *total, bool speed)
39596 {
39597 rtx mask;
39598 enum rtx_code code = GET_CODE (x);
39599 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
39600 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
39601 int src_cost;
39602
39603 switch (code)
39604 {
39605 case SET:
39606 if (register_operand (SET_DEST (x), VOIDmode)
39607 && reg_or_0_operand (SET_SRC (x), VOIDmode))
39608 {
39609 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
39610 return true;
39611 }
39612
39613 if (register_operand (SET_SRC (x), VOIDmode))
39614 /* Avoid potentially incorrect high cost from rtx_costs
39615 for non-tieable SUBREGs. */
39616 src_cost = 0;
39617 else
39618 {
39619 src_cost = rtx_cost (SET_SRC (x), mode, SET, 1, speed);
39620
39621 if (CONSTANT_P (SET_SRC (x)))
39622 /* Constant costs assume a base value of COSTS_N_INSNS (1) and add
39623 a small value, possibly zero for cheap constants. */
39624 src_cost += COSTS_N_INSNS (1);
39625 }
39626
39627 *total = src_cost + rtx_cost (SET_DEST (x), mode, SET, 0, speed);
39628 return true;
39629
39630 case CONST_INT:
39631 case CONST:
39632 case LABEL_REF:
39633 case SYMBOL_REF:
39634 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
39635 *total = 3;
39636 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
39637 *total = 2;
39638 else if (flag_pic && SYMBOLIC_CONST (x)
39639 && !(TARGET_64BIT
39640 && (GET_CODE (x) == LABEL_REF
39641 || (GET_CODE (x) == SYMBOL_REF
39642 && SYMBOL_REF_LOCAL_P (x))))
39643 /* Use 0 cost for CONST to improve its propagation. */
39644 && (TARGET_64BIT || GET_CODE (x) != CONST))
39645 *total = 1;
39646 else
39647 *total = 0;
39648 return true;
39649
39650 case CONST_DOUBLE:
39651 if (IS_STACK_MODE (mode))
39652 switch (standard_80387_constant_p (x))
39653 {
39654 case -1:
39655 case 0:
39656 break;
39657 case 1: /* 0.0 */
39658 *total = 1;
39659 return true;
39660 default: /* Other constants */
39661 *total = 2;
39662 return true;
39663 }
39664 /* FALLTHRU */
39665
39666 case CONST_VECTOR:
39667 switch (standard_sse_constant_p (x, mode))
39668 {
39669 case 0:
39670 break;
39671 case 1: /* 0: xor eliminates false dependency */
39672 *total = 0;
39673 return true;
39674 default: /* -1: cmp contains false dependency */
39675 *total = 1;
39676 return true;
39677 }
39678 /* FALLTHRU */
39679
39680 case CONST_WIDE_INT:
39681 /* Fall back to (MEM (SYMBOL_REF)), since that's where
39682 it'll probably end up. Add a penalty for size. */
39683 *total = (COSTS_N_INSNS (1)
39684 + (!TARGET_64BIT && flag_pic)
39685 + (GET_MODE_SIZE (mode) <= 4
39686 ? 0 : GET_MODE_SIZE (mode) <= 8 ? 1 : 2));
39687 return true;
39688
39689 case ZERO_EXTEND:
39690 /* The zero extensions is often completely free on x86_64, so make
39691 it as cheap as possible. */
39692 if (TARGET_64BIT && mode == DImode
39693 && GET_MODE (XEXP (x, 0)) == SImode)
39694 *total = 1;
39695 else if (TARGET_ZERO_EXTEND_WITH_AND)
39696 *total = cost->add;
39697 else
39698 *total = cost->movzx;
39699 return false;
39700
39701 case SIGN_EXTEND:
39702 *total = cost->movsx;
39703 return false;
39704
39705 case ASHIFT:
39706 if (SCALAR_INT_MODE_P (mode)
39707 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
39708 && CONST_INT_P (XEXP (x, 1)))
39709 {
39710 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39711 if (value == 1)
39712 {
39713 *total = cost->add;
39714 return false;
39715 }
39716 if ((value == 2 || value == 3)
39717 && cost->lea <= cost->shift_const)
39718 {
39719 *total = cost->lea;
39720 return false;
39721 }
39722 }
39723 /* FALLTHRU */
39724
39725 case ROTATE:
39726 case ASHIFTRT:
39727 case LSHIFTRT:
39728 case ROTATERT:
39729 bool skip_op0, skip_op1;
39730 *total = ix86_shift_rotate_cost (cost, mode, CONSTANT_P (XEXP (x, 1)),
39731 CONST_INT_P (XEXP (x, 1))
39732 ? INTVAL (XEXP (x, 1)) : -1,
39733 speed,
39734 GET_CODE (XEXP (x, 1)) == AND,
39735 SUBREG_P (XEXP (x, 1))
39736 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND,
39737 &skip_op0, &skip_op1);
39738 if (skip_op0 || skip_op1)
39739 {
39740 if (!skip_op0)
39741 *total += rtx_cost (XEXP (x, 0), mode, code, 0, speed);
39742 if (!skip_op1)
39743 *total += rtx_cost (XEXP (x, 1), mode, code, 0, speed);
39744 return true;
39745 }
39746 return false;
39747
39748 case FMA:
39749 {
39750 rtx sub;
39751
39752 gcc_assert (FLOAT_MODE_P (mode));
39753 gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
39754
39755 *total = ix86_vec_cost (mode,
39756 mode == SFmode ? cost->fmass : cost->fmasd,
39757 true);
39758 *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
39759
39760 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
39761 sub = XEXP (x, 0);
39762 if (GET_CODE (sub) == NEG)
39763 sub = XEXP (sub, 0);
39764 *total += rtx_cost (sub, mode, FMA, 0, speed);
39765
39766 sub = XEXP (x, 2);
39767 if (GET_CODE (sub) == NEG)
39768 sub = XEXP (sub, 0);
39769 *total += rtx_cost (sub, mode, FMA, 2, speed);
39770 return true;
39771 }
39772
39773 case MULT:
39774 if (!FLOAT_MODE_P (mode) && !VECTOR_MODE_P (mode))
39775 {
39776 rtx op0 = XEXP (x, 0);
39777 rtx op1 = XEXP (x, 1);
39778 int nbits;
39779 if (CONST_INT_P (XEXP (x, 1)))
39780 {
39781 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
39782 for (nbits = 0; value != 0; value &= value - 1)
39783 nbits++;
39784 }
39785 else
39786 /* This is arbitrary. */
39787 nbits = 7;
39788
39789 /* Compute costs correctly for widening multiplication. */
39790 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
39791 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
39792 == GET_MODE_SIZE (mode))
39793 {
39794 int is_mulwiden = 0;
39795 machine_mode inner_mode = GET_MODE (op0);
39796
39797 if (GET_CODE (op0) == GET_CODE (op1))
39798 is_mulwiden = 1, op1 = XEXP (op1, 0);
39799 else if (CONST_INT_P (op1))
39800 {
39801 if (GET_CODE (op0) == SIGN_EXTEND)
39802 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
39803 == INTVAL (op1);
39804 else
39805 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
39806 }
39807
39808 if (is_mulwiden)
39809 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
39810 }
39811
39812 *total = (cost->mult_init[MODE_INDEX (mode)]
39813 + nbits * cost->mult_bit
39814 + rtx_cost (op0, mode, outer_code, opno, speed)
39815 + rtx_cost (op1, mode, outer_code, opno, speed));
39816
39817 return true;
39818 }
39819 *total = ix86_multiplication_cost (cost, mode);
39820 return false;
39821
39822 case DIV:
39823 case UDIV:
39824 case MOD:
39825 case UMOD:
39826 *total = ix86_division_cost (cost, mode);
39827 return false;
39828
39829 case PLUS:
39830 if (GET_MODE_CLASS (mode) == MODE_INT
39831 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
39832 {
39833 if (GET_CODE (XEXP (x, 0)) == PLUS
39834 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
39835 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
39836 && CONSTANT_P (XEXP (x, 1)))
39837 {
39838 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
39839 if (val == 2 || val == 4 || val == 8)
39840 {
39841 *total = cost->lea;
39842 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39843 outer_code, opno, speed);
39844 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0), mode,
39845 outer_code, opno, speed);
39846 *total += rtx_cost (XEXP (x, 1), mode,
39847 outer_code, opno, speed);
39848 return true;
39849 }
39850 }
39851 else if (GET_CODE (XEXP (x, 0)) == MULT
39852 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
39853 {
39854 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
39855 if (val == 2 || val == 4 || val == 8)
39856 {
39857 *total = cost->lea;
39858 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39859 outer_code, opno, speed);
39860 *total += rtx_cost (XEXP (x, 1), mode,
39861 outer_code, opno, speed);
39862 return true;
39863 }
39864 }
39865 else if (GET_CODE (XEXP (x, 0)) == PLUS)
39866 {
39867 /* Add with carry, ignore the cost of adding a carry flag. */
39868 if (ix86_carry_flag_operator (XEXP (XEXP (x, 0), 0), mode))
39869 *total = cost->add;
39870 else
39871 {
39872 *total = cost->lea;
39873 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39874 outer_code, opno, speed);
39875 }
39876
39877 *total += rtx_cost (XEXP (XEXP (x, 0), 1), mode,
39878 outer_code, opno, speed);
39879 *total += rtx_cost (XEXP (x, 1), mode,
39880 outer_code, opno, speed);
39881 return true;
39882 }
39883 }
39884 /* FALLTHRU */
39885
39886 case MINUS:
39887 /* Subtract with borrow, ignore the cost of subtracting a carry flag. */
39888 if (GET_MODE_CLASS (mode) == MODE_INT
39889 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD
39890 && GET_CODE (XEXP (x, 0)) == MINUS
39891 && ix86_carry_flag_operator (XEXP (XEXP (x, 0), 1), mode))
39892 {
39893 *total = cost->add;
39894 *total += rtx_cost (XEXP (XEXP (x, 0), 0), mode,
39895 outer_code, opno, speed);
39896 *total += rtx_cost (XEXP (x, 1), mode,
39897 outer_code, opno, speed);
39898 return true;
39899 }
39900
39901 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39902 {
39903 *total = cost->addss;
39904 return false;
39905 }
39906 else if (X87_FLOAT_MODE_P (mode))
39907 {
39908 *total = cost->fadd;
39909 return false;
39910 }
39911 else if (FLOAT_MODE_P (mode))
39912 {
39913 *total = ix86_vec_cost (mode, cost->addss, true);
39914 return false;
39915 }
39916 /* FALLTHRU */
39917
39918 case AND:
39919 case IOR:
39920 case XOR:
39921 if (GET_MODE_CLASS (mode) == MODE_INT
39922 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39923 {
39924 *total = (cost->add * 2
39925 + (rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed)
39926 << (GET_MODE (XEXP (x, 0)) != DImode))
39927 + (rtx_cost (XEXP (x, 1), mode, outer_code, opno, speed)
39928 << (GET_MODE (XEXP (x, 1)) != DImode)));
39929 return true;
39930 }
39931 /* FALLTHRU */
39932
39933 case NEG:
39934 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
39935 {
39936 *total = cost->sse_op;
39937 return false;
39938 }
39939 else if (X87_FLOAT_MODE_P (mode))
39940 {
39941 *total = cost->fchs;
39942 return false;
39943 }
39944 else if (FLOAT_MODE_P (mode))
39945 {
39946 *total = ix86_vec_cost (mode, cost->sse_op, true);
39947 return false;
39948 }
39949 /* FALLTHRU */
39950
39951 case NOT:
39952 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
39953 *total = ix86_vec_cost (mode, cost->sse_op, true);
39954 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
39955 *total = cost->add * 2;
39956 else
39957 *total = cost->add;
39958 return false;
39959
39960 case COMPARE:
39961 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
39962 && XEXP (XEXP (x, 0), 1) == const1_rtx
39963 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
39964 && XEXP (x, 1) == const0_rtx)
39965 {
39966 /* This kind of construct is implemented using test[bwl].
39967 Treat it as if we had an AND. */
39968 mode = GET_MODE (XEXP (XEXP (x, 0), 0));
39969 *total = (cost->add
39970 + rtx_cost (XEXP (XEXP (x, 0), 0), mode, outer_code,
39971 opno, speed)
39972 + rtx_cost (const1_rtx, mode, outer_code, opno, speed));
39973 return true;
39974 }
39975
39976 /* The embedded comparison operand is completely free. */
39977 if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
39978 && XEXP (x, 1) == const0_rtx)
39979 *total = 0;
39980
39981 return false;
39982
39983 case FLOAT_EXTEND:
39984 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
39985 *total = 0;
39986 else
39987 *total = ix86_vec_cost (mode, cost->addss, true);
39988 return false;
39989
39990 case FLOAT_TRUNCATE:
39991 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
39992 *total = cost->fadd;
39993 else
39994 *total = ix86_vec_cost (mode, cost->addss, true);
39995 return false;
39996
39997 case ABS:
39998 /* SSE requires memory load for the constant operand. It may make
39999 sense to account for this. Of course the constant operand may or
40000 may not be reused. */
40001 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40002 *total = cost->sse_op;
40003 else if (X87_FLOAT_MODE_P (mode))
40004 *total = cost->fabs;
40005 else if (FLOAT_MODE_P (mode))
40006 *total = ix86_vec_cost (mode, cost->sse_op, true);
40007 return false;
40008
40009 case SQRT:
40010 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
40011 *total = mode == SFmode ? cost->sqrtss : cost->sqrtsd;
40012 else if (X87_FLOAT_MODE_P (mode))
40013 *total = cost->fsqrt;
40014 else if (FLOAT_MODE_P (mode))
40015 *total = ix86_vec_cost (mode,
40016 mode == SFmode ? cost->sqrtss : cost->sqrtsd,
40017 true);
40018 return false;
40019
40020 case UNSPEC:
40021 if (XINT (x, 1) == UNSPEC_TP)
40022 *total = 0;
40023 return false;
40024
40025 case VEC_SELECT:
40026 case VEC_CONCAT:
40027 case VEC_DUPLICATE:
40028 /* ??? Assume all of these vector manipulation patterns are
40029 recognizable. In which case they all pretty much have the
40030 same cost. */
40031 *total = cost->sse_op;
40032 return true;
40033 case VEC_MERGE:
40034 mask = XEXP (x, 2);
40035 /* This is masked instruction, assume the same cost,
40036 as nonmasked variant. */
40037 if (TARGET_AVX512F && register_operand (mask, GET_MODE (mask)))
40038 *total = rtx_cost (XEXP (x, 0), mode, outer_code, opno, speed);
40039 else
40040 *total = cost->sse_op;
40041 return true;
40042
40043 default:
40044 return false;
40045 }
40046 }
40047
40048 #if TARGET_MACHO
40049
40050 static int current_machopic_label_num;
40051
40052 /* Given a symbol name and its associated stub, write out the
40053 definition of the stub. */
40054
40055 void
40056 machopic_output_stub (FILE *file, const char *symb, const char *stub)
40057 {
40058 unsigned int length;
40059 char *binder_name, *symbol_name, lazy_ptr_name[32];
40060 int label = ++current_machopic_label_num;
40061
40062 /* For 64-bit we shouldn't get here. */
40063 gcc_assert (!TARGET_64BIT);
40064
40065 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
40066 symb = targetm.strip_name_encoding (symb);
40067
40068 length = strlen (stub);
40069 binder_name = XALLOCAVEC (char, length + 32);
40070 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
40071
40072 length = strlen (symb);
40073 symbol_name = XALLOCAVEC (char, length + 32);
40074 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
40075
40076 sprintf (lazy_ptr_name, "L%d$lz", label);
40077
40078 if (MACHOPIC_ATT_STUB)
40079 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
40080 else if (MACHOPIC_PURE)
40081 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
40082 else
40083 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
40084
40085 fprintf (file, "%s:\n", stub);
40086 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40087
40088 if (MACHOPIC_ATT_STUB)
40089 {
40090 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
40091 }
40092 else if (MACHOPIC_PURE)
40093 {
40094 /* PIC stub. */
40095 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40096 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
40097 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
40098 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
40099 label, lazy_ptr_name, label);
40100 fprintf (file, "\tjmp\t*%%ecx\n");
40101 }
40102 else
40103 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
40104
40105 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
40106 it needs no stub-binding-helper. */
40107 if (MACHOPIC_ATT_STUB)
40108 return;
40109
40110 fprintf (file, "%s:\n", binder_name);
40111
40112 if (MACHOPIC_PURE)
40113 {
40114 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
40115 fprintf (file, "\tpushl\t%%ecx\n");
40116 }
40117 else
40118 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
40119
40120 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
40121
40122 /* N.B. Keep the correspondence of these
40123 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
40124 old-pic/new-pic/non-pic stubs; altering this will break
40125 compatibility with existing dylibs. */
40126 if (MACHOPIC_PURE)
40127 {
40128 /* 25-byte PIC stub using "CALL get_pc_thunk". */
40129 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
40130 }
40131 else
40132 /* 16-byte -mdynamic-no-pic stub. */
40133 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
40134
40135 fprintf (file, "%s:\n", lazy_ptr_name);
40136 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
40137 fprintf (file, ASM_LONG "%s\n", binder_name);
40138 }
40139 #endif /* TARGET_MACHO */
40140
40141 /* Order the registers for register allocator. */
40142
40143 void
40144 x86_order_regs_for_local_alloc (void)
40145 {
40146 int pos = 0;
40147 int i;
40148
40149 /* First allocate the local general purpose registers. */
40150 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40151 if (GENERAL_REGNO_P (i) && call_used_regs[i])
40152 reg_alloc_order [pos++] = i;
40153
40154 /* Global general purpose registers. */
40155 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
40156 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
40157 reg_alloc_order [pos++] = i;
40158
40159 /* x87 registers come first in case we are doing FP math
40160 using them. */
40161 if (!TARGET_SSE_MATH)
40162 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40163 reg_alloc_order [pos++] = i;
40164
40165 /* SSE registers. */
40166 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
40167 reg_alloc_order [pos++] = i;
40168 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
40169 reg_alloc_order [pos++] = i;
40170
40171 /* Extended REX SSE registers. */
40172 for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
40173 reg_alloc_order [pos++] = i;
40174
40175 /* Mask register. */
40176 for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
40177 reg_alloc_order [pos++] = i;
40178
40179 /* MPX bound registers. */
40180 for (i = FIRST_BND_REG; i <= LAST_BND_REG; i++)
40181 reg_alloc_order [pos++] = i;
40182
40183 /* x87 registers. */
40184 if (TARGET_SSE_MATH)
40185 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
40186 reg_alloc_order [pos++] = i;
40187
40188 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
40189 reg_alloc_order [pos++] = i;
40190
40191 /* Initialize the rest of array as we do not allocate some registers
40192 at all. */
40193 while (pos < FIRST_PSEUDO_REGISTER)
40194 reg_alloc_order [pos++] = 0;
40195 }
40196
40197 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
40198 in struct attribute_spec handler. */
40199 static tree
40200 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
40201 tree args,
40202 int,
40203 bool *no_add_attrs)
40204 {
40205 if (TREE_CODE (*node) != FUNCTION_TYPE
40206 && TREE_CODE (*node) != METHOD_TYPE
40207 && TREE_CODE (*node) != FIELD_DECL
40208 && TREE_CODE (*node) != TYPE_DECL)
40209 {
40210 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40211 name);
40212 *no_add_attrs = true;
40213 return NULL_TREE;
40214 }
40215 if (TARGET_64BIT)
40216 {
40217 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
40218 name);
40219 *no_add_attrs = true;
40220 return NULL_TREE;
40221 }
40222 if (is_attribute_p ("callee_pop_aggregate_return", name))
40223 {
40224 tree cst;
40225
40226 cst = TREE_VALUE (args);
40227 if (TREE_CODE (cst) != INTEGER_CST)
40228 {
40229 warning (OPT_Wattributes,
40230 "%qE attribute requires an integer constant argument",
40231 name);
40232 *no_add_attrs = true;
40233 }
40234 else if (compare_tree_int (cst, 0) != 0
40235 && compare_tree_int (cst, 1) != 0)
40236 {
40237 warning (OPT_Wattributes,
40238 "argument to %qE attribute is neither zero, nor one",
40239 name);
40240 *no_add_attrs = true;
40241 }
40242
40243 return NULL_TREE;
40244 }
40245
40246 return NULL_TREE;
40247 }
40248
40249 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
40250 struct attribute_spec.handler. */
40251 static tree
40252 ix86_handle_abi_attribute (tree *node, tree name, tree, int,
40253 bool *no_add_attrs)
40254 {
40255 if (TREE_CODE (*node) != FUNCTION_TYPE
40256 && TREE_CODE (*node) != METHOD_TYPE
40257 && TREE_CODE (*node) != FIELD_DECL
40258 && TREE_CODE (*node) != TYPE_DECL)
40259 {
40260 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40261 name);
40262 *no_add_attrs = true;
40263 return NULL_TREE;
40264 }
40265
40266 /* Can combine regparm with all attributes but fastcall. */
40267 if (is_attribute_p ("ms_abi", name))
40268 {
40269 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
40270 {
40271 error ("ms_abi and sysv_abi attributes are not compatible");
40272 }
40273
40274 return NULL_TREE;
40275 }
40276 else if (is_attribute_p ("sysv_abi", name))
40277 {
40278 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
40279 {
40280 error ("ms_abi and sysv_abi attributes are not compatible");
40281 }
40282
40283 return NULL_TREE;
40284 }
40285
40286 return NULL_TREE;
40287 }
40288
40289 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
40290 struct attribute_spec.handler. */
40291 static tree
40292 ix86_handle_struct_attribute (tree *node, tree name, tree, int,
40293 bool *no_add_attrs)
40294 {
40295 tree *type = NULL;
40296 if (DECL_P (*node))
40297 {
40298 if (TREE_CODE (*node) == TYPE_DECL)
40299 type = &TREE_TYPE (*node);
40300 }
40301 else
40302 type = node;
40303
40304 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
40305 {
40306 warning (OPT_Wattributes, "%qE attribute ignored",
40307 name);
40308 *no_add_attrs = true;
40309 }
40310
40311 else if ((is_attribute_p ("ms_struct", name)
40312 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
40313 || ((is_attribute_p ("gcc_struct", name)
40314 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
40315 {
40316 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
40317 name);
40318 *no_add_attrs = true;
40319 }
40320
40321 return NULL_TREE;
40322 }
40323
40324 static tree
40325 ix86_handle_fndecl_attribute (tree *node, tree name, tree, int,
40326 bool *no_add_attrs)
40327 {
40328 if (TREE_CODE (*node) != FUNCTION_DECL)
40329 {
40330 warning (OPT_Wattributes, "%qE attribute only applies to functions",
40331 name);
40332 *no_add_attrs = true;
40333 }
40334 return NULL_TREE;
40335 }
40336
40337 static tree
40338 ix86_handle_no_caller_saved_registers_attribute (tree *, tree, tree,
40339 int, bool *)
40340 {
40341 return NULL_TREE;
40342 }
40343
40344 static tree
40345 ix86_handle_interrupt_attribute (tree *node, tree, tree, int, bool *)
40346 {
40347 /* DECL_RESULT and DECL_ARGUMENTS do not exist there yet,
40348 but the function type contains args and return type data. */
40349 tree func_type = *node;
40350 tree return_type = TREE_TYPE (func_type);
40351
40352 int nargs = 0;
40353 tree current_arg_type = TYPE_ARG_TYPES (func_type);
40354 while (current_arg_type
40355 && ! VOID_TYPE_P (TREE_VALUE (current_arg_type)))
40356 {
40357 if (nargs == 0)
40358 {
40359 if (! POINTER_TYPE_P (TREE_VALUE (current_arg_type)))
40360 error ("interrupt service routine should have a pointer "
40361 "as the first argument");
40362 }
40363 else if (nargs == 1)
40364 {
40365 if (TREE_CODE (TREE_VALUE (current_arg_type)) != INTEGER_TYPE
40366 || TYPE_MODE (TREE_VALUE (current_arg_type)) != word_mode)
40367 error ("interrupt service routine should have unsigned %s"
40368 "int as the second argument",
40369 TARGET_64BIT
40370 ? (TARGET_X32 ? "long long " : "long ")
40371 : "");
40372 }
40373 nargs++;
40374 current_arg_type = TREE_CHAIN (current_arg_type);
40375 }
40376 if (!nargs || nargs > 2)
40377 error ("interrupt service routine can only have a pointer argument "
40378 "and an optional integer argument");
40379 if (! VOID_TYPE_P (return_type))
40380 error ("interrupt service routine can't have non-void return value");
40381
40382 return NULL_TREE;
40383 }
40384
40385 static bool
40386 ix86_ms_bitfield_layout_p (const_tree record_type)
40387 {
40388 return ((TARGET_MS_BITFIELD_LAYOUT
40389 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
40390 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
40391 }
40392
40393 /* Returns an expression indicating where the this parameter is
40394 located on entry to the FUNCTION. */
40395
40396 static rtx
40397 x86_this_parameter (tree function)
40398 {
40399 tree type = TREE_TYPE (function);
40400 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
40401 int nregs;
40402
40403 if (TARGET_64BIT)
40404 {
40405 const int *parm_regs;
40406
40407 if (ix86_function_type_abi (type) == MS_ABI)
40408 parm_regs = x86_64_ms_abi_int_parameter_registers;
40409 else
40410 parm_regs = x86_64_int_parameter_registers;
40411 return gen_rtx_REG (Pmode, parm_regs[aggr]);
40412 }
40413
40414 nregs = ix86_function_regparm (type, function);
40415
40416 if (nregs > 0 && !stdarg_p (type))
40417 {
40418 int regno;
40419 unsigned int ccvt = ix86_get_callcvt (type);
40420
40421 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40422 regno = aggr ? DX_REG : CX_REG;
40423 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40424 {
40425 regno = CX_REG;
40426 if (aggr)
40427 return gen_rtx_MEM (SImode,
40428 plus_constant (Pmode, stack_pointer_rtx, 4));
40429 }
40430 else
40431 {
40432 regno = AX_REG;
40433 if (aggr)
40434 {
40435 regno = DX_REG;
40436 if (nregs == 1)
40437 return gen_rtx_MEM (SImode,
40438 plus_constant (Pmode,
40439 stack_pointer_rtx, 4));
40440 }
40441 }
40442 return gen_rtx_REG (SImode, regno);
40443 }
40444
40445 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
40446 aggr ? 8 : 4));
40447 }
40448
40449 /* Determine whether x86_output_mi_thunk can succeed. */
40450
40451 static bool
40452 x86_can_output_mi_thunk (const_tree, HOST_WIDE_INT, HOST_WIDE_INT vcall_offset,
40453 const_tree function)
40454 {
40455 /* 64-bit can handle anything. */
40456 if (TARGET_64BIT)
40457 return true;
40458
40459 /* For 32-bit, everything's fine if we have one free register. */
40460 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
40461 return true;
40462
40463 /* Need a free register for vcall_offset. */
40464 if (vcall_offset)
40465 return false;
40466
40467 /* Need a free register for GOT references. */
40468 if (flag_pic && !targetm.binds_local_p (function))
40469 return false;
40470
40471 /* Otherwise ok. */
40472 return true;
40473 }
40474
40475 /* Output the assembler code for a thunk function. THUNK_DECL is the
40476 declaration for the thunk function itself, FUNCTION is the decl for
40477 the target function. DELTA is an immediate constant offset to be
40478 added to THIS. If VCALL_OFFSET is nonzero, the word at
40479 *(*this + vcall_offset) should be added to THIS. */
40480
40481 static void
40482 x86_output_mi_thunk (FILE *file, tree, HOST_WIDE_INT delta,
40483 HOST_WIDE_INT vcall_offset, tree function)
40484 {
40485 rtx this_param = x86_this_parameter (function);
40486 rtx this_reg, tmp, fnaddr;
40487 unsigned int tmp_regno;
40488 rtx_insn *insn;
40489
40490 if (TARGET_64BIT)
40491 tmp_regno = R10_REG;
40492 else
40493 {
40494 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
40495 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
40496 tmp_regno = AX_REG;
40497 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
40498 tmp_regno = DX_REG;
40499 else
40500 tmp_regno = CX_REG;
40501 }
40502
40503 emit_note (NOTE_INSN_PROLOGUE_END);
40504
40505 /* CET is enabled, insert EB instruction. */
40506 if ((flag_cf_protection & CF_BRANCH) && TARGET_IBT)
40507 emit_insn (gen_nop_endbr ());
40508
40509 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
40510 pull it in now and let DELTA benefit. */
40511 if (REG_P (this_param))
40512 this_reg = this_param;
40513 else if (vcall_offset)
40514 {
40515 /* Put the this parameter into %eax. */
40516 this_reg = gen_rtx_REG (Pmode, AX_REG);
40517 emit_move_insn (this_reg, this_param);
40518 }
40519 else
40520 this_reg = NULL_RTX;
40521
40522 /* Adjust the this parameter by a fixed constant. */
40523 if (delta)
40524 {
40525 rtx delta_rtx = GEN_INT (delta);
40526 rtx delta_dst = this_reg ? this_reg : this_param;
40527
40528 if (TARGET_64BIT)
40529 {
40530 if (!x86_64_general_operand (delta_rtx, Pmode))
40531 {
40532 tmp = gen_rtx_REG (Pmode, tmp_regno);
40533 emit_move_insn (tmp, delta_rtx);
40534 delta_rtx = tmp;
40535 }
40536 }
40537
40538 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
40539 }
40540
40541 /* Adjust the this parameter by a value stored in the vtable. */
40542 if (vcall_offset)
40543 {
40544 rtx vcall_addr, vcall_mem, this_mem;
40545
40546 tmp = gen_rtx_REG (Pmode, tmp_regno);
40547
40548 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
40549 if (Pmode != ptr_mode)
40550 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
40551 emit_move_insn (tmp, this_mem);
40552
40553 /* Adjust the this parameter. */
40554 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
40555 if (TARGET_64BIT
40556 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
40557 {
40558 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
40559 emit_move_insn (tmp2, GEN_INT (vcall_offset));
40560 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
40561 }
40562
40563 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
40564 if (Pmode != ptr_mode)
40565 emit_insn (gen_addsi_1_zext (this_reg,
40566 gen_rtx_REG (ptr_mode,
40567 REGNO (this_reg)),
40568 vcall_mem));
40569 else
40570 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
40571 }
40572
40573 /* If necessary, drop THIS back to its stack slot. */
40574 if (this_reg && this_reg != this_param)
40575 emit_move_insn (this_param, this_reg);
40576
40577 fnaddr = XEXP (DECL_RTL (function), 0);
40578 if (TARGET_64BIT)
40579 {
40580 if (!flag_pic || targetm.binds_local_p (function)
40581 || TARGET_PECOFF)
40582 ;
40583 else
40584 {
40585 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
40586 tmp = gen_rtx_CONST (Pmode, tmp);
40587 fnaddr = gen_const_mem (Pmode, tmp);
40588 }
40589 }
40590 else
40591 {
40592 if (!flag_pic || targetm.binds_local_p (function))
40593 ;
40594 #if TARGET_MACHO
40595 else if (TARGET_MACHO)
40596 {
40597 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
40598 fnaddr = XEXP (fnaddr, 0);
40599 }
40600 #endif /* TARGET_MACHO */
40601 else
40602 {
40603 tmp = gen_rtx_REG (Pmode, CX_REG);
40604 output_set_got (tmp, NULL_RTX);
40605
40606 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
40607 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
40608 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
40609 fnaddr = gen_const_mem (Pmode, fnaddr);
40610 }
40611 }
40612
40613 /* Our sibling call patterns do not allow memories, because we have no
40614 predicate that can distinguish between frame and non-frame memory.
40615 For our purposes here, we can get away with (ab)using a jump pattern,
40616 because we're going to do no optimization. */
40617 if (MEM_P (fnaddr))
40618 {
40619 if (sibcall_insn_operand (fnaddr, word_mode))
40620 {
40621 fnaddr = XEXP (DECL_RTL (function), 0);
40622 tmp = gen_rtx_MEM (QImode, fnaddr);
40623 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40624 tmp = emit_call_insn (tmp);
40625 SIBLING_CALL_P (tmp) = 1;
40626 }
40627 else
40628 emit_jump_insn (gen_indirect_jump (fnaddr));
40629 }
40630 else
40631 {
40632 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
40633 {
40634 // CM_LARGE_PIC always uses pseudo PIC register which is
40635 // uninitialized. Since FUNCTION is local and calling it
40636 // doesn't go through PLT, we use scratch register %r11 as
40637 // PIC register and initialize it here.
40638 pic_offset_table_rtx = gen_rtx_REG (Pmode, R11_REG);
40639 ix86_init_large_pic_reg (tmp_regno);
40640 fnaddr = legitimize_pic_address (fnaddr,
40641 gen_rtx_REG (Pmode, tmp_regno));
40642 }
40643
40644 if (!sibcall_insn_operand (fnaddr, word_mode))
40645 {
40646 tmp = gen_rtx_REG (word_mode, tmp_regno);
40647 if (GET_MODE (fnaddr) != word_mode)
40648 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
40649 emit_move_insn (tmp, fnaddr);
40650 fnaddr = tmp;
40651 }
40652
40653 tmp = gen_rtx_MEM (QImode, fnaddr);
40654 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
40655 tmp = emit_call_insn (tmp);
40656 SIBLING_CALL_P (tmp) = 1;
40657 }
40658 emit_barrier ();
40659
40660 /* Emit just enough of rest_of_compilation to get the insns emitted.
40661 Note that use_thunk calls assemble_start_function et al. */
40662 insn = get_insns ();
40663 shorten_branches (insn);
40664 final_start_function (insn, file, 1);
40665 final (insn, file, 1);
40666 final_end_function ();
40667 }
40668
40669 static void
40670 x86_file_start (void)
40671 {
40672 default_file_start ();
40673 if (TARGET_16BIT)
40674 fputs ("\t.code16gcc\n", asm_out_file);
40675 #if TARGET_MACHO
40676 darwin_file_start ();
40677 #endif
40678 if (X86_FILE_START_VERSION_DIRECTIVE)
40679 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
40680 if (X86_FILE_START_FLTUSED)
40681 fputs ("\t.global\t__fltused\n", asm_out_file);
40682 if (ix86_asm_dialect == ASM_INTEL)
40683 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
40684 }
40685
40686 int
40687 x86_field_alignment (tree type, int computed)
40688 {
40689 machine_mode mode;
40690
40691 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
40692 return computed;
40693 if (TARGET_IAMCU)
40694 return iamcu_alignment (type, computed);
40695 mode = TYPE_MODE (strip_array_types (type));
40696 if (mode == DFmode || mode == DCmode
40697 || GET_MODE_CLASS (mode) == MODE_INT
40698 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
40699 return MIN (32, computed);
40700 return computed;
40701 }
40702
40703 /* Print call to TARGET to FILE. */
40704
40705 static void
40706 x86_print_call_or_nop (FILE *file, const char *target)
40707 {
40708 if (flag_nop_mcount)
40709 /* 5 byte nop: nopl 0(%[re]ax,%[re]ax,1) */
40710 fprintf (file, "1:" ASM_BYTE "0x0f, 0x1f, 0x44, 0x00, 0x00\n");
40711 else
40712 fprintf (file, "1:\tcall\t%s\n", target);
40713 }
40714
40715 /* Output assembler code to FILE to increment profiler label # LABELNO
40716 for profiling a function entry. */
40717 void
40718 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
40719 {
40720 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
40721 : MCOUNT_NAME);
40722 if (TARGET_64BIT)
40723 {
40724 #ifndef NO_PROFILE_COUNTERS
40725 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
40726 #endif
40727
40728 if (!TARGET_PECOFF && flag_pic)
40729 fprintf (file, "1:\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
40730 else
40731 x86_print_call_or_nop (file, mcount_name);
40732 }
40733 else if (flag_pic)
40734 {
40735 #ifndef NO_PROFILE_COUNTERS
40736 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
40737 LPREFIX, labelno);
40738 #endif
40739 fprintf (file, "1:\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
40740 }
40741 else
40742 {
40743 #ifndef NO_PROFILE_COUNTERS
40744 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
40745 LPREFIX, labelno);
40746 #endif
40747 x86_print_call_or_nop (file, mcount_name);
40748 }
40749
40750 if (flag_record_mcount)
40751 {
40752 fprintf (file, "\t.section __mcount_loc, \"a\",@progbits\n");
40753 fprintf (file, "\t.%s 1b\n", TARGET_64BIT ? "quad" : "long");
40754 fprintf (file, "\t.previous\n");
40755 }
40756 }
40757
40758 /* We don't have exact information about the insn sizes, but we may assume
40759 quite safely that we are informed about all 1 byte insns and memory
40760 address sizes. This is enough to eliminate unnecessary padding in
40761 99% of cases. */
40762
40763 int
40764 ix86_min_insn_size (rtx_insn *insn)
40765 {
40766 int l = 0, len;
40767
40768 if (!INSN_P (insn) || !active_insn_p (insn))
40769 return 0;
40770
40771 /* Discard alignments we've emit and jump instructions. */
40772 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
40773 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
40774 return 0;
40775
40776 /* Important case - calls are always 5 bytes.
40777 It is common to have many calls in the row. */
40778 if (CALL_P (insn)
40779 && symbolic_reference_mentioned_p (PATTERN (insn))
40780 && !SIBLING_CALL_P (insn))
40781 return 5;
40782 len = get_attr_length (insn);
40783 if (len <= 1)
40784 return 1;
40785
40786 /* For normal instructions we rely on get_attr_length being exact,
40787 with a few exceptions. */
40788 if (!JUMP_P (insn))
40789 {
40790 enum attr_type type = get_attr_type (insn);
40791
40792 switch (type)
40793 {
40794 case TYPE_MULTI:
40795 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
40796 || asm_noperands (PATTERN (insn)) >= 0)
40797 return 0;
40798 break;
40799 case TYPE_OTHER:
40800 case TYPE_FCMP:
40801 break;
40802 default:
40803 /* Otherwise trust get_attr_length. */
40804 return len;
40805 }
40806
40807 l = get_attr_length_address (insn);
40808 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
40809 l = 4;
40810 }
40811 if (l)
40812 return 1+l;
40813 else
40814 return 2;
40815 }
40816
40817 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
40818
40819 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
40820 window. */
40821
40822 static void
40823 ix86_avoid_jump_mispredicts (void)
40824 {
40825 rtx_insn *insn, *start = get_insns ();
40826 int nbytes = 0, njumps = 0;
40827 bool isjump = false;
40828
40829 /* Look for all minimal intervals of instructions containing 4 jumps.
40830 The intervals are bounded by START and INSN. NBYTES is the total
40831 size of instructions in the interval including INSN and not including
40832 START. When the NBYTES is smaller than 16 bytes, it is possible
40833 that the end of START and INSN ends up in the same 16byte page.
40834
40835 The smallest offset in the page INSN can start is the case where START
40836 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
40837 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
40838
40839 Don't consider asm goto as jump, while it can contain a jump, it doesn't
40840 have to, control transfer to label(s) can be performed through other
40841 means, and also we estimate minimum length of all asm stmts as 0. */
40842 for (insn = start; insn; insn = NEXT_INSN (insn))
40843 {
40844 int min_size;
40845
40846 if (LABEL_P (insn))
40847 {
40848 int align = label_to_alignment (insn);
40849 int max_skip = label_to_max_skip (insn);
40850
40851 if (max_skip > 15)
40852 max_skip = 15;
40853 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
40854 already in the current 16 byte page, because otherwise
40855 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
40856 bytes to reach 16 byte boundary. */
40857 if (align <= 0
40858 || (align <= 3 && max_skip != (1 << align) - 1))
40859 max_skip = 0;
40860 if (dump_file)
40861 fprintf (dump_file, "Label %i with max_skip %i\n",
40862 INSN_UID (insn), max_skip);
40863 if (max_skip)
40864 {
40865 while (nbytes + max_skip >= 16)
40866 {
40867 start = NEXT_INSN (start);
40868 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40869 || CALL_P (start))
40870 njumps--, isjump = true;
40871 else
40872 isjump = false;
40873 nbytes -= ix86_min_insn_size (start);
40874 }
40875 }
40876 continue;
40877 }
40878
40879 min_size = ix86_min_insn_size (insn);
40880 nbytes += min_size;
40881 if (dump_file)
40882 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
40883 INSN_UID (insn), min_size);
40884 if ((JUMP_P (insn) && asm_noperands (PATTERN (insn)) < 0)
40885 || CALL_P (insn))
40886 njumps++;
40887 else
40888 continue;
40889
40890 while (njumps > 3)
40891 {
40892 start = NEXT_INSN (start);
40893 if ((JUMP_P (start) && asm_noperands (PATTERN (start)) < 0)
40894 || CALL_P (start))
40895 njumps--, isjump = true;
40896 else
40897 isjump = false;
40898 nbytes -= ix86_min_insn_size (start);
40899 }
40900 gcc_assert (njumps >= 0);
40901 if (dump_file)
40902 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
40903 INSN_UID (start), INSN_UID (insn), nbytes);
40904
40905 if (njumps == 3 && isjump && nbytes < 16)
40906 {
40907 int padsize = 15 - nbytes + ix86_min_insn_size (insn);
40908
40909 if (dump_file)
40910 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
40911 INSN_UID (insn), padsize);
40912 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
40913 }
40914 }
40915 }
40916 #endif
40917
40918 /* AMD Athlon works faster
40919 when RET is not destination of conditional jump or directly preceded
40920 by other jump instruction. We avoid the penalty by inserting NOP just
40921 before the RET instructions in such cases. */
40922 static void
40923 ix86_pad_returns (void)
40924 {
40925 edge e;
40926 edge_iterator ei;
40927
40928 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
40929 {
40930 basic_block bb = e->src;
40931 rtx_insn *ret = BB_END (bb);
40932 rtx_insn *prev;
40933 bool replace = false;
40934
40935 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
40936 || optimize_bb_for_size_p (bb))
40937 continue;
40938 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
40939 if (active_insn_p (prev) || LABEL_P (prev))
40940 break;
40941 if (prev && LABEL_P (prev))
40942 {
40943 edge e;
40944 edge_iterator ei;
40945
40946 FOR_EACH_EDGE (e, ei, bb->preds)
40947 if (EDGE_FREQUENCY (e) && e->src->index >= 0
40948 && !(e->flags & EDGE_FALLTHRU))
40949 {
40950 replace = true;
40951 break;
40952 }
40953 }
40954 if (!replace)
40955 {
40956 prev = prev_active_insn (ret);
40957 if (prev
40958 && ((JUMP_P (prev) && any_condjump_p (prev))
40959 || CALL_P (prev)))
40960 replace = true;
40961 /* Empty functions get branch mispredict even when
40962 the jump destination is not visible to us. */
40963 if (!prev && !optimize_function_for_size_p (cfun))
40964 replace = true;
40965 }
40966 if (replace)
40967 {
40968 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
40969 delete_insn (ret);
40970 }
40971 }
40972 }
40973
40974 /* Count the minimum number of instructions in BB. Return 4 if the
40975 number of instructions >= 4. */
40976
40977 static int
40978 ix86_count_insn_bb (basic_block bb)
40979 {
40980 rtx_insn *insn;
40981 int insn_count = 0;
40982
40983 /* Count number of instructions in this block. Return 4 if the number
40984 of instructions >= 4. */
40985 FOR_BB_INSNS (bb, insn)
40986 {
40987 /* Only happen in exit blocks. */
40988 if (JUMP_P (insn)
40989 && ANY_RETURN_P (PATTERN (insn)))
40990 break;
40991
40992 if (NONDEBUG_INSN_P (insn)
40993 && GET_CODE (PATTERN (insn)) != USE
40994 && GET_CODE (PATTERN (insn)) != CLOBBER)
40995 {
40996 insn_count++;
40997 if (insn_count >= 4)
40998 return insn_count;
40999 }
41000 }
41001
41002 return insn_count;
41003 }
41004
41005
41006 /* Count the minimum number of instructions in code path in BB.
41007 Return 4 if the number of instructions >= 4. */
41008
41009 static int
41010 ix86_count_insn (basic_block bb)
41011 {
41012 edge e;
41013 edge_iterator ei;
41014 int min_prev_count;
41015
41016 /* Only bother counting instructions along paths with no
41017 more than 2 basic blocks between entry and exit. Given
41018 that BB has an edge to exit, determine if a predecessor
41019 of BB has an edge from entry. If so, compute the number
41020 of instructions in the predecessor block. If there
41021 happen to be multiple such blocks, compute the minimum. */
41022 min_prev_count = 4;
41023 FOR_EACH_EDGE (e, ei, bb->preds)
41024 {
41025 edge prev_e;
41026 edge_iterator prev_ei;
41027
41028 if (e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41029 {
41030 min_prev_count = 0;
41031 break;
41032 }
41033 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
41034 {
41035 if (prev_e->src == ENTRY_BLOCK_PTR_FOR_FN (cfun))
41036 {
41037 int count = ix86_count_insn_bb (e->src);
41038 if (count < min_prev_count)
41039 min_prev_count = count;
41040 break;
41041 }
41042 }
41043 }
41044
41045 if (min_prev_count < 4)
41046 min_prev_count += ix86_count_insn_bb (bb);
41047
41048 return min_prev_count;
41049 }
41050
41051 /* Pad short function to 4 instructions. */
41052
41053 static void
41054 ix86_pad_short_function (void)
41055 {
41056 edge e;
41057 edge_iterator ei;
41058
41059 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41060 {
41061 rtx_insn *ret = BB_END (e->src);
41062 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
41063 {
41064 int insn_count = ix86_count_insn (e->src);
41065
41066 /* Pad short function. */
41067 if (insn_count < 4)
41068 {
41069 rtx_insn *insn = ret;
41070
41071 /* Find epilogue. */
41072 while (insn
41073 && (!NOTE_P (insn)
41074 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
41075 insn = PREV_INSN (insn);
41076
41077 if (!insn)
41078 insn = ret;
41079
41080 /* Two NOPs count as one instruction. */
41081 insn_count = 2 * (4 - insn_count);
41082 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
41083 }
41084 }
41085 }
41086 }
41087
41088 /* Fix up a Windows system unwinder issue. If an EH region falls through into
41089 the epilogue, the Windows system unwinder will apply epilogue logic and
41090 produce incorrect offsets. This can be avoided by adding a nop between
41091 the last insn that can throw and the first insn of the epilogue. */
41092
41093 static void
41094 ix86_seh_fixup_eh_fallthru (void)
41095 {
41096 edge e;
41097 edge_iterator ei;
41098
41099 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
41100 {
41101 rtx_insn *insn, *next;
41102
41103 /* Find the beginning of the epilogue. */
41104 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
41105 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
41106 break;
41107 if (insn == NULL)
41108 continue;
41109
41110 /* We only care about preceding insns that can throw. */
41111 insn = prev_active_insn (insn);
41112 if (insn == NULL || !can_throw_internal (insn))
41113 continue;
41114
41115 /* Do not separate calls from their debug information. */
41116 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
41117 if (NOTE_P (next)
41118 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
41119 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
41120 insn = next;
41121 else
41122 break;
41123
41124 emit_insn_after (gen_nops (const1_rtx), insn);
41125 }
41126 }
41127
41128 /* Given a register number BASE, the lowest of a group of registers, update
41129 regsets IN and OUT with the registers that should be avoided in input
41130 and output operands respectively when trying to avoid generating a modr/m
41131 byte for -fmitigate-rop. */
41132
41133 static void
41134 set_rop_modrm_reg_bits (int base, HARD_REG_SET &in, HARD_REG_SET &out)
41135 {
41136 SET_HARD_REG_BIT (out, base);
41137 SET_HARD_REG_BIT (out, base + 1);
41138 SET_HARD_REG_BIT (in, base + 2);
41139 SET_HARD_REG_BIT (in, base + 3);
41140 }
41141
41142 /* Called if -fmitigate_rop is in effect. Try to rewrite instructions so
41143 that certain encodings of modr/m bytes do not occur. */
41144 static void
41145 ix86_mitigate_rop (void)
41146 {
41147 HARD_REG_SET input_risky;
41148 HARD_REG_SET output_risky;
41149 HARD_REG_SET inout_risky;
41150
41151 CLEAR_HARD_REG_SET (output_risky);
41152 CLEAR_HARD_REG_SET (input_risky);
41153 SET_HARD_REG_BIT (output_risky, AX_REG);
41154 SET_HARD_REG_BIT (output_risky, CX_REG);
41155 SET_HARD_REG_BIT (input_risky, BX_REG);
41156 SET_HARD_REG_BIT (input_risky, DX_REG);
41157 set_rop_modrm_reg_bits (FIRST_SSE_REG, input_risky, output_risky);
41158 set_rop_modrm_reg_bits (FIRST_REX_INT_REG, input_risky, output_risky);
41159 set_rop_modrm_reg_bits (FIRST_REX_SSE_REG, input_risky, output_risky);
41160 set_rop_modrm_reg_bits (FIRST_EXT_REX_SSE_REG, input_risky, output_risky);
41161 set_rop_modrm_reg_bits (FIRST_MASK_REG, input_risky, output_risky);
41162 set_rop_modrm_reg_bits (FIRST_BND_REG, input_risky, output_risky);
41163 COPY_HARD_REG_SET (inout_risky, input_risky);
41164 IOR_HARD_REG_SET (inout_risky, output_risky);
41165
41166 df_note_add_problem ();
41167 /* Fix up what stack-regs did. */
41168 df_insn_rescan_all ();
41169 df_analyze ();
41170
41171 regrename_init (true);
41172 regrename_analyze (NULL);
41173
41174 auto_vec<du_head_p> cands;
41175
41176 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
41177 {
41178 if (!NONDEBUG_INSN_P (insn))
41179 continue;
41180
41181 if (GET_CODE (PATTERN (insn)) == USE
41182 || GET_CODE (PATTERN (insn)) == CLOBBER)
41183 continue;
41184
41185 extract_insn (insn);
41186
41187 int opno0, opno1;
41188 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41189 recog_data.n_operands, &opno0,
41190 &opno1);
41191
41192 if (!ix86_rop_should_change_byte_p (modrm))
41193 continue;
41194
41195 insn_rr_info *info = &insn_rr[INSN_UID (insn)];
41196
41197 /* This happens when regrename has to fail a block. */
41198 if (!info->op_info)
41199 continue;
41200
41201 if (info->op_info[opno0].n_chains != 0)
41202 {
41203 gcc_assert (info->op_info[opno0].n_chains == 1);
41204 du_head_p op0c;
41205 op0c = regrename_chain_from_id (info->op_info[opno0].heads[0]->id);
41206 if (op0c->target_data_1 + op0c->target_data_2 == 0
41207 && !op0c->cannot_rename)
41208 cands.safe_push (op0c);
41209
41210 op0c->target_data_1++;
41211 }
41212 if (info->op_info[opno1].n_chains != 0)
41213 {
41214 gcc_assert (info->op_info[opno1].n_chains == 1);
41215 du_head_p op1c;
41216 op1c = regrename_chain_from_id (info->op_info[opno1].heads[0]->id);
41217 if (op1c->target_data_1 + op1c->target_data_2 == 0
41218 && !op1c->cannot_rename)
41219 cands.safe_push (op1c);
41220
41221 op1c->target_data_2++;
41222 }
41223 }
41224
41225 int i;
41226 du_head_p head;
41227 FOR_EACH_VEC_ELT (cands, i, head)
41228 {
41229 int old_reg, best_reg;
41230 HARD_REG_SET unavailable;
41231
41232 CLEAR_HARD_REG_SET (unavailable);
41233 if (head->target_data_1)
41234 IOR_HARD_REG_SET (unavailable, output_risky);
41235 if (head->target_data_2)
41236 IOR_HARD_REG_SET (unavailable, input_risky);
41237
41238 int n_uses;
41239 reg_class superclass = regrename_find_superclass (head, &n_uses,
41240 &unavailable);
41241 old_reg = head->regno;
41242 best_reg = find_rename_reg (head, superclass, &unavailable,
41243 old_reg, false);
41244 bool ok = regrename_do_replace (head, best_reg);
41245 gcc_assert (ok);
41246 if (dump_file)
41247 fprintf (dump_file, "Chain %d renamed as %s in %s\n", head->id,
41248 reg_names[best_reg], reg_class_names[superclass]);
41249
41250 }
41251
41252 regrename_finish ();
41253
41254 df_analyze ();
41255
41256 basic_block bb;
41257 regset_head live;
41258
41259 INIT_REG_SET (&live);
41260
41261 FOR_EACH_BB_FN (bb, cfun)
41262 {
41263 rtx_insn *insn;
41264
41265 COPY_REG_SET (&live, DF_LR_OUT (bb));
41266 df_simulate_initialize_backwards (bb, &live);
41267
41268 FOR_BB_INSNS_REVERSE (bb, insn)
41269 {
41270 if (!NONDEBUG_INSN_P (insn))
41271 continue;
41272
41273 df_simulate_one_insn_backwards (bb, insn, &live);
41274
41275 if (GET_CODE (PATTERN (insn)) == USE
41276 || GET_CODE (PATTERN (insn)) == CLOBBER)
41277 continue;
41278
41279 extract_insn (insn);
41280 constrain_operands_cached (insn, reload_completed);
41281 int opno0, opno1;
41282 int modrm = ix86_get_modrm_for_rop (insn, recog_data.operand,
41283 recog_data.n_operands, &opno0,
41284 &opno1);
41285 if (modrm < 0
41286 || !ix86_rop_should_change_byte_p (modrm)
41287 || opno0 == opno1)
41288 continue;
41289
41290 rtx oldreg = recog_data.operand[opno1];
41291 preprocess_constraints (insn);
41292 const operand_alternative *alt = which_op_alt ();
41293
41294 int i;
41295 for (i = 0; i < recog_data.n_operands; i++)
41296 if (i != opno1
41297 && alt[i].earlyclobber
41298 && reg_overlap_mentioned_p (recog_data.operand[i],
41299 oldreg))
41300 break;
41301
41302 if (i < recog_data.n_operands)
41303 continue;
41304
41305 if (dump_file)
41306 fprintf (dump_file,
41307 "attempting to fix modrm byte in insn %d:"
41308 " reg %d class %s", INSN_UID (insn), REGNO (oldreg),
41309 reg_class_names[alt[opno1].cl]);
41310
41311 HARD_REG_SET unavailable;
41312 REG_SET_TO_HARD_REG_SET (unavailable, &live);
41313 SET_HARD_REG_BIT (unavailable, REGNO (oldreg));
41314 IOR_COMPL_HARD_REG_SET (unavailable, call_used_reg_set);
41315 IOR_HARD_REG_SET (unavailable, fixed_reg_set);
41316 IOR_HARD_REG_SET (unavailable, output_risky);
41317 IOR_COMPL_HARD_REG_SET (unavailable,
41318 reg_class_contents[alt[opno1].cl]);
41319
41320 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
41321 if (!TEST_HARD_REG_BIT (unavailable, i))
41322 break;
41323 if (i == FIRST_PSEUDO_REGISTER)
41324 {
41325 if (dump_file)
41326 fprintf (dump_file, ", none available\n");
41327 continue;
41328 }
41329 if (dump_file)
41330 fprintf (dump_file, " -> %d\n", i);
41331 rtx newreg = gen_rtx_REG (recog_data.operand_mode[opno1], i);
41332 validate_change (insn, recog_data.operand_loc[opno1], newreg, false);
41333 insn = emit_insn_before (gen_move_insn (newreg, oldreg), insn);
41334 }
41335 }
41336 }
41337
41338 /* Implement machine specific optimizations. We implement padding of returns
41339 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
41340 static void
41341 ix86_reorg (void)
41342 {
41343 /* We are freeing block_for_insn in the toplev to keep compatibility
41344 with old MDEP_REORGS that are not CFG based. Recompute it now. */
41345 compute_bb_for_insn ();
41346
41347 if (flag_mitigate_rop)
41348 ix86_mitigate_rop ();
41349
41350 if (TARGET_SEH && current_function_has_exception_handlers ())
41351 ix86_seh_fixup_eh_fallthru ();
41352
41353 if (optimize && optimize_function_for_speed_p (cfun))
41354 {
41355 if (TARGET_PAD_SHORT_FUNCTION)
41356 ix86_pad_short_function ();
41357 else if (TARGET_PAD_RETURNS)
41358 ix86_pad_returns ();
41359 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
41360 if (TARGET_FOUR_JUMP_LIMIT)
41361 ix86_avoid_jump_mispredicts ();
41362 #endif
41363 }
41364 }
41365
41366 /* Return nonzero when QImode register that must be represented via REX prefix
41367 is used. */
41368 bool
41369 x86_extended_QIreg_mentioned_p (rtx_insn *insn)
41370 {
41371 int i;
41372 extract_insn_cached (insn);
41373 for (i = 0; i < recog_data.n_operands; i++)
41374 if (GENERAL_REG_P (recog_data.operand[i])
41375 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
41376 return true;
41377 return false;
41378 }
41379
41380 /* Return true when INSN mentions register that must be encoded using REX
41381 prefix. */
41382 bool
41383 x86_extended_reg_mentioned_p (rtx insn)
41384 {
41385 subrtx_iterator::array_type array;
41386 FOR_EACH_SUBRTX (iter, array, INSN_P (insn) ? PATTERN (insn) : insn, NONCONST)
41387 {
41388 const_rtx x = *iter;
41389 if (REG_P (x)
41390 && (REX_INT_REGNO_P (REGNO (x)) || REX_SSE_REGNO_P (REGNO (x))))
41391 return true;
41392 }
41393 return false;
41394 }
41395
41396 /* If profitable, negate (without causing overflow) integer constant
41397 of mode MODE at location LOC. Return true in this case. */
41398 bool
41399 x86_maybe_negate_const_int (rtx *loc, machine_mode mode)
41400 {
41401 HOST_WIDE_INT val;
41402
41403 if (!CONST_INT_P (*loc))
41404 return false;
41405
41406 switch (mode)
41407 {
41408 case E_DImode:
41409 /* DImode x86_64 constants must fit in 32 bits. */
41410 gcc_assert (x86_64_immediate_operand (*loc, mode));
41411
41412 mode = SImode;
41413 break;
41414
41415 case E_SImode:
41416 case E_HImode:
41417 case E_QImode:
41418 break;
41419
41420 default:
41421 gcc_unreachable ();
41422 }
41423
41424 /* Avoid overflows. */
41425 if (mode_signbit_p (mode, *loc))
41426 return false;
41427
41428 val = INTVAL (*loc);
41429
41430 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
41431 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
41432 if ((val < 0 && val != -128)
41433 || val == 128)
41434 {
41435 *loc = GEN_INT (-val);
41436 return true;
41437 }
41438
41439 return false;
41440 }
41441
41442 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
41443 optabs would emit if we didn't have TFmode patterns. */
41444
41445 void
41446 x86_emit_floatuns (rtx operands[2])
41447 {
41448 rtx_code_label *neglab, *donelab;
41449 rtx i0, i1, f0, in, out;
41450 machine_mode mode, inmode;
41451
41452 inmode = GET_MODE (operands[1]);
41453 gcc_assert (inmode == SImode || inmode == DImode);
41454
41455 out = operands[0];
41456 in = force_reg (inmode, operands[1]);
41457 mode = GET_MODE (out);
41458 neglab = gen_label_rtx ();
41459 donelab = gen_label_rtx ();
41460 f0 = gen_reg_rtx (mode);
41461
41462 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
41463
41464 expand_float (out, in, 0);
41465
41466 emit_jump_insn (gen_jump (donelab));
41467 emit_barrier ();
41468
41469 emit_label (neglab);
41470
41471 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
41472 1, OPTAB_DIRECT);
41473 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
41474 1, OPTAB_DIRECT);
41475 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
41476
41477 expand_float (f0, i0, 0);
41478
41479 emit_insn (gen_rtx_SET (out, gen_rtx_PLUS (mode, f0, f0)));
41480
41481 emit_label (donelab);
41482 }
41483 \f
41484 static bool canonicalize_perm (struct expand_vec_perm_d *d);
41485 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
41486 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
41487 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
41488
41489 /* Get a vector mode of the same size as the original but with elements
41490 twice as wide. This is only guaranteed to apply to integral vectors. */
41491
41492 static inline machine_mode
41493 get_mode_wider_vector (machine_mode o)
41494 {
41495 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
41496 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
41497 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
41498 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
41499 return n;
41500 }
41501
41502 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
41503 fill target with val via vec_duplicate. */
41504
41505 static bool
41506 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
41507 {
41508 bool ok;
41509 rtx_insn *insn;
41510 rtx dup;
41511
41512 /* First attempt to recognize VAL as-is. */
41513 dup = gen_vec_duplicate (mode, val);
41514 insn = emit_insn (gen_rtx_SET (target, dup));
41515 if (recog_memoized (insn) < 0)
41516 {
41517 rtx_insn *seq;
41518 machine_mode innermode = GET_MODE_INNER (mode);
41519 rtx reg;
41520
41521 /* If that fails, force VAL into a register. */
41522
41523 start_sequence ();
41524 reg = force_reg (innermode, val);
41525 if (GET_MODE (reg) != innermode)
41526 reg = gen_lowpart (innermode, reg);
41527 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
41528 seq = get_insns ();
41529 end_sequence ();
41530 if (seq)
41531 emit_insn_before (seq, insn);
41532
41533 ok = recog_memoized (insn) >= 0;
41534 gcc_assert (ok);
41535 }
41536 return true;
41537 }
41538
41539 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41540 with all elements equal to VAR. Return true if successful. */
41541
41542 static bool
41543 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
41544 rtx target, rtx val)
41545 {
41546 bool ok;
41547
41548 switch (mode)
41549 {
41550 case E_V2SImode:
41551 case E_V2SFmode:
41552 if (!mmx_ok)
41553 return false;
41554 /* FALLTHRU */
41555
41556 case E_V4DFmode:
41557 case E_V4DImode:
41558 case E_V8SFmode:
41559 case E_V8SImode:
41560 case E_V2DFmode:
41561 case E_V2DImode:
41562 case E_V4SFmode:
41563 case E_V4SImode:
41564 case E_V16SImode:
41565 case E_V8DImode:
41566 case E_V16SFmode:
41567 case E_V8DFmode:
41568 return ix86_vector_duplicate_value (mode, target, val);
41569
41570 case E_V4HImode:
41571 if (!mmx_ok)
41572 return false;
41573 if (TARGET_SSE || TARGET_3DNOW_A)
41574 {
41575 rtx x;
41576
41577 val = gen_lowpart (SImode, val);
41578 x = gen_rtx_TRUNCATE (HImode, val);
41579 x = gen_rtx_VEC_DUPLICATE (mode, x);
41580 emit_insn (gen_rtx_SET (target, x));
41581 return true;
41582 }
41583 goto widen;
41584
41585 case E_V8QImode:
41586 if (!mmx_ok)
41587 return false;
41588 goto widen;
41589
41590 case E_V8HImode:
41591 if (TARGET_AVX2)
41592 return ix86_vector_duplicate_value (mode, target, val);
41593
41594 if (TARGET_SSE2)
41595 {
41596 struct expand_vec_perm_d dperm;
41597 rtx tmp1, tmp2;
41598
41599 permute:
41600 memset (&dperm, 0, sizeof (dperm));
41601 dperm.target = target;
41602 dperm.vmode = mode;
41603 dperm.nelt = GET_MODE_NUNITS (mode);
41604 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
41605 dperm.one_operand_p = true;
41606
41607 /* Extend to SImode using a paradoxical SUBREG. */
41608 tmp1 = gen_reg_rtx (SImode);
41609 emit_move_insn (tmp1, gen_lowpart (SImode, val));
41610
41611 /* Insert the SImode value as low element of a V4SImode vector. */
41612 tmp2 = gen_reg_rtx (V4SImode);
41613 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
41614 emit_move_insn (dperm.op0, gen_lowpart (mode, tmp2));
41615
41616 ok = (expand_vec_perm_1 (&dperm)
41617 || expand_vec_perm_broadcast_1 (&dperm));
41618 gcc_assert (ok);
41619 return ok;
41620 }
41621 goto widen;
41622
41623 case E_V16QImode:
41624 if (TARGET_AVX2)
41625 return ix86_vector_duplicate_value (mode, target, val);
41626
41627 if (TARGET_SSE2)
41628 goto permute;
41629 goto widen;
41630
41631 widen:
41632 /* Replicate the value once into the next wider mode and recurse. */
41633 {
41634 machine_mode smode, wsmode, wvmode;
41635 rtx x;
41636
41637 smode = GET_MODE_INNER (mode);
41638 wvmode = get_mode_wider_vector (mode);
41639 wsmode = GET_MODE_INNER (wvmode);
41640
41641 val = convert_modes (wsmode, smode, val, true);
41642 x = expand_simple_binop (wsmode, ASHIFT, val,
41643 GEN_INT (GET_MODE_BITSIZE (smode)),
41644 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41645 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
41646
41647 x = gen_reg_rtx (wvmode);
41648 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
41649 gcc_assert (ok);
41650 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
41651 return ok;
41652 }
41653
41654 case E_V16HImode:
41655 case E_V32QImode:
41656 if (TARGET_AVX2)
41657 return ix86_vector_duplicate_value (mode, target, val);
41658 else
41659 {
41660 machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
41661 rtx x = gen_reg_rtx (hvmode);
41662
41663 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41664 gcc_assert (ok);
41665
41666 x = gen_rtx_VEC_CONCAT (mode, x, x);
41667 emit_insn (gen_rtx_SET (target, x));
41668 }
41669 return true;
41670
41671 case E_V64QImode:
41672 case E_V32HImode:
41673 if (TARGET_AVX512BW)
41674 return ix86_vector_duplicate_value (mode, target, val);
41675 else
41676 {
41677 machine_mode hvmode = (mode == V32HImode ? V16HImode : V32QImode);
41678 rtx x = gen_reg_rtx (hvmode);
41679
41680 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
41681 gcc_assert (ok);
41682
41683 x = gen_rtx_VEC_CONCAT (mode, x, x);
41684 emit_insn (gen_rtx_SET (target, x));
41685 }
41686 return true;
41687
41688 default:
41689 return false;
41690 }
41691 }
41692
41693 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41694 whose ONE_VAR element is VAR, and other elements are zero. Return true
41695 if successful. */
41696
41697 static bool
41698 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
41699 rtx target, rtx var, int one_var)
41700 {
41701 machine_mode vsimode;
41702 rtx new_target;
41703 rtx x, tmp;
41704 bool use_vector_set = false;
41705
41706 switch (mode)
41707 {
41708 case E_V2DImode:
41709 /* For SSE4.1, we normally use vector set. But if the second
41710 element is zero and inter-unit moves are OK, we use movq
41711 instead. */
41712 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
41713 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
41714 && one_var == 0));
41715 break;
41716 case E_V16QImode:
41717 case E_V4SImode:
41718 case E_V4SFmode:
41719 use_vector_set = TARGET_SSE4_1;
41720 break;
41721 case E_V8HImode:
41722 use_vector_set = TARGET_SSE2;
41723 break;
41724 case E_V4HImode:
41725 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
41726 break;
41727 case E_V32QImode:
41728 case E_V16HImode:
41729 case E_V8SImode:
41730 case E_V8SFmode:
41731 case E_V4DFmode:
41732 use_vector_set = TARGET_AVX;
41733 break;
41734 case E_V4DImode:
41735 /* Use ix86_expand_vector_set in 64bit mode only. */
41736 use_vector_set = TARGET_AVX && TARGET_64BIT;
41737 break;
41738 default:
41739 break;
41740 }
41741
41742 if (use_vector_set)
41743 {
41744 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
41745 var = force_reg (GET_MODE_INNER (mode), var);
41746 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41747 return true;
41748 }
41749
41750 switch (mode)
41751 {
41752 case E_V2SFmode:
41753 case E_V2SImode:
41754 if (!mmx_ok)
41755 return false;
41756 /* FALLTHRU */
41757
41758 case E_V2DFmode:
41759 case E_V2DImode:
41760 if (one_var != 0)
41761 return false;
41762 var = force_reg (GET_MODE_INNER (mode), var);
41763 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
41764 emit_insn (gen_rtx_SET (target, x));
41765 return true;
41766
41767 case E_V4SFmode:
41768 case E_V4SImode:
41769 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
41770 new_target = gen_reg_rtx (mode);
41771 else
41772 new_target = target;
41773 var = force_reg (GET_MODE_INNER (mode), var);
41774 x = gen_rtx_VEC_DUPLICATE (mode, var);
41775 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
41776 emit_insn (gen_rtx_SET (new_target, x));
41777 if (one_var != 0)
41778 {
41779 /* We need to shuffle the value to the correct position, so
41780 create a new pseudo to store the intermediate result. */
41781
41782 /* With SSE2, we can use the integer shuffle insns. */
41783 if (mode != V4SFmode && TARGET_SSE2)
41784 {
41785 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
41786 const1_rtx,
41787 GEN_INT (one_var == 1 ? 0 : 1),
41788 GEN_INT (one_var == 2 ? 0 : 1),
41789 GEN_INT (one_var == 3 ? 0 : 1)));
41790 if (target != new_target)
41791 emit_move_insn (target, new_target);
41792 return true;
41793 }
41794
41795 /* Otherwise convert the intermediate result to V4SFmode and
41796 use the SSE1 shuffle instructions. */
41797 if (mode != V4SFmode)
41798 {
41799 tmp = gen_reg_rtx (V4SFmode);
41800 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
41801 }
41802 else
41803 tmp = new_target;
41804
41805 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
41806 const1_rtx,
41807 GEN_INT (one_var == 1 ? 0 : 1),
41808 GEN_INT (one_var == 2 ? 0+4 : 1+4),
41809 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
41810
41811 if (mode != V4SFmode)
41812 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
41813 else if (tmp != target)
41814 emit_move_insn (target, tmp);
41815 }
41816 else if (target != new_target)
41817 emit_move_insn (target, new_target);
41818 return true;
41819
41820 case E_V8HImode:
41821 case E_V16QImode:
41822 vsimode = V4SImode;
41823 goto widen;
41824 case E_V4HImode:
41825 case E_V8QImode:
41826 if (!mmx_ok)
41827 return false;
41828 vsimode = V2SImode;
41829 goto widen;
41830 widen:
41831 if (one_var != 0)
41832 return false;
41833
41834 /* Zero extend the variable element to SImode and recurse. */
41835 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
41836
41837 x = gen_reg_rtx (vsimode);
41838 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
41839 var, one_var))
41840 gcc_unreachable ();
41841
41842 emit_move_insn (target, gen_lowpart (mode, x));
41843 return true;
41844
41845 default:
41846 return false;
41847 }
41848 }
41849
41850 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
41851 consisting of the values in VALS. It is known that all elements
41852 except ONE_VAR are constants. Return true if successful. */
41853
41854 static bool
41855 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
41856 rtx target, rtx vals, int one_var)
41857 {
41858 rtx var = XVECEXP (vals, 0, one_var);
41859 machine_mode wmode;
41860 rtx const_vec, x;
41861
41862 const_vec = copy_rtx (vals);
41863 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
41864 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
41865
41866 switch (mode)
41867 {
41868 case E_V2DFmode:
41869 case E_V2DImode:
41870 case E_V2SFmode:
41871 case E_V2SImode:
41872 /* For the two element vectors, it's just as easy to use
41873 the general case. */
41874 return false;
41875
41876 case E_V4DImode:
41877 /* Use ix86_expand_vector_set in 64bit mode only. */
41878 if (!TARGET_64BIT)
41879 return false;
41880 /* FALLTHRU */
41881 case E_V4DFmode:
41882 case E_V8SFmode:
41883 case E_V8SImode:
41884 case E_V16HImode:
41885 case E_V32QImode:
41886 case E_V4SFmode:
41887 case E_V4SImode:
41888 case E_V8HImode:
41889 case E_V4HImode:
41890 break;
41891
41892 case E_V16QImode:
41893 if (TARGET_SSE4_1)
41894 break;
41895 wmode = V8HImode;
41896 goto widen;
41897 case E_V8QImode:
41898 wmode = V4HImode;
41899 goto widen;
41900 widen:
41901 /* There's no way to set one QImode entry easily. Combine
41902 the variable value with its adjacent constant value, and
41903 promote to an HImode set. */
41904 x = XVECEXP (vals, 0, one_var ^ 1);
41905 if (one_var & 1)
41906 {
41907 var = convert_modes (HImode, QImode, var, true);
41908 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
41909 NULL_RTX, 1, OPTAB_LIB_WIDEN);
41910 x = GEN_INT (INTVAL (x) & 0xff);
41911 }
41912 else
41913 {
41914 var = convert_modes (HImode, QImode, var, true);
41915 x = gen_int_mode (INTVAL (x) << 8, HImode);
41916 }
41917 if (x != const0_rtx)
41918 var = expand_simple_binop (HImode, IOR, var, x, var,
41919 1, OPTAB_LIB_WIDEN);
41920
41921 x = gen_reg_rtx (wmode);
41922 emit_move_insn (x, gen_lowpart (wmode, const_vec));
41923 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
41924
41925 emit_move_insn (target, gen_lowpart (mode, x));
41926 return true;
41927
41928 default:
41929 return false;
41930 }
41931
41932 emit_move_insn (target, const_vec);
41933 ix86_expand_vector_set (mmx_ok, target, var, one_var);
41934 return true;
41935 }
41936
41937 /* A subroutine of ix86_expand_vector_init_general. Use vector
41938 concatenate to handle the most general case: all values variable,
41939 and none identical. */
41940
41941 static void
41942 ix86_expand_vector_init_concat (machine_mode mode,
41943 rtx target, rtx *ops, int n)
41944 {
41945 machine_mode cmode, hmode = VOIDmode, gmode = VOIDmode;
41946 rtx first[16], second[8], third[4];
41947 rtvec v;
41948 int i, j;
41949
41950 switch (n)
41951 {
41952 case 2:
41953 switch (mode)
41954 {
41955 case E_V16SImode:
41956 cmode = V8SImode;
41957 break;
41958 case E_V16SFmode:
41959 cmode = V8SFmode;
41960 break;
41961 case E_V8DImode:
41962 cmode = V4DImode;
41963 break;
41964 case E_V8DFmode:
41965 cmode = V4DFmode;
41966 break;
41967 case E_V8SImode:
41968 cmode = V4SImode;
41969 break;
41970 case E_V8SFmode:
41971 cmode = V4SFmode;
41972 break;
41973 case E_V4DImode:
41974 cmode = V2DImode;
41975 break;
41976 case E_V4DFmode:
41977 cmode = V2DFmode;
41978 break;
41979 case E_V4SImode:
41980 cmode = V2SImode;
41981 break;
41982 case E_V4SFmode:
41983 cmode = V2SFmode;
41984 break;
41985 case E_V2DImode:
41986 cmode = DImode;
41987 break;
41988 case E_V2SImode:
41989 cmode = SImode;
41990 break;
41991 case E_V2DFmode:
41992 cmode = DFmode;
41993 break;
41994 case E_V2SFmode:
41995 cmode = SFmode;
41996 break;
41997 default:
41998 gcc_unreachable ();
41999 }
42000
42001 if (!register_operand (ops[1], cmode))
42002 ops[1] = force_reg (cmode, ops[1]);
42003 if (!register_operand (ops[0], cmode))
42004 ops[0] = force_reg (cmode, ops[0]);
42005 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
42006 ops[1])));
42007 break;
42008
42009 case 4:
42010 switch (mode)
42011 {
42012 case E_V4DImode:
42013 cmode = V2DImode;
42014 break;
42015 case E_V4DFmode:
42016 cmode = V2DFmode;
42017 break;
42018 case E_V4SImode:
42019 cmode = V2SImode;
42020 break;
42021 case E_V4SFmode:
42022 cmode = V2SFmode;
42023 break;
42024 default:
42025 gcc_unreachable ();
42026 }
42027 goto half;
42028
42029 case 8:
42030 switch (mode)
42031 {
42032 case E_V8DImode:
42033 cmode = V2DImode;
42034 hmode = V4DImode;
42035 break;
42036 case E_V8DFmode:
42037 cmode = V2DFmode;
42038 hmode = V4DFmode;
42039 break;
42040 case E_V8SImode:
42041 cmode = V2SImode;
42042 hmode = V4SImode;
42043 break;
42044 case E_V8SFmode:
42045 cmode = V2SFmode;
42046 hmode = V4SFmode;
42047 break;
42048 default:
42049 gcc_unreachable ();
42050 }
42051 goto half;
42052
42053 case 16:
42054 switch (mode)
42055 {
42056 case E_V16SImode:
42057 cmode = V2SImode;
42058 hmode = V4SImode;
42059 gmode = V8SImode;
42060 break;
42061 case E_V16SFmode:
42062 cmode = V2SFmode;
42063 hmode = V4SFmode;
42064 gmode = V8SFmode;
42065 break;
42066 default:
42067 gcc_unreachable ();
42068 }
42069 goto half;
42070
42071 half:
42072 /* FIXME: We process inputs backward to help RA. PR 36222. */
42073 i = n - 1;
42074 j = (n >> 1) - 1;
42075 for (; i > 0; i -= 2, j--)
42076 {
42077 first[j] = gen_reg_rtx (cmode);
42078 v = gen_rtvec (2, ops[i - 1], ops[i]);
42079 ix86_expand_vector_init (false, first[j],
42080 gen_rtx_PARALLEL (cmode, v));
42081 }
42082
42083 n >>= 1;
42084 if (n > 4)
42085 {
42086 gcc_assert (hmode != VOIDmode);
42087 gcc_assert (gmode != VOIDmode);
42088 for (i = j = 0; i < n; i += 2, j++)
42089 {
42090 second[j] = gen_reg_rtx (hmode);
42091 ix86_expand_vector_init_concat (hmode, second [j],
42092 &first [i], 2);
42093 }
42094 n >>= 1;
42095 for (i = j = 0; i < n; i += 2, j++)
42096 {
42097 third[j] = gen_reg_rtx (gmode);
42098 ix86_expand_vector_init_concat (gmode, third[j],
42099 &second[i], 2);
42100 }
42101 n >>= 1;
42102 ix86_expand_vector_init_concat (mode, target, third, n);
42103 }
42104 else if (n > 2)
42105 {
42106 gcc_assert (hmode != VOIDmode);
42107 for (i = j = 0; i < n; i += 2, j++)
42108 {
42109 second[j] = gen_reg_rtx (hmode);
42110 ix86_expand_vector_init_concat (hmode, second [j],
42111 &first [i], 2);
42112 }
42113 n >>= 1;
42114 ix86_expand_vector_init_concat (mode, target, second, n);
42115 }
42116 else
42117 ix86_expand_vector_init_concat (mode, target, first, n);
42118 break;
42119
42120 default:
42121 gcc_unreachable ();
42122 }
42123 }
42124
42125 /* A subroutine of ix86_expand_vector_init_general. Use vector
42126 interleave to handle the most general case: all values variable,
42127 and none identical. */
42128
42129 static void
42130 ix86_expand_vector_init_interleave (machine_mode mode,
42131 rtx target, rtx *ops, int n)
42132 {
42133 machine_mode first_imode, second_imode, third_imode, inner_mode;
42134 int i, j;
42135 rtx op0, op1;
42136 rtx (*gen_load_even) (rtx, rtx, rtx);
42137 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
42138 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
42139
42140 switch (mode)
42141 {
42142 case E_V8HImode:
42143 gen_load_even = gen_vec_setv8hi;
42144 gen_interleave_first_low = gen_vec_interleave_lowv4si;
42145 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42146 inner_mode = HImode;
42147 first_imode = V4SImode;
42148 second_imode = V2DImode;
42149 third_imode = VOIDmode;
42150 break;
42151 case E_V16QImode:
42152 gen_load_even = gen_vec_setv16qi;
42153 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
42154 gen_interleave_second_low = gen_vec_interleave_lowv4si;
42155 inner_mode = QImode;
42156 first_imode = V8HImode;
42157 second_imode = V4SImode;
42158 third_imode = V2DImode;
42159 break;
42160 default:
42161 gcc_unreachable ();
42162 }
42163
42164 for (i = 0; i < n; i++)
42165 {
42166 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
42167 op0 = gen_reg_rtx (SImode);
42168 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
42169
42170 /* Insert the SImode value as low element of V4SImode vector. */
42171 op1 = gen_reg_rtx (V4SImode);
42172 op0 = gen_rtx_VEC_MERGE (V4SImode,
42173 gen_rtx_VEC_DUPLICATE (V4SImode,
42174 op0),
42175 CONST0_RTX (V4SImode),
42176 const1_rtx);
42177 emit_insn (gen_rtx_SET (op1, op0));
42178
42179 /* Cast the V4SImode vector back to a vector in orignal mode. */
42180 op0 = gen_reg_rtx (mode);
42181 emit_move_insn (op0, gen_lowpart (mode, op1));
42182
42183 /* Load even elements into the second position. */
42184 emit_insn (gen_load_even (op0,
42185 force_reg (inner_mode,
42186 ops [i + i + 1]),
42187 const1_rtx));
42188
42189 /* Cast vector to FIRST_IMODE vector. */
42190 ops[i] = gen_reg_rtx (first_imode);
42191 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
42192 }
42193
42194 /* Interleave low FIRST_IMODE vectors. */
42195 for (i = j = 0; i < n; i += 2, j++)
42196 {
42197 op0 = gen_reg_rtx (first_imode);
42198 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
42199
42200 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
42201 ops[j] = gen_reg_rtx (second_imode);
42202 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
42203 }
42204
42205 /* Interleave low SECOND_IMODE vectors. */
42206 switch (second_imode)
42207 {
42208 case E_V4SImode:
42209 for (i = j = 0; i < n / 2; i += 2, j++)
42210 {
42211 op0 = gen_reg_rtx (second_imode);
42212 emit_insn (gen_interleave_second_low (op0, ops[i],
42213 ops[i + 1]));
42214
42215 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
42216 vector. */
42217 ops[j] = gen_reg_rtx (third_imode);
42218 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
42219 }
42220 second_imode = V2DImode;
42221 gen_interleave_second_low = gen_vec_interleave_lowv2di;
42222 /* FALLTHRU */
42223
42224 case E_V2DImode:
42225 op0 = gen_reg_rtx (second_imode);
42226 emit_insn (gen_interleave_second_low (op0, ops[0],
42227 ops[1]));
42228
42229 /* Cast the SECOND_IMODE vector back to a vector on original
42230 mode. */
42231 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
42232 break;
42233
42234 default:
42235 gcc_unreachable ();
42236 }
42237 }
42238
42239 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
42240 all values variable, and none identical. */
42241
42242 static void
42243 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
42244 rtx target, rtx vals)
42245 {
42246 rtx ops[64], op0, op1, op2, op3, op4, op5;
42247 machine_mode half_mode = VOIDmode;
42248 machine_mode quarter_mode = VOIDmode;
42249 int n, i;
42250
42251 switch (mode)
42252 {
42253 case E_V2SFmode:
42254 case E_V2SImode:
42255 if (!mmx_ok && !TARGET_SSE)
42256 break;
42257 /* FALLTHRU */
42258
42259 case E_V16SImode:
42260 case E_V16SFmode:
42261 case E_V8DFmode:
42262 case E_V8DImode:
42263 case E_V8SFmode:
42264 case E_V8SImode:
42265 case E_V4DFmode:
42266 case E_V4DImode:
42267 case E_V4SFmode:
42268 case E_V4SImode:
42269 case E_V2DFmode:
42270 case E_V2DImode:
42271 n = GET_MODE_NUNITS (mode);
42272 for (i = 0; i < n; i++)
42273 ops[i] = XVECEXP (vals, 0, i);
42274 ix86_expand_vector_init_concat (mode, target, ops, n);
42275 return;
42276
42277 case E_V2TImode:
42278 for (i = 0; i < 2; i++)
42279 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42280 op0 = gen_reg_rtx (V4DImode);
42281 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
42282 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42283 return;
42284
42285 case E_V4TImode:
42286 for (i = 0; i < 4; i++)
42287 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
42288 ops[4] = gen_reg_rtx (V4DImode);
42289 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
42290 ops[5] = gen_reg_rtx (V4DImode);
42291 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
42292 op0 = gen_reg_rtx (V8DImode);
42293 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
42294 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
42295 return;
42296
42297 case E_V32QImode:
42298 half_mode = V16QImode;
42299 goto half;
42300
42301 case E_V16HImode:
42302 half_mode = V8HImode;
42303 goto half;
42304
42305 half:
42306 n = GET_MODE_NUNITS (mode);
42307 for (i = 0; i < n; i++)
42308 ops[i] = XVECEXP (vals, 0, i);
42309 op0 = gen_reg_rtx (half_mode);
42310 op1 = gen_reg_rtx (half_mode);
42311 ix86_expand_vector_init_interleave (half_mode, op0, ops,
42312 n >> 2);
42313 ix86_expand_vector_init_interleave (half_mode, op1,
42314 &ops [n >> 1], n >> 2);
42315 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
42316 return;
42317
42318 case E_V64QImode:
42319 quarter_mode = V16QImode;
42320 half_mode = V32QImode;
42321 goto quarter;
42322
42323 case E_V32HImode:
42324 quarter_mode = V8HImode;
42325 half_mode = V16HImode;
42326 goto quarter;
42327
42328 quarter:
42329 n = GET_MODE_NUNITS (mode);
42330 for (i = 0; i < n; i++)
42331 ops[i] = XVECEXP (vals, 0, i);
42332 op0 = gen_reg_rtx (quarter_mode);
42333 op1 = gen_reg_rtx (quarter_mode);
42334 op2 = gen_reg_rtx (quarter_mode);
42335 op3 = gen_reg_rtx (quarter_mode);
42336 op4 = gen_reg_rtx (half_mode);
42337 op5 = gen_reg_rtx (half_mode);
42338 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
42339 n >> 3);
42340 ix86_expand_vector_init_interleave (quarter_mode, op1,
42341 &ops [n >> 2], n >> 3);
42342 ix86_expand_vector_init_interleave (quarter_mode, op2,
42343 &ops [n >> 1], n >> 3);
42344 ix86_expand_vector_init_interleave (quarter_mode, op3,
42345 &ops [(n >> 1) | (n >> 2)], n >> 3);
42346 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
42347 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
42348 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
42349 return;
42350
42351 case E_V16QImode:
42352 if (!TARGET_SSE4_1)
42353 break;
42354 /* FALLTHRU */
42355
42356 case E_V8HImode:
42357 if (!TARGET_SSE2)
42358 break;
42359
42360 /* Don't use ix86_expand_vector_init_interleave if we can't
42361 move from GPR to SSE register directly. */
42362 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
42363 break;
42364
42365 n = GET_MODE_NUNITS (mode);
42366 for (i = 0; i < n; i++)
42367 ops[i] = XVECEXP (vals, 0, i);
42368 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
42369 return;
42370
42371 case E_V4HImode:
42372 case E_V8QImode:
42373 break;
42374
42375 default:
42376 gcc_unreachable ();
42377 }
42378
42379 {
42380 int i, j, n_elts, n_words, n_elt_per_word;
42381 machine_mode inner_mode;
42382 rtx words[4], shift;
42383
42384 inner_mode = GET_MODE_INNER (mode);
42385 n_elts = GET_MODE_NUNITS (mode);
42386 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
42387 n_elt_per_word = n_elts / n_words;
42388 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
42389
42390 for (i = 0; i < n_words; ++i)
42391 {
42392 rtx word = NULL_RTX;
42393
42394 for (j = 0; j < n_elt_per_word; ++j)
42395 {
42396 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
42397 elt = convert_modes (word_mode, inner_mode, elt, true);
42398
42399 if (j == 0)
42400 word = elt;
42401 else
42402 {
42403 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
42404 word, 1, OPTAB_LIB_WIDEN);
42405 word = expand_simple_binop (word_mode, IOR, word, elt,
42406 word, 1, OPTAB_LIB_WIDEN);
42407 }
42408 }
42409
42410 words[i] = word;
42411 }
42412
42413 if (n_words == 1)
42414 emit_move_insn (target, gen_lowpart (mode, words[0]));
42415 else if (n_words == 2)
42416 {
42417 rtx tmp = gen_reg_rtx (mode);
42418 emit_clobber (tmp);
42419 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
42420 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
42421 emit_move_insn (target, tmp);
42422 }
42423 else if (n_words == 4)
42424 {
42425 rtx tmp = gen_reg_rtx (V4SImode);
42426 gcc_assert (word_mode == SImode);
42427 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
42428 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
42429 emit_move_insn (target, gen_lowpart (mode, tmp));
42430 }
42431 else
42432 gcc_unreachable ();
42433 }
42434 }
42435
42436 /* Initialize vector TARGET via VALS. Suppress the use of MMX
42437 instructions unless MMX_OK is true. */
42438
42439 void
42440 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
42441 {
42442 machine_mode mode = GET_MODE (target);
42443 machine_mode inner_mode = GET_MODE_INNER (mode);
42444 int n_elts = GET_MODE_NUNITS (mode);
42445 int n_var = 0, one_var = -1;
42446 bool all_same = true, all_const_zero = true;
42447 int i;
42448 rtx x;
42449
42450 /* Handle first initialization from vector elts. */
42451 if (n_elts != XVECLEN (vals, 0))
42452 {
42453 rtx subtarget = target;
42454 x = XVECEXP (vals, 0, 0);
42455 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
42456 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
42457 {
42458 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
42459 if (inner_mode == QImode || inner_mode == HImode)
42460 {
42461 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
42462 mode = mode_for_vector (SImode, n_bits / 4).require ();
42463 inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
42464 ops[0] = gen_lowpart (inner_mode, ops[0]);
42465 ops[1] = gen_lowpart (inner_mode, ops[1]);
42466 subtarget = gen_reg_rtx (mode);
42467 }
42468 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
42469 if (subtarget != target)
42470 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
42471 return;
42472 }
42473 gcc_unreachable ();
42474 }
42475
42476 for (i = 0; i < n_elts; ++i)
42477 {
42478 x = XVECEXP (vals, 0, i);
42479 if (!(CONST_SCALAR_INT_P (x)
42480 || CONST_DOUBLE_P (x)
42481 || CONST_FIXED_P (x)))
42482 n_var++, one_var = i;
42483 else if (x != CONST0_RTX (inner_mode))
42484 all_const_zero = false;
42485 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
42486 all_same = false;
42487 }
42488
42489 /* Constants are best loaded from the constant pool. */
42490 if (n_var == 0)
42491 {
42492 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
42493 return;
42494 }
42495
42496 /* If all values are identical, broadcast the value. */
42497 if (all_same
42498 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
42499 XVECEXP (vals, 0, 0)))
42500 return;
42501
42502 /* Values where only one field is non-constant are best loaded from
42503 the pool and overwritten via move later. */
42504 if (n_var == 1)
42505 {
42506 if (all_const_zero
42507 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
42508 XVECEXP (vals, 0, one_var),
42509 one_var))
42510 return;
42511
42512 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
42513 return;
42514 }
42515
42516 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
42517 }
42518
42519 void
42520 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
42521 {
42522 machine_mode mode = GET_MODE (target);
42523 machine_mode inner_mode = GET_MODE_INNER (mode);
42524 machine_mode half_mode;
42525 bool use_vec_merge = false;
42526 rtx tmp;
42527 static rtx (*gen_extract[6][2]) (rtx, rtx)
42528 = {
42529 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
42530 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
42531 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
42532 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
42533 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
42534 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
42535 };
42536 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
42537 = {
42538 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
42539 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
42540 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
42541 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
42542 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
42543 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
42544 };
42545 int i, j, n;
42546 machine_mode mmode = VOIDmode;
42547 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
42548
42549 switch (mode)
42550 {
42551 case E_V2SFmode:
42552 case E_V2SImode:
42553 if (mmx_ok)
42554 {
42555 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42556 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
42557 if (elt == 0)
42558 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42559 else
42560 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42561 emit_insn (gen_rtx_SET (target, tmp));
42562 return;
42563 }
42564 break;
42565
42566 case E_V2DImode:
42567 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
42568 if (use_vec_merge)
42569 break;
42570
42571 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
42572 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
42573 if (elt == 0)
42574 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
42575 else
42576 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
42577 emit_insn (gen_rtx_SET (target, tmp));
42578 return;
42579
42580 case E_V2DFmode:
42581 {
42582 rtx op0, op1;
42583
42584 /* For the two element vectors, we implement a VEC_CONCAT with
42585 the extraction of the other element. */
42586
42587 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
42588 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
42589
42590 if (elt == 0)
42591 op0 = val, op1 = tmp;
42592 else
42593 op0 = tmp, op1 = val;
42594
42595 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
42596 emit_insn (gen_rtx_SET (target, tmp));
42597 }
42598 return;
42599
42600 case E_V4SFmode:
42601 use_vec_merge = TARGET_SSE4_1;
42602 if (use_vec_merge)
42603 break;
42604
42605 switch (elt)
42606 {
42607 case 0:
42608 use_vec_merge = true;
42609 break;
42610
42611 case 1:
42612 /* tmp = target = A B C D */
42613 tmp = copy_to_reg (target);
42614 /* target = A A B B */
42615 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
42616 /* target = X A B B */
42617 ix86_expand_vector_set (false, target, val, 0);
42618 /* target = A X C D */
42619 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42620 const1_rtx, const0_rtx,
42621 GEN_INT (2+4), GEN_INT (3+4)));
42622 return;
42623
42624 case 2:
42625 /* tmp = target = A B C D */
42626 tmp = copy_to_reg (target);
42627 /* tmp = X B C D */
42628 ix86_expand_vector_set (false, tmp, val, 0);
42629 /* target = A B X D */
42630 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42631 const0_rtx, const1_rtx,
42632 GEN_INT (0+4), GEN_INT (3+4)));
42633 return;
42634
42635 case 3:
42636 /* tmp = target = A B C D */
42637 tmp = copy_to_reg (target);
42638 /* tmp = X B C D */
42639 ix86_expand_vector_set (false, tmp, val, 0);
42640 /* target = A B X D */
42641 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
42642 const0_rtx, const1_rtx,
42643 GEN_INT (2+4), GEN_INT (0+4)));
42644 return;
42645
42646 default:
42647 gcc_unreachable ();
42648 }
42649 break;
42650
42651 case E_V4SImode:
42652 use_vec_merge = TARGET_SSE4_1;
42653 if (use_vec_merge)
42654 break;
42655
42656 /* Element 0 handled by vec_merge below. */
42657 if (elt == 0)
42658 {
42659 use_vec_merge = true;
42660 break;
42661 }
42662
42663 if (TARGET_SSE2)
42664 {
42665 /* With SSE2, use integer shuffles to swap element 0 and ELT,
42666 store into element 0, then shuffle them back. */
42667
42668 rtx order[4];
42669
42670 order[0] = GEN_INT (elt);
42671 order[1] = const1_rtx;
42672 order[2] = const2_rtx;
42673 order[3] = GEN_INT (3);
42674 order[elt] = const0_rtx;
42675
42676 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42677 order[1], order[2], order[3]));
42678
42679 ix86_expand_vector_set (false, target, val, 0);
42680
42681 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
42682 order[1], order[2], order[3]));
42683 }
42684 else
42685 {
42686 /* For SSE1, we have to reuse the V4SF code. */
42687 rtx t = gen_reg_rtx (V4SFmode);
42688 emit_move_insn (t, gen_lowpart (V4SFmode, target));
42689 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
42690 emit_move_insn (target, gen_lowpart (mode, t));
42691 }
42692 return;
42693
42694 case E_V8HImode:
42695 use_vec_merge = TARGET_SSE2;
42696 break;
42697 case E_V4HImode:
42698 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42699 break;
42700
42701 case E_V16QImode:
42702 use_vec_merge = TARGET_SSE4_1;
42703 break;
42704
42705 case E_V8QImode:
42706 break;
42707
42708 case E_V32QImode:
42709 half_mode = V16QImode;
42710 j = 0;
42711 n = 16;
42712 goto half;
42713
42714 case E_V16HImode:
42715 half_mode = V8HImode;
42716 j = 1;
42717 n = 8;
42718 goto half;
42719
42720 case E_V8SImode:
42721 half_mode = V4SImode;
42722 j = 2;
42723 n = 4;
42724 goto half;
42725
42726 case E_V4DImode:
42727 half_mode = V2DImode;
42728 j = 3;
42729 n = 2;
42730 goto half;
42731
42732 case E_V8SFmode:
42733 half_mode = V4SFmode;
42734 j = 4;
42735 n = 4;
42736 goto half;
42737
42738 case E_V4DFmode:
42739 half_mode = V2DFmode;
42740 j = 5;
42741 n = 2;
42742 goto half;
42743
42744 half:
42745 /* Compute offset. */
42746 i = elt / n;
42747 elt %= n;
42748
42749 gcc_assert (i <= 1);
42750
42751 /* Extract the half. */
42752 tmp = gen_reg_rtx (half_mode);
42753 emit_insn (gen_extract[j][i] (tmp, target));
42754
42755 /* Put val in tmp at elt. */
42756 ix86_expand_vector_set (false, tmp, val, elt);
42757
42758 /* Put it back. */
42759 emit_insn (gen_insert[j][i] (target, target, tmp));
42760 return;
42761
42762 case E_V8DFmode:
42763 if (TARGET_AVX512F)
42764 {
42765 mmode = QImode;
42766 gen_blendm = gen_avx512f_blendmv8df;
42767 }
42768 break;
42769
42770 case E_V8DImode:
42771 if (TARGET_AVX512F)
42772 {
42773 mmode = QImode;
42774 gen_blendm = gen_avx512f_blendmv8di;
42775 }
42776 break;
42777
42778 case E_V16SFmode:
42779 if (TARGET_AVX512F)
42780 {
42781 mmode = HImode;
42782 gen_blendm = gen_avx512f_blendmv16sf;
42783 }
42784 break;
42785
42786 case E_V16SImode:
42787 if (TARGET_AVX512F)
42788 {
42789 mmode = HImode;
42790 gen_blendm = gen_avx512f_blendmv16si;
42791 }
42792 break;
42793
42794 case E_V32HImode:
42795 if (TARGET_AVX512F && TARGET_AVX512BW)
42796 {
42797 mmode = SImode;
42798 gen_blendm = gen_avx512bw_blendmv32hi;
42799 }
42800 break;
42801
42802 case E_V64QImode:
42803 if (TARGET_AVX512F && TARGET_AVX512BW)
42804 {
42805 mmode = DImode;
42806 gen_blendm = gen_avx512bw_blendmv64qi;
42807 }
42808 break;
42809
42810 default:
42811 break;
42812 }
42813
42814 if (mmode != VOIDmode)
42815 {
42816 tmp = gen_reg_rtx (mode);
42817 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
42818 /* The avx512*_blendm<mode> expanders have different operand order
42819 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
42820 elements where the mask is set and second input operand otherwise,
42821 in {sse,avx}*_*blend* the first input operand is used for elements
42822 where the mask is clear and second input operand otherwise. */
42823 emit_insn (gen_blendm (target, target, tmp,
42824 force_reg (mmode,
42825 gen_int_mode (1 << elt, mmode))));
42826 }
42827 else if (use_vec_merge)
42828 {
42829 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
42830 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
42831 emit_insn (gen_rtx_SET (target, tmp));
42832 }
42833 else
42834 {
42835 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
42836
42837 emit_move_insn (mem, target);
42838
42839 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
42840 emit_move_insn (tmp, val);
42841
42842 emit_move_insn (target, mem);
42843 }
42844 }
42845
42846 void
42847 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
42848 {
42849 machine_mode mode = GET_MODE (vec);
42850 machine_mode inner_mode = GET_MODE_INNER (mode);
42851 bool use_vec_extr = false;
42852 rtx tmp;
42853
42854 switch (mode)
42855 {
42856 case E_V2SImode:
42857 case E_V2SFmode:
42858 if (!mmx_ok)
42859 break;
42860 /* FALLTHRU */
42861
42862 case E_V2DFmode:
42863 case E_V2DImode:
42864 case E_V2TImode:
42865 case E_V4TImode:
42866 use_vec_extr = true;
42867 break;
42868
42869 case E_V4SFmode:
42870 use_vec_extr = TARGET_SSE4_1;
42871 if (use_vec_extr)
42872 break;
42873
42874 switch (elt)
42875 {
42876 case 0:
42877 tmp = vec;
42878 break;
42879
42880 case 1:
42881 case 3:
42882 tmp = gen_reg_rtx (mode);
42883 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
42884 GEN_INT (elt), GEN_INT (elt),
42885 GEN_INT (elt+4), GEN_INT (elt+4)));
42886 break;
42887
42888 case 2:
42889 tmp = gen_reg_rtx (mode);
42890 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
42891 break;
42892
42893 default:
42894 gcc_unreachable ();
42895 }
42896 vec = tmp;
42897 use_vec_extr = true;
42898 elt = 0;
42899 break;
42900
42901 case E_V4SImode:
42902 use_vec_extr = TARGET_SSE4_1;
42903 if (use_vec_extr)
42904 break;
42905
42906 if (TARGET_SSE2)
42907 {
42908 switch (elt)
42909 {
42910 case 0:
42911 tmp = vec;
42912 break;
42913
42914 case 1:
42915 case 3:
42916 tmp = gen_reg_rtx (mode);
42917 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
42918 GEN_INT (elt), GEN_INT (elt),
42919 GEN_INT (elt), GEN_INT (elt)));
42920 break;
42921
42922 case 2:
42923 tmp = gen_reg_rtx (mode);
42924 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
42925 break;
42926
42927 default:
42928 gcc_unreachable ();
42929 }
42930 vec = tmp;
42931 use_vec_extr = true;
42932 elt = 0;
42933 }
42934 else
42935 {
42936 /* For SSE1, we have to reuse the V4SF code. */
42937 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
42938 gen_lowpart (V4SFmode, vec), elt);
42939 return;
42940 }
42941 break;
42942
42943 case E_V8HImode:
42944 use_vec_extr = TARGET_SSE2;
42945 break;
42946 case E_V4HImode:
42947 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
42948 break;
42949
42950 case E_V16QImode:
42951 use_vec_extr = TARGET_SSE4_1;
42952 break;
42953
42954 case E_V8SFmode:
42955 if (TARGET_AVX)
42956 {
42957 tmp = gen_reg_rtx (V4SFmode);
42958 if (elt < 4)
42959 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
42960 else
42961 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
42962 ix86_expand_vector_extract (false, target, tmp, elt & 3);
42963 return;
42964 }
42965 break;
42966
42967 case E_V4DFmode:
42968 if (TARGET_AVX)
42969 {
42970 tmp = gen_reg_rtx (V2DFmode);
42971 if (elt < 2)
42972 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
42973 else
42974 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
42975 ix86_expand_vector_extract (false, target, tmp, elt & 1);
42976 return;
42977 }
42978 break;
42979
42980 case E_V32QImode:
42981 if (TARGET_AVX)
42982 {
42983 tmp = gen_reg_rtx (V16QImode);
42984 if (elt < 16)
42985 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
42986 else
42987 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
42988 ix86_expand_vector_extract (false, target, tmp, elt & 15);
42989 return;
42990 }
42991 break;
42992
42993 case E_V16HImode:
42994 if (TARGET_AVX)
42995 {
42996 tmp = gen_reg_rtx (V8HImode);
42997 if (elt < 8)
42998 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
42999 else
43000 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
43001 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43002 return;
43003 }
43004 break;
43005
43006 case E_V8SImode:
43007 if (TARGET_AVX)
43008 {
43009 tmp = gen_reg_rtx (V4SImode);
43010 if (elt < 4)
43011 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
43012 else
43013 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
43014 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43015 return;
43016 }
43017 break;
43018
43019 case E_V4DImode:
43020 if (TARGET_AVX)
43021 {
43022 tmp = gen_reg_rtx (V2DImode);
43023 if (elt < 2)
43024 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
43025 else
43026 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
43027 ix86_expand_vector_extract (false, target, tmp, elt & 1);
43028 return;
43029 }
43030 break;
43031
43032 case E_V32HImode:
43033 if (TARGET_AVX512BW)
43034 {
43035 tmp = gen_reg_rtx (V16HImode);
43036 if (elt < 16)
43037 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
43038 else
43039 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
43040 ix86_expand_vector_extract (false, target, tmp, elt & 15);
43041 return;
43042 }
43043 break;
43044
43045 case E_V64QImode:
43046 if (TARGET_AVX512BW)
43047 {
43048 tmp = gen_reg_rtx (V32QImode);
43049 if (elt < 32)
43050 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
43051 else
43052 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
43053 ix86_expand_vector_extract (false, target, tmp, elt & 31);
43054 return;
43055 }
43056 break;
43057
43058 case E_V16SFmode:
43059 tmp = gen_reg_rtx (V8SFmode);
43060 if (elt < 8)
43061 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
43062 else
43063 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
43064 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43065 return;
43066
43067 case E_V8DFmode:
43068 tmp = gen_reg_rtx (V4DFmode);
43069 if (elt < 4)
43070 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
43071 else
43072 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
43073 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43074 return;
43075
43076 case E_V16SImode:
43077 tmp = gen_reg_rtx (V8SImode);
43078 if (elt < 8)
43079 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
43080 else
43081 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
43082 ix86_expand_vector_extract (false, target, tmp, elt & 7);
43083 return;
43084
43085 case E_V8DImode:
43086 tmp = gen_reg_rtx (V4DImode);
43087 if (elt < 4)
43088 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
43089 else
43090 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
43091 ix86_expand_vector_extract (false, target, tmp, elt & 3);
43092 return;
43093
43094 case E_V8QImode:
43095 /* ??? Could extract the appropriate HImode element and shift. */
43096 default:
43097 break;
43098 }
43099
43100 if (use_vec_extr)
43101 {
43102 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
43103 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
43104
43105 /* Let the rtl optimizers know about the zero extension performed. */
43106 if (inner_mode == QImode || inner_mode == HImode)
43107 {
43108 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
43109 target = gen_lowpart (SImode, target);
43110 }
43111
43112 emit_insn (gen_rtx_SET (target, tmp));
43113 }
43114 else
43115 {
43116 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
43117
43118 emit_move_insn (mem, vec);
43119
43120 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
43121 emit_move_insn (target, tmp);
43122 }
43123 }
43124
43125 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
43126 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
43127 The upper bits of DEST are undefined, though they shouldn't cause
43128 exceptions (some bits from src or all zeros are ok). */
43129
43130 static void
43131 emit_reduc_half (rtx dest, rtx src, int i)
43132 {
43133 rtx tem, d = dest;
43134 switch (GET_MODE (src))
43135 {
43136 case E_V4SFmode:
43137 if (i == 128)
43138 tem = gen_sse_movhlps (dest, src, src);
43139 else
43140 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
43141 GEN_INT (1 + 4), GEN_INT (1 + 4));
43142 break;
43143 case E_V2DFmode:
43144 tem = gen_vec_interleave_highv2df (dest, src, src);
43145 break;
43146 case E_V16QImode:
43147 case E_V8HImode:
43148 case E_V4SImode:
43149 case E_V2DImode:
43150 d = gen_reg_rtx (V1TImode);
43151 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
43152 GEN_INT (i / 2));
43153 break;
43154 case E_V8SFmode:
43155 if (i == 256)
43156 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
43157 else
43158 tem = gen_avx_shufps256 (dest, src, src,
43159 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
43160 break;
43161 case E_V4DFmode:
43162 if (i == 256)
43163 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
43164 else
43165 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
43166 break;
43167 case E_V32QImode:
43168 case E_V16HImode:
43169 case E_V8SImode:
43170 case E_V4DImode:
43171 if (i == 256)
43172 {
43173 if (GET_MODE (dest) != V4DImode)
43174 d = gen_reg_rtx (V4DImode);
43175 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
43176 gen_lowpart (V4DImode, src),
43177 const1_rtx);
43178 }
43179 else
43180 {
43181 d = gen_reg_rtx (V2TImode);
43182 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
43183 GEN_INT (i / 2));
43184 }
43185 break;
43186 case E_V64QImode:
43187 case E_V32HImode:
43188 case E_V16SImode:
43189 case E_V16SFmode:
43190 case E_V8DImode:
43191 case E_V8DFmode:
43192 if (i > 128)
43193 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
43194 gen_lowpart (V16SImode, src),
43195 gen_lowpart (V16SImode, src),
43196 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
43197 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
43198 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
43199 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
43200 GEN_INT (0xC), GEN_INT (0xD),
43201 GEN_INT (0xE), GEN_INT (0xF),
43202 GEN_INT (0x10), GEN_INT (0x11),
43203 GEN_INT (0x12), GEN_INT (0x13),
43204 GEN_INT (0x14), GEN_INT (0x15),
43205 GEN_INT (0x16), GEN_INT (0x17));
43206 else
43207 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
43208 gen_lowpart (V16SImode, src),
43209 GEN_INT (i == 128 ? 0x2 : 0x1),
43210 GEN_INT (0x3),
43211 GEN_INT (0x3),
43212 GEN_INT (0x3),
43213 GEN_INT (i == 128 ? 0x6 : 0x5),
43214 GEN_INT (0x7),
43215 GEN_INT (0x7),
43216 GEN_INT (0x7),
43217 GEN_INT (i == 128 ? 0xA : 0x9),
43218 GEN_INT (0xB),
43219 GEN_INT (0xB),
43220 GEN_INT (0xB),
43221 GEN_INT (i == 128 ? 0xE : 0xD),
43222 GEN_INT (0xF),
43223 GEN_INT (0xF),
43224 GEN_INT (0xF));
43225 break;
43226 default:
43227 gcc_unreachable ();
43228 }
43229 emit_insn (tem);
43230 if (d != dest)
43231 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
43232 }
43233
43234 /* Expand a vector reduction. FN is the binary pattern to reduce;
43235 DEST is the destination; IN is the input vector. */
43236
43237 void
43238 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
43239 {
43240 rtx half, dst, vec = in;
43241 machine_mode mode = GET_MODE (in);
43242 int i;
43243
43244 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
43245 if (TARGET_SSE4_1
43246 && mode == V8HImode
43247 && fn == gen_uminv8hi3)
43248 {
43249 emit_insn (gen_sse4_1_phminposuw (dest, in));
43250 return;
43251 }
43252
43253 for (i = GET_MODE_BITSIZE (mode);
43254 i > GET_MODE_UNIT_BITSIZE (mode);
43255 i >>= 1)
43256 {
43257 half = gen_reg_rtx (mode);
43258 emit_reduc_half (half, vec, i);
43259 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
43260 dst = dest;
43261 else
43262 dst = gen_reg_rtx (mode);
43263 emit_insn (fn (dst, half, vec));
43264 vec = dst;
43265 }
43266 }
43267 \f
43268 /* Target hook for scalar_mode_supported_p. */
43269 static bool
43270 ix86_scalar_mode_supported_p (scalar_mode mode)
43271 {
43272 if (DECIMAL_FLOAT_MODE_P (mode))
43273 return default_decimal_float_supported_p ();
43274 else if (mode == TFmode)
43275 return true;
43276 else
43277 return default_scalar_mode_supported_p (mode);
43278 }
43279
43280 /* Implements target hook vector_mode_supported_p. */
43281 static bool
43282 ix86_vector_mode_supported_p (machine_mode mode)
43283 {
43284 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
43285 return true;
43286 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
43287 return true;
43288 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
43289 return true;
43290 if (TARGET_AVX512F && VALID_AVX512F_REG_MODE (mode))
43291 return true;
43292 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
43293 return true;
43294 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
43295 return true;
43296 return false;
43297 }
43298
43299 /* Target hook for c_mode_for_suffix. */
43300 static machine_mode
43301 ix86_c_mode_for_suffix (char suffix)
43302 {
43303 if (suffix == 'q')
43304 return TFmode;
43305 if (suffix == 'w')
43306 return XFmode;
43307
43308 return VOIDmode;
43309 }
43310
43311 /* Worker function for TARGET_MD_ASM_ADJUST.
43312
43313 We implement asm flag outputs, and maintain source compatibility
43314 with the old cc0-based compiler. */
43315
43316 static rtx_insn *
43317 ix86_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &/*inputs*/,
43318 vec<const char *> &constraints,
43319 vec<rtx> &clobbers, HARD_REG_SET &clobbered_regs)
43320 {
43321 clobbers.safe_push (gen_rtx_REG (CCFPmode, FPSR_REG));
43322 SET_HARD_REG_BIT (clobbered_regs, FPSR_REG);
43323
43324 bool saw_asm_flag = false;
43325
43326 start_sequence ();
43327 for (unsigned i = 0, n = outputs.length (); i < n; ++i)
43328 {
43329 const char *con = constraints[i];
43330 if (strncmp (con, "=@cc", 4) != 0)
43331 continue;
43332 con += 4;
43333 if (strchr (con, ',') != NULL)
43334 {
43335 error ("alternatives not allowed in asm flag output");
43336 continue;
43337 }
43338
43339 bool invert = false;
43340 if (con[0] == 'n')
43341 invert = true, con++;
43342
43343 machine_mode mode = CCmode;
43344 rtx_code code = UNKNOWN;
43345
43346 switch (con[0])
43347 {
43348 case 'a':
43349 if (con[1] == 0)
43350 mode = CCAmode, code = EQ;
43351 else if (con[1] == 'e' && con[2] == 0)
43352 mode = CCCmode, code = NE;
43353 break;
43354 case 'b':
43355 if (con[1] == 0)
43356 mode = CCCmode, code = EQ;
43357 else if (con[1] == 'e' && con[2] == 0)
43358 mode = CCAmode, code = NE;
43359 break;
43360 case 'c':
43361 if (con[1] == 0)
43362 mode = CCCmode, code = EQ;
43363 break;
43364 case 'e':
43365 if (con[1] == 0)
43366 mode = CCZmode, code = EQ;
43367 break;
43368 case 'g':
43369 if (con[1] == 0)
43370 mode = CCGCmode, code = GT;
43371 else if (con[1] == 'e' && con[2] == 0)
43372 mode = CCGCmode, code = GE;
43373 break;
43374 case 'l':
43375 if (con[1] == 0)
43376 mode = CCGCmode, code = LT;
43377 else if (con[1] == 'e' && con[2] == 0)
43378 mode = CCGCmode, code = LE;
43379 break;
43380 case 'o':
43381 if (con[1] == 0)
43382 mode = CCOmode, code = EQ;
43383 break;
43384 case 'p':
43385 if (con[1] == 0)
43386 mode = CCPmode, code = EQ;
43387 break;
43388 case 's':
43389 if (con[1] == 0)
43390 mode = CCSmode, code = EQ;
43391 break;
43392 case 'z':
43393 if (con[1] == 0)
43394 mode = CCZmode, code = EQ;
43395 break;
43396 }
43397 if (code == UNKNOWN)
43398 {
43399 error ("unknown asm flag output %qs", constraints[i]);
43400 continue;
43401 }
43402 if (invert)
43403 code = reverse_condition (code);
43404
43405 rtx dest = outputs[i];
43406 if (!saw_asm_flag)
43407 {
43408 /* This is the first asm flag output. Here we put the flags
43409 register in as the real output and adjust the condition to
43410 allow it. */
43411 constraints[i] = "=Bf";
43412 outputs[i] = gen_rtx_REG (CCmode, FLAGS_REG);
43413 saw_asm_flag = true;
43414 }
43415 else
43416 {
43417 /* We don't need the flags register as output twice. */
43418 constraints[i] = "=X";
43419 outputs[i] = gen_rtx_SCRATCH (SImode);
43420 }
43421
43422 rtx x = gen_rtx_REG (mode, FLAGS_REG);
43423 x = gen_rtx_fmt_ee (code, QImode, x, const0_rtx);
43424
43425 machine_mode dest_mode = GET_MODE (dest);
43426 if (!SCALAR_INT_MODE_P (dest_mode))
43427 {
43428 error ("invalid type for asm flag output");
43429 continue;
43430 }
43431
43432 if (dest_mode == DImode && !TARGET_64BIT)
43433 dest_mode = SImode;
43434
43435 if (dest_mode != QImode)
43436 {
43437 rtx destqi = gen_reg_rtx (QImode);
43438 emit_insn (gen_rtx_SET (destqi, x));
43439
43440 if (TARGET_ZERO_EXTEND_WITH_AND
43441 && optimize_function_for_speed_p (cfun))
43442 {
43443 x = force_reg (dest_mode, const0_rtx);
43444
43445 emit_insn (gen_movstrictqi
43446 (gen_lowpart (QImode, x), destqi));
43447 }
43448 else
43449 x = gen_rtx_ZERO_EXTEND (dest_mode, destqi);
43450 }
43451
43452 if (dest_mode != GET_MODE (dest))
43453 {
43454 rtx tmp = gen_reg_rtx (SImode);
43455
43456 emit_insn (gen_rtx_SET (tmp, x));
43457 emit_insn (gen_zero_extendsidi2 (dest, tmp));
43458 }
43459 else
43460 emit_insn (gen_rtx_SET (dest, x));
43461 }
43462 rtx_insn *seq = get_insns ();
43463 end_sequence ();
43464
43465 if (saw_asm_flag)
43466 return seq;
43467 else
43468 {
43469 /* If we had no asm flag outputs, clobber the flags. */
43470 clobbers.safe_push (gen_rtx_REG (CCmode, FLAGS_REG));
43471 SET_HARD_REG_BIT (clobbered_regs, FLAGS_REG);
43472 return NULL;
43473 }
43474 }
43475
43476 /* Implements target vector targetm.asm.encode_section_info. */
43477
43478 static void ATTRIBUTE_UNUSED
43479 ix86_encode_section_info (tree decl, rtx rtl, int first)
43480 {
43481 default_encode_section_info (decl, rtl, first);
43482
43483 if (ix86_in_large_data_p (decl))
43484 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
43485 }
43486
43487 /* Worker function for REVERSE_CONDITION. */
43488
43489 enum rtx_code
43490 ix86_reverse_condition (enum rtx_code code, machine_mode mode)
43491 {
43492 return (mode == CCFPmode
43493 ? reverse_condition_maybe_unordered (code)
43494 : reverse_condition (code));
43495 }
43496
43497 /* Output code to perform an x87 FP register move, from OPERANDS[1]
43498 to OPERANDS[0]. */
43499
43500 const char *
43501 output_387_reg_move (rtx_insn *insn, rtx *operands)
43502 {
43503 if (REG_P (operands[0]))
43504 {
43505 if (REG_P (operands[1])
43506 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43507 {
43508 if (REGNO (operands[0]) == FIRST_STACK_REG)
43509 return output_387_ffreep (operands, 0);
43510 return "fstp\t%y0";
43511 }
43512 if (STACK_TOP_P (operands[0]))
43513 return "fld%Z1\t%y1";
43514 return "fst\t%y0";
43515 }
43516 else if (MEM_P (operands[0]))
43517 {
43518 gcc_assert (REG_P (operands[1]));
43519 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
43520 return "fstp%Z0\t%y0";
43521 else
43522 {
43523 /* There is no non-popping store to memory for XFmode.
43524 So if we need one, follow the store with a load. */
43525 if (GET_MODE (operands[0]) == XFmode)
43526 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
43527 else
43528 return "fst%Z0\t%y0";
43529 }
43530 }
43531 else
43532 gcc_unreachable();
43533 }
43534
43535 /* Output code to perform a conditional jump to LABEL, if C2 flag in
43536 FP status register is set. */
43537
43538 void
43539 ix86_emit_fp_unordered_jump (rtx label)
43540 {
43541 rtx reg = gen_reg_rtx (HImode);
43542 rtx temp;
43543
43544 emit_insn (gen_x86_fnstsw_1 (reg));
43545
43546 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
43547 {
43548 emit_insn (gen_x86_sahf_1 (reg));
43549
43550 temp = gen_rtx_REG (CCmode, FLAGS_REG);
43551 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
43552 }
43553 else
43554 {
43555 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
43556
43557 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
43558 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
43559 }
43560
43561 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
43562 gen_rtx_LABEL_REF (VOIDmode, label),
43563 pc_rtx);
43564 temp = gen_rtx_SET (pc_rtx, temp);
43565
43566 emit_jump_insn (temp);
43567 predict_jump (REG_BR_PROB_BASE * 10 / 100);
43568 }
43569
43570 /* Output code to perform a log1p XFmode calculation. */
43571
43572 void ix86_emit_i387_log1p (rtx op0, rtx op1)
43573 {
43574 rtx_code_label *label1 = gen_label_rtx ();
43575 rtx_code_label *label2 = gen_label_rtx ();
43576
43577 rtx tmp = gen_reg_rtx (XFmode);
43578 rtx tmp2 = gen_reg_rtx (XFmode);
43579 rtx test;
43580
43581 emit_insn (gen_absxf2 (tmp, op1));
43582 test = gen_rtx_GE (VOIDmode, tmp,
43583 const_double_from_real_value (
43584 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
43585 XFmode));
43586 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
43587
43588 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43589 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
43590 emit_jump (label2);
43591
43592 emit_label (label1);
43593 emit_move_insn (tmp, CONST1_RTX (XFmode));
43594 emit_insn (gen_addxf3 (tmp, op1, tmp));
43595 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
43596 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
43597
43598 emit_label (label2);
43599 }
43600
43601 /* Emit code for round calculation. */
43602 void ix86_emit_i387_round (rtx op0, rtx op1)
43603 {
43604 machine_mode inmode = GET_MODE (op1);
43605 machine_mode outmode = GET_MODE (op0);
43606 rtx e1, e2, res, tmp, tmp1, half;
43607 rtx scratch = gen_reg_rtx (HImode);
43608 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
43609 rtx_code_label *jump_label = gen_label_rtx ();
43610 rtx insn;
43611 rtx (*gen_abs) (rtx, rtx);
43612 rtx (*gen_neg) (rtx, rtx);
43613
43614 switch (inmode)
43615 {
43616 case E_SFmode:
43617 gen_abs = gen_abssf2;
43618 break;
43619 case E_DFmode:
43620 gen_abs = gen_absdf2;
43621 break;
43622 case E_XFmode:
43623 gen_abs = gen_absxf2;
43624 break;
43625 default:
43626 gcc_unreachable ();
43627 }
43628
43629 switch (outmode)
43630 {
43631 case E_SFmode:
43632 gen_neg = gen_negsf2;
43633 break;
43634 case E_DFmode:
43635 gen_neg = gen_negdf2;
43636 break;
43637 case E_XFmode:
43638 gen_neg = gen_negxf2;
43639 break;
43640 case E_HImode:
43641 gen_neg = gen_neghi2;
43642 break;
43643 case E_SImode:
43644 gen_neg = gen_negsi2;
43645 break;
43646 case E_DImode:
43647 gen_neg = gen_negdi2;
43648 break;
43649 default:
43650 gcc_unreachable ();
43651 }
43652
43653 e1 = gen_reg_rtx (inmode);
43654 e2 = gen_reg_rtx (inmode);
43655 res = gen_reg_rtx (outmode);
43656
43657 half = const_double_from_real_value (dconsthalf, inmode);
43658
43659 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
43660
43661 /* scratch = fxam(op1) */
43662 emit_insn (gen_rtx_SET (scratch,
43663 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
43664 UNSPEC_FXAM)));
43665 /* e1 = fabs(op1) */
43666 emit_insn (gen_abs (e1, op1));
43667
43668 /* e2 = e1 + 0.5 */
43669 half = force_reg (inmode, half);
43670 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (inmode, e1, half)));
43671
43672 /* res = floor(e2) */
43673 if (inmode != XFmode)
43674 {
43675 tmp1 = gen_reg_rtx (XFmode);
43676
43677 emit_insn (gen_rtx_SET (tmp1, gen_rtx_FLOAT_EXTEND (XFmode, e2)));
43678 }
43679 else
43680 tmp1 = e2;
43681
43682 switch (outmode)
43683 {
43684 case E_SFmode:
43685 case E_DFmode:
43686 {
43687 rtx tmp0 = gen_reg_rtx (XFmode);
43688
43689 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
43690
43691 emit_insn (gen_rtx_SET (res,
43692 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
43693 UNSPEC_TRUNC_NOOP)));
43694 }
43695 break;
43696 case E_XFmode:
43697 emit_insn (gen_frndintxf2_floor (res, tmp1));
43698 break;
43699 case E_HImode:
43700 emit_insn (gen_lfloorxfhi2 (res, tmp1));
43701 break;
43702 case E_SImode:
43703 emit_insn (gen_lfloorxfsi2 (res, tmp1));
43704 break;
43705 case E_DImode:
43706 emit_insn (gen_lfloorxfdi2 (res, tmp1));
43707 break;
43708 default:
43709 gcc_unreachable ();
43710 }
43711
43712 /* flags = signbit(a) */
43713 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
43714
43715 /* if (flags) then res = -res */
43716 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
43717 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
43718 gen_rtx_LABEL_REF (VOIDmode, jump_label),
43719 pc_rtx);
43720 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
43721 predict_jump (REG_BR_PROB_BASE * 50 / 100);
43722 JUMP_LABEL (insn) = jump_label;
43723
43724 emit_insn (gen_neg (res, res));
43725
43726 emit_label (jump_label);
43727 LABEL_NUSES (jump_label) = 1;
43728
43729 emit_move_insn (op0, res);
43730 }
43731
43732 /* Output code to perform a Newton-Rhapson approximation of a single precision
43733 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
43734
43735 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
43736 {
43737 rtx x0, x1, e0, e1;
43738
43739 x0 = gen_reg_rtx (mode);
43740 e0 = gen_reg_rtx (mode);
43741 e1 = gen_reg_rtx (mode);
43742 x1 = gen_reg_rtx (mode);
43743
43744 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
43745
43746 b = force_reg (mode, b);
43747
43748 /* x0 = rcp(b) estimate */
43749 if (mode == V16SFmode || mode == V8DFmode)
43750 {
43751 if (TARGET_AVX512ER)
43752 {
43753 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43754 UNSPEC_RCP28)));
43755 /* res = a * x0 */
43756 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
43757 return;
43758 }
43759 else
43760 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43761 UNSPEC_RCP14)));
43762 }
43763 else
43764 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
43765 UNSPEC_RCP)));
43766
43767 /* e0 = x0 * b */
43768 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
43769
43770 /* e0 = x0 * e0 */
43771 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
43772
43773 /* e1 = x0 + x0 */
43774 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
43775
43776 /* x1 = e1 - e0 */
43777 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
43778
43779 /* res = a * x1 */
43780 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
43781 }
43782
43783 /* Output code to perform a Newton-Rhapson approximation of a
43784 single precision floating point [reciprocal] square root. */
43785
43786 void ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
43787 {
43788 rtx x0, e0, e1, e2, e3, mthree, mhalf;
43789 REAL_VALUE_TYPE r;
43790 int unspec;
43791
43792 x0 = gen_reg_rtx (mode);
43793 e0 = gen_reg_rtx (mode);
43794 e1 = gen_reg_rtx (mode);
43795 e2 = gen_reg_rtx (mode);
43796 e3 = gen_reg_rtx (mode);
43797
43798 if (TARGET_AVX512ER && mode == V16SFmode)
43799 {
43800 if (recip)
43801 /* res = rsqrt28(a) estimate */
43802 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43803 UNSPEC_RSQRT28)));
43804 else
43805 {
43806 /* x0 = rsqrt28(a) estimate */
43807 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43808 UNSPEC_RSQRT28)));
43809 /* res = rcp28(x0) estimate */
43810 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
43811 UNSPEC_RCP28)));
43812 }
43813 return;
43814 }
43815
43816 real_from_integer (&r, VOIDmode, -3, SIGNED);
43817 mthree = const_double_from_real_value (r, SFmode);
43818
43819 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
43820 mhalf = const_double_from_real_value (r, SFmode);
43821 unspec = UNSPEC_RSQRT;
43822
43823 if (VECTOR_MODE_P (mode))
43824 {
43825 mthree = ix86_build_const_vector (mode, true, mthree);
43826 mhalf = ix86_build_const_vector (mode, true, mhalf);
43827 /* There is no 512-bit rsqrt. There is however rsqrt14. */
43828 if (GET_MODE_SIZE (mode) == 64)
43829 unspec = UNSPEC_RSQRT14;
43830 }
43831
43832 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
43833 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
43834
43835 a = force_reg (mode, a);
43836
43837 /* x0 = rsqrt(a) estimate */
43838 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
43839 unspec)));
43840
43841 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
43842 if (!recip)
43843 {
43844 rtx zero = force_reg (mode, CONST0_RTX(mode));
43845 rtx mask;
43846
43847 /* Handle masked compare. */
43848 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
43849 {
43850 mask = gen_reg_rtx (HImode);
43851 /* Imm value 0x4 corresponds to not-equal comparison. */
43852 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
43853 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
43854 }
43855 else
43856 {
43857 mask = gen_reg_rtx (mode);
43858 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
43859 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
43860 }
43861 }
43862
43863 /* e0 = x0 * a */
43864 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
43865 /* e1 = e0 * x0 */
43866 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
43867
43868 /* e2 = e1 - 3. */
43869 mthree = force_reg (mode, mthree);
43870 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
43871
43872 mhalf = force_reg (mode, mhalf);
43873 if (recip)
43874 /* e3 = -.5 * x0 */
43875 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
43876 else
43877 /* e3 = -.5 * e0 */
43878 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
43879 /* ret = e2 * e3 */
43880 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
43881 }
43882
43883 #ifdef TARGET_SOLARIS
43884 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
43885
43886 static void
43887 i386_solaris_elf_named_section (const char *name, unsigned int flags,
43888 tree decl)
43889 {
43890 /* With Binutils 2.15, the "@unwind" marker must be specified on
43891 every occurrence of the ".eh_frame" section, not just the first
43892 one. */
43893 if (TARGET_64BIT
43894 && strcmp (name, ".eh_frame") == 0)
43895 {
43896 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
43897 flags & SECTION_WRITE ? "aw" : "a");
43898 return;
43899 }
43900
43901 #ifndef USE_GAS
43902 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
43903 {
43904 solaris_elf_asm_comdat_section (name, flags, decl);
43905 return;
43906 }
43907 #endif
43908
43909 default_elf_asm_named_section (name, flags, decl);
43910 }
43911 #endif /* TARGET_SOLARIS */
43912
43913 /* Return the mangling of TYPE if it is an extended fundamental type. */
43914
43915 static const char *
43916 ix86_mangle_type (const_tree type)
43917 {
43918 type = TYPE_MAIN_VARIANT (type);
43919
43920 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
43921 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
43922 return NULL;
43923
43924 switch (TYPE_MODE (type))
43925 {
43926 case E_TFmode:
43927 /* __float128 is "g". */
43928 return "g";
43929 case E_XFmode:
43930 /* "long double" or __float80 is "e". */
43931 return "e";
43932 default:
43933 return NULL;
43934 }
43935 }
43936
43937 static GTY(()) tree ix86_tls_stack_chk_guard_decl;
43938
43939 static tree
43940 ix86_stack_protect_guard (void)
43941 {
43942 if (TARGET_SSP_TLS_GUARD)
43943 {
43944 tree type_node = lang_hooks.types.type_for_mode (ptr_mode, 1);
43945 int qual = ENCODE_QUAL_ADDR_SPACE (ix86_stack_protector_guard_reg);
43946 tree type = build_qualified_type (type_node, qual);
43947 tree t;
43948
43949 if (global_options_set.x_ix86_stack_protector_guard_symbol_str)
43950 {
43951 t = ix86_tls_stack_chk_guard_decl;
43952
43953 if (t == NULL)
43954 {
43955 rtx x;
43956
43957 t = build_decl
43958 (UNKNOWN_LOCATION, VAR_DECL,
43959 get_identifier (ix86_stack_protector_guard_symbol_str),
43960 type);
43961 TREE_STATIC (t) = 1;
43962 TREE_PUBLIC (t) = 1;
43963 DECL_EXTERNAL (t) = 1;
43964 TREE_USED (t) = 1;
43965 TREE_THIS_VOLATILE (t) = 1;
43966 DECL_ARTIFICIAL (t) = 1;
43967 DECL_IGNORED_P (t) = 1;
43968
43969 /* Do not share RTL as the declaration is visible outside of
43970 current function. */
43971 x = DECL_RTL (t);
43972 RTX_FLAG (x, used) = 1;
43973
43974 ix86_tls_stack_chk_guard_decl = t;
43975 }
43976 }
43977 else
43978 {
43979 tree asptrtype = build_pointer_type (type);
43980
43981 t = build_int_cst (asptrtype, ix86_stack_protector_guard_offset);
43982 t = build2 (MEM_REF, asptrtype, t,
43983 build_int_cst (asptrtype, 0));
43984 }
43985
43986 return t;
43987 }
43988
43989 return default_stack_protect_guard ();
43990 }
43991
43992 /* For 32-bit code we can save PIC register setup by using
43993 __stack_chk_fail_local hidden function instead of calling
43994 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
43995 register, so it is better to call __stack_chk_fail directly. */
43996
43997 static tree ATTRIBUTE_UNUSED
43998 ix86_stack_protect_fail (void)
43999 {
44000 return TARGET_64BIT
44001 ? default_external_stack_protect_fail ()
44002 : default_hidden_stack_protect_fail ();
44003 }
44004
44005 /* Select a format to encode pointers in exception handling data. CODE
44006 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
44007 true if the symbol may be affected by dynamic relocations.
44008
44009 ??? All x86 object file formats are capable of representing this.
44010 After all, the relocation needed is the same as for the call insn.
44011 Whether or not a particular assembler allows us to enter such, I
44012 guess we'll have to see. */
44013 int
44014 asm_preferred_eh_data_format (int code, int global)
44015 {
44016 if (flag_pic)
44017 {
44018 int type = DW_EH_PE_sdata8;
44019 if (!TARGET_64BIT
44020 || ix86_cmodel == CM_SMALL_PIC
44021 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
44022 type = DW_EH_PE_sdata4;
44023 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
44024 }
44025 if (ix86_cmodel == CM_SMALL
44026 || (ix86_cmodel == CM_MEDIUM && code))
44027 return DW_EH_PE_udata4;
44028 return DW_EH_PE_absptr;
44029 }
44030 \f
44031 /* Expand copysign from SIGN to the positive value ABS_VALUE
44032 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
44033 the sign-bit. */
44034 static void
44035 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
44036 {
44037 machine_mode mode = GET_MODE (sign);
44038 rtx sgn = gen_reg_rtx (mode);
44039 if (mask == NULL_RTX)
44040 {
44041 machine_mode vmode;
44042
44043 if (mode == SFmode)
44044 vmode = V4SFmode;
44045 else if (mode == DFmode)
44046 vmode = V2DFmode;
44047 else
44048 vmode = mode;
44049
44050 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
44051 if (!VECTOR_MODE_P (mode))
44052 {
44053 /* We need to generate a scalar mode mask in this case. */
44054 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44055 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44056 mask = gen_reg_rtx (mode);
44057 emit_insn (gen_rtx_SET (mask, tmp));
44058 }
44059 }
44060 else
44061 mask = gen_rtx_NOT (mode, mask);
44062 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
44063 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
44064 }
44065
44066 /* Expand fabs (OP0) and return a new rtx that holds the result. The
44067 mask for masking out the sign-bit is stored in *SMASK, if that is
44068 non-null. */
44069 static rtx
44070 ix86_expand_sse_fabs (rtx op0, rtx *smask)
44071 {
44072 machine_mode vmode, mode = GET_MODE (op0);
44073 rtx xa, mask;
44074
44075 xa = gen_reg_rtx (mode);
44076 if (mode == SFmode)
44077 vmode = V4SFmode;
44078 else if (mode == DFmode)
44079 vmode = V2DFmode;
44080 else
44081 vmode = mode;
44082 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
44083 if (!VECTOR_MODE_P (mode))
44084 {
44085 /* We need to generate a scalar mode mask in this case. */
44086 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
44087 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
44088 mask = gen_reg_rtx (mode);
44089 emit_insn (gen_rtx_SET (mask, tmp));
44090 }
44091 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
44092
44093 if (smask)
44094 *smask = mask;
44095
44096 return xa;
44097 }
44098
44099 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
44100 swapping the operands if SWAP_OPERANDS is true. The expanded
44101 code is a forward jump to a newly created label in case the
44102 comparison is true. The generated label rtx is returned. */
44103 static rtx_code_label *
44104 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
44105 bool swap_operands)
44106 {
44107 bool unordered_compare = ix86_unordered_fp_compare (code);
44108 rtx_code_label *label;
44109 rtx tmp, reg;
44110
44111 if (swap_operands)
44112 std::swap (op0, op1);
44113
44114 label = gen_label_rtx ();
44115 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
44116 if (unordered_compare)
44117 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
44118 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
44119 emit_insn (gen_rtx_SET (reg, tmp));
44120 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
44121 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
44122 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
44123 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
44124 JUMP_LABEL (tmp) = label;
44125
44126 return label;
44127 }
44128
44129 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
44130 using comparison code CODE. Operands are swapped for the comparison if
44131 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
44132 static rtx
44133 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
44134 bool swap_operands)
44135 {
44136 rtx (*insn)(rtx, rtx, rtx, rtx);
44137 machine_mode mode = GET_MODE (op0);
44138 rtx mask = gen_reg_rtx (mode);
44139
44140 if (swap_operands)
44141 std::swap (op0, op1);
44142
44143 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
44144
44145 emit_insn (insn (mask, op0, op1,
44146 gen_rtx_fmt_ee (code, mode, op0, op1)));
44147 return mask;
44148 }
44149
44150 /* Generate and return a rtx of mode MODE for 2**n where n is the number
44151 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
44152 static rtx
44153 ix86_gen_TWO52 (machine_mode mode)
44154 {
44155 REAL_VALUE_TYPE TWO52r;
44156 rtx TWO52;
44157
44158 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
44159 TWO52 = const_double_from_real_value (TWO52r, mode);
44160 TWO52 = force_reg (mode, TWO52);
44161
44162 return TWO52;
44163 }
44164
44165 /* Expand SSE sequence for computing lround from OP1 storing
44166 into OP0. */
44167 void
44168 ix86_expand_lround (rtx op0, rtx op1)
44169 {
44170 /* C code for the stuff we're doing below:
44171 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
44172 return (long)tmp;
44173 */
44174 machine_mode mode = GET_MODE (op1);
44175 const struct real_format *fmt;
44176 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44177 rtx adj;
44178
44179 /* load nextafter (0.5, 0.0) */
44180 fmt = REAL_MODE_FORMAT (mode);
44181 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44182 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44183
44184 /* adj = copysign (0.5, op1) */
44185 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
44186 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
44187
44188 /* adj = op1 + adj */
44189 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
44190
44191 /* op0 = (imode)adj */
44192 expand_fix (op0, adj, 0);
44193 }
44194
44195 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
44196 into OPERAND0. */
44197 void
44198 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
44199 {
44200 /* C code for the stuff we're doing below (for do_floor):
44201 xi = (long)op1;
44202 xi -= (double)xi > op1 ? 1 : 0;
44203 return xi;
44204 */
44205 machine_mode fmode = GET_MODE (op1);
44206 machine_mode imode = GET_MODE (op0);
44207 rtx ireg, freg, tmp;
44208 rtx_code_label *label;
44209
44210 /* reg = (long)op1 */
44211 ireg = gen_reg_rtx (imode);
44212 expand_fix (ireg, op1, 0);
44213
44214 /* freg = (double)reg */
44215 freg = gen_reg_rtx (fmode);
44216 expand_float (freg, ireg, 0);
44217
44218 /* ireg = (freg > op1) ? ireg - 1 : ireg */
44219 label = ix86_expand_sse_compare_and_jump (UNLE,
44220 freg, op1, !do_floor);
44221 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
44222 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
44223 emit_move_insn (ireg, tmp);
44224
44225 emit_label (label);
44226 LABEL_NUSES (label) = 1;
44227
44228 emit_move_insn (op0, ireg);
44229 }
44230
44231 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
44232 result in OPERAND0. */
44233 void
44234 ix86_expand_rint (rtx operand0, rtx operand1)
44235 {
44236 /* C code for the stuff we're doing below:
44237 xa = fabs (operand1);
44238 if (!isless (xa, 2**52))
44239 return operand1;
44240 xa = xa + 2**52 - 2**52;
44241 return copysign (xa, operand1);
44242 */
44243 machine_mode mode = GET_MODE (operand0);
44244 rtx res, xa, TWO52, mask;
44245 rtx_code_label *label;
44246
44247 res = gen_reg_rtx (mode);
44248 emit_move_insn (res, operand1);
44249
44250 /* xa = abs (operand1) */
44251 xa = ix86_expand_sse_fabs (res, &mask);
44252
44253 /* if (!isless (xa, TWO52)) goto label; */
44254 TWO52 = ix86_gen_TWO52 (mode);
44255 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44256
44257 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44258 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44259
44260 ix86_sse_copysign_to_positive (res, xa, res, mask);
44261
44262 emit_label (label);
44263 LABEL_NUSES (label) = 1;
44264
44265 emit_move_insn (operand0, res);
44266 }
44267
44268 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44269 into OPERAND0. */
44270 void
44271 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
44272 {
44273 /* C code for the stuff we expand below.
44274 double xa = fabs (x), x2;
44275 if (!isless (xa, TWO52))
44276 return x;
44277 xa = xa + TWO52 - TWO52;
44278 x2 = copysign (xa, x);
44279 Compensate. Floor:
44280 if (x2 > x)
44281 x2 -= 1;
44282 Compensate. Ceil:
44283 if (x2 < x)
44284 x2 -= -1;
44285 return x2;
44286 */
44287 machine_mode mode = GET_MODE (operand0);
44288 rtx xa, TWO52, tmp, one, res, mask;
44289 rtx_code_label *label;
44290
44291 TWO52 = ix86_gen_TWO52 (mode);
44292
44293 /* Temporary for holding the result, initialized to the input
44294 operand to ease control flow. */
44295 res = gen_reg_rtx (mode);
44296 emit_move_insn (res, operand1);
44297
44298 /* xa = abs (operand1) */
44299 xa = ix86_expand_sse_fabs (res, &mask);
44300
44301 /* if (!isless (xa, TWO52)) goto label; */
44302 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44303
44304 /* xa = xa + TWO52 - TWO52; */
44305 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44306 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
44307
44308 /* xa = copysign (xa, operand1) */
44309 ix86_sse_copysign_to_positive (xa, xa, res, mask);
44310
44311 /* generate 1.0 or -1.0 */
44312 one = force_reg (mode,
44313 const_double_from_real_value (do_floor
44314 ? dconst1 : dconstm1, mode));
44315
44316 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44317 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44318 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44319 /* We always need to subtract here to preserve signed zero. */
44320 tmp = expand_simple_binop (mode, MINUS,
44321 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44322 emit_move_insn (res, tmp);
44323
44324 emit_label (label);
44325 LABEL_NUSES (label) = 1;
44326
44327 emit_move_insn (operand0, res);
44328 }
44329
44330 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
44331 into OPERAND0. */
44332 void
44333 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
44334 {
44335 /* C code for the stuff we expand below.
44336 double xa = fabs (x), x2;
44337 if (!isless (xa, TWO52))
44338 return x;
44339 x2 = (double)(long)x;
44340 Compensate. Floor:
44341 if (x2 > x)
44342 x2 -= 1;
44343 Compensate. Ceil:
44344 if (x2 < x)
44345 x2 += 1;
44346 if (HONOR_SIGNED_ZEROS (mode))
44347 return copysign (x2, x);
44348 return x2;
44349 */
44350 machine_mode mode = GET_MODE (operand0);
44351 rtx xa, xi, TWO52, tmp, one, res, mask;
44352 rtx_code_label *label;
44353
44354 TWO52 = ix86_gen_TWO52 (mode);
44355
44356 /* Temporary for holding the result, initialized to the input
44357 operand to ease control flow. */
44358 res = gen_reg_rtx (mode);
44359 emit_move_insn (res, operand1);
44360
44361 /* xa = abs (operand1) */
44362 xa = ix86_expand_sse_fabs (res, &mask);
44363
44364 /* if (!isless (xa, TWO52)) goto label; */
44365 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44366
44367 /* xa = (double)(long)x */
44368 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44369 expand_fix (xi, res, 0);
44370 expand_float (xa, xi, 0);
44371
44372 /* generate 1.0 */
44373 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44374
44375 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
44376 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
44377 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44378 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
44379 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44380 emit_move_insn (res, tmp);
44381
44382 if (HONOR_SIGNED_ZEROS (mode))
44383 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44384
44385 emit_label (label);
44386 LABEL_NUSES (label) = 1;
44387
44388 emit_move_insn (operand0, res);
44389 }
44390
44391 /* Expand SSE sequence for computing round from OPERAND1 storing
44392 into OPERAND0. Sequence that works without relying on DImode truncation
44393 via cvttsd2siq that is only available on 64bit targets. */
44394 void
44395 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
44396 {
44397 /* C code for the stuff we expand below.
44398 double xa = fabs (x), xa2, x2;
44399 if (!isless (xa, TWO52))
44400 return x;
44401 Using the absolute value and copying back sign makes
44402 -0.0 -> -0.0 correct.
44403 xa2 = xa + TWO52 - TWO52;
44404 Compensate.
44405 dxa = xa2 - xa;
44406 if (dxa <= -0.5)
44407 xa2 += 1;
44408 else if (dxa > 0.5)
44409 xa2 -= 1;
44410 x2 = copysign (xa2, x);
44411 return x2;
44412 */
44413 machine_mode mode = GET_MODE (operand0);
44414 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
44415 rtx_code_label *label;
44416
44417 TWO52 = ix86_gen_TWO52 (mode);
44418
44419 /* Temporary for holding the result, initialized to the input
44420 operand to ease control flow. */
44421 res = gen_reg_rtx (mode);
44422 emit_move_insn (res, operand1);
44423
44424 /* xa = abs (operand1) */
44425 xa = ix86_expand_sse_fabs (res, &mask);
44426
44427 /* if (!isless (xa, TWO52)) goto label; */
44428 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44429
44430 /* xa2 = xa + TWO52 - TWO52; */
44431 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44432 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
44433
44434 /* dxa = xa2 - xa; */
44435 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
44436
44437 /* generate 0.5, 1.0 and -0.5 */
44438 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
44439 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
44440 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
44441 0, OPTAB_DIRECT);
44442
44443 /* Compensate. */
44444 tmp = gen_reg_rtx (mode);
44445 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
44446 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
44447 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44448 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44449 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
44450 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
44451 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
44452 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
44453
44454 /* res = copysign (xa2, operand1) */
44455 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
44456
44457 emit_label (label);
44458 LABEL_NUSES (label) = 1;
44459
44460 emit_move_insn (operand0, res);
44461 }
44462
44463 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44464 into OPERAND0. */
44465 void
44466 ix86_expand_trunc (rtx operand0, rtx operand1)
44467 {
44468 /* C code for SSE variant we expand below.
44469 double xa = fabs (x), x2;
44470 if (!isless (xa, TWO52))
44471 return x;
44472 x2 = (double)(long)x;
44473 if (HONOR_SIGNED_ZEROS (mode))
44474 return copysign (x2, x);
44475 return x2;
44476 */
44477 machine_mode mode = GET_MODE (operand0);
44478 rtx xa, xi, TWO52, res, mask;
44479 rtx_code_label *label;
44480
44481 TWO52 = ix86_gen_TWO52 (mode);
44482
44483 /* Temporary for holding the result, initialized to the input
44484 operand to ease control flow. */
44485 res = gen_reg_rtx (mode);
44486 emit_move_insn (res, operand1);
44487
44488 /* xa = abs (operand1) */
44489 xa = ix86_expand_sse_fabs (res, &mask);
44490
44491 /* if (!isless (xa, TWO52)) goto label; */
44492 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44493
44494 /* x = (double)(long)x */
44495 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44496 expand_fix (xi, res, 0);
44497 expand_float (res, xi, 0);
44498
44499 if (HONOR_SIGNED_ZEROS (mode))
44500 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
44501
44502 emit_label (label);
44503 LABEL_NUSES (label) = 1;
44504
44505 emit_move_insn (operand0, res);
44506 }
44507
44508 /* Expand SSE sequence for computing trunc from OPERAND1 storing
44509 into OPERAND0. */
44510 void
44511 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
44512 {
44513 machine_mode mode = GET_MODE (operand0);
44514 rtx xa, mask, TWO52, one, res, smask, tmp;
44515 rtx_code_label *label;
44516
44517 /* C code for SSE variant we expand below.
44518 double xa = fabs (x), x2;
44519 if (!isless (xa, TWO52))
44520 return x;
44521 xa2 = xa + TWO52 - TWO52;
44522 Compensate:
44523 if (xa2 > xa)
44524 xa2 -= 1.0;
44525 x2 = copysign (xa2, x);
44526 return x2;
44527 */
44528
44529 TWO52 = ix86_gen_TWO52 (mode);
44530
44531 /* Temporary for holding the result, initialized to the input
44532 operand to ease control flow. */
44533 res = gen_reg_rtx (mode);
44534 emit_move_insn (res, operand1);
44535
44536 /* xa = abs (operand1) */
44537 xa = ix86_expand_sse_fabs (res, &smask);
44538
44539 /* if (!isless (xa, TWO52)) goto label; */
44540 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44541
44542 /* res = xa + TWO52 - TWO52; */
44543 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
44544 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
44545 emit_move_insn (res, tmp);
44546
44547 /* generate 1.0 */
44548 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
44549
44550 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
44551 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
44552 emit_insn (gen_rtx_SET (mask, gen_rtx_AND (mode, mask, one)));
44553 tmp = expand_simple_binop (mode, MINUS,
44554 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
44555 emit_move_insn (res, tmp);
44556
44557 /* res = copysign (res, operand1) */
44558 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
44559
44560 emit_label (label);
44561 LABEL_NUSES (label) = 1;
44562
44563 emit_move_insn (operand0, res);
44564 }
44565
44566 /* Expand SSE sequence for computing round from OPERAND1 storing
44567 into OPERAND0. */
44568 void
44569 ix86_expand_round (rtx operand0, rtx operand1)
44570 {
44571 /* C code for the stuff we're doing below:
44572 double xa = fabs (x);
44573 if (!isless (xa, TWO52))
44574 return x;
44575 xa = (double)(long)(xa + nextafter (0.5, 0.0));
44576 return copysign (xa, x);
44577 */
44578 machine_mode mode = GET_MODE (operand0);
44579 rtx res, TWO52, xa, xi, half, mask;
44580 rtx_code_label *label;
44581 const struct real_format *fmt;
44582 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44583
44584 /* Temporary for holding the result, initialized to the input
44585 operand to ease control flow. */
44586 res = gen_reg_rtx (mode);
44587 emit_move_insn (res, operand1);
44588
44589 TWO52 = ix86_gen_TWO52 (mode);
44590 xa = ix86_expand_sse_fabs (res, &mask);
44591 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
44592
44593 /* load nextafter (0.5, 0.0) */
44594 fmt = REAL_MODE_FORMAT (mode);
44595 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44596 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44597
44598 /* xa = xa + 0.5 */
44599 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
44600 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
44601
44602 /* xa = (double)(int64_t)xa */
44603 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
44604 expand_fix (xi, xa, 0);
44605 expand_float (xa, xi, 0);
44606
44607 /* res = copysign (xa, operand1) */
44608 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
44609
44610 emit_label (label);
44611 LABEL_NUSES (label) = 1;
44612
44613 emit_move_insn (operand0, res);
44614 }
44615
44616 /* Expand SSE sequence for computing round
44617 from OP1 storing into OP0 using sse4 round insn. */
44618 void
44619 ix86_expand_round_sse4 (rtx op0, rtx op1)
44620 {
44621 machine_mode mode = GET_MODE (op0);
44622 rtx e1, e2, res, half;
44623 const struct real_format *fmt;
44624 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
44625 rtx (*gen_copysign) (rtx, rtx, rtx);
44626 rtx (*gen_round) (rtx, rtx, rtx);
44627
44628 switch (mode)
44629 {
44630 case E_SFmode:
44631 gen_copysign = gen_copysignsf3;
44632 gen_round = gen_sse4_1_roundsf2;
44633 break;
44634 case E_DFmode:
44635 gen_copysign = gen_copysigndf3;
44636 gen_round = gen_sse4_1_rounddf2;
44637 break;
44638 default:
44639 gcc_unreachable ();
44640 }
44641
44642 /* round (a) = trunc (a + copysign (0.5, a)) */
44643
44644 /* load nextafter (0.5, 0.0) */
44645 fmt = REAL_MODE_FORMAT (mode);
44646 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
44647 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
44648 half = const_double_from_real_value (pred_half, mode);
44649
44650 /* e1 = copysign (0.5, op1) */
44651 e1 = gen_reg_rtx (mode);
44652 emit_insn (gen_copysign (e1, half, op1));
44653
44654 /* e2 = op1 + e1 */
44655 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
44656
44657 /* res = trunc (e2) */
44658 res = gen_reg_rtx (mode);
44659 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
44660
44661 emit_move_insn (op0, res);
44662 }
44663 \f
44664
44665 /* Table of valid machine attributes. */
44666 static const struct attribute_spec ix86_attribute_table[] =
44667 {
44668 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
44669 affects_type_identity } */
44670 /* Stdcall attribute says callee is responsible for popping arguments
44671 if they are not variable. */
44672 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44673 true },
44674 /* Fastcall attribute says callee is responsible for popping arguments
44675 if they are not variable. */
44676 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44677 true },
44678 /* Thiscall attribute says callee is responsible for popping arguments
44679 if they are not variable. */
44680 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44681 true },
44682 /* Cdecl attribute says the callee is a normal C declaration */
44683 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44684 true },
44685 /* Regparm attribute specifies how many integer arguments are to be
44686 passed in registers. */
44687 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
44688 true },
44689 /* Sseregparm attribute says we are using x86_64 calling conventions
44690 for FP arguments. */
44691 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
44692 true },
44693 /* The transactional memory builtins are implicitly regparm or fastcall
44694 depending on the ABI. Override the generic do-nothing attribute that
44695 these builtins were declared with. */
44696 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
44697 true },
44698 /* force_align_arg_pointer says this function realigns the stack at entry. */
44699 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
44700 false, true, true, ix86_handle_force_align_arg_pointer_attribute, false },
44701 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
44702 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
44703 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
44704 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
44705 false },
44706 #endif
44707 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44708 false },
44709 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
44710 false },
44711 #ifdef SUBTARGET_ATTRIBUTE_TABLE
44712 SUBTARGET_ATTRIBUTE_TABLE,
44713 #endif
44714 /* ms_abi and sysv_abi calling convention function attributes. */
44715 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44716 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
44717 { "ms_abi va_list", 0, 0, false, false, false, NULL, false },
44718 { "sysv_abi va_list", 0, 0, false, false, false, NULL, false },
44719 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
44720 false },
44721 { "callee_pop_aggregate_return", 1, 1, false, true, true,
44722 ix86_handle_callee_pop_aggregate_return, true },
44723 { "interrupt", 0, 0, false, true, true,
44724 ix86_handle_interrupt_attribute, false },
44725 { "no_caller_saved_registers", 0, 0, false, true, true,
44726 ix86_handle_no_caller_saved_registers_attribute, false },
44727 { "naked", 0, 0, true, false, false,
44728 ix86_handle_fndecl_attribute, false },
44729
44730 /* End element. */
44731 { NULL, 0, 0, false, false, false, NULL, false }
44732 };
44733
44734 /* Implement targetm.vectorize.builtin_vectorization_cost. */
44735 static int
44736 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
44737 tree vectype, int)
44738 {
44739 bool fp = false;
44740 machine_mode mode = TImode;
44741 int index;
44742 if (vectype != NULL)
44743 {
44744 fp = FLOAT_TYPE_P (vectype);
44745 mode = TYPE_MODE (vectype);
44746 }
44747
44748 switch (type_of_cost)
44749 {
44750 case scalar_stmt:
44751 return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
44752
44753 case scalar_load:
44754 /* load/store costs are relative to register move which is 2. Recompute
44755 it to COSTS_N_INSNS so everything have same base. */
44756 return COSTS_N_INSNS (fp ? ix86_cost->sse_load[0]
44757 : ix86_cost->int_load [2]) / 2;
44758
44759 case scalar_store:
44760 return COSTS_N_INSNS (fp ? ix86_cost->sse_store[0]
44761 : ix86_cost->int_store [2]) / 2;
44762
44763 case vector_stmt:
44764 return ix86_vec_cost (mode,
44765 fp ? ix86_cost->addss : ix86_cost->sse_op,
44766 true);
44767
44768 case vector_load:
44769 index = sse_store_index (mode);
44770 /* See PR82713 - we may end up being called on non-vector type. */
44771 if (index < 0)
44772 index = 2;
44773 return ix86_vec_cost (mode,
44774 COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
44775 true);
44776
44777 case vector_store:
44778 index = sse_store_index (mode);
44779 /* See PR82713 - we may end up being called on non-vector type. */
44780 if (index < 0)
44781 index = 2;
44782 return ix86_vec_cost (mode,
44783 COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
44784 true);
44785
44786 case vec_to_scalar:
44787 case scalar_to_vec:
44788 return ix86_vec_cost (mode, ix86_cost->sse_op, true);
44789
44790 /* We should have separate costs for unaligned loads and gather/scatter.
44791 Do that incrementally. */
44792 case unaligned_load:
44793 index = sse_store_index (mode);
44794 /* See PR82713 - we may end up being called on non-vector type. */
44795 if (index < 0)
44796 index = 2;
44797 return ix86_vec_cost (mode,
44798 COSTS_N_INSNS
44799 (ix86_cost->sse_unaligned_load[index]) / 2,
44800 true);
44801
44802 case unaligned_store:
44803 index = sse_store_index (mode);
44804 /* See PR82713 - we may end up being called on non-vector type. */
44805 if (index < 0)
44806 index = 2;
44807 return ix86_vec_cost (mode,
44808 COSTS_N_INSNS
44809 (ix86_cost->sse_unaligned_store[index]) / 2,
44810 true);
44811
44812 case vector_gather_load:
44813 return ix86_vec_cost (mode,
44814 COSTS_N_INSNS
44815 (ix86_cost->gather_static
44816 + ix86_cost->gather_per_elt
44817 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44818 true);
44819
44820 case vector_scatter_store:
44821 return ix86_vec_cost (mode,
44822 COSTS_N_INSNS
44823 (ix86_cost->scatter_static
44824 + ix86_cost->scatter_per_elt
44825 * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
44826 true);
44827
44828 case cond_branch_taken:
44829 return ix86_cost->cond_taken_branch_cost;
44830
44831 case cond_branch_not_taken:
44832 return ix86_cost->cond_not_taken_branch_cost;
44833
44834 case vec_perm:
44835 case vec_promote_demote:
44836 return ix86_vec_cost (mode,
44837 ix86_cost->sse_op, true);
44838
44839 case vec_construct:
44840 return ix86_vec_cost (mode, ix86_cost->sse_op, false);
44841
44842 default:
44843 gcc_unreachable ();
44844 }
44845 }
44846
44847 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
44848 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
44849 insn every time. */
44850
44851 static GTY(()) rtx_insn *vselect_insn;
44852
44853 /* Initialize vselect_insn. */
44854
44855 static void
44856 init_vselect_insn (void)
44857 {
44858 unsigned i;
44859 rtx x;
44860
44861 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
44862 for (i = 0; i < MAX_VECT_LEN; ++i)
44863 XVECEXP (x, 0, i) = const0_rtx;
44864 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
44865 const0_rtx), x);
44866 x = gen_rtx_SET (const0_rtx, x);
44867 start_sequence ();
44868 vselect_insn = emit_insn (x);
44869 end_sequence ();
44870 }
44871
44872 /* Construct (set target (vec_select op0 (parallel perm))) and
44873 return true if that's a valid instruction in the active ISA. */
44874
44875 static bool
44876 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
44877 unsigned nelt, bool testing_p)
44878 {
44879 unsigned int i;
44880 rtx x, save_vconcat;
44881 int icode;
44882
44883 if (vselect_insn == NULL_RTX)
44884 init_vselect_insn ();
44885
44886 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
44887 PUT_NUM_ELEM (XVEC (x, 0), nelt);
44888 for (i = 0; i < nelt; ++i)
44889 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
44890 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44891 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
44892 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
44893 SET_DEST (PATTERN (vselect_insn)) = target;
44894 icode = recog_memoized (vselect_insn);
44895
44896 if (icode >= 0 && !testing_p)
44897 emit_insn (copy_rtx (PATTERN (vselect_insn)));
44898
44899 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
44900 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
44901 INSN_CODE (vselect_insn) = -1;
44902
44903 return icode >= 0;
44904 }
44905
44906 /* Similar, but generate a vec_concat from op0 and op1 as well. */
44907
44908 static bool
44909 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
44910 const unsigned char *perm, unsigned nelt,
44911 bool testing_p)
44912 {
44913 machine_mode v2mode;
44914 rtx x;
44915 bool ok;
44916
44917 if (vselect_insn == NULL_RTX)
44918 init_vselect_insn ();
44919
44920 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
44921 return false;
44922 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
44923 PUT_MODE (x, v2mode);
44924 XEXP (x, 0) = op0;
44925 XEXP (x, 1) = op1;
44926 ok = expand_vselect (target, x, perm, nelt, testing_p);
44927 XEXP (x, 0) = const0_rtx;
44928 XEXP (x, 1) = const0_rtx;
44929 return ok;
44930 }
44931
44932 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
44933 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
44934
44935 static bool
44936 expand_vec_perm_blend (struct expand_vec_perm_d *d)
44937 {
44938 machine_mode mmode, vmode = d->vmode;
44939 unsigned i, mask, nelt = d->nelt;
44940 rtx target, op0, op1, maskop, x;
44941 rtx rperm[32], vperm;
44942
44943 if (d->one_operand_p)
44944 return false;
44945 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
44946 && (TARGET_AVX512BW
44947 || GET_MODE_UNIT_SIZE (vmode) >= 4))
44948 ;
44949 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
44950 ;
44951 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
44952 ;
44953 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
44954 ;
44955 else
44956 return false;
44957
44958 /* This is a blend, not a permute. Elements must stay in their
44959 respective lanes. */
44960 for (i = 0; i < nelt; ++i)
44961 {
44962 unsigned e = d->perm[i];
44963 if (!(e == i || e == i + nelt))
44964 return false;
44965 }
44966
44967 if (d->testing_p)
44968 return true;
44969
44970 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
44971 decision should be extracted elsewhere, so that we only try that
44972 sequence once all budget==3 options have been tried. */
44973 target = d->target;
44974 op0 = d->op0;
44975 op1 = d->op1;
44976 mask = 0;
44977
44978 switch (vmode)
44979 {
44980 case E_V8DFmode:
44981 case E_V16SFmode:
44982 case E_V4DFmode:
44983 case E_V8SFmode:
44984 case E_V2DFmode:
44985 case E_V4SFmode:
44986 case E_V8HImode:
44987 case E_V8SImode:
44988 case E_V32HImode:
44989 case E_V64QImode:
44990 case E_V16SImode:
44991 case E_V8DImode:
44992 for (i = 0; i < nelt; ++i)
44993 mask |= (d->perm[i] >= nelt) << i;
44994 break;
44995
44996 case E_V2DImode:
44997 for (i = 0; i < 2; ++i)
44998 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
44999 vmode = V8HImode;
45000 goto do_subreg;
45001
45002 case E_V4SImode:
45003 for (i = 0; i < 4; ++i)
45004 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45005 vmode = V8HImode;
45006 goto do_subreg;
45007
45008 case E_V16QImode:
45009 /* See if bytes move in pairs so we can use pblendw with
45010 an immediate argument, rather than pblendvb with a vector
45011 argument. */
45012 for (i = 0; i < 16; i += 2)
45013 if (d->perm[i] + 1 != d->perm[i + 1])
45014 {
45015 use_pblendvb:
45016 for (i = 0; i < nelt; ++i)
45017 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
45018
45019 finish_pblendvb:
45020 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
45021 vperm = force_reg (vmode, vperm);
45022
45023 if (GET_MODE_SIZE (vmode) == 16)
45024 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
45025 else
45026 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
45027 if (target != d->target)
45028 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45029 return true;
45030 }
45031
45032 for (i = 0; i < 8; ++i)
45033 mask |= (d->perm[i * 2] >= 16) << i;
45034 vmode = V8HImode;
45035 /* FALLTHRU */
45036
45037 do_subreg:
45038 target = gen_reg_rtx (vmode);
45039 op0 = gen_lowpart (vmode, op0);
45040 op1 = gen_lowpart (vmode, op1);
45041 break;
45042
45043 case E_V32QImode:
45044 /* See if bytes move in pairs. If not, vpblendvb must be used. */
45045 for (i = 0; i < 32; i += 2)
45046 if (d->perm[i] + 1 != d->perm[i + 1])
45047 goto use_pblendvb;
45048 /* See if bytes move in quadruplets. If yes, vpblendd
45049 with immediate can be used. */
45050 for (i = 0; i < 32; i += 4)
45051 if (d->perm[i] + 2 != d->perm[i + 2])
45052 break;
45053 if (i < 32)
45054 {
45055 /* See if bytes move the same in both lanes. If yes,
45056 vpblendw with immediate can be used. */
45057 for (i = 0; i < 16; i += 2)
45058 if (d->perm[i] + 16 != d->perm[i + 16])
45059 goto use_pblendvb;
45060
45061 /* Use vpblendw. */
45062 for (i = 0; i < 16; ++i)
45063 mask |= (d->perm[i * 2] >= 32) << i;
45064 vmode = V16HImode;
45065 goto do_subreg;
45066 }
45067
45068 /* Use vpblendd. */
45069 for (i = 0; i < 8; ++i)
45070 mask |= (d->perm[i * 4] >= 32) << i;
45071 vmode = V8SImode;
45072 goto do_subreg;
45073
45074 case E_V16HImode:
45075 /* See if words move in pairs. If yes, vpblendd can be used. */
45076 for (i = 0; i < 16; i += 2)
45077 if (d->perm[i] + 1 != d->perm[i + 1])
45078 break;
45079 if (i < 16)
45080 {
45081 /* See if words move the same in both lanes. If not,
45082 vpblendvb must be used. */
45083 for (i = 0; i < 8; i++)
45084 if (d->perm[i] + 8 != d->perm[i + 8])
45085 {
45086 /* Use vpblendvb. */
45087 for (i = 0; i < 32; ++i)
45088 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
45089
45090 vmode = V32QImode;
45091 nelt = 32;
45092 target = gen_reg_rtx (vmode);
45093 op0 = gen_lowpart (vmode, op0);
45094 op1 = gen_lowpart (vmode, op1);
45095 goto finish_pblendvb;
45096 }
45097
45098 /* Use vpblendw. */
45099 for (i = 0; i < 16; ++i)
45100 mask |= (d->perm[i] >= 16) << i;
45101 break;
45102 }
45103
45104 /* Use vpblendd. */
45105 for (i = 0; i < 8; ++i)
45106 mask |= (d->perm[i * 2] >= 16) << i;
45107 vmode = V8SImode;
45108 goto do_subreg;
45109
45110 case E_V4DImode:
45111 /* Use vpblendd. */
45112 for (i = 0; i < 4; ++i)
45113 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
45114 vmode = V8SImode;
45115 goto do_subreg;
45116
45117 default:
45118 gcc_unreachable ();
45119 }
45120
45121 switch (vmode)
45122 {
45123 case E_V8DFmode:
45124 case E_V8DImode:
45125 mmode = QImode;
45126 break;
45127 case E_V16SFmode:
45128 case E_V16SImode:
45129 mmode = HImode;
45130 break;
45131 case E_V32HImode:
45132 mmode = SImode;
45133 break;
45134 case E_V64QImode:
45135 mmode = DImode;
45136 break;
45137 default:
45138 mmode = VOIDmode;
45139 }
45140
45141 if (mmode != VOIDmode)
45142 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
45143 else
45144 maskop = GEN_INT (mask);
45145
45146 /* This matches five different patterns with the different modes. */
45147 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
45148 x = gen_rtx_SET (target, x);
45149 emit_insn (x);
45150 if (target != d->target)
45151 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45152
45153 return true;
45154 }
45155
45156 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45157 in terms of the variable form of vpermilps.
45158
45159 Note that we will have already failed the immediate input vpermilps,
45160 which requires that the high and low part shuffle be identical; the
45161 variable form doesn't require that. */
45162
45163 static bool
45164 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
45165 {
45166 rtx rperm[8], vperm;
45167 unsigned i;
45168
45169 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
45170 return false;
45171
45172 /* We can only permute within the 128-bit lane. */
45173 for (i = 0; i < 8; ++i)
45174 {
45175 unsigned e = d->perm[i];
45176 if (i < 4 ? e >= 4 : e < 4)
45177 return false;
45178 }
45179
45180 if (d->testing_p)
45181 return true;
45182
45183 for (i = 0; i < 8; ++i)
45184 {
45185 unsigned e = d->perm[i];
45186
45187 /* Within each 128-bit lane, the elements of op0 are numbered
45188 from 0 and the elements of op1 are numbered from 4. */
45189 if (e >= 8 + 4)
45190 e -= 8;
45191 else if (e >= 4)
45192 e -= 4;
45193
45194 rperm[i] = GEN_INT (e);
45195 }
45196
45197 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
45198 vperm = force_reg (V8SImode, vperm);
45199 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
45200
45201 return true;
45202 }
45203
45204 /* Return true if permutation D can be performed as VMODE permutation
45205 instead. */
45206
45207 static bool
45208 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
45209 {
45210 unsigned int i, j, chunk;
45211
45212 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
45213 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
45214 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
45215 return false;
45216
45217 if (GET_MODE_NUNITS (vmode) >= d->nelt)
45218 return true;
45219
45220 chunk = d->nelt / GET_MODE_NUNITS (vmode);
45221 for (i = 0; i < d->nelt; i += chunk)
45222 if (d->perm[i] & (chunk - 1))
45223 return false;
45224 else
45225 for (j = 1; j < chunk; ++j)
45226 if (d->perm[i] + j != d->perm[i + j])
45227 return false;
45228
45229 return true;
45230 }
45231
45232 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45233 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
45234
45235 static bool
45236 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
45237 {
45238 unsigned i, nelt, eltsz, mask;
45239 unsigned char perm[64];
45240 machine_mode vmode = V16QImode;
45241 rtx rperm[64], vperm, target, op0, op1;
45242
45243 nelt = d->nelt;
45244
45245 if (!d->one_operand_p)
45246 {
45247 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
45248 {
45249 if (TARGET_AVX2
45250 && valid_perm_using_mode_p (V2TImode, d))
45251 {
45252 if (d->testing_p)
45253 return true;
45254
45255 /* Use vperm2i128 insn. The pattern uses
45256 V4DImode instead of V2TImode. */
45257 target = d->target;
45258 if (d->vmode != V4DImode)
45259 target = gen_reg_rtx (V4DImode);
45260 op0 = gen_lowpart (V4DImode, d->op0);
45261 op1 = gen_lowpart (V4DImode, d->op1);
45262 rperm[0]
45263 = GEN_INT ((d->perm[0] / (nelt / 2))
45264 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
45265 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
45266 if (target != d->target)
45267 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45268 return true;
45269 }
45270 return false;
45271 }
45272 }
45273 else
45274 {
45275 if (GET_MODE_SIZE (d->vmode) == 16)
45276 {
45277 if (!TARGET_SSSE3)
45278 return false;
45279 }
45280 else if (GET_MODE_SIZE (d->vmode) == 32)
45281 {
45282 if (!TARGET_AVX2)
45283 return false;
45284
45285 /* V4DImode should be already handled through
45286 expand_vselect by vpermq instruction. */
45287 gcc_assert (d->vmode != V4DImode);
45288
45289 vmode = V32QImode;
45290 if (d->vmode == V8SImode
45291 || d->vmode == V16HImode
45292 || d->vmode == V32QImode)
45293 {
45294 /* First see if vpermq can be used for
45295 V8SImode/V16HImode/V32QImode. */
45296 if (valid_perm_using_mode_p (V4DImode, d))
45297 {
45298 for (i = 0; i < 4; i++)
45299 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
45300 if (d->testing_p)
45301 return true;
45302 target = gen_reg_rtx (V4DImode);
45303 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
45304 perm, 4, false))
45305 {
45306 emit_move_insn (d->target,
45307 gen_lowpart (d->vmode, target));
45308 return true;
45309 }
45310 return false;
45311 }
45312
45313 /* Next see if vpermd can be used. */
45314 if (valid_perm_using_mode_p (V8SImode, d))
45315 vmode = V8SImode;
45316 }
45317 /* Or if vpermps can be used. */
45318 else if (d->vmode == V8SFmode)
45319 vmode = V8SImode;
45320
45321 if (vmode == V32QImode)
45322 {
45323 /* vpshufb only works intra lanes, it is not
45324 possible to shuffle bytes in between the lanes. */
45325 for (i = 0; i < nelt; ++i)
45326 if ((d->perm[i] ^ i) & (nelt / 2))
45327 return false;
45328 }
45329 }
45330 else if (GET_MODE_SIZE (d->vmode) == 64)
45331 {
45332 if (!TARGET_AVX512BW)
45333 return false;
45334
45335 /* If vpermq didn't work, vpshufb won't work either. */
45336 if (d->vmode == V8DFmode || d->vmode == V8DImode)
45337 return false;
45338
45339 vmode = V64QImode;
45340 if (d->vmode == V16SImode
45341 || d->vmode == V32HImode
45342 || d->vmode == V64QImode)
45343 {
45344 /* First see if vpermq can be used for
45345 V16SImode/V32HImode/V64QImode. */
45346 if (valid_perm_using_mode_p (V8DImode, d))
45347 {
45348 for (i = 0; i < 8; i++)
45349 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
45350 if (d->testing_p)
45351 return true;
45352 target = gen_reg_rtx (V8DImode);
45353 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
45354 perm, 8, false))
45355 {
45356 emit_move_insn (d->target,
45357 gen_lowpart (d->vmode, target));
45358 return true;
45359 }
45360 return false;
45361 }
45362
45363 /* Next see if vpermd can be used. */
45364 if (valid_perm_using_mode_p (V16SImode, d))
45365 vmode = V16SImode;
45366 }
45367 /* Or if vpermps can be used. */
45368 else if (d->vmode == V16SFmode)
45369 vmode = V16SImode;
45370 if (vmode == V64QImode)
45371 {
45372 /* vpshufb only works intra lanes, it is not
45373 possible to shuffle bytes in between the lanes. */
45374 for (i = 0; i < nelt; ++i)
45375 if ((d->perm[i] ^ i) & (nelt / 4))
45376 return false;
45377 }
45378 }
45379 else
45380 return false;
45381 }
45382
45383 if (d->testing_p)
45384 return true;
45385
45386 if (vmode == V8SImode)
45387 for (i = 0; i < 8; ++i)
45388 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
45389 else if (vmode == V16SImode)
45390 for (i = 0; i < 16; ++i)
45391 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
45392 else
45393 {
45394 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
45395 if (!d->one_operand_p)
45396 mask = 2 * nelt - 1;
45397 else if (vmode == V16QImode)
45398 mask = nelt - 1;
45399 else if (vmode == V64QImode)
45400 mask = nelt / 4 - 1;
45401 else
45402 mask = nelt / 2 - 1;
45403
45404 for (i = 0; i < nelt; ++i)
45405 {
45406 unsigned j, e = d->perm[i] & mask;
45407 for (j = 0; j < eltsz; ++j)
45408 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
45409 }
45410 }
45411
45412 vperm = gen_rtx_CONST_VECTOR (vmode,
45413 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
45414 vperm = force_reg (vmode, vperm);
45415
45416 target = d->target;
45417 if (d->vmode != vmode)
45418 target = gen_reg_rtx (vmode);
45419 op0 = gen_lowpart (vmode, d->op0);
45420 if (d->one_operand_p)
45421 {
45422 if (vmode == V16QImode)
45423 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
45424 else if (vmode == V32QImode)
45425 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
45426 else if (vmode == V64QImode)
45427 emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
45428 else if (vmode == V8SFmode)
45429 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
45430 else if (vmode == V8SImode)
45431 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
45432 else if (vmode == V16SFmode)
45433 emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
45434 else if (vmode == V16SImode)
45435 emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
45436 else
45437 gcc_unreachable ();
45438 }
45439 else
45440 {
45441 op1 = gen_lowpart (vmode, d->op1);
45442 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
45443 }
45444 if (target != d->target)
45445 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
45446
45447 return true;
45448 }
45449
45450 /* For V*[QHS]Imode permutations, check if the same permutation
45451 can't be performed in a 2x, 4x or 8x wider inner mode. */
45452
45453 static bool
45454 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
45455 struct expand_vec_perm_d *nd)
45456 {
45457 int i;
45458 machine_mode mode = VOIDmode;
45459
45460 switch (d->vmode)
45461 {
45462 case E_V16QImode: mode = V8HImode; break;
45463 case E_V32QImode: mode = V16HImode; break;
45464 case E_V64QImode: mode = V32HImode; break;
45465 case E_V8HImode: mode = V4SImode; break;
45466 case E_V16HImode: mode = V8SImode; break;
45467 case E_V32HImode: mode = V16SImode; break;
45468 case E_V4SImode: mode = V2DImode; break;
45469 case E_V8SImode: mode = V4DImode; break;
45470 case E_V16SImode: mode = V8DImode; break;
45471 default: return false;
45472 }
45473 for (i = 0; i < d->nelt; i += 2)
45474 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
45475 return false;
45476 nd->vmode = mode;
45477 nd->nelt = d->nelt / 2;
45478 for (i = 0; i < nd->nelt; i++)
45479 nd->perm[i] = d->perm[2 * i] / 2;
45480 if (GET_MODE_INNER (mode) != DImode)
45481 canonicalize_vector_int_perm (nd, nd);
45482 if (nd != d)
45483 {
45484 nd->one_operand_p = d->one_operand_p;
45485 nd->testing_p = d->testing_p;
45486 if (d->op0 == d->op1)
45487 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
45488 else
45489 {
45490 nd->op0 = gen_lowpart (nd->vmode, d->op0);
45491 nd->op1 = gen_lowpart (nd->vmode, d->op1);
45492 }
45493 if (d->testing_p)
45494 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
45495 else
45496 nd->target = gen_reg_rtx (nd->vmode);
45497 }
45498 return true;
45499 }
45500
45501 /* Try to expand one-operand permutation with constant mask. */
45502
45503 static bool
45504 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
45505 {
45506 machine_mode mode = GET_MODE (d->op0);
45507 machine_mode maskmode = mode;
45508 rtx (*gen) (rtx, rtx, rtx) = NULL;
45509 rtx target, op0, mask;
45510 rtx vec[64];
45511
45512 if (!rtx_equal_p (d->op0, d->op1))
45513 return false;
45514
45515 if (!TARGET_AVX512F)
45516 return false;
45517
45518 switch (mode)
45519 {
45520 case E_V16SImode:
45521 gen = gen_avx512f_permvarv16si;
45522 break;
45523 case E_V16SFmode:
45524 gen = gen_avx512f_permvarv16sf;
45525 maskmode = V16SImode;
45526 break;
45527 case E_V8DImode:
45528 gen = gen_avx512f_permvarv8di;
45529 break;
45530 case E_V8DFmode:
45531 gen = gen_avx512f_permvarv8df;
45532 maskmode = V8DImode;
45533 break;
45534 default:
45535 return false;
45536 }
45537
45538 target = d->target;
45539 op0 = d->op0;
45540 for (int i = 0; i < d->nelt; ++i)
45541 vec[i] = GEN_INT (d->perm[i]);
45542 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
45543 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
45544 return true;
45545 }
45546
45547 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
45548 in a single instruction. */
45549
45550 static bool
45551 expand_vec_perm_1 (struct expand_vec_perm_d *d)
45552 {
45553 unsigned i, nelt = d->nelt;
45554 struct expand_vec_perm_d nd;
45555
45556 /* Check plain VEC_SELECT first, because AVX has instructions that could
45557 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
45558 input where SEL+CONCAT may not. */
45559 if (d->one_operand_p)
45560 {
45561 int mask = nelt - 1;
45562 bool identity_perm = true;
45563 bool broadcast_perm = true;
45564
45565 for (i = 0; i < nelt; i++)
45566 {
45567 nd.perm[i] = d->perm[i] & mask;
45568 if (nd.perm[i] != i)
45569 identity_perm = false;
45570 if (nd.perm[i])
45571 broadcast_perm = false;
45572 }
45573
45574 if (identity_perm)
45575 {
45576 if (!d->testing_p)
45577 emit_move_insn (d->target, d->op0);
45578 return true;
45579 }
45580 else if (broadcast_perm && TARGET_AVX2)
45581 {
45582 /* Use vpbroadcast{b,w,d}. */
45583 rtx (*gen) (rtx, rtx) = NULL;
45584 switch (d->vmode)
45585 {
45586 case E_V64QImode:
45587 if (TARGET_AVX512BW)
45588 gen = gen_avx512bw_vec_dupv64qi_1;
45589 break;
45590 case E_V32QImode:
45591 gen = gen_avx2_pbroadcastv32qi_1;
45592 break;
45593 case E_V32HImode:
45594 if (TARGET_AVX512BW)
45595 gen = gen_avx512bw_vec_dupv32hi_1;
45596 break;
45597 case E_V16HImode:
45598 gen = gen_avx2_pbroadcastv16hi_1;
45599 break;
45600 case E_V16SImode:
45601 if (TARGET_AVX512F)
45602 gen = gen_avx512f_vec_dupv16si_1;
45603 break;
45604 case E_V8SImode:
45605 gen = gen_avx2_pbroadcastv8si_1;
45606 break;
45607 case E_V16QImode:
45608 gen = gen_avx2_pbroadcastv16qi;
45609 break;
45610 case E_V8HImode:
45611 gen = gen_avx2_pbroadcastv8hi;
45612 break;
45613 case E_V16SFmode:
45614 if (TARGET_AVX512F)
45615 gen = gen_avx512f_vec_dupv16sf_1;
45616 break;
45617 case E_V8SFmode:
45618 gen = gen_avx2_vec_dupv8sf_1;
45619 break;
45620 case E_V8DFmode:
45621 if (TARGET_AVX512F)
45622 gen = gen_avx512f_vec_dupv8df_1;
45623 break;
45624 case E_V8DImode:
45625 if (TARGET_AVX512F)
45626 gen = gen_avx512f_vec_dupv8di_1;
45627 break;
45628 /* For other modes prefer other shuffles this function creates. */
45629 default: break;
45630 }
45631 if (gen != NULL)
45632 {
45633 if (!d->testing_p)
45634 emit_insn (gen (d->target, d->op0));
45635 return true;
45636 }
45637 }
45638
45639 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
45640 return true;
45641
45642 /* There are plenty of patterns in sse.md that are written for
45643 SEL+CONCAT and are not replicated for a single op. Perhaps
45644 that should be changed, to avoid the nastiness here. */
45645
45646 /* Recognize interleave style patterns, which means incrementing
45647 every other permutation operand. */
45648 for (i = 0; i < nelt; i += 2)
45649 {
45650 nd.perm[i] = d->perm[i] & mask;
45651 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
45652 }
45653 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45654 d->testing_p))
45655 return true;
45656
45657 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
45658 if (nelt >= 4)
45659 {
45660 for (i = 0; i < nelt; i += 4)
45661 {
45662 nd.perm[i + 0] = d->perm[i + 0] & mask;
45663 nd.perm[i + 1] = d->perm[i + 1] & mask;
45664 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
45665 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
45666 }
45667
45668 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
45669 d->testing_p))
45670 return true;
45671 }
45672 }
45673
45674 /* Finally, try the fully general two operand permute. */
45675 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
45676 d->testing_p))
45677 return true;
45678
45679 /* Recognize interleave style patterns with reversed operands. */
45680 if (!d->one_operand_p)
45681 {
45682 for (i = 0; i < nelt; ++i)
45683 {
45684 unsigned e = d->perm[i];
45685 if (e >= nelt)
45686 e -= nelt;
45687 else
45688 e += nelt;
45689 nd.perm[i] = e;
45690 }
45691
45692 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
45693 d->testing_p))
45694 return true;
45695 }
45696
45697 /* Try the SSE4.1 blend variable merge instructions. */
45698 if (expand_vec_perm_blend (d))
45699 return true;
45700
45701 /* Try one of the AVX vpermil variable permutations. */
45702 if (expand_vec_perm_vpermil (d))
45703 return true;
45704
45705 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
45706 vpshufb, vpermd, vpermps or vpermq variable permutation. */
45707 if (expand_vec_perm_pshufb (d))
45708 return true;
45709
45710 /* Try the AVX2 vpalignr instruction. */
45711 if (expand_vec_perm_palignr (d, true))
45712 return true;
45713
45714 /* Try the AVX512F vperm{s,d} instructions. */
45715 if (ix86_expand_vec_one_operand_perm_avx512 (d))
45716 return true;
45717
45718 /* Try the AVX512F vpermt2/vpermi2 instructions. */
45719 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
45720 return true;
45721
45722 /* See if we can get the same permutation in different vector integer
45723 mode. */
45724 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
45725 {
45726 if (!d->testing_p)
45727 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
45728 return true;
45729 }
45730 return false;
45731 }
45732
45733 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
45734 in terms of a pair of pshuflw + pshufhw instructions. */
45735
45736 static bool
45737 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
45738 {
45739 unsigned char perm2[MAX_VECT_LEN];
45740 unsigned i;
45741 bool ok;
45742
45743 if (d->vmode != V8HImode || !d->one_operand_p)
45744 return false;
45745
45746 /* The two permutations only operate in 64-bit lanes. */
45747 for (i = 0; i < 4; ++i)
45748 if (d->perm[i] >= 4)
45749 return false;
45750 for (i = 4; i < 8; ++i)
45751 if (d->perm[i] < 4)
45752 return false;
45753
45754 if (d->testing_p)
45755 return true;
45756
45757 /* Emit the pshuflw. */
45758 memcpy (perm2, d->perm, 4);
45759 for (i = 4; i < 8; ++i)
45760 perm2[i] = i;
45761 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
45762 gcc_assert (ok);
45763
45764 /* Emit the pshufhw. */
45765 memcpy (perm2 + 4, d->perm + 4, 4);
45766 for (i = 0; i < 4; ++i)
45767 perm2[i] = i;
45768 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
45769 gcc_assert (ok);
45770
45771 return true;
45772 }
45773
45774 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45775 the permutation using the SSSE3 palignr instruction. This succeeds
45776 when all of the elements in PERM fit within one vector and we merely
45777 need to shift them down so that a single vector permutation has a
45778 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
45779 the vpalignr instruction itself can perform the requested permutation. */
45780
45781 static bool
45782 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
45783 {
45784 unsigned i, nelt = d->nelt;
45785 unsigned min, max, minswap, maxswap;
45786 bool in_order, ok, swap = false;
45787 rtx shift, target;
45788 struct expand_vec_perm_d dcopy;
45789
45790 /* Even with AVX, palignr only operates on 128-bit vectors,
45791 in AVX2 palignr operates on both 128-bit lanes. */
45792 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
45793 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
45794 return false;
45795
45796 min = 2 * nelt;
45797 max = 0;
45798 minswap = 2 * nelt;
45799 maxswap = 0;
45800 for (i = 0; i < nelt; ++i)
45801 {
45802 unsigned e = d->perm[i];
45803 unsigned eswap = d->perm[i] ^ nelt;
45804 if (GET_MODE_SIZE (d->vmode) == 32)
45805 {
45806 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
45807 eswap = e ^ (nelt / 2);
45808 }
45809 if (e < min)
45810 min = e;
45811 if (e > max)
45812 max = e;
45813 if (eswap < minswap)
45814 minswap = eswap;
45815 if (eswap > maxswap)
45816 maxswap = eswap;
45817 }
45818 if (min == 0
45819 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
45820 {
45821 if (d->one_operand_p
45822 || minswap == 0
45823 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
45824 ? nelt / 2 : nelt))
45825 return false;
45826 swap = true;
45827 min = minswap;
45828 max = maxswap;
45829 }
45830
45831 /* Given that we have SSSE3, we know we'll be able to implement the
45832 single operand permutation after the palignr with pshufb for
45833 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
45834 first. */
45835 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
45836 return true;
45837
45838 dcopy = *d;
45839 if (swap)
45840 {
45841 dcopy.op0 = d->op1;
45842 dcopy.op1 = d->op0;
45843 for (i = 0; i < nelt; ++i)
45844 dcopy.perm[i] ^= nelt;
45845 }
45846
45847 in_order = true;
45848 for (i = 0; i < nelt; ++i)
45849 {
45850 unsigned e = dcopy.perm[i];
45851 if (GET_MODE_SIZE (d->vmode) == 32
45852 && e >= nelt
45853 && (e & (nelt / 2 - 1)) < min)
45854 e = e - min - (nelt / 2);
45855 else
45856 e = e - min;
45857 if (e != i)
45858 in_order = false;
45859 dcopy.perm[i] = e;
45860 }
45861 dcopy.one_operand_p = true;
45862
45863 if (single_insn_only_p && !in_order)
45864 return false;
45865
45866 /* For AVX2, test whether we can permute the result in one instruction. */
45867 if (d->testing_p)
45868 {
45869 if (in_order)
45870 return true;
45871 dcopy.op1 = dcopy.op0;
45872 return expand_vec_perm_1 (&dcopy);
45873 }
45874
45875 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
45876 if (GET_MODE_SIZE (d->vmode) == 16)
45877 {
45878 target = gen_reg_rtx (TImode);
45879 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
45880 gen_lowpart (TImode, dcopy.op0), shift));
45881 }
45882 else
45883 {
45884 target = gen_reg_rtx (V2TImode);
45885 emit_insn (gen_avx2_palignrv2ti (target,
45886 gen_lowpart (V2TImode, dcopy.op1),
45887 gen_lowpart (V2TImode, dcopy.op0),
45888 shift));
45889 }
45890
45891 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
45892
45893 /* Test for the degenerate case where the alignment by itself
45894 produces the desired permutation. */
45895 if (in_order)
45896 {
45897 emit_move_insn (d->target, dcopy.op0);
45898 return true;
45899 }
45900
45901 ok = expand_vec_perm_1 (&dcopy);
45902 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
45903
45904 return ok;
45905 }
45906
45907 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
45908 the permutation using the SSE4_1 pblendv instruction. Potentially
45909 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
45910
45911 static bool
45912 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
45913 {
45914 unsigned i, which, nelt = d->nelt;
45915 struct expand_vec_perm_d dcopy, dcopy1;
45916 machine_mode vmode = d->vmode;
45917 bool ok;
45918
45919 /* Use the same checks as in expand_vec_perm_blend. */
45920 if (d->one_operand_p)
45921 return false;
45922 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
45923 ;
45924 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
45925 ;
45926 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
45927 ;
45928 else
45929 return false;
45930
45931 /* Figure out where permutation elements stay not in their
45932 respective lanes. */
45933 for (i = 0, which = 0; i < nelt; ++i)
45934 {
45935 unsigned e = d->perm[i];
45936 if (e != i)
45937 which |= (e < nelt ? 1 : 2);
45938 }
45939 /* We can pblend the part where elements stay not in their
45940 respective lanes only when these elements are all in one
45941 half of a permutation.
45942 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
45943 lanes, but both 8 and 9 >= 8
45944 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
45945 respective lanes and 8 >= 8, but 2 not. */
45946 if (which != 1 && which != 2)
45947 return false;
45948 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
45949 return true;
45950
45951 /* First we apply one operand permutation to the part where
45952 elements stay not in their respective lanes. */
45953 dcopy = *d;
45954 if (which == 2)
45955 dcopy.op0 = dcopy.op1 = d->op1;
45956 else
45957 dcopy.op0 = dcopy.op1 = d->op0;
45958 if (!d->testing_p)
45959 dcopy.target = gen_reg_rtx (vmode);
45960 dcopy.one_operand_p = true;
45961
45962 for (i = 0; i < nelt; ++i)
45963 dcopy.perm[i] = d->perm[i] & (nelt - 1);
45964
45965 ok = expand_vec_perm_1 (&dcopy);
45966 if (GET_MODE_SIZE (vmode) != 16 && !ok)
45967 return false;
45968 else
45969 gcc_assert (ok);
45970 if (d->testing_p)
45971 return true;
45972
45973 /* Next we put permuted elements into their positions. */
45974 dcopy1 = *d;
45975 if (which == 2)
45976 dcopy1.op1 = dcopy.target;
45977 else
45978 dcopy1.op0 = dcopy.target;
45979
45980 for (i = 0; i < nelt; ++i)
45981 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
45982
45983 ok = expand_vec_perm_blend (&dcopy1);
45984 gcc_assert (ok);
45985
45986 return true;
45987 }
45988
45989 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
45990
45991 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
45992 a two vector permutation into a single vector permutation by using
45993 an interleave operation to merge the vectors. */
45994
45995 static bool
45996 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
45997 {
45998 struct expand_vec_perm_d dremap, dfinal;
45999 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
46000 unsigned HOST_WIDE_INT contents;
46001 unsigned char remap[2 * MAX_VECT_LEN];
46002 rtx_insn *seq;
46003 bool ok, same_halves = false;
46004
46005 if (GET_MODE_SIZE (d->vmode) == 16)
46006 {
46007 if (d->one_operand_p)
46008 return false;
46009 }
46010 else if (GET_MODE_SIZE (d->vmode) == 32)
46011 {
46012 if (!TARGET_AVX)
46013 return false;
46014 /* For 32-byte modes allow even d->one_operand_p.
46015 The lack of cross-lane shuffling in some instructions
46016 might prevent a single insn shuffle. */
46017 dfinal = *d;
46018 dfinal.testing_p = true;
46019 /* If expand_vec_perm_interleave3 can expand this into
46020 a 3 insn sequence, give up and let it be expanded as
46021 3 insn sequence. While that is one insn longer,
46022 it doesn't need a memory operand and in the common
46023 case that both interleave low and high permutations
46024 with the same operands are adjacent needs 4 insns
46025 for both after CSE. */
46026 if (expand_vec_perm_interleave3 (&dfinal))
46027 return false;
46028 }
46029 else
46030 return false;
46031
46032 /* Examine from whence the elements come. */
46033 contents = 0;
46034 for (i = 0; i < nelt; ++i)
46035 contents |= HOST_WIDE_INT_1U << d->perm[i];
46036
46037 memset (remap, 0xff, sizeof (remap));
46038 dremap = *d;
46039
46040 if (GET_MODE_SIZE (d->vmode) == 16)
46041 {
46042 unsigned HOST_WIDE_INT h1, h2, h3, h4;
46043
46044 /* Split the two input vectors into 4 halves. */
46045 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
46046 h2 = h1 << nelt2;
46047 h3 = h2 << nelt2;
46048 h4 = h3 << nelt2;
46049
46050 /* If the elements from the low halves use interleave low, and similarly
46051 for interleave high. If the elements are from mis-matched halves, we
46052 can use shufps for V4SF/V4SI or do a DImode shuffle. */
46053 if ((contents & (h1 | h3)) == contents)
46054 {
46055 /* punpckl* */
46056 for (i = 0; i < nelt2; ++i)
46057 {
46058 remap[i] = i * 2;
46059 remap[i + nelt] = i * 2 + 1;
46060 dremap.perm[i * 2] = i;
46061 dremap.perm[i * 2 + 1] = i + nelt;
46062 }
46063 if (!TARGET_SSE2 && d->vmode == V4SImode)
46064 dremap.vmode = V4SFmode;
46065 }
46066 else if ((contents & (h2 | h4)) == contents)
46067 {
46068 /* punpckh* */
46069 for (i = 0; i < nelt2; ++i)
46070 {
46071 remap[i + nelt2] = i * 2;
46072 remap[i + nelt + nelt2] = i * 2 + 1;
46073 dremap.perm[i * 2] = i + nelt2;
46074 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
46075 }
46076 if (!TARGET_SSE2 && d->vmode == V4SImode)
46077 dremap.vmode = V4SFmode;
46078 }
46079 else if ((contents & (h1 | h4)) == contents)
46080 {
46081 /* shufps */
46082 for (i = 0; i < nelt2; ++i)
46083 {
46084 remap[i] = i;
46085 remap[i + nelt + nelt2] = i + nelt2;
46086 dremap.perm[i] = i;
46087 dremap.perm[i + nelt2] = i + nelt + nelt2;
46088 }
46089 if (nelt != 4)
46090 {
46091 /* shufpd */
46092 dremap.vmode = V2DImode;
46093 dremap.nelt = 2;
46094 dremap.perm[0] = 0;
46095 dremap.perm[1] = 3;
46096 }
46097 }
46098 else if ((contents & (h2 | h3)) == contents)
46099 {
46100 /* shufps */
46101 for (i = 0; i < nelt2; ++i)
46102 {
46103 remap[i + nelt2] = i;
46104 remap[i + nelt] = i + nelt2;
46105 dremap.perm[i] = i + nelt2;
46106 dremap.perm[i + nelt2] = i + nelt;
46107 }
46108 if (nelt != 4)
46109 {
46110 /* shufpd */
46111 dremap.vmode = V2DImode;
46112 dremap.nelt = 2;
46113 dremap.perm[0] = 1;
46114 dremap.perm[1] = 2;
46115 }
46116 }
46117 else
46118 return false;
46119 }
46120 else
46121 {
46122 unsigned int nelt4 = nelt / 4, nzcnt = 0;
46123 unsigned HOST_WIDE_INT q[8];
46124 unsigned int nonzero_halves[4];
46125
46126 /* Split the two input vectors into 8 quarters. */
46127 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
46128 for (i = 1; i < 8; ++i)
46129 q[i] = q[0] << (nelt4 * i);
46130 for (i = 0; i < 4; ++i)
46131 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
46132 {
46133 nonzero_halves[nzcnt] = i;
46134 ++nzcnt;
46135 }
46136
46137 if (nzcnt == 1)
46138 {
46139 gcc_assert (d->one_operand_p);
46140 nonzero_halves[1] = nonzero_halves[0];
46141 same_halves = true;
46142 }
46143 else if (d->one_operand_p)
46144 {
46145 gcc_assert (nonzero_halves[0] == 0);
46146 gcc_assert (nonzero_halves[1] == 1);
46147 }
46148
46149 if (nzcnt <= 2)
46150 {
46151 if (d->perm[0] / nelt2 == nonzero_halves[1])
46152 {
46153 /* Attempt to increase the likelihood that dfinal
46154 shuffle will be intra-lane. */
46155 std::swap (nonzero_halves[0], nonzero_halves[1]);
46156 }
46157
46158 /* vperm2f128 or vperm2i128. */
46159 for (i = 0; i < nelt2; ++i)
46160 {
46161 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
46162 remap[i + nonzero_halves[0] * nelt2] = i;
46163 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
46164 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
46165 }
46166
46167 if (d->vmode != V8SFmode
46168 && d->vmode != V4DFmode
46169 && d->vmode != V8SImode)
46170 {
46171 dremap.vmode = V8SImode;
46172 dremap.nelt = 8;
46173 for (i = 0; i < 4; ++i)
46174 {
46175 dremap.perm[i] = i + nonzero_halves[0] * 4;
46176 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
46177 }
46178 }
46179 }
46180 else if (d->one_operand_p)
46181 return false;
46182 else if (TARGET_AVX2
46183 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
46184 {
46185 /* vpunpckl* */
46186 for (i = 0; i < nelt4; ++i)
46187 {
46188 remap[i] = i * 2;
46189 remap[i + nelt] = i * 2 + 1;
46190 remap[i + nelt2] = i * 2 + nelt2;
46191 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
46192 dremap.perm[i * 2] = i;
46193 dremap.perm[i * 2 + 1] = i + nelt;
46194 dremap.perm[i * 2 + nelt2] = i + nelt2;
46195 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
46196 }
46197 }
46198 else if (TARGET_AVX2
46199 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
46200 {
46201 /* vpunpckh* */
46202 for (i = 0; i < nelt4; ++i)
46203 {
46204 remap[i + nelt4] = i * 2;
46205 remap[i + nelt + nelt4] = i * 2 + 1;
46206 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
46207 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
46208 dremap.perm[i * 2] = i + nelt4;
46209 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
46210 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
46211 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
46212 }
46213 }
46214 else
46215 return false;
46216 }
46217
46218 /* Use the remapping array set up above to move the elements from their
46219 swizzled locations into their final destinations. */
46220 dfinal = *d;
46221 for (i = 0; i < nelt; ++i)
46222 {
46223 unsigned e = remap[d->perm[i]];
46224 gcc_assert (e < nelt);
46225 /* If same_halves is true, both halves of the remapped vector are the
46226 same. Avoid cross-lane accesses if possible. */
46227 if (same_halves && i >= nelt2)
46228 {
46229 gcc_assert (e < nelt2);
46230 dfinal.perm[i] = e + nelt2;
46231 }
46232 else
46233 dfinal.perm[i] = e;
46234 }
46235 if (!d->testing_p)
46236 {
46237 dremap.target = gen_reg_rtx (dremap.vmode);
46238 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46239 }
46240 dfinal.op1 = dfinal.op0;
46241 dfinal.one_operand_p = true;
46242
46243 /* Test if the final remap can be done with a single insn. For V4SFmode or
46244 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
46245 start_sequence ();
46246 ok = expand_vec_perm_1 (&dfinal);
46247 seq = get_insns ();
46248 end_sequence ();
46249
46250 if (!ok)
46251 return false;
46252
46253 if (d->testing_p)
46254 return true;
46255
46256 if (dremap.vmode != dfinal.vmode)
46257 {
46258 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
46259 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
46260 }
46261
46262 ok = expand_vec_perm_1 (&dremap);
46263 gcc_assert (ok);
46264
46265 emit_insn (seq);
46266 return true;
46267 }
46268
46269 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46270 a single vector cross-lane permutation into vpermq followed
46271 by any of the single insn permutations. */
46272
46273 static bool
46274 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
46275 {
46276 struct expand_vec_perm_d dremap, dfinal;
46277 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
46278 unsigned contents[2];
46279 bool ok;
46280
46281 if (!(TARGET_AVX2
46282 && (d->vmode == V32QImode || d->vmode == V16HImode)
46283 && d->one_operand_p))
46284 return false;
46285
46286 contents[0] = 0;
46287 contents[1] = 0;
46288 for (i = 0; i < nelt2; ++i)
46289 {
46290 contents[0] |= 1u << (d->perm[i] / nelt4);
46291 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
46292 }
46293
46294 for (i = 0; i < 2; ++i)
46295 {
46296 unsigned int cnt = 0;
46297 for (j = 0; j < 4; ++j)
46298 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
46299 return false;
46300 }
46301
46302 if (d->testing_p)
46303 return true;
46304
46305 dremap = *d;
46306 dremap.vmode = V4DImode;
46307 dremap.nelt = 4;
46308 dremap.target = gen_reg_rtx (V4DImode);
46309 dremap.op0 = gen_lowpart (V4DImode, d->op0);
46310 dremap.op1 = dremap.op0;
46311 dremap.one_operand_p = true;
46312 for (i = 0; i < 2; ++i)
46313 {
46314 unsigned int cnt = 0;
46315 for (j = 0; j < 4; ++j)
46316 if ((contents[i] & (1u << j)) != 0)
46317 dremap.perm[2 * i + cnt++] = j;
46318 for (; cnt < 2; ++cnt)
46319 dremap.perm[2 * i + cnt] = 0;
46320 }
46321
46322 dfinal = *d;
46323 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
46324 dfinal.op1 = dfinal.op0;
46325 dfinal.one_operand_p = true;
46326 for (i = 0, j = 0; i < nelt; ++i)
46327 {
46328 if (i == nelt2)
46329 j = 2;
46330 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
46331 if ((d->perm[i] / nelt4) == dremap.perm[j])
46332 ;
46333 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
46334 dfinal.perm[i] |= nelt4;
46335 else
46336 gcc_unreachable ();
46337 }
46338
46339 ok = expand_vec_perm_1 (&dremap);
46340 gcc_assert (ok);
46341
46342 ok = expand_vec_perm_1 (&dfinal);
46343 gcc_assert (ok);
46344
46345 return true;
46346 }
46347
46348 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
46349 a vector permutation using two instructions, vperm2f128 resp.
46350 vperm2i128 followed by any single in-lane permutation. */
46351
46352 static bool
46353 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
46354 {
46355 struct expand_vec_perm_d dfirst, dsecond;
46356 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
46357 bool ok;
46358
46359 if (!TARGET_AVX
46360 || GET_MODE_SIZE (d->vmode) != 32
46361 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
46362 return false;
46363
46364 dsecond = *d;
46365 dsecond.one_operand_p = false;
46366 dsecond.testing_p = true;
46367
46368 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
46369 immediate. For perm < 16 the second permutation uses
46370 d->op0 as first operand, for perm >= 16 it uses d->op1
46371 as first operand. The second operand is the result of
46372 vperm2[fi]128. */
46373 for (perm = 0; perm < 32; perm++)
46374 {
46375 /* Ignore permutations which do not move anything cross-lane. */
46376 if (perm < 16)
46377 {
46378 /* The second shuffle for e.g. V4DFmode has
46379 0123 and ABCD operands.
46380 Ignore AB23, as 23 is already in the second lane
46381 of the first operand. */
46382 if ((perm & 0xc) == (1 << 2)) continue;
46383 /* And 01CD, as 01 is in the first lane of the first
46384 operand. */
46385 if ((perm & 3) == 0) continue;
46386 /* And 4567, as then the vperm2[fi]128 doesn't change
46387 anything on the original 4567 second operand. */
46388 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
46389 }
46390 else
46391 {
46392 /* The second shuffle for e.g. V4DFmode has
46393 4567 and ABCD operands.
46394 Ignore AB67, as 67 is already in the second lane
46395 of the first operand. */
46396 if ((perm & 0xc) == (3 << 2)) continue;
46397 /* And 45CD, as 45 is in the first lane of the first
46398 operand. */
46399 if ((perm & 3) == 2) continue;
46400 /* And 0123, as then the vperm2[fi]128 doesn't change
46401 anything on the original 0123 first operand. */
46402 if ((perm & 0xf) == (1 << 2)) continue;
46403 }
46404
46405 for (i = 0; i < nelt; i++)
46406 {
46407 j = d->perm[i] / nelt2;
46408 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
46409 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
46410 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
46411 dsecond.perm[i] = d->perm[i] & (nelt - 1);
46412 else
46413 break;
46414 }
46415
46416 if (i == nelt)
46417 {
46418 start_sequence ();
46419 ok = expand_vec_perm_1 (&dsecond);
46420 end_sequence ();
46421 }
46422 else
46423 ok = false;
46424
46425 if (ok)
46426 {
46427 if (d->testing_p)
46428 return true;
46429
46430 /* Found a usable second shuffle. dfirst will be
46431 vperm2f128 on d->op0 and d->op1. */
46432 dsecond.testing_p = false;
46433 dfirst = *d;
46434 dfirst.target = gen_reg_rtx (d->vmode);
46435 for (i = 0; i < nelt; i++)
46436 dfirst.perm[i] = (i & (nelt2 - 1))
46437 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
46438
46439 canonicalize_perm (&dfirst);
46440 ok = expand_vec_perm_1 (&dfirst);
46441 gcc_assert (ok);
46442
46443 /* And dsecond is some single insn shuffle, taking
46444 d->op0 and result of vperm2f128 (if perm < 16) or
46445 d->op1 and result of vperm2f128 (otherwise). */
46446 if (perm >= 16)
46447 dsecond.op0 = dsecond.op1;
46448 dsecond.op1 = dfirst.target;
46449
46450 ok = expand_vec_perm_1 (&dsecond);
46451 gcc_assert (ok);
46452
46453 return true;
46454 }
46455
46456 /* For one operand, the only useful vperm2f128 permutation is 0x01
46457 aka lanes swap. */
46458 if (d->one_operand_p)
46459 return false;
46460 }
46461
46462 return false;
46463 }
46464
46465 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
46466 a two vector permutation using 2 intra-lane interleave insns
46467 and cross-lane shuffle for 32-byte vectors. */
46468
46469 static bool
46470 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
46471 {
46472 unsigned i, nelt;
46473 rtx (*gen) (rtx, rtx, rtx);
46474
46475 if (d->one_operand_p)
46476 return false;
46477 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
46478 ;
46479 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
46480 ;
46481 else
46482 return false;
46483
46484 nelt = d->nelt;
46485 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
46486 return false;
46487 for (i = 0; i < nelt; i += 2)
46488 if (d->perm[i] != d->perm[0] + i / 2
46489 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
46490 return false;
46491
46492 if (d->testing_p)
46493 return true;
46494
46495 switch (d->vmode)
46496 {
46497 case E_V32QImode:
46498 if (d->perm[0])
46499 gen = gen_vec_interleave_highv32qi;
46500 else
46501 gen = gen_vec_interleave_lowv32qi;
46502 break;
46503 case E_V16HImode:
46504 if (d->perm[0])
46505 gen = gen_vec_interleave_highv16hi;
46506 else
46507 gen = gen_vec_interleave_lowv16hi;
46508 break;
46509 case E_V8SImode:
46510 if (d->perm[0])
46511 gen = gen_vec_interleave_highv8si;
46512 else
46513 gen = gen_vec_interleave_lowv8si;
46514 break;
46515 case E_V4DImode:
46516 if (d->perm[0])
46517 gen = gen_vec_interleave_highv4di;
46518 else
46519 gen = gen_vec_interleave_lowv4di;
46520 break;
46521 case E_V8SFmode:
46522 if (d->perm[0])
46523 gen = gen_vec_interleave_highv8sf;
46524 else
46525 gen = gen_vec_interleave_lowv8sf;
46526 break;
46527 case E_V4DFmode:
46528 if (d->perm[0])
46529 gen = gen_vec_interleave_highv4df;
46530 else
46531 gen = gen_vec_interleave_lowv4df;
46532 break;
46533 default:
46534 gcc_unreachable ();
46535 }
46536
46537 emit_insn (gen (d->target, d->op0, d->op1));
46538 return true;
46539 }
46540
46541 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
46542 a single vector permutation using a single intra-lane vector
46543 permutation, vperm2f128 swapping the lanes and vblend* insn blending
46544 the non-swapped and swapped vectors together. */
46545
46546 static bool
46547 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
46548 {
46549 struct expand_vec_perm_d dfirst, dsecond;
46550 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
46551 rtx_insn *seq;
46552 bool ok;
46553 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
46554
46555 if (!TARGET_AVX
46556 || TARGET_AVX2
46557 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
46558 || !d->one_operand_p)
46559 return false;
46560
46561 dfirst = *d;
46562 for (i = 0; i < nelt; i++)
46563 dfirst.perm[i] = 0xff;
46564 for (i = 0, msk = 0; i < nelt; i++)
46565 {
46566 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
46567 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
46568 return false;
46569 dfirst.perm[j] = d->perm[i];
46570 if (j != i)
46571 msk |= (1 << i);
46572 }
46573 for (i = 0; i < nelt; i++)
46574 if (dfirst.perm[i] == 0xff)
46575 dfirst.perm[i] = i;
46576
46577 if (!d->testing_p)
46578 dfirst.target = gen_reg_rtx (dfirst.vmode);
46579
46580 start_sequence ();
46581 ok = expand_vec_perm_1 (&dfirst);
46582 seq = get_insns ();
46583 end_sequence ();
46584
46585 if (!ok)
46586 return false;
46587
46588 if (d->testing_p)
46589 return true;
46590
46591 emit_insn (seq);
46592
46593 dsecond = *d;
46594 dsecond.op0 = dfirst.target;
46595 dsecond.op1 = dfirst.target;
46596 dsecond.one_operand_p = true;
46597 dsecond.target = gen_reg_rtx (dsecond.vmode);
46598 for (i = 0; i < nelt; i++)
46599 dsecond.perm[i] = i ^ nelt2;
46600
46601 ok = expand_vec_perm_1 (&dsecond);
46602 gcc_assert (ok);
46603
46604 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
46605 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
46606 return true;
46607 }
46608
46609 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
46610 permutation using two vperm2f128, followed by a vshufpd insn blending
46611 the two vectors together. */
46612
46613 static bool
46614 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
46615 {
46616 struct expand_vec_perm_d dfirst, dsecond, dthird;
46617 bool ok;
46618
46619 if (!TARGET_AVX || (d->vmode != V4DFmode))
46620 return false;
46621
46622 if (d->testing_p)
46623 return true;
46624
46625 dfirst = *d;
46626 dsecond = *d;
46627 dthird = *d;
46628
46629 dfirst.perm[0] = (d->perm[0] & ~1);
46630 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
46631 dfirst.perm[2] = (d->perm[2] & ~1);
46632 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
46633 dsecond.perm[0] = (d->perm[1] & ~1);
46634 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
46635 dsecond.perm[2] = (d->perm[3] & ~1);
46636 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
46637 dthird.perm[0] = (d->perm[0] % 2);
46638 dthird.perm[1] = (d->perm[1] % 2) + 4;
46639 dthird.perm[2] = (d->perm[2] % 2) + 2;
46640 dthird.perm[3] = (d->perm[3] % 2) + 6;
46641
46642 dfirst.target = gen_reg_rtx (dfirst.vmode);
46643 dsecond.target = gen_reg_rtx (dsecond.vmode);
46644 dthird.op0 = dfirst.target;
46645 dthird.op1 = dsecond.target;
46646 dthird.one_operand_p = false;
46647
46648 canonicalize_perm (&dfirst);
46649 canonicalize_perm (&dsecond);
46650
46651 ok = expand_vec_perm_1 (&dfirst)
46652 && expand_vec_perm_1 (&dsecond)
46653 && expand_vec_perm_1 (&dthird);
46654
46655 gcc_assert (ok);
46656
46657 return true;
46658 }
46659
46660 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
46661 permutation with two pshufb insns and an ior. We should have already
46662 failed all two instruction sequences. */
46663
46664 static bool
46665 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
46666 {
46667 rtx rperm[2][16], vperm, l, h, op, m128;
46668 unsigned int i, nelt, eltsz;
46669
46670 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
46671 return false;
46672 gcc_assert (!d->one_operand_p);
46673
46674 if (d->testing_p)
46675 return true;
46676
46677 nelt = d->nelt;
46678 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46679
46680 /* Generate two permutation masks. If the required element is within
46681 the given vector it is shuffled into the proper lane. If the required
46682 element is in the other vector, force a zero into the lane by setting
46683 bit 7 in the permutation mask. */
46684 m128 = GEN_INT (-128);
46685 for (i = 0; i < nelt; ++i)
46686 {
46687 unsigned j, e = d->perm[i];
46688 unsigned which = (e >= nelt);
46689 if (e >= nelt)
46690 e -= nelt;
46691
46692 for (j = 0; j < eltsz; ++j)
46693 {
46694 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
46695 rperm[1-which][i*eltsz + j] = m128;
46696 }
46697 }
46698
46699 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
46700 vperm = force_reg (V16QImode, vperm);
46701
46702 l = gen_reg_rtx (V16QImode);
46703 op = gen_lowpart (V16QImode, d->op0);
46704 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
46705
46706 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
46707 vperm = force_reg (V16QImode, vperm);
46708
46709 h = gen_reg_rtx (V16QImode);
46710 op = gen_lowpart (V16QImode, d->op1);
46711 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
46712
46713 op = d->target;
46714 if (d->vmode != V16QImode)
46715 op = gen_reg_rtx (V16QImode);
46716 emit_insn (gen_iorv16qi3 (op, l, h));
46717 if (op != d->target)
46718 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46719
46720 return true;
46721 }
46722
46723 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
46724 with two vpshufb insns, vpermq and vpor. We should have already failed
46725 all two or three instruction sequences. */
46726
46727 static bool
46728 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
46729 {
46730 rtx rperm[2][32], vperm, l, h, hp, op, m128;
46731 unsigned int i, nelt, eltsz;
46732
46733 if (!TARGET_AVX2
46734 || !d->one_operand_p
46735 || (d->vmode != V32QImode && d->vmode != V16HImode))
46736 return false;
46737
46738 if (d->testing_p)
46739 return true;
46740
46741 nelt = d->nelt;
46742 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46743
46744 /* Generate two permutation masks. If the required element is within
46745 the same lane, it is shuffled in. If the required element from the
46746 other lane, force a zero by setting bit 7 in the permutation mask.
46747 In the other mask the mask has non-negative elements if element
46748 is requested from the other lane, but also moved to the other lane,
46749 so that the result of vpshufb can have the two V2TImode halves
46750 swapped. */
46751 m128 = GEN_INT (-128);
46752 for (i = 0; i < nelt; ++i)
46753 {
46754 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46755 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
46756
46757 for (j = 0; j < eltsz; ++j)
46758 {
46759 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
46760 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
46761 }
46762 }
46763
46764 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46765 vperm = force_reg (V32QImode, vperm);
46766
46767 h = gen_reg_rtx (V32QImode);
46768 op = gen_lowpart (V32QImode, d->op0);
46769 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46770
46771 /* Swap the 128-byte lanes of h into hp. */
46772 hp = gen_reg_rtx (V4DImode);
46773 op = gen_lowpart (V4DImode, h);
46774 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
46775 const1_rtx));
46776
46777 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46778 vperm = force_reg (V32QImode, vperm);
46779
46780 l = gen_reg_rtx (V32QImode);
46781 op = gen_lowpart (V32QImode, d->op0);
46782 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46783
46784 op = d->target;
46785 if (d->vmode != V32QImode)
46786 op = gen_reg_rtx (V32QImode);
46787 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
46788 if (op != d->target)
46789 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46790
46791 return true;
46792 }
46793
46794 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46795 and extract-odd permutations of two V32QImode and V16QImode operand
46796 with two vpshufb insns, vpor and vpermq. We should have already
46797 failed all two or three instruction sequences. */
46798
46799 static bool
46800 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
46801 {
46802 rtx rperm[2][32], vperm, l, h, ior, op, m128;
46803 unsigned int i, nelt, eltsz;
46804
46805 if (!TARGET_AVX2
46806 || d->one_operand_p
46807 || (d->vmode != V32QImode && d->vmode != V16HImode))
46808 return false;
46809
46810 for (i = 0; i < d->nelt; ++i)
46811 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
46812 return false;
46813
46814 if (d->testing_p)
46815 return true;
46816
46817 nelt = d->nelt;
46818 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
46819
46820 /* Generate two permutation masks. In the first permutation mask
46821 the first quarter will contain indexes for the first half
46822 of the op0, the second quarter will contain bit 7 set, third quarter
46823 will contain indexes for the second half of the op0 and the
46824 last quarter bit 7 set. In the second permutation mask
46825 the first quarter will contain bit 7 set, the second quarter
46826 indexes for the first half of the op1, the third quarter bit 7 set
46827 and last quarter indexes for the second half of the op1.
46828 I.e. the first mask e.g. for V32QImode extract even will be:
46829 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
46830 (all values masked with 0xf except for -128) and second mask
46831 for extract even will be
46832 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
46833 m128 = GEN_INT (-128);
46834 for (i = 0; i < nelt; ++i)
46835 {
46836 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
46837 unsigned which = d->perm[i] >= nelt;
46838 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
46839
46840 for (j = 0; j < eltsz; ++j)
46841 {
46842 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
46843 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
46844 }
46845 }
46846
46847 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
46848 vperm = force_reg (V32QImode, vperm);
46849
46850 l = gen_reg_rtx (V32QImode);
46851 op = gen_lowpart (V32QImode, d->op0);
46852 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
46853
46854 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
46855 vperm = force_reg (V32QImode, vperm);
46856
46857 h = gen_reg_rtx (V32QImode);
46858 op = gen_lowpart (V32QImode, d->op1);
46859 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
46860
46861 ior = gen_reg_rtx (V32QImode);
46862 emit_insn (gen_iorv32qi3 (ior, l, h));
46863
46864 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
46865 op = gen_reg_rtx (V4DImode);
46866 ior = gen_lowpart (V4DImode, ior);
46867 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
46868 const1_rtx, GEN_INT (3)));
46869 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
46870
46871 return true;
46872 }
46873
46874 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46875 and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
46876 with two "and" and "pack" or two "shift" and "pack" insns. We should
46877 have already failed all two instruction sequences. */
46878
46879 static bool
46880 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
46881 {
46882 rtx op, dop0, dop1, t;
46883 unsigned i, odd, c, s, nelt = d->nelt;
46884 bool end_perm = false;
46885 machine_mode half_mode;
46886 rtx (*gen_and) (rtx, rtx, rtx);
46887 rtx (*gen_pack) (rtx, rtx, rtx);
46888 rtx (*gen_shift) (rtx, rtx, rtx);
46889
46890 if (d->one_operand_p)
46891 return false;
46892
46893 switch (d->vmode)
46894 {
46895 case E_V8HImode:
46896 /* Required for "pack". */
46897 if (!TARGET_SSE4_1)
46898 return false;
46899 c = 0xffff;
46900 s = 16;
46901 half_mode = V4SImode;
46902 gen_and = gen_andv4si3;
46903 gen_pack = gen_sse4_1_packusdw;
46904 gen_shift = gen_lshrv4si3;
46905 break;
46906 case E_V16QImode:
46907 /* No check as all instructions are SSE2. */
46908 c = 0xff;
46909 s = 8;
46910 half_mode = V8HImode;
46911 gen_and = gen_andv8hi3;
46912 gen_pack = gen_sse2_packuswb;
46913 gen_shift = gen_lshrv8hi3;
46914 break;
46915 case E_V16HImode:
46916 if (!TARGET_AVX2)
46917 return false;
46918 c = 0xffff;
46919 s = 16;
46920 half_mode = V8SImode;
46921 gen_and = gen_andv8si3;
46922 gen_pack = gen_avx2_packusdw;
46923 gen_shift = gen_lshrv8si3;
46924 end_perm = true;
46925 break;
46926 case E_V32QImode:
46927 if (!TARGET_AVX2)
46928 return false;
46929 c = 0xff;
46930 s = 8;
46931 half_mode = V16HImode;
46932 gen_and = gen_andv16hi3;
46933 gen_pack = gen_avx2_packuswb;
46934 gen_shift = gen_lshrv16hi3;
46935 end_perm = true;
46936 break;
46937 default:
46938 /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
46939 general shuffles. */
46940 return false;
46941 }
46942
46943 /* Check that permutation is even or odd. */
46944 odd = d->perm[0];
46945 if (odd > 1)
46946 return false;
46947
46948 for (i = 1; i < nelt; ++i)
46949 if (d->perm[i] != 2 * i + odd)
46950 return false;
46951
46952 if (d->testing_p)
46953 return true;
46954
46955 dop0 = gen_reg_rtx (half_mode);
46956 dop1 = gen_reg_rtx (half_mode);
46957 if (odd == 0)
46958 {
46959 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
46960 t = force_reg (half_mode, t);
46961 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
46962 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
46963 }
46964 else
46965 {
46966 emit_insn (gen_shift (dop0,
46967 gen_lowpart (half_mode, d->op0),
46968 GEN_INT (s)));
46969 emit_insn (gen_shift (dop1,
46970 gen_lowpart (half_mode, d->op1),
46971 GEN_INT (s)));
46972 }
46973 /* In AVX2 for 256 bit case we need to permute pack result. */
46974 if (TARGET_AVX2 && end_perm)
46975 {
46976 op = gen_reg_rtx (d->vmode);
46977 t = gen_reg_rtx (V4DImode);
46978 emit_insn (gen_pack (op, dop0, dop1));
46979 emit_insn (gen_avx2_permv4di_1 (t,
46980 gen_lowpart (V4DImode, op),
46981 const0_rtx,
46982 const2_rtx,
46983 const1_rtx,
46984 GEN_INT (3)));
46985 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
46986 }
46987 else
46988 emit_insn (gen_pack (d->target, dop0, dop1));
46989
46990 return true;
46991 }
46992
46993 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
46994 and extract-odd permutations of two V64QI operands
46995 with two "shifts", two "truncs" and one "concat" insns for "odd"
46996 and two "truncs" and one concat insn for "even."
46997 Have already failed all two instruction sequences. */
46998
46999 static bool
47000 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
47001 {
47002 rtx t1, t2, t3, t4;
47003 unsigned i, odd, nelt = d->nelt;
47004
47005 if (!TARGET_AVX512BW
47006 || d->one_operand_p
47007 || d->vmode != V64QImode)
47008 return false;
47009
47010 /* Check that permutation is even or odd. */
47011 odd = d->perm[0];
47012 if (odd > 1)
47013 return false;
47014
47015 for (i = 1; i < nelt; ++i)
47016 if (d->perm[i] != 2 * i + odd)
47017 return false;
47018
47019 if (d->testing_p)
47020 return true;
47021
47022
47023 if (odd)
47024 {
47025 t1 = gen_reg_rtx (V32HImode);
47026 t2 = gen_reg_rtx (V32HImode);
47027 emit_insn (gen_lshrv32hi3 (t1,
47028 gen_lowpart (V32HImode, d->op0),
47029 GEN_INT (8)));
47030 emit_insn (gen_lshrv32hi3 (t2,
47031 gen_lowpart (V32HImode, d->op1),
47032 GEN_INT (8)));
47033 }
47034 else
47035 {
47036 t1 = gen_lowpart (V32HImode, d->op0);
47037 t2 = gen_lowpart (V32HImode, d->op1);
47038 }
47039
47040 t3 = gen_reg_rtx (V32QImode);
47041 t4 = gen_reg_rtx (V32QImode);
47042 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
47043 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
47044 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
47045
47046 return true;
47047 }
47048
47049 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
47050 and extract-odd permutations. */
47051
47052 static bool
47053 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
47054 {
47055 rtx t1, t2, t3, t4, t5;
47056
47057 switch (d->vmode)
47058 {
47059 case E_V4DFmode:
47060 if (d->testing_p)
47061 break;
47062 t1 = gen_reg_rtx (V4DFmode);
47063 t2 = gen_reg_rtx (V4DFmode);
47064
47065 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47066 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
47067 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
47068
47069 /* Now an unpck[lh]pd will produce the result required. */
47070 if (odd)
47071 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
47072 else
47073 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
47074 emit_insn (t3);
47075 break;
47076
47077 case E_V8SFmode:
47078 {
47079 int mask = odd ? 0xdd : 0x88;
47080
47081 if (d->testing_p)
47082 break;
47083 t1 = gen_reg_rtx (V8SFmode);
47084 t2 = gen_reg_rtx (V8SFmode);
47085 t3 = gen_reg_rtx (V8SFmode);
47086
47087 /* Shuffle within the 128-bit lanes to produce:
47088 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
47089 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
47090 GEN_INT (mask)));
47091
47092 /* Shuffle the lanes around to produce:
47093 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
47094 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
47095 GEN_INT (0x3)));
47096
47097 /* Shuffle within the 128-bit lanes to produce:
47098 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
47099 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
47100
47101 /* Shuffle within the 128-bit lanes to produce:
47102 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
47103 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
47104
47105 /* Shuffle the lanes around to produce:
47106 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
47107 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
47108 GEN_INT (0x20)));
47109 }
47110 break;
47111
47112 case E_V2DFmode:
47113 case E_V4SFmode:
47114 case E_V2DImode:
47115 case E_V4SImode:
47116 /* These are always directly implementable by expand_vec_perm_1. */
47117 gcc_unreachable ();
47118
47119 case E_V8HImode:
47120 if (TARGET_SSE4_1)
47121 return expand_vec_perm_even_odd_pack (d);
47122 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
47123 return expand_vec_perm_pshufb2 (d);
47124 else
47125 {
47126 if (d->testing_p)
47127 break;
47128 /* We need 2*log2(N)-1 operations to achieve odd/even
47129 with interleave. */
47130 t1 = gen_reg_rtx (V8HImode);
47131 t2 = gen_reg_rtx (V8HImode);
47132 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
47133 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
47134 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
47135 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
47136 if (odd)
47137 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
47138 else
47139 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
47140 emit_insn (t3);
47141 }
47142 break;
47143
47144 case E_V16QImode:
47145 return expand_vec_perm_even_odd_pack (d);
47146
47147 case E_V16HImode:
47148 case E_V32QImode:
47149 return expand_vec_perm_even_odd_pack (d);
47150
47151 case E_V64QImode:
47152 return expand_vec_perm_even_odd_trunc (d);
47153
47154 case E_V4DImode:
47155 if (!TARGET_AVX2)
47156 {
47157 struct expand_vec_perm_d d_copy = *d;
47158 d_copy.vmode = V4DFmode;
47159 if (d->testing_p)
47160 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
47161 else
47162 d_copy.target = gen_reg_rtx (V4DFmode);
47163 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
47164 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
47165 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47166 {
47167 if (!d->testing_p)
47168 emit_move_insn (d->target,
47169 gen_lowpart (V4DImode, d_copy.target));
47170 return true;
47171 }
47172 return false;
47173 }
47174
47175 if (d->testing_p)
47176 break;
47177
47178 t1 = gen_reg_rtx (V4DImode);
47179 t2 = gen_reg_rtx (V4DImode);
47180
47181 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
47182 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
47183 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
47184
47185 /* Now an vpunpck[lh]qdq will produce the result required. */
47186 if (odd)
47187 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
47188 else
47189 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
47190 emit_insn (t3);
47191 break;
47192
47193 case E_V8SImode:
47194 if (!TARGET_AVX2)
47195 {
47196 struct expand_vec_perm_d d_copy = *d;
47197 d_copy.vmode = V8SFmode;
47198 if (d->testing_p)
47199 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
47200 else
47201 d_copy.target = gen_reg_rtx (V8SFmode);
47202 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
47203 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
47204 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
47205 {
47206 if (!d->testing_p)
47207 emit_move_insn (d->target,
47208 gen_lowpart (V8SImode, d_copy.target));
47209 return true;
47210 }
47211 return false;
47212 }
47213
47214 if (d->testing_p)
47215 break;
47216
47217 t1 = gen_reg_rtx (V8SImode);
47218 t2 = gen_reg_rtx (V8SImode);
47219 t3 = gen_reg_rtx (V4DImode);
47220 t4 = gen_reg_rtx (V4DImode);
47221 t5 = gen_reg_rtx (V4DImode);
47222
47223 /* Shuffle the lanes around into
47224 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
47225 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
47226 gen_lowpart (V4DImode, d->op1),
47227 GEN_INT (0x20)));
47228 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
47229 gen_lowpart (V4DImode, d->op1),
47230 GEN_INT (0x31)));
47231
47232 /* Swap the 2nd and 3rd position in each lane into
47233 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
47234 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
47235 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47236 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
47237 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
47238
47239 /* Now an vpunpck[lh]qdq will produce
47240 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
47241 if (odd)
47242 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
47243 gen_lowpart (V4DImode, t2));
47244 else
47245 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
47246 gen_lowpart (V4DImode, t2));
47247 emit_insn (t3);
47248 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
47249 break;
47250
47251 default:
47252 gcc_unreachable ();
47253 }
47254
47255 return true;
47256 }
47257
47258 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47259 extract-even and extract-odd permutations. */
47260
47261 static bool
47262 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
47263 {
47264 unsigned i, odd, nelt = d->nelt;
47265
47266 odd = d->perm[0];
47267 if (odd != 0 && odd != 1)
47268 return false;
47269
47270 for (i = 1; i < nelt; ++i)
47271 if (d->perm[i] != 2 * i + odd)
47272 return false;
47273
47274 return expand_vec_perm_even_odd_1 (d, odd);
47275 }
47276
47277 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
47278 permutations. We assume that expand_vec_perm_1 has already failed. */
47279
47280 static bool
47281 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
47282 {
47283 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
47284 machine_mode vmode = d->vmode;
47285 unsigned char perm2[4];
47286 rtx op0 = d->op0, dest;
47287 bool ok;
47288
47289 switch (vmode)
47290 {
47291 case E_V4DFmode:
47292 case E_V8SFmode:
47293 /* These are special-cased in sse.md so that we can optionally
47294 use the vbroadcast instruction. They expand to two insns
47295 if the input happens to be in a register. */
47296 gcc_unreachable ();
47297
47298 case E_V2DFmode:
47299 case E_V2DImode:
47300 case E_V4SFmode:
47301 case E_V4SImode:
47302 /* These are always implementable using standard shuffle patterns. */
47303 gcc_unreachable ();
47304
47305 case E_V8HImode:
47306 case E_V16QImode:
47307 /* These can be implemented via interleave. We save one insn by
47308 stopping once we have promoted to V4SImode and then use pshufd. */
47309 if (d->testing_p)
47310 return true;
47311 do
47312 {
47313 rtx dest;
47314 rtx (*gen) (rtx, rtx, rtx)
47315 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
47316 : gen_vec_interleave_lowv8hi;
47317
47318 if (elt >= nelt2)
47319 {
47320 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
47321 : gen_vec_interleave_highv8hi;
47322 elt -= nelt2;
47323 }
47324 nelt2 /= 2;
47325
47326 dest = gen_reg_rtx (vmode);
47327 emit_insn (gen (dest, op0, op0));
47328 vmode = get_mode_wider_vector (vmode);
47329 op0 = gen_lowpart (vmode, dest);
47330 }
47331 while (vmode != V4SImode);
47332
47333 memset (perm2, elt, 4);
47334 dest = gen_reg_rtx (V4SImode);
47335 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
47336 gcc_assert (ok);
47337 if (!d->testing_p)
47338 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
47339 return true;
47340
47341 case E_V64QImode:
47342 case E_V32QImode:
47343 case E_V16HImode:
47344 case E_V8SImode:
47345 case E_V4DImode:
47346 /* For AVX2 broadcasts of the first element vpbroadcast* or
47347 vpermq should be used by expand_vec_perm_1. */
47348 gcc_assert (!TARGET_AVX2 || d->perm[0]);
47349 return false;
47350
47351 default:
47352 gcc_unreachable ();
47353 }
47354 }
47355
47356 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
47357 broadcast permutations. */
47358
47359 static bool
47360 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
47361 {
47362 unsigned i, elt, nelt = d->nelt;
47363
47364 if (!d->one_operand_p)
47365 return false;
47366
47367 elt = d->perm[0];
47368 for (i = 1; i < nelt; ++i)
47369 if (d->perm[i] != elt)
47370 return false;
47371
47372 return expand_vec_perm_broadcast_1 (d);
47373 }
47374
47375 /* Implement arbitrary permutations of two V64QImode operands
47376 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
47377 static bool
47378 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
47379 {
47380 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
47381 return false;
47382
47383 if (d->testing_p)
47384 return true;
47385
47386 struct expand_vec_perm_d ds[2];
47387 rtx rperm[128], vperm, target0, target1;
47388 unsigned int i, nelt;
47389 machine_mode vmode;
47390
47391 nelt = d->nelt;
47392 vmode = V64QImode;
47393
47394 for (i = 0; i < 2; i++)
47395 {
47396 ds[i] = *d;
47397 ds[i].vmode = V32HImode;
47398 ds[i].nelt = 32;
47399 ds[i].target = gen_reg_rtx (V32HImode);
47400 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
47401 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
47402 }
47403
47404 /* Prepare permutations such that the first one takes care of
47405 putting the even bytes into the right positions or one higher
47406 positions (ds[0]) and the second one takes care of
47407 putting the odd bytes into the right positions or one below
47408 (ds[1]). */
47409
47410 for (i = 0; i < nelt; i++)
47411 {
47412 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
47413 if (i & 1)
47414 {
47415 rperm[i] = constm1_rtx;
47416 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47417 }
47418 else
47419 {
47420 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
47421 rperm[i + 64] = constm1_rtx;
47422 }
47423 }
47424
47425 bool ok = expand_vec_perm_1 (&ds[0]);
47426 gcc_assert (ok);
47427 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
47428
47429 ok = expand_vec_perm_1 (&ds[1]);
47430 gcc_assert (ok);
47431 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
47432
47433 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
47434 vperm = force_reg (vmode, vperm);
47435 target0 = gen_reg_rtx (V64QImode);
47436 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
47437
47438 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
47439 vperm = force_reg (vmode, vperm);
47440 target1 = gen_reg_rtx (V64QImode);
47441 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
47442
47443 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
47444 return true;
47445 }
47446
47447 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
47448 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
47449 all the shorter instruction sequences. */
47450
47451 static bool
47452 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
47453 {
47454 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
47455 unsigned int i, nelt, eltsz;
47456 bool used[4];
47457
47458 if (!TARGET_AVX2
47459 || d->one_operand_p
47460 || (d->vmode != V32QImode && d->vmode != V16HImode))
47461 return false;
47462
47463 if (d->testing_p)
47464 return true;
47465
47466 nelt = d->nelt;
47467 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
47468
47469 /* Generate 4 permutation masks. If the required element is within
47470 the same lane, it is shuffled in. If the required element from the
47471 other lane, force a zero by setting bit 7 in the permutation mask.
47472 In the other mask the mask has non-negative elements if element
47473 is requested from the other lane, but also moved to the other lane,
47474 so that the result of vpshufb can have the two V2TImode halves
47475 swapped. */
47476 m128 = GEN_INT (-128);
47477 for (i = 0; i < 32; ++i)
47478 {
47479 rperm[0][i] = m128;
47480 rperm[1][i] = m128;
47481 rperm[2][i] = m128;
47482 rperm[3][i] = m128;
47483 }
47484 used[0] = false;
47485 used[1] = false;
47486 used[2] = false;
47487 used[3] = false;
47488 for (i = 0; i < nelt; ++i)
47489 {
47490 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
47491 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
47492 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
47493
47494 for (j = 0; j < eltsz; ++j)
47495 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
47496 used[which] = true;
47497 }
47498
47499 for (i = 0; i < 2; ++i)
47500 {
47501 if (!used[2 * i + 1])
47502 {
47503 h[i] = NULL_RTX;
47504 continue;
47505 }
47506 vperm = gen_rtx_CONST_VECTOR (V32QImode,
47507 gen_rtvec_v (32, rperm[2 * i + 1]));
47508 vperm = force_reg (V32QImode, vperm);
47509 h[i] = gen_reg_rtx (V32QImode);
47510 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47511 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
47512 }
47513
47514 /* Swap the 128-byte lanes of h[X]. */
47515 for (i = 0; i < 2; ++i)
47516 {
47517 if (h[i] == NULL_RTX)
47518 continue;
47519 op = gen_reg_rtx (V4DImode);
47520 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
47521 const2_rtx, GEN_INT (3), const0_rtx,
47522 const1_rtx));
47523 h[i] = gen_lowpart (V32QImode, op);
47524 }
47525
47526 for (i = 0; i < 2; ++i)
47527 {
47528 if (!used[2 * i])
47529 {
47530 l[i] = NULL_RTX;
47531 continue;
47532 }
47533 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
47534 vperm = force_reg (V32QImode, vperm);
47535 l[i] = gen_reg_rtx (V32QImode);
47536 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
47537 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
47538 }
47539
47540 for (i = 0; i < 2; ++i)
47541 {
47542 if (h[i] && l[i])
47543 {
47544 op = gen_reg_rtx (V32QImode);
47545 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
47546 l[i] = op;
47547 }
47548 else if (h[i])
47549 l[i] = h[i];
47550 }
47551
47552 gcc_assert (l[0] && l[1]);
47553 op = d->target;
47554 if (d->vmode != V32QImode)
47555 op = gen_reg_rtx (V32QImode);
47556 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
47557 if (op != d->target)
47558 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
47559 return true;
47560 }
47561
47562 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
47563 With all of the interface bits taken care of, perform the expansion
47564 in D and return true on success. */
47565
47566 static bool
47567 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
47568 {
47569 /* Try a single instruction expansion. */
47570 if (expand_vec_perm_1 (d))
47571 return true;
47572
47573 /* Try sequences of two instructions. */
47574
47575 if (expand_vec_perm_pshuflw_pshufhw (d))
47576 return true;
47577
47578 if (expand_vec_perm_palignr (d, false))
47579 return true;
47580
47581 if (expand_vec_perm_interleave2 (d))
47582 return true;
47583
47584 if (expand_vec_perm_broadcast (d))
47585 return true;
47586
47587 if (expand_vec_perm_vpermq_perm_1 (d))
47588 return true;
47589
47590 if (expand_vec_perm_vperm2f128 (d))
47591 return true;
47592
47593 if (expand_vec_perm_pblendv (d))
47594 return true;
47595
47596 /* Try sequences of three instructions. */
47597
47598 if (expand_vec_perm_even_odd_pack (d))
47599 return true;
47600
47601 if (expand_vec_perm_2vperm2f128_vshuf (d))
47602 return true;
47603
47604 if (expand_vec_perm_pshufb2 (d))
47605 return true;
47606
47607 if (expand_vec_perm_interleave3 (d))
47608 return true;
47609
47610 if (expand_vec_perm_vperm2f128_vblend (d))
47611 return true;
47612
47613 /* Try sequences of four instructions. */
47614
47615 if (expand_vec_perm_even_odd_trunc (d))
47616 return true;
47617 if (expand_vec_perm_vpshufb2_vpermq (d))
47618 return true;
47619
47620 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
47621 return true;
47622
47623 if (expand_vec_perm_vpermt2_vpshub2 (d))
47624 return true;
47625
47626 /* ??? Look for narrow permutations whose element orderings would
47627 allow the promotion to a wider mode. */
47628
47629 /* ??? Look for sequences of interleave or a wider permute that place
47630 the data into the correct lanes for a half-vector shuffle like
47631 pshuf[lh]w or vpermilps. */
47632
47633 /* ??? Look for sequences of interleave that produce the desired results.
47634 The combinatorics of punpck[lh] get pretty ugly... */
47635
47636 if (expand_vec_perm_even_odd (d))
47637 return true;
47638
47639 /* Even longer sequences. */
47640 if (expand_vec_perm_vpshufb4_vpermq2 (d))
47641 return true;
47642
47643 /* See if we can get the same permutation in different vector integer
47644 mode. */
47645 struct expand_vec_perm_d nd;
47646 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
47647 {
47648 if (!d->testing_p)
47649 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
47650 return true;
47651 }
47652
47653 return false;
47654 }
47655
47656 /* If a permutation only uses one operand, make it clear. Returns true
47657 if the permutation references both operands. */
47658
47659 static bool
47660 canonicalize_perm (struct expand_vec_perm_d *d)
47661 {
47662 int i, which, nelt = d->nelt;
47663
47664 for (i = which = 0; i < nelt; ++i)
47665 which |= (d->perm[i] < nelt ? 1 : 2);
47666
47667 d->one_operand_p = true;
47668 switch (which)
47669 {
47670 default:
47671 gcc_unreachable();
47672
47673 case 3:
47674 if (!rtx_equal_p (d->op0, d->op1))
47675 {
47676 d->one_operand_p = false;
47677 break;
47678 }
47679 /* The elements of PERM do not suggest that only the first operand
47680 is used, but both operands are identical. Allow easier matching
47681 of the permutation by folding the permutation into the single
47682 input vector. */
47683 /* FALLTHRU */
47684
47685 case 2:
47686 for (i = 0; i < nelt; ++i)
47687 d->perm[i] &= nelt - 1;
47688 d->op0 = d->op1;
47689 break;
47690
47691 case 1:
47692 d->op1 = d->op0;
47693 break;
47694 }
47695
47696 return (which == 3);
47697 }
47698
47699 bool
47700 ix86_expand_vec_perm_const (rtx operands[4])
47701 {
47702 struct expand_vec_perm_d d;
47703 unsigned char perm[MAX_VECT_LEN];
47704 int i, nelt;
47705 bool two_args;
47706 rtx sel;
47707
47708 d.target = operands[0];
47709 d.op0 = operands[1];
47710 d.op1 = operands[2];
47711 sel = operands[3];
47712
47713 d.vmode = GET_MODE (d.target);
47714 gcc_assert (VECTOR_MODE_P (d.vmode));
47715 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47716 d.testing_p = false;
47717
47718 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
47719 gcc_assert (XVECLEN (sel, 0) == nelt);
47720 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
47721
47722 for (i = 0; i < nelt; ++i)
47723 {
47724 rtx e = XVECEXP (sel, 0, i);
47725 int ei = INTVAL (e) & (2 * nelt - 1);
47726 d.perm[i] = ei;
47727 perm[i] = ei;
47728 }
47729
47730 two_args = canonicalize_perm (&d);
47731
47732 if (ix86_expand_vec_perm_const_1 (&d))
47733 return true;
47734
47735 /* If the selector says both arguments are needed, but the operands are the
47736 same, the above tried to expand with one_operand_p and flattened selector.
47737 If that didn't work, retry without one_operand_p; we succeeded with that
47738 during testing. */
47739 if (two_args && d.one_operand_p)
47740 {
47741 d.one_operand_p = false;
47742 memcpy (d.perm, perm, sizeof (perm));
47743 return ix86_expand_vec_perm_const_1 (&d);
47744 }
47745
47746 return false;
47747 }
47748
47749 /* Implement targetm.vectorize.vec_perm_const_ok. */
47750
47751 static bool
47752 ix86_vectorize_vec_perm_const_ok (machine_mode vmode, vec_perm_indices sel)
47753 {
47754 struct expand_vec_perm_d d;
47755 unsigned int i, nelt, which;
47756 bool ret;
47757
47758 d.vmode = vmode;
47759 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47760 d.testing_p = true;
47761
47762 /* Given sufficient ISA support we can just return true here
47763 for selected vector modes. */
47764 switch (d.vmode)
47765 {
47766 case E_V16SFmode:
47767 case E_V16SImode:
47768 case E_V8DImode:
47769 case E_V8DFmode:
47770 if (TARGET_AVX512F)
47771 /* All implementable with a single vperm[it]2 insn. */
47772 return true;
47773 break;
47774 case E_V32HImode:
47775 if (TARGET_AVX512BW)
47776 /* All implementable with a single vperm[it]2 insn. */
47777 return true;
47778 break;
47779 case E_V64QImode:
47780 if (TARGET_AVX512BW)
47781 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
47782 return true;
47783 break;
47784 case E_V8SImode:
47785 case E_V8SFmode:
47786 case E_V4DFmode:
47787 case E_V4DImode:
47788 if (TARGET_AVX512VL)
47789 /* All implementable with a single vperm[it]2 insn. */
47790 return true;
47791 break;
47792 case E_V16HImode:
47793 if (TARGET_AVX2)
47794 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47795 return true;
47796 break;
47797 case E_V32QImode:
47798 if (TARGET_AVX2)
47799 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
47800 return true;
47801 break;
47802 case E_V4SImode:
47803 case E_V4SFmode:
47804 case E_V8HImode:
47805 case E_V16QImode:
47806 /* All implementable with a single vpperm insn. */
47807 if (TARGET_XOP)
47808 return true;
47809 /* All implementable with 2 pshufb + 1 ior. */
47810 if (TARGET_SSSE3)
47811 return true;
47812 break;
47813 case E_V2DImode:
47814 case E_V2DFmode:
47815 /* All implementable with shufpd or unpck[lh]pd. */
47816 return true;
47817 default:
47818 return false;
47819 }
47820
47821 /* Extract the values from the vector CST into the permutation
47822 array in D. */
47823 for (i = which = 0; i < nelt; ++i)
47824 {
47825 unsigned char e = sel[i];
47826 gcc_assert (e < 2 * nelt);
47827 d.perm[i] = e;
47828 which |= (e < nelt ? 1 : 2);
47829 }
47830
47831 /* For all elements from second vector, fold the elements to first. */
47832 if (which == 2)
47833 for (i = 0; i < nelt; ++i)
47834 d.perm[i] -= nelt;
47835
47836 /* Check whether the mask can be applied to the vector type. */
47837 d.one_operand_p = (which != 3);
47838
47839 /* Implementable with shufps or pshufd. */
47840 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
47841 return true;
47842
47843 /* Otherwise we have to go through the motions and see if we can
47844 figure out how to generate the requested permutation. */
47845 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
47846 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
47847 if (!d.one_operand_p)
47848 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
47849
47850 start_sequence ();
47851 ret = ix86_expand_vec_perm_const_1 (&d);
47852 end_sequence ();
47853
47854 return ret;
47855 }
47856
47857 void
47858 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
47859 {
47860 struct expand_vec_perm_d d;
47861 unsigned i, nelt;
47862
47863 d.target = targ;
47864 d.op0 = op0;
47865 d.op1 = op1;
47866 d.vmode = GET_MODE (targ);
47867 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47868 d.one_operand_p = false;
47869 d.testing_p = false;
47870
47871 for (i = 0; i < nelt; ++i)
47872 d.perm[i] = i * 2 + odd;
47873
47874 /* We'll either be able to implement the permutation directly... */
47875 if (expand_vec_perm_1 (&d))
47876 return;
47877
47878 /* ... or we use the special-case patterns. */
47879 expand_vec_perm_even_odd_1 (&d, odd);
47880 }
47881
47882 static void
47883 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
47884 {
47885 struct expand_vec_perm_d d;
47886 unsigned i, nelt, base;
47887 bool ok;
47888
47889 d.target = targ;
47890 d.op0 = op0;
47891 d.op1 = op1;
47892 d.vmode = GET_MODE (targ);
47893 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
47894 d.one_operand_p = false;
47895 d.testing_p = false;
47896
47897 base = high_p ? nelt / 2 : 0;
47898 for (i = 0; i < nelt / 2; ++i)
47899 {
47900 d.perm[i * 2] = i + base;
47901 d.perm[i * 2 + 1] = i + base + nelt;
47902 }
47903
47904 /* Note that for AVX this isn't one instruction. */
47905 ok = ix86_expand_vec_perm_const_1 (&d);
47906 gcc_assert (ok);
47907 }
47908
47909
47910 /* Expand a vector operation CODE for a V*QImode in terms of the
47911 same operation on V*HImode. */
47912
47913 void
47914 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
47915 {
47916 machine_mode qimode = GET_MODE (dest);
47917 machine_mode himode;
47918 rtx (*gen_il) (rtx, rtx, rtx);
47919 rtx (*gen_ih) (rtx, rtx, rtx);
47920 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
47921 struct expand_vec_perm_d d;
47922 bool ok, full_interleave;
47923 bool uns_p = false;
47924 int i;
47925
47926 switch (qimode)
47927 {
47928 case E_V16QImode:
47929 himode = V8HImode;
47930 gen_il = gen_vec_interleave_lowv16qi;
47931 gen_ih = gen_vec_interleave_highv16qi;
47932 break;
47933 case E_V32QImode:
47934 himode = V16HImode;
47935 gen_il = gen_avx2_interleave_lowv32qi;
47936 gen_ih = gen_avx2_interleave_highv32qi;
47937 break;
47938 case E_V64QImode:
47939 himode = V32HImode;
47940 gen_il = gen_avx512bw_interleave_lowv64qi;
47941 gen_ih = gen_avx512bw_interleave_highv64qi;
47942 break;
47943 default:
47944 gcc_unreachable ();
47945 }
47946
47947 op2_l = op2_h = op2;
47948 switch (code)
47949 {
47950 case MULT:
47951 /* Unpack data such that we've got a source byte in each low byte of
47952 each word. We don't care what goes into the high byte of each word.
47953 Rather than trying to get zero in there, most convenient is to let
47954 it be a copy of the low byte. */
47955 op2_l = gen_reg_rtx (qimode);
47956 op2_h = gen_reg_rtx (qimode);
47957 emit_insn (gen_il (op2_l, op2, op2));
47958 emit_insn (gen_ih (op2_h, op2, op2));
47959
47960 op1_l = gen_reg_rtx (qimode);
47961 op1_h = gen_reg_rtx (qimode);
47962 emit_insn (gen_il (op1_l, op1, op1));
47963 emit_insn (gen_ih (op1_h, op1, op1));
47964 full_interleave = qimode == V16QImode;
47965 break;
47966
47967 case ASHIFT:
47968 case LSHIFTRT:
47969 uns_p = true;
47970 /* FALLTHRU */
47971 case ASHIFTRT:
47972 op1_l = gen_reg_rtx (himode);
47973 op1_h = gen_reg_rtx (himode);
47974 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
47975 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
47976 full_interleave = true;
47977 break;
47978 default:
47979 gcc_unreachable ();
47980 }
47981
47982 /* Perform the operation. */
47983 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
47984 1, OPTAB_DIRECT);
47985 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
47986 1, OPTAB_DIRECT);
47987 gcc_assert (res_l && res_h);
47988
47989 /* Merge the data back into the right place. */
47990 d.target = dest;
47991 d.op0 = gen_lowpart (qimode, res_l);
47992 d.op1 = gen_lowpart (qimode, res_h);
47993 d.vmode = qimode;
47994 d.nelt = GET_MODE_NUNITS (qimode);
47995 d.one_operand_p = false;
47996 d.testing_p = false;
47997
47998 if (full_interleave)
47999 {
48000 /* For SSE2, we used an full interleave, so the desired
48001 results are in the even elements. */
48002 for (i = 0; i < d.nelt; ++i)
48003 d.perm[i] = i * 2;
48004 }
48005 else
48006 {
48007 /* For AVX, the interleave used above was not cross-lane. So the
48008 extraction is evens but with the second and third quarter swapped.
48009 Happily, that is even one insn shorter than even extraction.
48010 For AVX512BW we have 4 lanes. We extract evens from within a lane,
48011 always first from the first and then from the second source operand,
48012 the index bits above the low 4 bits remains the same.
48013 Thus, for d.nelt == 32 we want permutation
48014 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
48015 and for d.nelt == 64 we want permutation
48016 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
48017 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
48018 for (i = 0; i < d.nelt; ++i)
48019 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
48020 }
48021
48022 ok = ix86_expand_vec_perm_const_1 (&d);
48023 gcc_assert (ok);
48024
48025 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48026 gen_rtx_fmt_ee (code, qimode, op1, op2));
48027 }
48028
48029 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
48030 if op is CONST_VECTOR with all odd elements equal to their
48031 preceding element. */
48032
48033 static bool
48034 const_vector_equal_evenodd_p (rtx op)
48035 {
48036 machine_mode mode = GET_MODE (op);
48037 int i, nunits = GET_MODE_NUNITS (mode);
48038 if (GET_CODE (op) != CONST_VECTOR
48039 || nunits != CONST_VECTOR_NUNITS (op))
48040 return false;
48041 for (i = 0; i < nunits; i += 2)
48042 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
48043 return false;
48044 return true;
48045 }
48046
48047 void
48048 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
48049 bool uns_p, bool odd_p)
48050 {
48051 machine_mode mode = GET_MODE (op1);
48052 machine_mode wmode = GET_MODE (dest);
48053 rtx x;
48054 rtx orig_op1 = op1, orig_op2 = op2;
48055
48056 if (!nonimmediate_operand (op1, mode))
48057 op1 = force_reg (mode, op1);
48058 if (!nonimmediate_operand (op2, mode))
48059 op2 = force_reg (mode, op2);
48060
48061 /* We only play even/odd games with vectors of SImode. */
48062 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
48063
48064 /* If we're looking for the odd results, shift those members down to
48065 the even slots. For some cpus this is faster than a PSHUFD. */
48066 if (odd_p)
48067 {
48068 /* For XOP use vpmacsdqh, but only for smult, as it is only
48069 signed. */
48070 if (TARGET_XOP && mode == V4SImode && !uns_p)
48071 {
48072 x = force_reg (wmode, CONST0_RTX (wmode));
48073 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
48074 return;
48075 }
48076
48077 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
48078 if (!const_vector_equal_evenodd_p (orig_op1))
48079 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
48080 x, NULL, 1, OPTAB_DIRECT);
48081 if (!const_vector_equal_evenodd_p (orig_op2))
48082 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
48083 x, NULL, 1, OPTAB_DIRECT);
48084 op1 = gen_lowpart (mode, op1);
48085 op2 = gen_lowpart (mode, op2);
48086 }
48087
48088 if (mode == V16SImode)
48089 {
48090 if (uns_p)
48091 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
48092 else
48093 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
48094 }
48095 else if (mode == V8SImode)
48096 {
48097 if (uns_p)
48098 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
48099 else
48100 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
48101 }
48102 else if (uns_p)
48103 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
48104 else if (TARGET_SSE4_1)
48105 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
48106 else
48107 {
48108 rtx s1, s2, t0, t1, t2;
48109
48110 /* The easiest way to implement this without PMULDQ is to go through
48111 the motions as if we are performing a full 64-bit multiply. With
48112 the exception that we need to do less shuffling of the elements. */
48113
48114 /* Compute the sign-extension, aka highparts, of the two operands. */
48115 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48116 op1, pc_rtx, pc_rtx);
48117 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
48118 op2, pc_rtx, pc_rtx);
48119
48120 /* Multiply LO(A) * HI(B), and vice-versa. */
48121 t1 = gen_reg_rtx (wmode);
48122 t2 = gen_reg_rtx (wmode);
48123 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
48124 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
48125
48126 /* Multiply LO(A) * LO(B). */
48127 t0 = gen_reg_rtx (wmode);
48128 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
48129
48130 /* Combine and shift the highparts into place. */
48131 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
48132 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
48133 1, OPTAB_DIRECT);
48134
48135 /* Combine high and low parts. */
48136 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
48137 return;
48138 }
48139 emit_insn (x);
48140 }
48141
48142 void
48143 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
48144 bool uns_p, bool high_p)
48145 {
48146 machine_mode wmode = GET_MODE (dest);
48147 machine_mode mode = GET_MODE (op1);
48148 rtx t1, t2, t3, t4, mask;
48149
48150 switch (mode)
48151 {
48152 case E_V4SImode:
48153 t1 = gen_reg_rtx (mode);
48154 t2 = gen_reg_rtx (mode);
48155 if (TARGET_XOP && !uns_p)
48156 {
48157 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
48158 shuffle the elements once so that all elements are in the right
48159 place for immediate use: { A C B D }. */
48160 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
48161 const1_rtx, GEN_INT (3)));
48162 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
48163 const1_rtx, GEN_INT (3)));
48164 }
48165 else
48166 {
48167 /* Put the elements into place for the multiply. */
48168 ix86_expand_vec_interleave (t1, op1, op1, high_p);
48169 ix86_expand_vec_interleave (t2, op2, op2, high_p);
48170 high_p = false;
48171 }
48172 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
48173 break;
48174
48175 case E_V8SImode:
48176 /* Shuffle the elements between the lanes. After this we
48177 have { A B E F | C D G H } for each operand. */
48178 t1 = gen_reg_rtx (V4DImode);
48179 t2 = gen_reg_rtx (V4DImode);
48180 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
48181 const0_rtx, const2_rtx,
48182 const1_rtx, GEN_INT (3)));
48183 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
48184 const0_rtx, const2_rtx,
48185 const1_rtx, GEN_INT (3)));
48186
48187 /* Shuffle the elements within the lanes. After this we
48188 have { A A B B | C C D D } or { E E F F | G G H H }. */
48189 t3 = gen_reg_rtx (V8SImode);
48190 t4 = gen_reg_rtx (V8SImode);
48191 mask = GEN_INT (high_p
48192 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
48193 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
48194 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
48195 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
48196
48197 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
48198 break;
48199
48200 case E_V8HImode:
48201 case E_V16HImode:
48202 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
48203 uns_p, OPTAB_DIRECT);
48204 t2 = expand_binop (mode,
48205 uns_p ? umul_highpart_optab : smul_highpart_optab,
48206 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
48207 gcc_assert (t1 && t2);
48208
48209 t3 = gen_reg_rtx (mode);
48210 ix86_expand_vec_interleave (t3, t1, t2, high_p);
48211 emit_move_insn (dest, gen_lowpart (wmode, t3));
48212 break;
48213
48214 case E_V16QImode:
48215 case E_V32QImode:
48216 case E_V32HImode:
48217 case E_V16SImode:
48218 case E_V64QImode:
48219 t1 = gen_reg_rtx (wmode);
48220 t2 = gen_reg_rtx (wmode);
48221 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
48222 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
48223
48224 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
48225 break;
48226
48227 default:
48228 gcc_unreachable ();
48229 }
48230 }
48231
48232 void
48233 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
48234 {
48235 rtx res_1, res_2, res_3, res_4;
48236
48237 res_1 = gen_reg_rtx (V4SImode);
48238 res_2 = gen_reg_rtx (V4SImode);
48239 res_3 = gen_reg_rtx (V2DImode);
48240 res_4 = gen_reg_rtx (V2DImode);
48241 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
48242 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
48243
48244 /* Move the results in element 2 down to element 1; we don't care
48245 what goes in elements 2 and 3. Then we can merge the parts
48246 back together with an interleave.
48247
48248 Note that two other sequences were tried:
48249 (1) Use interleaves at the start instead of psrldq, which allows
48250 us to use a single shufps to merge things back at the end.
48251 (2) Use shufps here to combine the two vectors, then pshufd to
48252 put the elements in the correct order.
48253 In both cases the cost of the reformatting stall was too high
48254 and the overall sequence slower. */
48255
48256 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
48257 const0_rtx, const2_rtx,
48258 const0_rtx, const0_rtx));
48259 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
48260 const0_rtx, const2_rtx,
48261 const0_rtx, const0_rtx));
48262 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
48263
48264 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
48265 }
48266
48267 void
48268 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
48269 {
48270 machine_mode mode = GET_MODE (op0);
48271 rtx t1, t2, t3, t4, t5, t6;
48272
48273 if (TARGET_AVX512DQ && mode == V8DImode)
48274 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
48275 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
48276 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
48277 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
48278 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
48279 else if (TARGET_XOP && mode == V2DImode)
48280 {
48281 /* op1: A,B,C,D, op2: E,F,G,H */
48282 op1 = gen_lowpart (V4SImode, op1);
48283 op2 = gen_lowpart (V4SImode, op2);
48284
48285 t1 = gen_reg_rtx (V4SImode);
48286 t2 = gen_reg_rtx (V4SImode);
48287 t3 = gen_reg_rtx (V2DImode);
48288 t4 = gen_reg_rtx (V2DImode);
48289
48290 /* t1: B,A,D,C */
48291 emit_insn (gen_sse2_pshufd_1 (t1, op1,
48292 GEN_INT (1),
48293 GEN_INT (0),
48294 GEN_INT (3),
48295 GEN_INT (2)));
48296
48297 /* t2: (B*E),(A*F),(D*G),(C*H) */
48298 emit_insn (gen_mulv4si3 (t2, t1, op2));
48299
48300 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
48301 emit_insn (gen_xop_phadddq (t3, t2));
48302
48303 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
48304 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
48305
48306 /* Multiply lower parts and add all */
48307 t5 = gen_reg_rtx (V2DImode);
48308 emit_insn (gen_vec_widen_umult_even_v4si (t5,
48309 gen_lowpart (V4SImode, op1),
48310 gen_lowpart (V4SImode, op2)));
48311 op0 = expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
48312
48313 }
48314 else
48315 {
48316 machine_mode nmode;
48317 rtx (*umul) (rtx, rtx, rtx);
48318
48319 if (mode == V2DImode)
48320 {
48321 umul = gen_vec_widen_umult_even_v4si;
48322 nmode = V4SImode;
48323 }
48324 else if (mode == V4DImode)
48325 {
48326 umul = gen_vec_widen_umult_even_v8si;
48327 nmode = V8SImode;
48328 }
48329 else if (mode == V8DImode)
48330 {
48331 umul = gen_vec_widen_umult_even_v16si;
48332 nmode = V16SImode;
48333 }
48334 else
48335 gcc_unreachable ();
48336
48337
48338 /* Multiply low parts. */
48339 t1 = gen_reg_rtx (mode);
48340 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
48341
48342 /* Shift input vectors right 32 bits so we can multiply high parts. */
48343 t6 = GEN_INT (32);
48344 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
48345 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
48346
48347 /* Multiply high parts by low parts. */
48348 t4 = gen_reg_rtx (mode);
48349 t5 = gen_reg_rtx (mode);
48350 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
48351 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
48352
48353 /* Combine and shift the highparts back. */
48354 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
48355 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
48356
48357 /* Combine high and low parts. */
48358 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
48359 }
48360
48361 set_unique_reg_note (get_last_insn (), REG_EQUAL,
48362 gen_rtx_MULT (mode, op1, op2));
48363 }
48364
48365 /* Return 1 if control tansfer instruction INSN
48366 should be encoded with bnd prefix.
48367 If insn is NULL then return 1 when control
48368 transfer instructions should be prefixed with
48369 bnd by default for current function. */
48370
48371 bool
48372 ix86_bnd_prefixed_insn_p (rtx insn)
48373 {
48374 /* For call insns check special flag. */
48375 if (insn && CALL_P (insn))
48376 {
48377 rtx call = get_call_rtx_from (insn);
48378 if (call)
48379 return CALL_EXPR_WITH_BOUNDS_P (call);
48380 }
48381
48382 /* All other insns are prefixed only if function is instrumented. */
48383 return chkp_function_instrumented_p (current_function_decl);
48384 }
48385
48386 /* Return 1 if control tansfer instruction INSN
48387 should be encoded with notrack prefix. */
48388
48389 static bool
48390 ix86_notrack_prefixed_insn_p (rtx insn)
48391 {
48392 if (!insn || !((flag_cf_protection & CF_BRANCH) && TARGET_IBT))
48393 return false;
48394
48395 if (CALL_P (insn))
48396 {
48397 rtx call = get_call_rtx_from (insn);
48398 gcc_assert (call != NULL_RTX);
48399 rtx addr = XEXP (call, 0);
48400
48401 /* Do not emit 'notrack' if it's not an indirect call. */
48402 if (MEM_P (addr)
48403 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
48404 return false;
48405 else
48406 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
48407 }
48408
48409 if (JUMP_P (insn) && !flag_cet_switch)
48410 {
48411 rtx target = JUMP_LABEL (insn);
48412 if (target == NULL_RTX || ANY_RETURN_P (target))
48413 return false;
48414
48415 /* Check the jump is a switch table. */
48416 rtx_insn *label = as_a<rtx_insn *> (target);
48417 rtx_insn *table = next_insn (label);
48418 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
48419 return false;
48420 else
48421 return true;
48422 }
48423 return false;
48424 }
48425
48426 /* Calculate integer abs() using only SSE2 instructions. */
48427
48428 void
48429 ix86_expand_sse2_abs (rtx target, rtx input)
48430 {
48431 machine_mode mode = GET_MODE (target);
48432 rtx tmp0, tmp1, x;
48433
48434 switch (mode)
48435 {
48436 /* For 32-bit signed integer X, the best way to calculate the absolute
48437 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
48438 case E_V4SImode:
48439 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
48440 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
48441 NULL, 0, OPTAB_DIRECT);
48442 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
48443 NULL, 0, OPTAB_DIRECT);
48444 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
48445 target, 0, OPTAB_DIRECT);
48446 break;
48447
48448 /* For 16-bit signed integer X, the best way to calculate the absolute
48449 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
48450 case E_V8HImode:
48451 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48452
48453 x = expand_simple_binop (mode, SMAX, tmp0, input,
48454 target, 0, OPTAB_DIRECT);
48455 break;
48456
48457 /* For 8-bit signed integer X, the best way to calculate the absolute
48458 value of X is min ((unsigned char) X, (unsigned char) (-X)),
48459 as SSE2 provides the PMINUB insn. */
48460 case E_V16QImode:
48461 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
48462
48463 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
48464 target, 0, OPTAB_DIRECT);
48465 break;
48466
48467 default:
48468 gcc_unreachable ();
48469 }
48470
48471 if (x != target)
48472 emit_move_insn (target, x);
48473 }
48474
48475 /* Expand an extract from a vector register through pextr insn.
48476 Return true if successful. */
48477
48478 bool
48479 ix86_expand_pextr (rtx *operands)
48480 {
48481 rtx dst = operands[0];
48482 rtx src = operands[1];
48483
48484 unsigned int size = INTVAL (operands[2]);
48485 unsigned int pos = INTVAL (operands[3]);
48486
48487 if (SUBREG_P (dst))
48488 {
48489 /* Reject non-lowpart subregs. */
48490 if (SUBREG_BYTE (dst) > 0)
48491 return false;
48492 dst = SUBREG_REG (dst);
48493 }
48494
48495 if (SUBREG_P (src))
48496 {
48497 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
48498 src = SUBREG_REG (src);
48499 }
48500
48501 switch (GET_MODE (src))
48502 {
48503 case E_V16QImode:
48504 case E_V8HImode:
48505 case E_V4SImode:
48506 case E_V2DImode:
48507 case E_V1TImode:
48508 case E_TImode:
48509 {
48510 machine_mode srcmode, dstmode;
48511 rtx d, pat;
48512
48513 if (!int_mode_for_size (size, 0).exists (&dstmode))
48514 return false;
48515
48516 switch (dstmode)
48517 {
48518 case E_QImode:
48519 if (!TARGET_SSE4_1)
48520 return false;
48521 srcmode = V16QImode;
48522 break;
48523
48524 case E_HImode:
48525 if (!TARGET_SSE2)
48526 return false;
48527 srcmode = V8HImode;
48528 break;
48529
48530 case E_SImode:
48531 if (!TARGET_SSE4_1)
48532 return false;
48533 srcmode = V4SImode;
48534 break;
48535
48536 case E_DImode:
48537 gcc_assert (TARGET_64BIT);
48538 if (!TARGET_SSE4_1)
48539 return false;
48540 srcmode = V2DImode;
48541 break;
48542
48543 default:
48544 return false;
48545 }
48546
48547 /* Reject extractions from misaligned positions. */
48548 if (pos & (size-1))
48549 return false;
48550
48551 if (GET_MODE (dst) == dstmode)
48552 d = dst;
48553 else
48554 d = gen_reg_rtx (dstmode);
48555
48556 /* Construct insn pattern. */
48557 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
48558 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
48559
48560 /* Let the rtl optimizers know about the zero extension performed. */
48561 if (dstmode == QImode || dstmode == HImode)
48562 {
48563 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
48564 d = gen_lowpart (SImode, d);
48565 }
48566
48567 emit_insn (gen_rtx_SET (d, pat));
48568
48569 if (d != dst)
48570 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48571 return true;
48572 }
48573
48574 default:
48575 return false;
48576 }
48577 }
48578
48579 /* Expand an insert into a vector register through pinsr insn.
48580 Return true if successful. */
48581
48582 bool
48583 ix86_expand_pinsr (rtx *operands)
48584 {
48585 rtx dst = operands[0];
48586 rtx src = operands[3];
48587
48588 unsigned int size = INTVAL (operands[1]);
48589 unsigned int pos = INTVAL (operands[2]);
48590
48591 if (SUBREG_P (dst))
48592 {
48593 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
48594 dst = SUBREG_REG (dst);
48595 }
48596
48597 switch (GET_MODE (dst))
48598 {
48599 case E_V16QImode:
48600 case E_V8HImode:
48601 case E_V4SImode:
48602 case E_V2DImode:
48603 case E_V1TImode:
48604 case E_TImode:
48605 {
48606 machine_mode srcmode, dstmode;
48607 rtx (*pinsr)(rtx, rtx, rtx, rtx);
48608 rtx d;
48609
48610 if (!int_mode_for_size (size, 0).exists (&srcmode))
48611 return false;
48612
48613 switch (srcmode)
48614 {
48615 case E_QImode:
48616 if (!TARGET_SSE4_1)
48617 return false;
48618 dstmode = V16QImode;
48619 pinsr = gen_sse4_1_pinsrb;
48620 break;
48621
48622 case E_HImode:
48623 if (!TARGET_SSE2)
48624 return false;
48625 dstmode = V8HImode;
48626 pinsr = gen_sse2_pinsrw;
48627 break;
48628
48629 case E_SImode:
48630 if (!TARGET_SSE4_1)
48631 return false;
48632 dstmode = V4SImode;
48633 pinsr = gen_sse4_1_pinsrd;
48634 break;
48635
48636 case E_DImode:
48637 gcc_assert (TARGET_64BIT);
48638 if (!TARGET_SSE4_1)
48639 return false;
48640 dstmode = V2DImode;
48641 pinsr = gen_sse4_1_pinsrq;
48642 break;
48643
48644 default:
48645 return false;
48646 }
48647
48648 /* Reject insertions to misaligned positions. */
48649 if (pos & (size-1))
48650 return false;
48651
48652 if (SUBREG_P (src))
48653 {
48654 unsigned int srcpos = SUBREG_BYTE (src);
48655
48656 if (srcpos > 0)
48657 {
48658 rtx extr_ops[4];
48659
48660 extr_ops[0] = gen_reg_rtx (srcmode);
48661 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
48662 extr_ops[2] = GEN_INT (size);
48663 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
48664
48665 if (!ix86_expand_pextr (extr_ops))
48666 return false;
48667
48668 src = extr_ops[0];
48669 }
48670 else
48671 src = gen_lowpart (srcmode, SUBREG_REG (src));
48672 }
48673
48674 if (GET_MODE (dst) == dstmode)
48675 d = dst;
48676 else
48677 d = gen_reg_rtx (dstmode);
48678
48679 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
48680 gen_lowpart (srcmode, src),
48681 GEN_INT (1 << (pos / size))));
48682 if (d != dst)
48683 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
48684 return true;
48685 }
48686
48687 default:
48688 return false;
48689 }
48690 }
48691 \f
48692 /* This function returns the calling abi specific va_list type node.
48693 It returns the FNDECL specific va_list type. */
48694
48695 static tree
48696 ix86_fn_abi_va_list (tree fndecl)
48697 {
48698 if (!TARGET_64BIT)
48699 return va_list_type_node;
48700 gcc_assert (fndecl != NULL_TREE);
48701
48702 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
48703 return ms_va_list_type_node;
48704 else
48705 return sysv_va_list_type_node;
48706 }
48707
48708 /* Returns the canonical va_list type specified by TYPE. If there
48709 is no valid TYPE provided, it return NULL_TREE. */
48710
48711 static tree
48712 ix86_canonical_va_list_type (tree type)
48713 {
48714 if (TARGET_64BIT)
48715 {
48716 if (lookup_attribute ("ms_abi va_list", TYPE_ATTRIBUTES (type)))
48717 return ms_va_list_type_node;
48718
48719 if ((TREE_CODE (type) == ARRAY_TYPE
48720 && integer_zerop (array_type_nelts (type)))
48721 || POINTER_TYPE_P (type))
48722 {
48723 tree elem_type = TREE_TYPE (type);
48724 if (TREE_CODE (elem_type) == RECORD_TYPE
48725 && lookup_attribute ("sysv_abi va_list",
48726 TYPE_ATTRIBUTES (elem_type)))
48727 return sysv_va_list_type_node;
48728 }
48729
48730 return NULL_TREE;
48731 }
48732
48733 return std_canonical_va_list_type (type);
48734 }
48735
48736 /* Iterate through the target-specific builtin types for va_list.
48737 IDX denotes the iterator, *PTREE is set to the result type of
48738 the va_list builtin, and *PNAME to its internal type.
48739 Returns zero if there is no element for this index, otherwise
48740 IDX should be increased upon the next call.
48741 Note, do not iterate a base builtin's name like __builtin_va_list.
48742 Used from c_common_nodes_and_builtins. */
48743
48744 static int
48745 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
48746 {
48747 if (TARGET_64BIT)
48748 {
48749 switch (idx)
48750 {
48751 default:
48752 break;
48753
48754 case 0:
48755 *ptree = ms_va_list_type_node;
48756 *pname = "__builtin_ms_va_list";
48757 return 1;
48758
48759 case 1:
48760 *ptree = sysv_va_list_type_node;
48761 *pname = "__builtin_sysv_va_list";
48762 return 1;
48763 }
48764 }
48765
48766 return 0;
48767 }
48768
48769 #undef TARGET_SCHED_DISPATCH
48770 #define TARGET_SCHED_DISPATCH ix86_bd_has_dispatch
48771 #undef TARGET_SCHED_DISPATCH_DO
48772 #define TARGET_SCHED_DISPATCH_DO ix86_bd_do_dispatch
48773 #undef TARGET_SCHED_REASSOCIATION_WIDTH
48774 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
48775 #undef TARGET_SCHED_REORDER
48776 #define TARGET_SCHED_REORDER ix86_atom_sched_reorder
48777 #undef TARGET_SCHED_ADJUST_PRIORITY
48778 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
48779 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
48780 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
48781 ix86_dependencies_evaluation_hook
48782
48783
48784 /* Implementation of reassociation_width target hook used by
48785 reassoc phase to identify parallelism level in reassociated
48786 tree. Statements tree_code is passed in OPC. Arguments type
48787 is passed in MODE. */
48788
48789 static int
48790 ix86_reassociation_width (unsigned int op, machine_mode mode)
48791 {
48792 int width = 1;
48793 /* Vector part. */
48794 if (VECTOR_MODE_P (mode))
48795 {
48796 int div = 1;
48797 if (INTEGRAL_MODE_P (mode))
48798 width = ix86_cost->reassoc_vec_int;
48799 else if (FLOAT_MODE_P (mode))
48800 width = ix86_cost->reassoc_vec_fp;
48801
48802 if (width == 1)
48803 return 1;
48804
48805 /* Integer vector instructions execute in FP unit
48806 and can execute 3 additions and one multiplication per cycle. */
48807 if (ix86_tune == PROCESSOR_ZNVER1 && INTEGRAL_MODE_P (mode)
48808 && op != PLUS && op != MINUS)
48809 return 1;
48810
48811 /* Account for targets that splits wide vectors into multiple parts. */
48812 if (TARGET_AVX128_OPTIMAL && GET_MODE_BITSIZE (mode) > 128)
48813 div = GET_MODE_BITSIZE (mode) / 128;
48814 else if (TARGET_SSE_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 64)
48815 div = GET_MODE_BITSIZE (mode) / 64;
48816 width = (width + div - 1) / div;
48817 }
48818 /* Scalar part. */
48819 else if (INTEGRAL_MODE_P (mode))
48820 width = ix86_cost->reassoc_int;
48821 else if (FLOAT_MODE_P (mode))
48822 width = ix86_cost->reassoc_fp;
48823
48824 /* Avoid using too many registers in 32bit mode. */
48825 if (!TARGET_64BIT && width > 2)
48826 width = 2;
48827 return width;
48828 }
48829
48830 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
48831 place emms and femms instructions. */
48832
48833 static machine_mode
48834 ix86_preferred_simd_mode (scalar_mode mode)
48835 {
48836 if (!TARGET_SSE)
48837 return word_mode;
48838
48839 switch (mode)
48840 {
48841 case E_QImode:
48842 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48843 return V64QImode;
48844 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48845 return V32QImode;
48846 else
48847 return V16QImode;
48848
48849 case E_HImode:
48850 if (TARGET_AVX512BW && !TARGET_PREFER_AVX256)
48851 return V32HImode;
48852 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48853 return V16HImode;
48854 else
48855 return V8HImode;
48856
48857 case E_SImode:
48858 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48859 return V16SImode;
48860 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48861 return V8SImode;
48862 else
48863 return V4SImode;
48864
48865 case E_DImode:
48866 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48867 return V8DImode;
48868 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48869 return V4DImode;
48870 else
48871 return V2DImode;
48872
48873 case E_SFmode:
48874 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48875 return V16SFmode;
48876 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48877 return V8SFmode;
48878 else
48879 return V4SFmode;
48880
48881 case E_DFmode:
48882 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48883 return V8DFmode;
48884 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48885 return V4DFmode;
48886 else if (TARGET_SSE2)
48887 return V2DFmode;
48888 /* FALLTHRU */
48889
48890 default:
48891 return word_mode;
48892 }
48893 }
48894
48895 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
48896 vectors. If AVX512F is enabled then try vectorizing with 512bit,
48897 256bit and 128bit vectors. */
48898
48899 static unsigned int
48900 ix86_autovectorize_vector_sizes (void)
48901 {
48902 unsigned int bytesizes = 0;
48903
48904 if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
48905 bytesizes |= (64 | 32 | 16);
48906 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
48907 bytesizes |= (32 | 16);
48908
48909 return bytesizes;
48910 }
48911
48912 /* Implemenation of targetm.vectorize.get_mask_mode. */
48913
48914 static opt_machine_mode
48915 ix86_get_mask_mode (unsigned nunits, unsigned vector_size)
48916 {
48917 unsigned elem_size = vector_size / nunits;
48918
48919 /* Scalar mask case. */
48920 if ((TARGET_AVX512F && vector_size == 64)
48921 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
48922 {
48923 if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
48924 return smallest_int_mode_for_size (nunits);
48925 }
48926
48927 scalar_int_mode elem_mode
48928 = smallest_int_mode_for_size (elem_size * BITS_PER_UNIT);
48929
48930 gcc_assert (elem_size * nunits == vector_size);
48931
48932 return mode_for_vector (elem_mode, nunits);
48933 }
48934
48935 \f
48936
48937 /* Return class of registers which could be used for pseudo of MODE
48938 and of class RCLASS for spilling instead of memory. Return NO_REGS
48939 if it is not possible or non-profitable. */
48940
48941 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
48942
48943 static reg_class_t
48944 ix86_spill_class (reg_class_t rclass, machine_mode mode)
48945 {
48946 if (0 && TARGET_GENERAL_REGS_SSE_SPILL
48947 && TARGET_SSE2
48948 && TARGET_INTER_UNIT_MOVES_TO_VEC
48949 && TARGET_INTER_UNIT_MOVES_FROM_VEC
48950 && (mode == SImode || (TARGET_64BIT && mode == DImode))
48951 && INTEGER_CLASS_P (rclass))
48952 return ALL_SSE_REGS;
48953 return NO_REGS;
48954 }
48955
48956 /* Implement TARGET_MAX_NOCE_IFCVT_SEQ_COST. Like the default implementation,
48957 but returns a lower bound. */
48958
48959 static unsigned int
48960 ix86_max_noce_ifcvt_seq_cost (edge e)
48961 {
48962 bool predictable_p = predictable_edge_p (e);
48963
48964 enum compiler_param param
48965 = (predictable_p
48966 ? PARAM_MAX_RTL_IF_CONVERSION_PREDICTABLE_COST
48967 : PARAM_MAX_RTL_IF_CONVERSION_UNPREDICTABLE_COST);
48968
48969 /* If we have a parameter set, use that, otherwise take a guess using
48970 BRANCH_COST. */
48971 if (global_options_set.x_param_values[param])
48972 return PARAM_VALUE (param);
48973 else
48974 return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2);
48975 }
48976
48977 /* Return true if SEQ is a good candidate as a replacement for the
48978 if-convertible sequence described in IF_INFO. */
48979
48980 static bool
48981 ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
48982 {
48983 if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p)
48984 {
48985 int cmov_cnt = 0;
48986 /* Punt if SEQ contains more than one CMOV or FCMOV instruction.
48987 Maybe we should allow even more conditional moves as long as they
48988 are used far enough not to stall the CPU, or also consider
48989 IF_INFO->TEST_BB succ edge probabilities. */
48990 for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
48991 {
48992 rtx set = single_set (insn);
48993 if (!set)
48994 continue;
48995 if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE)
48996 continue;
48997 rtx src = SET_SRC (set);
48998 machine_mode mode = GET_MODE (src);
48999 if (GET_MODE_CLASS (mode) != MODE_INT
49000 && GET_MODE_CLASS (mode) != MODE_FLOAT)
49001 continue;
49002 if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1)))
49003 || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2))))
49004 continue;
49005 /* insn is CMOV or FCMOV. */
49006 if (++cmov_cnt > 1)
49007 return false;
49008 }
49009 }
49010 return default_noce_conversion_profitable_p (seq, if_info);
49011 }
49012
49013 /* Implement targetm.vectorize.init_cost. */
49014
49015 static void *
49016 ix86_init_cost (struct loop *)
49017 {
49018 unsigned *cost = XNEWVEC (unsigned, 3);
49019 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
49020 return cost;
49021 }
49022
49023 /* Implement targetm.vectorize.add_stmt_cost. */
49024
49025 static unsigned
49026 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
49027 struct _stmt_vec_info *stmt_info, int misalign,
49028 enum vect_cost_model_location where)
49029 {
49030 unsigned *cost = (unsigned *) data;
49031 unsigned retval = 0;
49032
49033 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
49034 int stmt_cost = - 1;
49035
49036 if ((kind == vector_stmt || kind == scalar_stmt)
49037 && stmt_info
49038 && stmt_info->stmt && gimple_code (stmt_info->stmt) == GIMPLE_ASSIGN)
49039 {
49040 tree_code subcode = gimple_assign_rhs_code (stmt_info->stmt);
49041 bool fp = false;
49042 machine_mode mode = TImode;
49043
49044 if (vectype != NULL)
49045 {
49046 fp = FLOAT_TYPE_P (vectype);
49047 mode = TYPE_MODE (vectype);
49048 }
49049 /*machine_mode inner_mode = mode;
49050 if (VECTOR_MODE_P (mode))
49051 inner_mode = GET_MODE_INNER (mode);*/
49052
49053 switch (subcode)
49054 {
49055 case PLUS_EXPR:
49056 case POINTER_PLUS_EXPR:
49057 case MINUS_EXPR:
49058 if (kind == scalar_stmt)
49059 {
49060 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49061 stmt_cost = ix86_cost->addss;
49062 else if (X87_FLOAT_MODE_P (mode))
49063 stmt_cost = ix86_cost->fadd;
49064 else
49065 stmt_cost = ix86_cost->add;
49066 }
49067 else
49068 stmt_cost = ix86_vec_cost (mode,
49069 fp ? ix86_cost->addss
49070 : ix86_cost->sse_op,
49071 true);
49072 break;
49073
49074 case MULT_EXPR:
49075 case WIDEN_MULT_EXPR:
49076 case MULT_HIGHPART_EXPR:
49077 stmt_cost = ix86_multiplication_cost (ix86_cost, mode);
49078 break;
49079 case FMA_EXPR:
49080 stmt_cost = ix86_vec_cost (mode,
49081 mode == SFmode ? ix86_cost->fmass
49082 : ix86_cost->fmasd,
49083 true);
49084 break;
49085 case NEGATE_EXPR:
49086 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49087 stmt_cost = ix86_cost->sse_op;
49088 else if (X87_FLOAT_MODE_P (mode))
49089 stmt_cost = ix86_cost->fchs;
49090 else if (VECTOR_MODE_P (mode))
49091 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49092 else
49093 stmt_cost = ix86_cost->add;
49094 break;
49095 case TRUNC_DIV_EXPR:
49096 case CEIL_DIV_EXPR:
49097 case FLOOR_DIV_EXPR:
49098 case ROUND_DIV_EXPR:
49099 case TRUNC_MOD_EXPR:
49100 case CEIL_MOD_EXPR:
49101 case FLOOR_MOD_EXPR:
49102 case RDIV_EXPR:
49103 case ROUND_MOD_EXPR:
49104 case EXACT_DIV_EXPR:
49105 stmt_cost = ix86_division_cost (ix86_cost, mode);
49106 break;
49107
49108 case RSHIFT_EXPR:
49109 case LSHIFT_EXPR:
49110 case LROTATE_EXPR:
49111 case RROTATE_EXPR:
49112 {
49113 tree op2 = gimple_assign_rhs2 (stmt_info->stmt);
49114 stmt_cost = ix86_shift_rotate_cost
49115 (ix86_cost, mode,
49116 TREE_CODE (op2) == INTEGER_CST,
49117 cst_and_fits_in_hwi (op2) ? int_cst_value (op2) : -1,
49118 true, false, false, NULL, NULL);
49119 }
49120 break;
49121 case NOP_EXPR:
49122 stmt_cost = 0;
49123 break;
49124
49125 case BIT_IOR_EXPR:
49126 case ABS_EXPR:
49127 case MIN_EXPR:
49128 case MAX_EXPR:
49129 case BIT_XOR_EXPR:
49130 case BIT_AND_EXPR:
49131 case BIT_NOT_EXPR:
49132 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
49133 stmt_cost = ix86_cost->sse_op;
49134 else if (VECTOR_MODE_P (mode))
49135 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op, true);
49136 else
49137 stmt_cost = ix86_cost->add;
49138 break;
49139 default:
49140 break;
49141 }
49142 }
49143 if (stmt_cost == -1)
49144 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
49145
49146 /* Penalize DFmode vector operations for Bonnell. */
49147 if (TARGET_BONNELL && kind == vector_stmt
49148 && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
49149 stmt_cost *= 5; /* FIXME: The value here is arbitrary. */
49150
49151 /* Statements in an inner loop relative to the loop being
49152 vectorized are weighted more heavily. The value here is
49153 arbitrary and could potentially be improved with analysis. */
49154 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
49155 count *= 50; /* FIXME. */
49156
49157 retval = (unsigned) (count * stmt_cost);
49158
49159 /* We need to multiply all vector stmt cost by 1.7 (estimated cost)
49160 for Silvermont as it has out of order integer pipeline and can execute
49161 2 scalar instruction per tick, but has in order SIMD pipeline. */
49162 if ((TARGET_SILVERMONT || TARGET_INTEL)
49163 && stmt_info && stmt_info->stmt)
49164 {
49165 tree lhs_op = gimple_get_lhs (stmt_info->stmt);
49166 if (lhs_op && TREE_CODE (TREE_TYPE (lhs_op)) == INTEGER_TYPE)
49167 retval = (retval * 17) / 10;
49168 }
49169
49170 cost[where] += retval;
49171
49172 return retval;
49173 }
49174
49175 /* Implement targetm.vectorize.finish_cost. */
49176
49177 static void
49178 ix86_finish_cost (void *data, unsigned *prologue_cost,
49179 unsigned *body_cost, unsigned *epilogue_cost)
49180 {
49181 unsigned *cost = (unsigned *) data;
49182 *prologue_cost = cost[vect_prologue];
49183 *body_cost = cost[vect_body];
49184 *epilogue_cost = cost[vect_epilogue];
49185 }
49186
49187 /* Implement targetm.vectorize.destroy_cost_data. */
49188
49189 static void
49190 ix86_destroy_cost_data (void *data)
49191 {
49192 free (data);
49193 }
49194
49195 /* Validate target specific memory model bits in VAL. */
49196
49197 static unsigned HOST_WIDE_INT
49198 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
49199 {
49200 enum memmodel model = memmodel_from_int (val);
49201 bool strong;
49202
49203 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
49204 |MEMMODEL_MASK)
49205 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
49206 {
49207 warning (OPT_Winvalid_memory_model,
49208 "unknown architecture specific memory model");
49209 return MEMMODEL_SEQ_CST;
49210 }
49211 strong = (is_mm_acq_rel (model) || is_mm_seq_cst (model));
49212 if (val & IX86_HLE_ACQUIRE && !(is_mm_acquire (model) || strong))
49213 {
49214 warning (OPT_Winvalid_memory_model,
49215 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
49216 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
49217 }
49218 if (val & IX86_HLE_RELEASE && !(is_mm_release (model) || strong))
49219 {
49220 warning (OPT_Winvalid_memory_model,
49221 "HLE_RELEASE not used with RELEASE or stronger memory model");
49222 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
49223 }
49224 return val;
49225 }
49226
49227 /* Set CLONEI->vecsize_mangle, CLONEI->mask_mode, CLONEI->vecsize_int,
49228 CLONEI->vecsize_float and if CLONEI->simdlen is 0, also
49229 CLONEI->simdlen. Return 0 if SIMD clones shouldn't be emitted,
49230 or number of vecsize_mangle variants that should be emitted. */
49231
49232 static int
49233 ix86_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
49234 struct cgraph_simd_clone *clonei,
49235 tree base_type, int num)
49236 {
49237 int ret = 1;
49238
49239 if (clonei->simdlen
49240 && (clonei->simdlen < 2
49241 || clonei->simdlen > 1024
49242 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
49243 {
49244 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49245 "unsupported simdlen %d", clonei->simdlen);
49246 return 0;
49247 }
49248
49249 tree ret_type = TREE_TYPE (TREE_TYPE (node->decl));
49250 if (TREE_CODE (ret_type) != VOID_TYPE)
49251 switch (TYPE_MODE (ret_type))
49252 {
49253 case E_QImode:
49254 case E_HImode:
49255 case E_SImode:
49256 case E_DImode:
49257 case E_SFmode:
49258 case E_DFmode:
49259 /* case E_SCmode: */
49260 /* case E_DCmode: */
49261 break;
49262 default:
49263 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49264 "unsupported return type %qT for simd\n", ret_type);
49265 return 0;
49266 }
49267
49268 tree t;
49269 int i;
49270
49271 for (t = DECL_ARGUMENTS (node->decl), i = 0; t; t = DECL_CHAIN (t), i++)
49272 /* FIXME: Shouldn't we allow such arguments if they are uniform? */
49273 switch (TYPE_MODE (TREE_TYPE (t)))
49274 {
49275 case E_QImode:
49276 case E_HImode:
49277 case E_SImode:
49278 case E_DImode:
49279 case E_SFmode:
49280 case E_DFmode:
49281 /* case E_SCmode: */
49282 /* case E_DCmode: */
49283 break;
49284 default:
49285 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49286 "unsupported argument type %qT for simd\n", TREE_TYPE (t));
49287 return 0;
49288 }
49289
49290 if (clonei->cilk_elemental)
49291 {
49292 /* Parse here processor clause. If not present, default to 'b'. */
49293 clonei->vecsize_mangle = 'b';
49294 }
49295 else if (!TREE_PUBLIC (node->decl))
49296 {
49297 /* If the function isn't exported, we can pick up just one ISA
49298 for the clones. */
49299 if (TARGET_AVX512F)
49300 clonei->vecsize_mangle = 'e';
49301 else if (TARGET_AVX2)
49302 clonei->vecsize_mangle = 'd';
49303 else if (TARGET_AVX)
49304 clonei->vecsize_mangle = 'c';
49305 else
49306 clonei->vecsize_mangle = 'b';
49307 ret = 1;
49308 }
49309 else
49310 {
49311 clonei->vecsize_mangle = "bcde"[num];
49312 ret = 4;
49313 }
49314 clonei->mask_mode = VOIDmode;
49315 switch (clonei->vecsize_mangle)
49316 {
49317 case 'b':
49318 clonei->vecsize_int = 128;
49319 clonei->vecsize_float = 128;
49320 break;
49321 case 'c':
49322 clonei->vecsize_int = 128;
49323 clonei->vecsize_float = 256;
49324 break;
49325 case 'd':
49326 clonei->vecsize_int = 256;
49327 clonei->vecsize_float = 256;
49328 break;
49329 case 'e':
49330 clonei->vecsize_int = 512;
49331 clonei->vecsize_float = 512;
49332 if (TYPE_MODE (base_type) == QImode)
49333 clonei->mask_mode = DImode;
49334 else
49335 clonei->mask_mode = SImode;
49336 break;
49337 }
49338 if (clonei->simdlen == 0)
49339 {
49340 if (SCALAR_INT_MODE_P (TYPE_MODE (base_type)))
49341 clonei->simdlen = clonei->vecsize_int;
49342 else
49343 clonei->simdlen = clonei->vecsize_float;
49344 clonei->simdlen /= GET_MODE_BITSIZE (TYPE_MODE (base_type));
49345 }
49346 else if (clonei->simdlen > 16)
49347 {
49348 /* For compatibility with ICC, use the same upper bounds
49349 for simdlen. In particular, for CTYPE below, use the return type,
49350 unless the function returns void, in that case use the characteristic
49351 type. If it is possible for given SIMDLEN to pass CTYPE value
49352 in registers (8 [XYZ]MM* regs for 32-bit code, 16 [XYZ]MM* regs
49353 for 64-bit code), accept that SIMDLEN, otherwise warn and don't
49354 emit corresponding clone. */
49355 tree ctype = ret_type;
49356 if (TREE_CODE (ret_type) == VOID_TYPE)
49357 ctype = base_type;
49358 int cnt = GET_MODE_BITSIZE (TYPE_MODE (ctype)) * clonei->simdlen;
49359 if (SCALAR_INT_MODE_P (TYPE_MODE (ctype)))
49360 cnt /= clonei->vecsize_int;
49361 else
49362 cnt /= clonei->vecsize_float;
49363 if (cnt > (TARGET_64BIT ? 16 : 8))
49364 {
49365 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
49366 "unsupported simdlen %d", clonei->simdlen);
49367 return 0;
49368 }
49369 }
49370 return ret;
49371 }
49372
49373 /* Add target attribute to SIMD clone NODE if needed. */
49374
49375 static void
49376 ix86_simd_clone_adjust (struct cgraph_node *node)
49377 {
49378 const char *str = NULL;
49379 gcc_assert (node->decl == cfun->decl);
49380 switch (node->simdclone->vecsize_mangle)
49381 {
49382 case 'b':
49383 if (!TARGET_SSE2)
49384 str = "sse2";
49385 break;
49386 case 'c':
49387 if (!TARGET_AVX)
49388 str = "avx";
49389 break;
49390 case 'd':
49391 if (!TARGET_AVX2)
49392 str = "avx2";
49393 break;
49394 case 'e':
49395 if (!TARGET_AVX512F)
49396 str = "avx512f";
49397 break;
49398 default:
49399 gcc_unreachable ();
49400 }
49401 if (str == NULL)
49402 return;
49403 push_cfun (NULL);
49404 tree args = build_tree_list (NULL_TREE, build_string (strlen (str), str));
49405 bool ok = ix86_valid_target_attribute_p (node->decl, NULL, args, 0);
49406 gcc_assert (ok);
49407 pop_cfun ();
49408 ix86_reset_previous_fndecl ();
49409 ix86_set_current_function (node->decl);
49410 }
49411
49412 /* If SIMD clone NODE can't be used in a vectorized loop
49413 in current function, return -1, otherwise return a badness of using it
49414 (0 if it is most desirable from vecsize_mangle point of view, 1
49415 slightly less desirable, etc.). */
49416
49417 static int
49418 ix86_simd_clone_usable (struct cgraph_node *node)
49419 {
49420 switch (node->simdclone->vecsize_mangle)
49421 {
49422 case 'b':
49423 if (!TARGET_SSE2)
49424 return -1;
49425 if (!TARGET_AVX)
49426 return 0;
49427 return TARGET_AVX2 ? 2 : 1;
49428 case 'c':
49429 if (!TARGET_AVX)
49430 return -1;
49431 return TARGET_AVX2 ? 1 : 0;
49432 case 'd':
49433 if (!TARGET_AVX2)
49434 return -1;
49435 return 0;
49436 case 'e':
49437 if (!TARGET_AVX512F)
49438 return -1;
49439 return 0;
49440 default:
49441 gcc_unreachable ();
49442 }
49443 }
49444
49445 /* This function adjusts the unroll factor based on
49446 the hardware capabilities. For ex, bdver3 has
49447 a loop buffer which makes unrolling of smaller
49448 loops less important. This function decides the
49449 unroll factor using number of memory references
49450 (value 32 is used) as a heuristic. */
49451
49452 static unsigned
49453 ix86_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
49454 {
49455 basic_block *bbs;
49456 rtx_insn *insn;
49457 unsigned i;
49458 unsigned mem_count = 0;
49459
49460 if (!TARGET_ADJUST_UNROLL)
49461 return nunroll;
49462
49463 /* Count the number of memory references within the loop body.
49464 This value determines the unrolling factor for bdver3 and bdver4
49465 architectures. */
49466 subrtx_iterator::array_type array;
49467 bbs = get_loop_body (loop);
49468 for (i = 0; i < loop->num_nodes; i++)
49469 FOR_BB_INSNS (bbs[i], insn)
49470 if (NONDEBUG_INSN_P (insn))
49471 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
49472 if (const_rtx x = *iter)
49473 if (MEM_P (x))
49474 {
49475 machine_mode mode = GET_MODE (x);
49476 unsigned int n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
49477 if (n_words > 4)
49478 mem_count += 2;
49479 else
49480 mem_count += 1;
49481 }
49482 free (bbs);
49483
49484 if (mem_count && mem_count <=32)
49485 return 32/mem_count;
49486
49487 return nunroll;
49488 }
49489
49490
49491 /* Implement TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P. */
49492
49493 static bool
49494 ix86_float_exceptions_rounding_supported_p (void)
49495 {
49496 /* For x87 floating point with standard excess precision handling,
49497 there is no adddf3 pattern (since x87 floating point only has
49498 XFmode operations) so the default hook implementation gets this
49499 wrong. */
49500 return TARGET_80387 || TARGET_SSE_MATH;
49501 }
49502
49503 /* Implement TARGET_ATOMIC_ASSIGN_EXPAND_FENV. */
49504
49505 static void
49506 ix86_atomic_assign_expand_fenv (tree *hold, tree *clear, tree *update)
49507 {
49508 if (!TARGET_80387 && !TARGET_SSE_MATH)
49509 return;
49510 tree exceptions_var = create_tmp_var_raw (integer_type_node);
49511 if (TARGET_80387)
49512 {
49513 tree fenv_index_type = build_index_type (size_int (6));
49514 tree fenv_type = build_array_type (unsigned_type_node, fenv_index_type);
49515 tree fenv_var = create_tmp_var_raw (fenv_type);
49516 TREE_ADDRESSABLE (fenv_var) = 1;
49517 tree fenv_ptr = build_pointer_type (fenv_type);
49518 tree fenv_addr = build1 (ADDR_EXPR, fenv_ptr, fenv_var);
49519 fenv_addr = fold_convert (ptr_type_node, fenv_addr);
49520 tree fnstenv = ix86_builtins[IX86_BUILTIN_FNSTENV];
49521 tree fldenv = ix86_builtins[IX86_BUILTIN_FLDENV];
49522 tree fnstsw = ix86_builtins[IX86_BUILTIN_FNSTSW];
49523 tree fnclex = ix86_builtins[IX86_BUILTIN_FNCLEX];
49524 tree hold_fnstenv = build_call_expr (fnstenv, 1, fenv_addr);
49525 tree hold_fnclex = build_call_expr (fnclex, 0);
49526 fenv_var = build4 (TARGET_EXPR, fenv_type, fenv_var, hold_fnstenv,
49527 NULL_TREE, NULL_TREE);
49528 *hold = build2 (COMPOUND_EXPR, void_type_node, fenv_var,
49529 hold_fnclex);
49530 *clear = build_call_expr (fnclex, 0);
49531 tree sw_var = create_tmp_var_raw (short_unsigned_type_node);
49532 tree fnstsw_call = build_call_expr (fnstsw, 0);
49533 tree sw_mod = build2 (MODIFY_EXPR, short_unsigned_type_node,
49534 sw_var, fnstsw_call);
49535 tree exceptions_x87 = fold_convert (integer_type_node, sw_var);
49536 tree update_mod = build2 (MODIFY_EXPR, integer_type_node,
49537 exceptions_var, exceptions_x87);
49538 *update = build2 (COMPOUND_EXPR, integer_type_node,
49539 sw_mod, update_mod);
49540 tree update_fldenv = build_call_expr (fldenv, 1, fenv_addr);
49541 *update = build2 (COMPOUND_EXPR, void_type_node, *update, update_fldenv);
49542 }
49543 if (TARGET_SSE_MATH)
49544 {
49545 tree mxcsr_orig_var = create_tmp_var_raw (unsigned_type_node);
49546 tree mxcsr_mod_var = create_tmp_var_raw (unsigned_type_node);
49547 tree stmxcsr = ix86_builtins[IX86_BUILTIN_STMXCSR];
49548 tree ldmxcsr = ix86_builtins[IX86_BUILTIN_LDMXCSR];
49549 tree stmxcsr_hold_call = build_call_expr (stmxcsr, 0);
49550 tree hold_assign_orig = build2 (MODIFY_EXPR, unsigned_type_node,
49551 mxcsr_orig_var, stmxcsr_hold_call);
49552 tree hold_mod_val = build2 (BIT_IOR_EXPR, unsigned_type_node,
49553 mxcsr_orig_var,
49554 build_int_cst (unsigned_type_node, 0x1f80));
49555 hold_mod_val = build2 (BIT_AND_EXPR, unsigned_type_node, hold_mod_val,
49556 build_int_cst (unsigned_type_node, 0xffffffc0));
49557 tree hold_assign_mod = build2 (MODIFY_EXPR, unsigned_type_node,
49558 mxcsr_mod_var, hold_mod_val);
49559 tree ldmxcsr_hold_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49560 tree hold_all = build2 (COMPOUND_EXPR, unsigned_type_node,
49561 hold_assign_orig, hold_assign_mod);
49562 hold_all = build2 (COMPOUND_EXPR, void_type_node, hold_all,
49563 ldmxcsr_hold_call);
49564 if (*hold)
49565 *hold = build2 (COMPOUND_EXPR, void_type_node, *hold, hold_all);
49566 else
49567 *hold = hold_all;
49568 tree ldmxcsr_clear_call = build_call_expr (ldmxcsr, 1, mxcsr_mod_var);
49569 if (*clear)
49570 *clear = build2 (COMPOUND_EXPR, void_type_node, *clear,
49571 ldmxcsr_clear_call);
49572 else
49573 *clear = ldmxcsr_clear_call;
49574 tree stxmcsr_update_call = build_call_expr (stmxcsr, 0);
49575 tree exceptions_sse = fold_convert (integer_type_node,
49576 stxmcsr_update_call);
49577 if (*update)
49578 {
49579 tree exceptions_mod = build2 (BIT_IOR_EXPR, integer_type_node,
49580 exceptions_var, exceptions_sse);
49581 tree exceptions_assign = build2 (MODIFY_EXPR, integer_type_node,
49582 exceptions_var, exceptions_mod);
49583 *update = build2 (COMPOUND_EXPR, integer_type_node, *update,
49584 exceptions_assign);
49585 }
49586 else
49587 *update = build2 (MODIFY_EXPR, integer_type_node,
49588 exceptions_var, exceptions_sse);
49589 tree ldmxcsr_update_call = build_call_expr (ldmxcsr, 1, mxcsr_orig_var);
49590 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49591 ldmxcsr_update_call);
49592 }
49593 tree atomic_feraiseexcept
49594 = builtin_decl_implicit (BUILT_IN_ATOMIC_FERAISEEXCEPT);
49595 tree atomic_feraiseexcept_call = build_call_expr (atomic_feraiseexcept,
49596 1, exceptions_var);
49597 *update = build2 (COMPOUND_EXPR, void_type_node, *update,
49598 atomic_feraiseexcept_call);
49599 }
49600
49601 /* Return mode to be used for bounds or VOIDmode
49602 if bounds are not supported. */
49603
49604 static machine_mode
49605 ix86_mpx_bound_mode ()
49606 {
49607 /* Do not support pointer checker if MPX
49608 is not enabled. */
49609 if (!TARGET_MPX)
49610 {
49611 if (flag_check_pointer_bounds)
49612 warning (0, "Pointer Checker requires MPX support on this target."
49613 " Use -mmpx options to enable MPX.");
49614 return VOIDmode;
49615 }
49616
49617 return BNDmode;
49618 }
49619
49620 /* Return constant used to statically initialize constant bounds.
49621
49622 This function is used to create special bound values. For now
49623 only INIT bounds and NONE bounds are expected. More special
49624 values may be added later. */
49625
49626 static tree
49627 ix86_make_bounds_constant (HOST_WIDE_INT lb, HOST_WIDE_INT ub)
49628 {
49629 tree low = lb ? build_minus_one_cst (pointer_sized_int_node)
49630 : build_zero_cst (pointer_sized_int_node);
49631 tree high = ub ? build_zero_cst (pointer_sized_int_node)
49632 : build_minus_one_cst (pointer_sized_int_node);
49633
49634 /* This function is supposed to be used to create INIT and
49635 NONE bounds only. */
49636 gcc_assert ((lb == 0 && ub == -1)
49637 || (lb == -1 && ub == 0));
49638
49639 return build_complex (NULL, low, high);
49640 }
49641
49642 /* Generate a list of statements STMTS to initialize pointer bounds
49643 variable VAR with bounds LB and UB. Return the number of generated
49644 statements. */
49645
49646 static int
49647 ix86_initialize_bounds (tree var, tree lb, tree ub, tree *stmts)
49648 {
49649 tree bnd_ptr = build_pointer_type (pointer_sized_int_node);
49650 tree lhs, modify, var_p;
49651
49652 ub = build1 (BIT_NOT_EXPR, pointer_sized_int_node, ub);
49653 var_p = fold_convert (bnd_ptr, build_fold_addr_expr (var));
49654
49655 lhs = build1 (INDIRECT_REF, pointer_sized_int_node, var_p);
49656 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, lb);
49657 append_to_statement_list (modify, stmts);
49658
49659 lhs = build1 (INDIRECT_REF, pointer_sized_int_node,
49660 build2 (POINTER_PLUS_EXPR, bnd_ptr, var_p,
49661 TYPE_SIZE_UNIT (pointer_sized_int_node)));
49662 modify = build2 (MODIFY_EXPR, TREE_TYPE (lhs), lhs, ub);
49663 append_to_statement_list (modify, stmts);
49664
49665 return 2;
49666 }
49667
49668 #if !TARGET_MACHO && !TARGET_DLLIMPORT_DECL_ATTRIBUTES
49669 /* For i386, common symbol is local only for non-PIE binaries. For
49670 x86-64, common symbol is local only for non-PIE binaries or linker
49671 supports copy reloc in PIE binaries. */
49672
49673 static bool
49674 ix86_binds_local_p (const_tree exp)
49675 {
49676 return default_binds_local_p_3 (exp, flag_shlib != 0, true, true,
49677 (!flag_pic
49678 || (TARGET_64BIT
49679 && HAVE_LD_PIE_COPYRELOC != 0)));
49680 }
49681 #endif
49682
49683 /* If MEM is in the form of [base+offset], extract the two parts
49684 of address and set to BASE and OFFSET, otherwise return false. */
49685
49686 static bool
49687 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
49688 {
49689 rtx addr;
49690
49691 gcc_assert (MEM_P (mem));
49692
49693 addr = XEXP (mem, 0);
49694
49695 if (GET_CODE (addr) == CONST)
49696 addr = XEXP (addr, 0);
49697
49698 if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
49699 {
49700 *base = addr;
49701 *offset = const0_rtx;
49702 return true;
49703 }
49704
49705 if (GET_CODE (addr) == PLUS
49706 && (REG_P (XEXP (addr, 0))
49707 || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
49708 && CONST_INT_P (XEXP (addr, 1)))
49709 {
49710 *base = XEXP (addr, 0);
49711 *offset = XEXP (addr, 1);
49712 return true;
49713 }
49714
49715 return false;
49716 }
49717
49718 /* Given OPERANDS of consecutive load/store, check if we can merge
49719 them into move multiple. LOAD is true if they are load instructions.
49720 MODE is the mode of memory operands. */
49721
49722 bool
49723 ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
49724 machine_mode mode)
49725 {
49726 HOST_WIDE_INT offval_1, offval_2, msize;
49727 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
49728
49729 if (load)
49730 {
49731 mem_1 = operands[1];
49732 mem_2 = operands[3];
49733 reg_1 = operands[0];
49734 reg_2 = operands[2];
49735 }
49736 else
49737 {
49738 mem_1 = operands[0];
49739 mem_2 = operands[2];
49740 reg_1 = operands[1];
49741 reg_2 = operands[3];
49742 }
49743
49744 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
49745
49746 if (REGNO (reg_1) != REGNO (reg_2))
49747 return false;
49748
49749 /* Check if the addresses are in the form of [base+offset]. */
49750 if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
49751 return false;
49752 if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
49753 return false;
49754
49755 /* Check if the bases are the same. */
49756 if (!rtx_equal_p (base_1, base_2))
49757 return false;
49758
49759 offval_1 = INTVAL (offset_1);
49760 offval_2 = INTVAL (offset_2);
49761 msize = GET_MODE_SIZE (mode);
49762 /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address. */
49763 if (offval_1 + msize != offval_2)
49764 return false;
49765
49766 return true;
49767 }
49768
49769 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
49770
49771 static bool
49772 ix86_optab_supported_p (int op, machine_mode mode1, machine_mode,
49773 optimization_type opt_type)
49774 {
49775 switch (op)
49776 {
49777 case asin_optab:
49778 case acos_optab:
49779 case log1p_optab:
49780 case exp_optab:
49781 case exp10_optab:
49782 case exp2_optab:
49783 case expm1_optab:
49784 case ldexp_optab:
49785 case scalb_optab:
49786 case round_optab:
49787 return opt_type == OPTIMIZE_FOR_SPEED;
49788
49789 case rint_optab:
49790 if (SSE_FLOAT_MODE_P (mode1)
49791 && TARGET_SSE_MATH
49792 && !flag_trapping_math
49793 && !TARGET_SSE4_1)
49794 return opt_type == OPTIMIZE_FOR_SPEED;
49795 return true;
49796
49797 case floor_optab:
49798 case ceil_optab:
49799 case btrunc_optab:
49800 if (SSE_FLOAT_MODE_P (mode1)
49801 && TARGET_SSE_MATH
49802 && !flag_trapping_math
49803 && TARGET_SSE4_1)
49804 return true;
49805 return opt_type == OPTIMIZE_FOR_SPEED;
49806
49807 case rsqrt_optab:
49808 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
49809
49810 default:
49811 return true;
49812 }
49813 }
49814
49815 /* Address space support.
49816
49817 This is not "far pointers" in the 16-bit sense, but an easy way
49818 to use %fs and %gs segment prefixes. Therefore:
49819
49820 (a) All address spaces have the same modes,
49821 (b) All address spaces have the same addresss forms,
49822 (c) While %fs and %gs are technically subsets of the generic
49823 address space, they are probably not subsets of each other.
49824 (d) Since we have no access to the segment base register values
49825 without resorting to a system call, we cannot convert a
49826 non-default address space to a default address space.
49827 Therefore we do not claim %fs or %gs are subsets of generic.
49828
49829 Therefore we can (mostly) use the default hooks. */
49830
49831 /* All use of segmentation is assumed to make address 0 valid. */
49832
49833 static bool
49834 ix86_addr_space_zero_address_valid (addr_space_t as)
49835 {
49836 return as != ADDR_SPACE_GENERIC;
49837 }
49838
49839 static void
49840 ix86_init_libfuncs (void)
49841 {
49842 if (TARGET_64BIT)
49843 {
49844 set_optab_libfunc (sdivmod_optab, TImode, "__divmodti4");
49845 set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
49846 }
49847 else
49848 {
49849 set_optab_libfunc (sdivmod_optab, DImode, "__divmoddi4");
49850 set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
49851 }
49852
49853 #if TARGET_MACHO
49854 darwin_rename_builtins ();
49855 #endif
49856 }
49857
49858 /* Generate call to __divmoddi4. */
49859
49860 static void
49861 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
49862 rtx op0, rtx op1,
49863 rtx *quot_p, rtx *rem_p)
49864 {
49865 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
49866
49867 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
49868 mode,
49869 op0, GET_MODE (op0),
49870 op1, GET_MODE (op1),
49871 XEXP (rem, 0), Pmode);
49872 *quot_p = quot;
49873 *rem_p = rem;
49874 }
49875
49876 /* Set the value of FLT_EVAL_METHOD in float.h. When using only the
49877 FPU, assume that the fpcw is set to extended precision; when using
49878 only SSE, rounding is correct; when using both SSE and the FPU,
49879 the rounding precision is indeterminate, since either may be chosen
49880 apparently at random. */
49881
49882 static enum flt_eval_method
49883 ix86_excess_precision (enum excess_precision_type type)
49884 {
49885 switch (type)
49886 {
49887 case EXCESS_PRECISION_TYPE_FAST:
49888 /* The fastest type to promote to will always be the native type,
49889 whether that occurs with implicit excess precision or
49890 otherwise. */
49891 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49892 case EXCESS_PRECISION_TYPE_STANDARD:
49893 case EXCESS_PRECISION_TYPE_IMPLICIT:
49894 /* Otherwise, the excess precision we want when we are
49895 in a standards compliant mode, and the implicit precision we
49896 provide would be identical were it not for the unpredictable
49897 cases. */
49898 if (!TARGET_80387)
49899 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49900 else if (!TARGET_MIX_SSE_I387)
49901 {
49902 if (!TARGET_SSE_MATH)
49903 return FLT_EVAL_METHOD_PROMOTE_TO_LONG_DOUBLE;
49904 else if (TARGET_SSE2)
49905 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT;
49906 }
49907
49908 /* If we are in standards compliant mode, but we know we will
49909 calculate in unpredictable precision, return
49910 FLT_EVAL_METHOD_FLOAT. There is no reason to introduce explicit
49911 excess precision if the target can't guarantee it will honor
49912 it. */
49913 return (type == EXCESS_PRECISION_TYPE_STANDARD
49914 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT
49915 : FLT_EVAL_METHOD_UNPREDICTABLE);
49916 default:
49917 gcc_unreachable ();
49918 }
49919
49920 return FLT_EVAL_METHOD_UNPREDICTABLE;
49921 }
49922
49923 /* Target-specific selftests. */
49924
49925 #if CHECKING_P
49926
49927 namespace selftest {
49928
49929 /* Verify that hard regs are dumped as expected (in compact mode). */
49930
49931 static void
49932 ix86_test_dumping_hard_regs ()
49933 {
49934 ASSERT_RTL_DUMP_EQ ("(reg:SI ax)", gen_raw_REG (SImode, 0));
49935 ASSERT_RTL_DUMP_EQ ("(reg:SI dx)", gen_raw_REG (SImode, 1));
49936 }
49937
49938 /* Test dumping an insn with repeated references to the same SCRATCH,
49939 to verify the rtx_reuse code. */
49940
49941 static void
49942 ix86_test_dumping_memory_blockage ()
49943 {
49944 set_new_first_and_last_insn (NULL, NULL);
49945
49946 rtx pat = gen_memory_blockage ();
49947 rtx_reuse_manager r;
49948 r.preprocess (pat);
49949
49950 /* Verify that the repeated references to the SCRATCH show use
49951 reuse IDS. The first should be prefixed with a reuse ID,
49952 and the second should be dumped as a "reuse_rtx" of that ID.
49953 The expected string assumes Pmode == DImode. */
49954 if (Pmode == DImode)
49955 ASSERT_RTL_DUMP_EQ_WITH_REUSE
49956 ("(cinsn 1 (set (mem/v:BLK (0|scratch:DI) [0 A8])\n"
49957 " (unspec:BLK [\n"
49958 " (mem/v:BLK (reuse_rtx 0) [0 A8])\n"
49959 " ] UNSPEC_MEMORY_BLOCKAGE)))\n", pat, &r);
49960 }
49961
49962 /* Verify loading an RTL dump; specifically a dump of copying
49963 a param on x86_64 from a hard reg into the frame.
49964 This test is target-specific since the dump contains target-specific
49965 hard reg names. */
49966
49967 static void
49968 ix86_test_loading_dump_fragment_1 ()
49969 {
49970 rtl_dump_test t (SELFTEST_LOCATION,
49971 locate_file ("x86_64/copy-hard-reg-into-frame.rtl"));
49972
49973 rtx_insn *insn = get_insn_by_uid (1);
49974
49975 /* The block structure and indentation here is purely for
49976 readability; it mirrors the structure of the rtx. */
49977 tree mem_expr;
49978 {
49979 rtx pat = PATTERN (insn);
49980 ASSERT_EQ (SET, GET_CODE (pat));
49981 {
49982 rtx dest = SET_DEST (pat);
49983 ASSERT_EQ (MEM, GET_CODE (dest));
49984 /* Verify the "/c" was parsed. */
49985 ASSERT_TRUE (RTX_FLAG (dest, call));
49986 ASSERT_EQ (SImode, GET_MODE (dest));
49987 {
49988 rtx addr = XEXP (dest, 0);
49989 ASSERT_EQ (PLUS, GET_CODE (addr));
49990 ASSERT_EQ (DImode, GET_MODE (addr));
49991 {
49992 rtx lhs = XEXP (addr, 0);
49993 /* Verify that the "frame" REG was consolidated. */
49994 ASSERT_RTX_PTR_EQ (frame_pointer_rtx, lhs);
49995 }
49996 {
49997 rtx rhs = XEXP (addr, 1);
49998 ASSERT_EQ (CONST_INT, GET_CODE (rhs));
49999 ASSERT_EQ (-4, INTVAL (rhs));
50000 }
50001 }
50002 /* Verify the "[1 i+0 S4 A32]" was parsed. */
50003 ASSERT_EQ (1, MEM_ALIAS_SET (dest));
50004 /* "i" should have been handled by synthesizing a global int
50005 variable named "i". */
50006 mem_expr = MEM_EXPR (dest);
50007 ASSERT_NE (mem_expr, NULL);
50008 ASSERT_EQ (VAR_DECL, TREE_CODE (mem_expr));
50009 ASSERT_EQ (integer_type_node, TREE_TYPE (mem_expr));
50010 ASSERT_EQ (IDENTIFIER_NODE, TREE_CODE (DECL_NAME (mem_expr)));
50011 ASSERT_STREQ ("i", IDENTIFIER_POINTER (DECL_NAME (mem_expr)));
50012 /* "+0". */
50013 ASSERT_TRUE (MEM_OFFSET_KNOWN_P (dest));
50014 ASSERT_EQ (0, MEM_OFFSET (dest));
50015 /* "S4". */
50016 ASSERT_EQ (4, MEM_SIZE (dest));
50017 /* "A32. */
50018 ASSERT_EQ (32, MEM_ALIGN (dest));
50019 }
50020 {
50021 rtx src = SET_SRC (pat);
50022 ASSERT_EQ (REG, GET_CODE (src));
50023 ASSERT_EQ (SImode, GET_MODE (src));
50024 ASSERT_EQ (5, REGNO (src));
50025 tree reg_expr = REG_EXPR (src);
50026 /* "i" here should point to the same var as for the MEM_EXPR. */
50027 ASSERT_EQ (reg_expr, mem_expr);
50028 }
50029 }
50030 }
50031
50032 /* Verify that the RTL loader copes with a call_insn dump.
50033 This test is target-specific since the dump contains a target-specific
50034 hard reg name. */
50035
50036 static void
50037 ix86_test_loading_call_insn ()
50038 {
50039 /* The test dump includes register "xmm0", where requires TARGET_SSE
50040 to exist. */
50041 if (!TARGET_SSE)
50042 return;
50043
50044 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/call-insn.rtl"));
50045
50046 rtx_insn *insn = get_insns ();
50047 ASSERT_EQ (CALL_INSN, GET_CODE (insn));
50048
50049 /* "/j". */
50050 ASSERT_TRUE (RTX_FLAG (insn, jump));
50051
50052 rtx pat = PATTERN (insn);
50053 ASSERT_EQ (CALL, GET_CODE (SET_SRC (pat)));
50054
50055 /* Verify REG_NOTES. */
50056 {
50057 /* "(expr_list:REG_CALL_DECL". */
50058 ASSERT_EQ (EXPR_LIST, GET_CODE (REG_NOTES (insn)));
50059 rtx_expr_list *note0 = as_a <rtx_expr_list *> (REG_NOTES (insn));
50060 ASSERT_EQ (REG_CALL_DECL, REG_NOTE_KIND (note0));
50061
50062 /* "(expr_list:REG_EH_REGION (const_int 0 [0])". */
50063 rtx_expr_list *note1 = note0->next ();
50064 ASSERT_EQ (REG_EH_REGION, REG_NOTE_KIND (note1));
50065
50066 ASSERT_EQ (NULL, note1->next ());
50067 }
50068
50069 /* Verify CALL_INSN_FUNCTION_USAGE. */
50070 {
50071 /* "(expr_list:DF (use (reg:DF 21 xmm0))". */
50072 rtx_expr_list *usage
50073 = as_a <rtx_expr_list *> (CALL_INSN_FUNCTION_USAGE (insn));
50074 ASSERT_EQ (EXPR_LIST, GET_CODE (usage));
50075 ASSERT_EQ (DFmode, GET_MODE (usage));
50076 ASSERT_EQ (USE, GET_CODE (usage->element ()));
50077 ASSERT_EQ (NULL, usage->next ());
50078 }
50079 }
50080
50081 /* Verify that the RTL loader copes a dump from print_rtx_function.
50082 This test is target-specific since the dump contains target-specific
50083 hard reg names. */
50084
50085 static void
50086 ix86_test_loading_full_dump ()
50087 {
50088 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/times-two.rtl"));
50089
50090 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50091
50092 rtx_insn *insn_1 = get_insn_by_uid (1);
50093 ASSERT_EQ (NOTE, GET_CODE (insn_1));
50094
50095 rtx_insn *insn_7 = get_insn_by_uid (7);
50096 ASSERT_EQ (INSN, GET_CODE (insn_7));
50097 ASSERT_EQ (PARALLEL, GET_CODE (PATTERN (insn_7)));
50098
50099 rtx_insn *insn_15 = get_insn_by_uid (15);
50100 ASSERT_EQ (INSN, GET_CODE (insn_15));
50101 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
50102
50103 /* Verify crtl->return_rtx. */
50104 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
50105 ASSERT_EQ (0, REGNO (crtl->return_rtx));
50106 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
50107 }
50108
50109 /* Verify that the RTL loader copes with UNSPEC and UNSPEC_VOLATILE insns.
50110 In particular, verify that it correctly loads the 2nd operand.
50111 This test is target-specific since these are machine-specific
50112 operands (and enums). */
50113
50114 static void
50115 ix86_test_loading_unspec ()
50116 {
50117 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("x86_64/unspec.rtl"));
50118
50119 ASSERT_STREQ ("test_unspec", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
50120
50121 ASSERT_TRUE (cfun);
50122
50123 /* Test of an UNSPEC. */
50124 rtx_insn *insn = get_insns ();
50125 ASSERT_EQ (INSN, GET_CODE (insn));
50126 rtx set = single_set (insn);
50127 ASSERT_NE (NULL, set);
50128 rtx dst = SET_DEST (set);
50129 ASSERT_EQ (MEM, GET_CODE (dst));
50130 rtx src = SET_SRC (set);
50131 ASSERT_EQ (UNSPEC, GET_CODE (src));
50132 ASSERT_EQ (BLKmode, GET_MODE (src));
50133 ASSERT_EQ (UNSPEC_MEMORY_BLOCKAGE, XINT (src, 1));
50134
50135 rtx v0 = XVECEXP (src, 0, 0);
50136
50137 /* Verify that the two uses of the first SCRATCH have pointer
50138 equality. */
50139 rtx scratch_a = XEXP (dst, 0);
50140 ASSERT_EQ (SCRATCH, GET_CODE (scratch_a));
50141
50142 rtx scratch_b = XEXP (v0, 0);
50143 ASSERT_EQ (SCRATCH, GET_CODE (scratch_b));
50144
50145 ASSERT_EQ (scratch_a, scratch_b);
50146
50147 /* Verify that the two mems are thus treated as equal. */
50148 ASSERT_TRUE (rtx_equal_p (dst, v0));
50149
50150 /* Verify the the insn is recognized. */
50151 ASSERT_NE(-1, recog_memoized (insn));
50152
50153 /* Test of an UNSPEC_VOLATILE, which has its own enum values. */
50154 insn = NEXT_INSN (insn);
50155 ASSERT_EQ (INSN, GET_CODE (insn));
50156
50157 set = single_set (insn);
50158 ASSERT_NE (NULL, set);
50159
50160 src = SET_SRC (set);
50161 ASSERT_EQ (UNSPEC_VOLATILE, GET_CODE (src));
50162 ASSERT_EQ (UNSPECV_RDTSCP, XINT (src, 1));
50163 }
50164
50165 /* Run all target-specific selftests. */
50166
50167 static void
50168 ix86_run_selftests (void)
50169 {
50170 ix86_test_dumping_hard_regs ();
50171 ix86_test_dumping_memory_blockage ();
50172
50173 /* Various tests of loading RTL dumps, here because they contain
50174 ix86-isms (e.g. names of hard regs). */
50175 ix86_test_loading_dump_fragment_1 ();
50176 ix86_test_loading_call_insn ();
50177 ix86_test_loading_full_dump ();
50178 ix86_test_loading_unspec ();
50179 }
50180
50181 } // namespace selftest
50182
50183 #endif /* CHECKING_P */
50184
50185 /* Initialize the GCC target structure. */
50186 #undef TARGET_RETURN_IN_MEMORY
50187 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
50188
50189 #undef TARGET_LEGITIMIZE_ADDRESS
50190 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
50191
50192 #undef TARGET_ATTRIBUTE_TABLE
50193 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
50194 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
50195 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
50196 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50197 # undef TARGET_MERGE_DECL_ATTRIBUTES
50198 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
50199 #endif
50200
50201 #undef TARGET_COMP_TYPE_ATTRIBUTES
50202 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
50203
50204 #undef TARGET_INIT_BUILTINS
50205 #define TARGET_INIT_BUILTINS ix86_init_builtins
50206 #undef TARGET_BUILTIN_DECL
50207 #define TARGET_BUILTIN_DECL ix86_builtin_decl
50208 #undef TARGET_EXPAND_BUILTIN
50209 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
50210
50211 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
50212 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
50213 ix86_builtin_vectorized_function
50214
50215 #undef TARGET_VECTORIZE_BUILTIN_GATHER
50216 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
50217
50218 #undef TARGET_VECTORIZE_BUILTIN_SCATTER
50219 #define TARGET_VECTORIZE_BUILTIN_SCATTER ix86_vectorize_builtin_scatter
50220
50221 #undef TARGET_BUILTIN_RECIPROCAL
50222 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
50223
50224 #undef TARGET_ASM_FUNCTION_EPILOGUE
50225 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
50226
50227 #undef TARGET_ENCODE_SECTION_INFO
50228 #ifndef SUBTARGET_ENCODE_SECTION_INFO
50229 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
50230 #else
50231 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
50232 #endif
50233
50234 #undef TARGET_ASM_OPEN_PAREN
50235 #define TARGET_ASM_OPEN_PAREN ""
50236 #undef TARGET_ASM_CLOSE_PAREN
50237 #define TARGET_ASM_CLOSE_PAREN ""
50238
50239 #undef TARGET_ASM_BYTE_OP
50240 #define TARGET_ASM_BYTE_OP ASM_BYTE
50241
50242 #undef TARGET_ASM_ALIGNED_HI_OP
50243 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
50244 #undef TARGET_ASM_ALIGNED_SI_OP
50245 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
50246 #ifdef ASM_QUAD
50247 #undef TARGET_ASM_ALIGNED_DI_OP
50248 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
50249 #endif
50250
50251 #undef TARGET_PROFILE_BEFORE_PROLOGUE
50252 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
50253
50254 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
50255 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
50256
50257 #undef TARGET_ASM_UNALIGNED_HI_OP
50258 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
50259 #undef TARGET_ASM_UNALIGNED_SI_OP
50260 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
50261 #undef TARGET_ASM_UNALIGNED_DI_OP
50262 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
50263
50264 #undef TARGET_PRINT_OPERAND
50265 #define TARGET_PRINT_OPERAND ix86_print_operand
50266 #undef TARGET_PRINT_OPERAND_ADDRESS
50267 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
50268 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
50269 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
50270 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
50271 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
50272
50273 #undef TARGET_SCHED_INIT_GLOBAL
50274 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
50275 #undef TARGET_SCHED_ADJUST_COST
50276 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
50277 #undef TARGET_SCHED_ISSUE_RATE
50278 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
50279 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
50280 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
50281 ia32_multipass_dfa_lookahead
50282 #undef TARGET_SCHED_MACRO_FUSION_P
50283 #define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
50284 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
50285 #define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
50286
50287 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
50288 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
50289
50290 #undef TARGET_MEMMODEL_CHECK
50291 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
50292
50293 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
50294 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV ix86_atomic_assign_expand_fenv
50295
50296 #ifdef HAVE_AS_TLS
50297 #undef TARGET_HAVE_TLS
50298 #define TARGET_HAVE_TLS true
50299 #endif
50300 #undef TARGET_CANNOT_FORCE_CONST_MEM
50301 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
50302 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
50303 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
50304
50305 #undef TARGET_DELEGITIMIZE_ADDRESS
50306 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
50307
50308 #undef TARGET_CONST_NOT_OK_FOR_DEBUG_P
50309 #define TARGET_CONST_NOT_OK_FOR_DEBUG_P ix86_const_not_ok_for_debug_p
50310
50311 #undef TARGET_MS_BITFIELD_LAYOUT_P
50312 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
50313
50314 #if TARGET_MACHO
50315 #undef TARGET_BINDS_LOCAL_P
50316 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
50317 #else
50318 #undef TARGET_BINDS_LOCAL_P
50319 #define TARGET_BINDS_LOCAL_P ix86_binds_local_p
50320 #endif
50321 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
50322 #undef TARGET_BINDS_LOCAL_P
50323 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
50324 #endif
50325
50326 #undef TARGET_ASM_OUTPUT_MI_THUNK
50327 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
50328 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
50329 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
50330
50331 #undef TARGET_ASM_FILE_START
50332 #define TARGET_ASM_FILE_START x86_file_start
50333
50334 #undef TARGET_OPTION_OVERRIDE
50335 #define TARGET_OPTION_OVERRIDE ix86_option_override
50336
50337 #undef TARGET_REGISTER_MOVE_COST
50338 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
50339 #undef TARGET_MEMORY_MOVE_COST
50340 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
50341 #undef TARGET_RTX_COSTS
50342 #define TARGET_RTX_COSTS ix86_rtx_costs
50343 #undef TARGET_ADDRESS_COST
50344 #define TARGET_ADDRESS_COST ix86_address_cost
50345
50346 #undef TARGET_FLAGS_REGNUM
50347 #define TARGET_FLAGS_REGNUM FLAGS_REG
50348 #undef TARGET_FIXED_CONDITION_CODE_REGS
50349 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
50350 #undef TARGET_CC_MODES_COMPATIBLE
50351 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
50352
50353 #undef TARGET_MACHINE_DEPENDENT_REORG
50354 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
50355
50356 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
50357 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
50358
50359 #undef TARGET_BUILD_BUILTIN_VA_LIST
50360 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
50361
50362 #undef TARGET_FOLD_BUILTIN
50363 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
50364
50365 #undef TARGET_GIMPLE_FOLD_BUILTIN
50366 #define TARGET_GIMPLE_FOLD_BUILTIN ix86_gimple_fold_builtin
50367
50368 #undef TARGET_COMPARE_VERSION_PRIORITY
50369 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
50370
50371 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
50372 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
50373 ix86_generate_version_dispatcher_body
50374
50375 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
50376 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
50377 ix86_get_function_versions_dispatcher
50378
50379 #undef TARGET_ENUM_VA_LIST_P
50380 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
50381
50382 #undef TARGET_FN_ABI_VA_LIST
50383 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
50384
50385 #undef TARGET_CANONICAL_VA_LIST_TYPE
50386 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
50387
50388 #undef TARGET_EXPAND_BUILTIN_VA_START
50389 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
50390
50391 #undef TARGET_MD_ASM_ADJUST
50392 #define TARGET_MD_ASM_ADJUST ix86_md_asm_adjust
50393
50394 #undef TARGET_C_EXCESS_PRECISION
50395 #define TARGET_C_EXCESS_PRECISION ix86_excess_precision
50396 #undef TARGET_PROMOTE_PROTOTYPES
50397 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
50398 #undef TARGET_SETUP_INCOMING_VARARGS
50399 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
50400 #undef TARGET_MUST_PASS_IN_STACK
50401 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
50402 #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
50403 #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS ix86_allocate_stack_slots_for_args
50404 #undef TARGET_FUNCTION_ARG_ADVANCE
50405 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
50406 #undef TARGET_FUNCTION_ARG
50407 #define TARGET_FUNCTION_ARG ix86_function_arg
50408 #undef TARGET_INIT_PIC_REG
50409 #define TARGET_INIT_PIC_REG ix86_init_pic_reg
50410 #undef TARGET_USE_PSEUDO_PIC_REG
50411 #define TARGET_USE_PSEUDO_PIC_REG ix86_use_pseudo_pic_reg
50412 #undef TARGET_FUNCTION_ARG_BOUNDARY
50413 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
50414 #undef TARGET_PASS_BY_REFERENCE
50415 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
50416 #undef TARGET_INTERNAL_ARG_POINTER
50417 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
50418 #undef TARGET_UPDATE_STACK_BOUNDARY
50419 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
50420 #undef TARGET_GET_DRAP_RTX
50421 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
50422 #undef TARGET_STRICT_ARGUMENT_NAMING
50423 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
50424 #undef TARGET_STATIC_CHAIN
50425 #define TARGET_STATIC_CHAIN ix86_static_chain
50426 #undef TARGET_TRAMPOLINE_INIT
50427 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
50428 #undef TARGET_RETURN_POPS_ARGS
50429 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
50430
50431 #undef TARGET_WARN_FUNC_RETURN
50432 #define TARGET_WARN_FUNC_RETURN ix86_warn_func_return
50433
50434 #undef TARGET_LEGITIMATE_COMBINED_INSN
50435 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
50436
50437 #undef TARGET_ASAN_SHADOW_OFFSET
50438 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
50439
50440 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
50441 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
50442
50443 #undef TARGET_SCALAR_MODE_SUPPORTED_P
50444 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
50445
50446 #undef TARGET_VECTOR_MODE_SUPPORTED_P
50447 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
50448
50449 #undef TARGET_C_MODE_FOR_SUFFIX
50450 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
50451
50452 #ifdef HAVE_AS_TLS
50453 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
50454 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
50455 #endif
50456
50457 #ifdef SUBTARGET_INSERT_ATTRIBUTES
50458 #undef TARGET_INSERT_ATTRIBUTES
50459 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
50460 #endif
50461
50462 #undef TARGET_MANGLE_TYPE
50463 #define TARGET_MANGLE_TYPE ix86_mangle_type
50464
50465 #undef TARGET_STACK_PROTECT_GUARD
50466 #define TARGET_STACK_PROTECT_GUARD ix86_stack_protect_guard
50467
50468 #if !TARGET_MACHO
50469 #undef TARGET_STACK_PROTECT_FAIL
50470 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
50471 #endif
50472
50473 #undef TARGET_FUNCTION_VALUE
50474 #define TARGET_FUNCTION_VALUE ix86_function_value
50475
50476 #undef TARGET_FUNCTION_VALUE_REGNO_P
50477 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
50478
50479 #undef TARGET_PROMOTE_FUNCTION_MODE
50480 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
50481
50482 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
50483 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE ix86_override_options_after_change
50484
50485 #undef TARGET_MEMBER_TYPE_FORCES_BLK
50486 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
50487
50488 #undef TARGET_INSTANTIATE_DECLS
50489 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
50490
50491 #undef TARGET_SECONDARY_RELOAD
50492 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
50493 #undef TARGET_SECONDARY_MEMORY_NEEDED
50494 #define TARGET_SECONDARY_MEMORY_NEEDED ix86_secondary_memory_needed
50495 #undef TARGET_SECONDARY_MEMORY_NEEDED_MODE
50496 #define TARGET_SECONDARY_MEMORY_NEEDED_MODE ix86_secondary_memory_needed_mode
50497
50498 #undef TARGET_CLASS_MAX_NREGS
50499 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
50500
50501 #undef TARGET_PREFERRED_RELOAD_CLASS
50502 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
50503 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
50504 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
50505 #undef TARGET_CLASS_LIKELY_SPILLED_P
50506 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
50507
50508 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
50509 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
50510 ix86_builtin_vectorization_cost
50511 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
50512 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
50513 ix86_vectorize_vec_perm_const_ok
50514 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
50515 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
50516 ix86_preferred_simd_mode
50517 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
50518 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
50519 ix86_autovectorize_vector_sizes
50520 #undef TARGET_VECTORIZE_GET_MASK_MODE
50521 #define TARGET_VECTORIZE_GET_MASK_MODE ix86_get_mask_mode
50522 #undef TARGET_VECTORIZE_INIT_COST
50523 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
50524 #undef TARGET_VECTORIZE_ADD_STMT_COST
50525 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
50526 #undef TARGET_VECTORIZE_FINISH_COST
50527 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
50528 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
50529 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
50530
50531 #undef TARGET_SET_CURRENT_FUNCTION
50532 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
50533
50534 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
50535 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
50536
50537 #undef TARGET_OPTION_SAVE
50538 #define TARGET_OPTION_SAVE ix86_function_specific_save
50539
50540 #undef TARGET_OPTION_RESTORE
50541 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
50542
50543 #undef TARGET_OPTION_POST_STREAM_IN
50544 #define TARGET_OPTION_POST_STREAM_IN ix86_function_specific_post_stream_in
50545
50546 #undef TARGET_OPTION_PRINT
50547 #define TARGET_OPTION_PRINT ix86_function_specific_print
50548
50549 #undef TARGET_OPTION_FUNCTION_VERSIONS
50550 #define TARGET_OPTION_FUNCTION_VERSIONS common_function_versions
50551
50552 #undef TARGET_CAN_INLINE_P
50553 #define TARGET_CAN_INLINE_P ix86_can_inline_p
50554
50555 #undef TARGET_LEGITIMATE_ADDRESS_P
50556 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
50557
50558 #undef TARGET_REGISTER_PRIORITY
50559 #define TARGET_REGISTER_PRIORITY ix86_register_priority
50560
50561 #undef TARGET_REGISTER_USAGE_LEVELING_P
50562 #define TARGET_REGISTER_USAGE_LEVELING_P hook_bool_void_true
50563
50564 #undef TARGET_LEGITIMATE_CONSTANT_P
50565 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
50566
50567 #undef TARGET_COMPUTE_FRAME_LAYOUT
50568 #define TARGET_COMPUTE_FRAME_LAYOUT ix86_compute_frame_layout
50569
50570 #undef TARGET_FRAME_POINTER_REQUIRED
50571 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
50572
50573 #undef TARGET_CAN_ELIMINATE
50574 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
50575
50576 #undef TARGET_EXTRA_LIVE_ON_ENTRY
50577 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
50578
50579 #undef TARGET_ASM_CODE_END
50580 #define TARGET_ASM_CODE_END ix86_code_end
50581
50582 #undef TARGET_CONDITIONAL_REGISTER_USAGE
50583 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
50584
50585 #undef TARGET_CANONICALIZE_COMPARISON
50586 #define TARGET_CANONICALIZE_COMPARISON ix86_canonicalize_comparison
50587
50588 #undef TARGET_LOOP_UNROLL_ADJUST
50589 #define TARGET_LOOP_UNROLL_ADJUST ix86_loop_unroll_adjust
50590
50591 /* Disabled due to PRs 70902, 71453, 71555, 71596 and 71657. */
50592 #undef TARGET_SPILL_CLASS
50593 #define TARGET_SPILL_CLASS ix86_spill_class
50594
50595 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
50596 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
50597 ix86_simd_clone_compute_vecsize_and_simdlen
50598
50599 #undef TARGET_SIMD_CLONE_ADJUST
50600 #define TARGET_SIMD_CLONE_ADJUST \
50601 ix86_simd_clone_adjust
50602
50603 #undef TARGET_SIMD_CLONE_USABLE
50604 #define TARGET_SIMD_CLONE_USABLE \
50605 ix86_simd_clone_usable
50606
50607 #undef TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P
50608 #define TARGET_FLOAT_EXCEPTIONS_ROUNDING_SUPPORTED_P \
50609 ix86_float_exceptions_rounding_supported_p
50610
50611 #undef TARGET_MODE_EMIT
50612 #define TARGET_MODE_EMIT ix86_emit_mode_set
50613
50614 #undef TARGET_MODE_NEEDED
50615 #define TARGET_MODE_NEEDED ix86_mode_needed
50616
50617 #undef TARGET_MODE_AFTER
50618 #define TARGET_MODE_AFTER ix86_mode_after
50619
50620 #undef TARGET_MODE_ENTRY
50621 #define TARGET_MODE_ENTRY ix86_mode_entry
50622
50623 #undef TARGET_MODE_EXIT
50624 #define TARGET_MODE_EXIT ix86_mode_exit
50625
50626 #undef TARGET_MODE_PRIORITY
50627 #define TARGET_MODE_PRIORITY ix86_mode_priority
50628
50629 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
50630 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
50631
50632 #undef TARGET_LOAD_BOUNDS_FOR_ARG
50633 #define TARGET_LOAD_BOUNDS_FOR_ARG ix86_load_bounds
50634
50635 #undef TARGET_STORE_BOUNDS_FOR_ARG
50636 #define TARGET_STORE_BOUNDS_FOR_ARG ix86_store_bounds
50637
50638 #undef TARGET_LOAD_RETURNED_BOUNDS
50639 #define TARGET_LOAD_RETURNED_BOUNDS ix86_load_returned_bounds
50640
50641 #undef TARGET_STORE_RETURNED_BOUNDS
50642 #define TARGET_STORE_RETURNED_BOUNDS ix86_store_returned_bounds
50643
50644 #undef TARGET_CHKP_BOUND_MODE
50645 #define TARGET_CHKP_BOUND_MODE ix86_mpx_bound_mode
50646
50647 #undef TARGET_BUILTIN_CHKP_FUNCTION
50648 #define TARGET_BUILTIN_CHKP_FUNCTION ix86_builtin_mpx_function
50649
50650 #undef TARGET_CHKP_FUNCTION_VALUE_BOUNDS
50651 #define TARGET_CHKP_FUNCTION_VALUE_BOUNDS ix86_function_value_bounds
50652
50653 #undef TARGET_CHKP_MAKE_BOUNDS_CONSTANT
50654 #define TARGET_CHKP_MAKE_BOUNDS_CONSTANT ix86_make_bounds_constant
50655
50656 #undef TARGET_CHKP_INITIALIZE_BOUNDS
50657 #define TARGET_CHKP_INITIALIZE_BOUNDS ix86_initialize_bounds
50658
50659 #undef TARGET_SETUP_INCOMING_VARARG_BOUNDS
50660 #define TARGET_SETUP_INCOMING_VARARG_BOUNDS ix86_setup_incoming_vararg_bounds
50661
50662 #undef TARGET_OFFLOAD_OPTIONS
50663 #define TARGET_OFFLOAD_OPTIONS \
50664 ix86_offload_options
50665
50666 #undef TARGET_ABSOLUTE_BIGGEST_ALIGNMENT
50667 #define TARGET_ABSOLUTE_BIGGEST_ALIGNMENT 512
50668
50669 #undef TARGET_OPTAB_SUPPORTED_P
50670 #define TARGET_OPTAB_SUPPORTED_P ix86_optab_supported_p
50671
50672 #undef TARGET_HARD_REGNO_SCRATCH_OK
50673 #define TARGET_HARD_REGNO_SCRATCH_OK ix86_hard_regno_scratch_ok
50674
50675 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
50676 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 1
50677
50678 #undef TARGET_ADDITIONAL_ALLOCNO_CLASS_P
50679 #define TARGET_ADDITIONAL_ALLOCNO_CLASS_P ix86_additional_allocno_class_p
50680
50681 #undef TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID
50682 #define TARGET_ADDR_SPACE_ZERO_ADDRESS_VALID ix86_addr_space_zero_address_valid
50683
50684 #undef TARGET_INIT_LIBFUNCS
50685 #define TARGET_INIT_LIBFUNCS ix86_init_libfuncs
50686
50687 #undef TARGET_EXPAND_DIVMOD_LIBFUNC
50688 #define TARGET_EXPAND_DIVMOD_LIBFUNC ix86_expand_divmod_libfunc
50689
50690 #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST
50691 #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost
50692
50693 #undef TARGET_NOCE_CONVERSION_PROFITABLE_P
50694 #define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p
50695
50696 #undef TARGET_HARD_REGNO_NREGS
50697 #define TARGET_HARD_REGNO_NREGS ix86_hard_regno_nregs
50698 #undef TARGET_HARD_REGNO_MODE_OK
50699 #define TARGET_HARD_REGNO_MODE_OK ix86_hard_regno_mode_ok
50700
50701 #undef TARGET_MODES_TIEABLE_P
50702 #define TARGET_MODES_TIEABLE_P ix86_modes_tieable_p
50703
50704 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
50705 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
50706 ix86_hard_regno_call_part_clobbered
50707
50708 #undef TARGET_CAN_CHANGE_MODE_CLASS
50709 #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class
50710
50711 #undef TARGET_STATIC_RTX_ALIGNMENT
50712 #define TARGET_STATIC_RTX_ALIGNMENT ix86_static_rtx_alignment
50713 #undef TARGET_CONSTANT_ALIGNMENT
50714 #define TARGET_CONSTANT_ALIGNMENT ix86_constant_alignment
50715
50716 #undef TARGET_EMPTY_RECORD_P
50717 #define TARGET_EMPTY_RECORD_P ix86_is_empty_record
50718
50719 #undef TARGET_WARN_PARAMETER_PASSING_ABI
50720 #define TARGET_WARN_PARAMETER_PASSING_ABI ix86_warn_parameter_passing_abi
50721
50722 #if CHECKING_P
50723 #undef TARGET_RUN_TARGET_SELFTESTS
50724 #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests
50725 #endif /* #if CHECKING_P */
50726
50727 struct gcc_target targetm = TARGET_INITIALIZER;
50728 \f
50729 #include "gt-i386.h"