[AArch64] Fix ICES with -mgeneral-regs-only / -march=...+nofp
[gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2015 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "insn-codes.h"
26 #include "rtl.h"
27 #include "insn-attr.h"
28 #include "alias.h"
29 #include "symtab.h"
30 #include "tree.h"
31 #include "fold-const.h"
32 #include "stringpool.h"
33 #include "stor-layout.h"
34 #include "calls.h"
35 #include "varasm.h"
36 #include "regs.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "cfgrtl.h"
40 #include "cfganal.h"
41 #include "lcm.h"
42 #include "cfgbuild.h"
43 #include "cfgcleanup.h"
44 #include "predict.h"
45 #include "basic-block.h"
46 #include "df.h"
47 #include "hard-reg-set.h"
48 #include "output.h"
49 #include "function.h"
50 #include "flags.h"
51 #include "insn-config.h"
52 #include "expmed.h"
53 #include "dojump.h"
54 #include "explow.h"
55 #include "emit-rtl.h"
56 #include "stmt.h"
57 #include "expr.h"
58 #include "reload.h"
59 #include "toplev.h"
60 #include "target.h"
61 #include "target-def.h"
62 #include "targhooks.h"
63 #include "tm_p.h"
64 #include "recog.h"
65 #include "langhooks.h"
66 #include "diagnostic-core.h"
67 #include "tree-ssa-alias.h"
68 #include "internal-fn.h"
69 #include "gimple-fold.h"
70 #include "tree-eh.h"
71 #include "gimple-expr.h"
72 #include "gimple.h"
73 #include "gimplify.h"
74 #include "optabs.h"
75 #include "dwarf2.h"
76 #include "cfgloop.h"
77 #include "tree-vectorizer.h"
78 #include "aarch64-cost-tables.h"
79 #include "dumpfile.h"
80 #include "builtins.h"
81 #include "rtl-iter.h"
82 #include "tm-constrs.h"
83 #include "sched-int.h"
84 #include "cortex-a57-fma-steering.h"
85
86 /* Defined for convenience. */
87 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
88
89 /* Classifies an address.
90
91 ADDRESS_REG_IMM
92 A simple base register plus immediate offset.
93
94 ADDRESS_REG_WB
95 A base register indexed by immediate offset with writeback.
96
97 ADDRESS_REG_REG
98 A base register indexed by (optionally scaled) register.
99
100 ADDRESS_REG_UXTW
101 A base register indexed by (optionally scaled) zero-extended register.
102
103 ADDRESS_REG_SXTW
104 A base register indexed by (optionally scaled) sign-extended register.
105
106 ADDRESS_LO_SUM
107 A LO_SUM rtx with a base register and "LO12" symbol relocation.
108
109 ADDRESS_SYMBOLIC:
110 A constant symbolic address, in pc-relative literal pool. */
111
112 enum aarch64_address_type {
113 ADDRESS_REG_IMM,
114 ADDRESS_REG_WB,
115 ADDRESS_REG_REG,
116 ADDRESS_REG_UXTW,
117 ADDRESS_REG_SXTW,
118 ADDRESS_LO_SUM,
119 ADDRESS_SYMBOLIC
120 };
121
122 struct aarch64_address_info {
123 enum aarch64_address_type type;
124 rtx base;
125 rtx offset;
126 int shift;
127 enum aarch64_symbol_type symbol_type;
128 };
129
130 struct simd_immediate_info
131 {
132 rtx value;
133 int shift;
134 int element_width;
135 bool mvn;
136 bool msl;
137 };
138
139 /* The current code model. */
140 enum aarch64_code_model aarch64_cmodel;
141
142 #ifdef HAVE_AS_TLS
143 #undef TARGET_HAVE_TLS
144 #define TARGET_HAVE_TLS 1
145 #endif
146
147 static bool aarch64_composite_type_p (const_tree, machine_mode);
148 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
149 const_tree,
150 machine_mode *, int *,
151 bool *);
152 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
153 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
154 static void aarch64_override_options_after_change (void);
155 static bool aarch64_vector_mode_supported_p (machine_mode);
156 static unsigned bit_count (unsigned HOST_WIDE_INT);
157 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
158 const unsigned char *sel);
159 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
160
161 /* Major revision number of the ARM Architecture implemented by the target. */
162 unsigned aarch64_architecture_version;
163
164 /* The processor for which instructions should be scheduled. */
165 enum aarch64_processor aarch64_tune = cortexa53;
166
167 /* The current tuning set. */
168 const struct tune_params *aarch64_tune_params;
169
170 /* Mask to specify which instructions we are allowed to generate. */
171 unsigned long aarch64_isa_flags = 0;
172
173 /* Mask to specify which instruction scheduling options should be used. */
174 unsigned long aarch64_tune_flags = 0;
175
176 /* Tuning parameters. */
177
178 static const struct cpu_addrcost_table generic_addrcost_table =
179 {
180 {
181 0, /* hi */
182 0, /* si */
183 0, /* di */
184 0, /* ti */
185 },
186 0, /* pre_modify */
187 0, /* post_modify */
188 0, /* register_offset */
189 0, /* register_extend */
190 0 /* imm_offset */
191 };
192
193 static const struct cpu_addrcost_table cortexa57_addrcost_table =
194 {
195 {
196 1, /* hi */
197 0, /* si */
198 0, /* di */
199 1, /* ti */
200 },
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_extend */
205 0, /* imm_offset */
206 };
207
208 static const struct cpu_addrcost_table xgene1_addrcost_table =
209 {
210 {
211 1, /* hi */
212 0, /* si */
213 0, /* di */
214 1, /* ti */
215 },
216 1, /* pre_modify */
217 0, /* post_modify */
218 0, /* register_offset */
219 1, /* register_extend */
220 0, /* imm_offset */
221 };
222
223 static const struct cpu_regmove_cost generic_regmove_cost =
224 {
225 1, /* GP2GP */
226 /* Avoid the use of slow int<->fp moves for spilling by setting
227 their cost higher than memmov_cost. */
228 5, /* GP2FP */
229 5, /* FP2GP */
230 2 /* FP2FP */
231 };
232
233 static const struct cpu_regmove_cost cortexa57_regmove_cost =
234 {
235 1, /* GP2GP */
236 /* Avoid the use of slow int<->fp moves for spilling by setting
237 their cost higher than memmov_cost. */
238 5, /* GP2FP */
239 5, /* FP2GP */
240 2 /* FP2FP */
241 };
242
243 static const struct cpu_regmove_cost cortexa53_regmove_cost =
244 {
245 1, /* GP2GP */
246 /* Avoid the use of slow int<->fp moves for spilling by setting
247 their cost higher than memmov_cost. */
248 5, /* GP2FP */
249 5, /* FP2GP */
250 2 /* FP2FP */
251 };
252
253 static const struct cpu_regmove_cost thunderx_regmove_cost =
254 {
255 2, /* GP2GP */
256 2, /* GP2FP */
257 6, /* FP2GP */
258 4 /* FP2FP */
259 };
260
261 static const struct cpu_regmove_cost xgene1_regmove_cost =
262 {
263 1, /* GP2GP */
264 /* Avoid the use of slow int<->fp moves for spilling by setting
265 their cost higher than memmov_cost. */
266 8, /* GP2FP */
267 8, /* FP2GP */
268 2 /* FP2FP */
269 };
270
271 /* Generic costs for vector insn classes. */
272 static const struct cpu_vector_cost generic_vector_cost =
273 {
274 1, /* scalar_stmt_cost */
275 1, /* scalar_load_cost */
276 1, /* scalar_store_cost */
277 1, /* vec_stmt_cost */
278 1, /* vec_to_scalar_cost */
279 1, /* scalar_to_vec_cost */
280 1, /* vec_align_load_cost */
281 1, /* vec_unalign_load_cost */
282 1, /* vec_unalign_store_cost */
283 1, /* vec_store_cost */
284 3, /* cond_taken_branch_cost */
285 1 /* cond_not_taken_branch_cost */
286 };
287
288 /* Generic costs for vector insn classes. */
289 static const struct cpu_vector_cost cortexa57_vector_cost =
290 {
291 1, /* scalar_stmt_cost */
292 4, /* scalar_load_cost */
293 1, /* scalar_store_cost */
294 3, /* vec_stmt_cost */
295 8, /* vec_to_scalar_cost */
296 8, /* scalar_to_vec_cost */
297 5, /* vec_align_load_cost */
298 5, /* vec_unalign_load_cost */
299 1, /* vec_unalign_store_cost */
300 1, /* vec_store_cost */
301 1, /* cond_taken_branch_cost */
302 1 /* cond_not_taken_branch_cost */
303 };
304
305 /* Generic costs for vector insn classes. */
306 static const struct cpu_vector_cost xgene1_vector_cost =
307 {
308 1, /* scalar_stmt_cost */
309 5, /* scalar_load_cost */
310 1, /* scalar_store_cost */
311 2, /* vec_stmt_cost */
312 4, /* vec_to_scalar_cost */
313 4, /* scalar_to_vec_cost */
314 10, /* vec_align_load_cost */
315 10, /* vec_unalign_load_cost */
316 2, /* vec_unalign_store_cost */
317 2, /* vec_store_cost */
318 2, /* cond_taken_branch_cost */
319 1 /* cond_not_taken_branch_cost */
320 };
321
322 #define AARCH64_FUSE_NOTHING (0)
323 #define AARCH64_FUSE_MOV_MOVK (1 << 0)
324 #define AARCH64_FUSE_ADRP_ADD (1 << 1)
325 #define AARCH64_FUSE_MOVK_MOVK (1 << 2)
326 #define AARCH64_FUSE_ADRP_LDR (1 << 3)
327 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
328
329 /* Generic costs for branch instructions. */
330 static const struct cpu_branch_cost generic_branch_cost =
331 {
332 2, /* Predictable. */
333 2 /* Unpredictable. */
334 };
335
336 static const struct tune_params generic_tunings =
337 {
338 &cortexa57_extra_costs,
339 &generic_addrcost_table,
340 &generic_regmove_cost,
341 &generic_vector_cost,
342 &generic_branch_cost,
343 4, /* memmov_cost */
344 2, /* issue_rate */
345 AARCH64_FUSE_NOTHING, /* fusible_ops */
346 8, /* function_align. */
347 8, /* jump_align. */
348 4, /* loop_align. */
349 2, /* int_reassoc_width. */
350 4, /* fp_reassoc_width. */
351 1, /* vec_reassoc_width. */
352 2, /* min_div_recip_mul_sf. */
353 2 /* min_div_recip_mul_df. */
354 };
355
356 static const struct tune_params cortexa53_tunings =
357 {
358 &cortexa53_extra_costs,
359 &generic_addrcost_table,
360 &cortexa53_regmove_cost,
361 &generic_vector_cost,
362 &generic_branch_cost,
363 4, /* memmov_cost */
364 2, /* issue_rate */
365 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
366 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
367 8, /* function_align. */
368 8, /* jump_align. */
369 4, /* loop_align. */
370 2, /* int_reassoc_width. */
371 4, /* fp_reassoc_width. */
372 1, /* vec_reassoc_width. */
373 2, /* min_div_recip_mul_sf. */
374 2 /* min_div_recip_mul_df. */
375 };
376
377 static const struct tune_params cortexa57_tunings =
378 {
379 &cortexa57_extra_costs,
380 &cortexa57_addrcost_table,
381 &cortexa57_regmove_cost,
382 &cortexa57_vector_cost,
383 &generic_branch_cost,
384 4, /* memmov_cost */
385 3, /* issue_rate */
386 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
387 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
388 16, /* function_align. */
389 8, /* jump_align. */
390 4, /* loop_align. */
391 2, /* int_reassoc_width. */
392 4, /* fp_reassoc_width. */
393 1, /* vec_reassoc_width. */
394 2, /* min_div_recip_mul_sf. */
395 2 /* min_div_recip_mul_df. */
396 };
397
398 static const struct tune_params thunderx_tunings =
399 {
400 &thunderx_extra_costs,
401 &generic_addrcost_table,
402 &thunderx_regmove_cost,
403 &generic_vector_cost,
404 &generic_branch_cost,
405 6, /* memmov_cost */
406 2, /* issue_rate */
407 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
408 8, /* function_align. */
409 8, /* jump_align. */
410 8, /* loop_align. */
411 2, /* int_reassoc_width. */
412 4, /* fp_reassoc_width. */
413 1, /* vec_reassoc_width. */
414 2, /* min_div_recip_mul_sf. */
415 2 /* min_div_recip_mul_df. */
416 };
417
418 static const struct tune_params xgene1_tunings =
419 {
420 &xgene1_extra_costs,
421 &xgene1_addrcost_table,
422 &xgene1_regmove_cost,
423 &xgene1_vector_cost,
424 &generic_branch_cost,
425 6, /* memmov_cost */
426 4, /* issue_rate */
427 AARCH64_FUSE_NOTHING, /* fusible_ops */
428 16, /* function_align. */
429 8, /* jump_align. */
430 16, /* loop_align. */
431 2, /* int_reassoc_width. */
432 4, /* fp_reassoc_width. */
433 1, /* vec_reassoc_width. */
434 2, /* min_div_recip_mul_sf. */
435 2 /* min_div_recip_mul_df. */
436 };
437
438 /* A processor implementing AArch64. */
439 struct processor
440 {
441 const char *const name;
442 enum aarch64_processor core;
443 const char *arch;
444 unsigned architecture_version;
445 const unsigned long flags;
446 const struct tune_params *const tune;
447 };
448
449 /* Processor cores implementing AArch64. */
450 static const struct processor all_cores[] =
451 {
452 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
453 {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
454 #include "aarch64-cores.def"
455 #undef AARCH64_CORE
456 {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
457 {NULL, aarch64_none, NULL, 0, 0, NULL}
458 };
459
460 /* Architectures implementing AArch64. */
461 static const struct processor all_architectures[] =
462 {
463 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
464 {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
465 #include "aarch64-arches.def"
466 #undef AARCH64_ARCH
467 {NULL, aarch64_none, NULL, 0, 0, NULL}
468 };
469
470 /* Target specification. These are populated as commandline arguments
471 are processed, or NULL if not specified. */
472 static const struct processor *selected_arch;
473 static const struct processor *selected_cpu;
474 static const struct processor *selected_tune;
475
476 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
477
478 /* An ISA extension in the co-processor and main instruction set space. */
479 struct aarch64_option_extension
480 {
481 const char *const name;
482 const unsigned long flags_on;
483 const unsigned long flags_off;
484 };
485
486 /* ISA extensions in AArch64. */
487 static const struct aarch64_option_extension all_extensions[] =
488 {
489 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
490 {NAME, FLAGS_ON, FLAGS_OFF},
491 #include "aarch64-option-extensions.def"
492 #undef AARCH64_OPT_EXTENSION
493 {NULL, 0, 0}
494 };
495
496 /* Used to track the size of an address when generating a pre/post
497 increment address. */
498 static machine_mode aarch64_memory_reference_mode;
499
500 /* A table of valid AArch64 "bitmask immediate" values for
501 logical instructions. */
502
503 #define AARCH64_NUM_BITMASKS 5334
504 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
505
506 typedef enum aarch64_cond_code
507 {
508 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
509 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
510 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
511 }
512 aarch64_cc;
513
514 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
515
516 /* The condition codes of the processor, and the inverse function. */
517 static const char * const aarch64_condition_codes[] =
518 {
519 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
520 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
521 };
522
523 void
524 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
525 {
526 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
527 if (TARGET_GENERAL_REGS_ONLY)
528 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
529 else
530 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
531 }
532
533 static unsigned int
534 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
535 {
536 if (GET_MODE_UNIT_SIZE (mode) == 4)
537 return aarch64_tune_params->min_div_recip_mul_sf;
538 return aarch64_tune_params->min_div_recip_mul_df;
539 }
540
541 static int
542 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
543 enum machine_mode mode)
544 {
545 if (VECTOR_MODE_P (mode))
546 return aarch64_tune_params->vec_reassoc_width;
547 if (INTEGRAL_MODE_P (mode))
548 return aarch64_tune_params->int_reassoc_width;
549 if (FLOAT_MODE_P (mode))
550 return aarch64_tune_params->fp_reassoc_width;
551 return 1;
552 }
553
554 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
555 unsigned
556 aarch64_dbx_register_number (unsigned regno)
557 {
558 if (GP_REGNUM_P (regno))
559 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
560 else if (regno == SP_REGNUM)
561 return AARCH64_DWARF_SP;
562 else if (FP_REGNUM_P (regno))
563 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
564
565 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
566 equivalent DWARF register. */
567 return DWARF_FRAME_REGISTERS;
568 }
569
570 /* Return TRUE if MODE is any of the large INT modes. */
571 static bool
572 aarch64_vect_struct_mode_p (machine_mode mode)
573 {
574 return mode == OImode || mode == CImode || mode == XImode;
575 }
576
577 /* Return TRUE if MODE is any of the vector modes. */
578 static bool
579 aarch64_vector_mode_p (machine_mode mode)
580 {
581 return aarch64_vector_mode_supported_p (mode)
582 || aarch64_vect_struct_mode_p (mode);
583 }
584
585 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
586 static bool
587 aarch64_array_mode_supported_p (machine_mode mode,
588 unsigned HOST_WIDE_INT nelems)
589 {
590 if (TARGET_SIMD
591 && AARCH64_VALID_SIMD_QREG_MODE (mode)
592 && (nelems >= 2 && nelems <= 4))
593 return true;
594
595 return false;
596 }
597
598 /* Implement HARD_REGNO_NREGS. */
599
600 int
601 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
602 {
603 switch (aarch64_regno_regclass (regno))
604 {
605 case FP_REGS:
606 case FP_LO_REGS:
607 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
608 default:
609 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
610 }
611 gcc_unreachable ();
612 }
613
614 /* Implement HARD_REGNO_MODE_OK. */
615
616 int
617 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
618 {
619 if (GET_MODE_CLASS (mode) == MODE_CC)
620 return regno == CC_REGNUM;
621
622 if (regno == SP_REGNUM)
623 /* The purpose of comparing with ptr_mode is to support the
624 global register variable associated with the stack pointer
625 register via the syntax of asm ("wsp") in ILP32. */
626 return mode == Pmode || mode == ptr_mode;
627
628 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
629 return mode == Pmode;
630
631 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
632 return 1;
633
634 if (FP_REGNUM_P (regno))
635 {
636 if (aarch64_vect_struct_mode_p (mode))
637 return
638 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
639 else
640 return 1;
641 }
642
643 return 0;
644 }
645
646 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
647 machine_mode
648 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
649 machine_mode mode)
650 {
651 /* Handle modes that fit within single registers. */
652 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
653 {
654 if (GET_MODE_SIZE (mode) >= 4)
655 return mode;
656 else
657 return SImode;
658 }
659 /* Fall back to generic for multi-reg and very large modes. */
660 else
661 return choose_hard_reg_mode (regno, nregs, false);
662 }
663
664 /* Return true if calls to DECL should be treated as
665 long-calls (ie called via a register). */
666 static bool
667 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
668 {
669 return false;
670 }
671
672 /* Return true if calls to symbol-ref SYM should be treated as
673 long-calls (ie called via a register). */
674 bool
675 aarch64_is_long_call_p (rtx sym)
676 {
677 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
678 }
679
680 /* Return true if the offsets to a zero/sign-extract operation
681 represent an expression that matches an extend operation. The
682 operands represent the paramters from
683
684 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
685 bool
686 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
687 rtx extract_imm)
688 {
689 HOST_WIDE_INT mult_val, extract_val;
690
691 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
692 return false;
693
694 mult_val = INTVAL (mult_imm);
695 extract_val = INTVAL (extract_imm);
696
697 if (extract_val > 8
698 && extract_val < GET_MODE_BITSIZE (mode)
699 && exact_log2 (extract_val & ~7) > 0
700 && (extract_val & 7) <= 4
701 && mult_val == (1 << (extract_val & 7)))
702 return true;
703
704 return false;
705 }
706
707 /* Emit an insn that's a simple single-set. Both the operands must be
708 known to be valid. */
709 inline static rtx
710 emit_set_insn (rtx x, rtx y)
711 {
712 return emit_insn (gen_rtx_SET (x, y));
713 }
714
715 /* X and Y are two things to compare using CODE. Emit the compare insn and
716 return the rtx for register 0 in the proper mode. */
717 rtx
718 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
719 {
720 machine_mode mode = SELECT_CC_MODE (code, x, y);
721 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
722
723 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
724 return cc_reg;
725 }
726
727 /* Build the SYMBOL_REF for __tls_get_addr. */
728
729 static GTY(()) rtx tls_get_addr_libfunc;
730
731 rtx
732 aarch64_tls_get_addr (void)
733 {
734 if (!tls_get_addr_libfunc)
735 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
736 return tls_get_addr_libfunc;
737 }
738
739 /* Return the TLS model to use for ADDR. */
740
741 static enum tls_model
742 tls_symbolic_operand_type (rtx addr)
743 {
744 enum tls_model tls_kind = TLS_MODEL_NONE;
745 rtx sym, addend;
746
747 if (GET_CODE (addr) == CONST)
748 {
749 split_const (addr, &sym, &addend);
750 if (GET_CODE (sym) == SYMBOL_REF)
751 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
752 }
753 else if (GET_CODE (addr) == SYMBOL_REF)
754 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
755
756 return tls_kind;
757 }
758
759 /* We'll allow lo_sum's in addresses in our legitimate addresses
760 so that combine would take care of combining addresses where
761 necessary, but for generation purposes, we'll generate the address
762 as :
763 RTL Absolute
764 tmp = hi (symbol_ref); adrp x1, foo
765 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
766 nop
767
768 PIC TLS
769 adrp x1, :got:foo adrp tmp, :tlsgd:foo
770 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
771 bl __tls_get_addr
772 nop
773
774 Load TLS symbol, depending on TLS mechanism and TLS access model.
775
776 Global Dynamic - Traditional TLS:
777 adrp tmp, :tlsgd:imm
778 add dest, tmp, #:tlsgd_lo12:imm
779 bl __tls_get_addr
780
781 Global Dynamic - TLS Descriptors:
782 adrp dest, :tlsdesc:imm
783 ldr tmp, [dest, #:tlsdesc_lo12:imm]
784 add dest, dest, #:tlsdesc_lo12:imm
785 blr tmp
786 mrs tp, tpidr_el0
787 add dest, dest, tp
788
789 Initial Exec:
790 mrs tp, tpidr_el0
791 adrp tmp, :gottprel:imm
792 ldr dest, [tmp, #:gottprel_lo12:imm]
793 add dest, dest, tp
794
795 Local Exec:
796 mrs tp, tpidr_el0
797 add t0, tp, #:tprel_hi12:imm, lsl #12
798 add t0, t0, #:tprel_lo12_nc:imm
799 */
800
801 static void
802 aarch64_load_symref_appropriately (rtx dest, rtx imm,
803 enum aarch64_symbol_type type)
804 {
805 switch (type)
806 {
807 case SYMBOL_SMALL_ABSOLUTE:
808 {
809 /* In ILP32, the mode of dest can be either SImode or DImode. */
810 rtx tmp_reg = dest;
811 machine_mode mode = GET_MODE (dest);
812
813 gcc_assert (mode == Pmode || mode == ptr_mode);
814
815 if (can_create_pseudo_p ())
816 tmp_reg = gen_reg_rtx (mode);
817
818 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
819 emit_insn (gen_add_losym (dest, tmp_reg, imm));
820 return;
821 }
822
823 case SYMBOL_TINY_ABSOLUTE:
824 emit_insn (gen_rtx_SET (dest, imm));
825 return;
826
827 case SYMBOL_SMALL_GOT:
828 {
829 /* In ILP32, the mode of dest can be either SImode or DImode,
830 while the got entry is always of SImode size. The mode of
831 dest depends on how dest is used: if dest is assigned to a
832 pointer (e.g. in the memory), it has SImode; it may have
833 DImode if dest is dereferenced to access the memeory.
834 This is why we have to handle three different ldr_got_small
835 patterns here (two patterns for ILP32). */
836 rtx tmp_reg = dest;
837 machine_mode mode = GET_MODE (dest);
838
839 if (can_create_pseudo_p ())
840 tmp_reg = gen_reg_rtx (mode);
841
842 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
843 if (mode == ptr_mode)
844 {
845 if (mode == DImode)
846 emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
847 else
848 emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
849 }
850 else
851 {
852 gcc_assert (mode == Pmode);
853 emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
854 }
855
856 return;
857 }
858
859 case SYMBOL_SMALL_TLSGD:
860 {
861 rtx_insn *insns;
862 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
863
864 start_sequence ();
865 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
866 insns = get_insns ();
867 end_sequence ();
868
869 RTL_CONST_CALL_P (insns) = 1;
870 emit_libcall_block (insns, dest, result, imm);
871 return;
872 }
873
874 case SYMBOL_SMALL_TLSDESC:
875 {
876 machine_mode mode = GET_MODE (dest);
877 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
878 rtx tp;
879
880 gcc_assert (mode == Pmode || mode == ptr_mode);
881
882 /* In ILP32, the got entry is always of SImode size. Unlike
883 small GOT, the dest is fixed at reg 0. */
884 if (TARGET_ILP32)
885 emit_insn (gen_tlsdesc_small_si (imm));
886 else
887 emit_insn (gen_tlsdesc_small_di (imm));
888 tp = aarch64_load_tp (NULL);
889
890 if (mode != Pmode)
891 tp = gen_lowpart (mode, tp);
892
893 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
894 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
895 return;
896 }
897
898 case SYMBOL_SMALL_GOTTPREL:
899 {
900 /* In ILP32, the mode of dest can be either SImode or DImode,
901 while the got entry is always of SImode size. The mode of
902 dest depends on how dest is used: if dest is assigned to a
903 pointer (e.g. in the memory), it has SImode; it may have
904 DImode if dest is dereferenced to access the memeory.
905 This is why we have to handle three different tlsie_small
906 patterns here (two patterns for ILP32). */
907 machine_mode mode = GET_MODE (dest);
908 rtx tmp_reg = gen_reg_rtx (mode);
909 rtx tp = aarch64_load_tp (NULL);
910
911 if (mode == ptr_mode)
912 {
913 if (mode == DImode)
914 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
915 else
916 {
917 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
918 tp = gen_lowpart (mode, tp);
919 }
920 }
921 else
922 {
923 gcc_assert (mode == Pmode);
924 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
925 }
926
927 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
928 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
929 return;
930 }
931
932 case SYMBOL_SMALL_TPREL:
933 {
934 rtx tp = aarch64_load_tp (NULL);
935
936 if (GET_MODE (dest) != Pmode)
937 tp = gen_lowpart (GET_MODE (dest), tp);
938
939 emit_insn (gen_tlsle_small (dest, tp, imm));
940 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
941 return;
942 }
943
944 case SYMBOL_TINY_GOT:
945 emit_insn (gen_ldr_got_tiny (dest, imm));
946 return;
947
948 default:
949 gcc_unreachable ();
950 }
951 }
952
953 /* Emit a move from SRC to DEST. Assume that the move expanders can
954 handle all moves if !can_create_pseudo_p (). The distinction is
955 important because, unlike emit_move_insn, the move expanders know
956 how to force Pmode objects into the constant pool even when the
957 constant pool address is not itself legitimate. */
958 static rtx
959 aarch64_emit_move (rtx dest, rtx src)
960 {
961 return (can_create_pseudo_p ()
962 ? emit_move_insn (dest, src)
963 : emit_move_insn_1 (dest, src));
964 }
965
966 /* Split a 128-bit move operation into two 64-bit move operations,
967 taking care to handle partial overlap of register to register
968 copies. Special cases are needed when moving between GP regs and
969 FP regs. SRC can be a register, constant or memory; DST a register
970 or memory. If either operand is memory it must not have any side
971 effects. */
972 void
973 aarch64_split_128bit_move (rtx dst, rtx src)
974 {
975 rtx dst_lo, dst_hi;
976 rtx src_lo, src_hi;
977
978 machine_mode mode = GET_MODE (dst);
979
980 gcc_assert (mode == TImode || mode == TFmode);
981 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
982 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
983
984 if (REG_P (dst) && REG_P (src))
985 {
986 int src_regno = REGNO (src);
987 int dst_regno = REGNO (dst);
988
989 /* Handle FP <-> GP regs. */
990 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
991 {
992 src_lo = gen_lowpart (word_mode, src);
993 src_hi = gen_highpart (word_mode, src);
994
995 if (mode == TImode)
996 {
997 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
998 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
999 }
1000 else
1001 {
1002 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1003 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1004 }
1005 return;
1006 }
1007 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1008 {
1009 dst_lo = gen_lowpart (word_mode, dst);
1010 dst_hi = gen_highpart (word_mode, dst);
1011
1012 if (mode == TImode)
1013 {
1014 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1015 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1016 }
1017 else
1018 {
1019 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1020 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1021 }
1022 return;
1023 }
1024 }
1025
1026 dst_lo = gen_lowpart (word_mode, dst);
1027 dst_hi = gen_highpart (word_mode, dst);
1028 src_lo = gen_lowpart (word_mode, src);
1029 src_hi = gen_highpart_mode (word_mode, mode, src);
1030
1031 /* At most one pairing may overlap. */
1032 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1033 {
1034 aarch64_emit_move (dst_hi, src_hi);
1035 aarch64_emit_move (dst_lo, src_lo);
1036 }
1037 else
1038 {
1039 aarch64_emit_move (dst_lo, src_lo);
1040 aarch64_emit_move (dst_hi, src_hi);
1041 }
1042 }
1043
1044 bool
1045 aarch64_split_128bit_move_p (rtx dst, rtx src)
1046 {
1047 return (! REG_P (src)
1048 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1049 }
1050
1051 /* Split a complex SIMD combine. */
1052
1053 void
1054 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1055 {
1056 machine_mode src_mode = GET_MODE (src1);
1057 machine_mode dst_mode = GET_MODE (dst);
1058
1059 gcc_assert (VECTOR_MODE_P (dst_mode));
1060
1061 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1062 {
1063 rtx (*gen) (rtx, rtx, rtx);
1064
1065 switch (src_mode)
1066 {
1067 case V8QImode:
1068 gen = gen_aarch64_simd_combinev8qi;
1069 break;
1070 case V4HImode:
1071 gen = gen_aarch64_simd_combinev4hi;
1072 break;
1073 case V2SImode:
1074 gen = gen_aarch64_simd_combinev2si;
1075 break;
1076 case V2SFmode:
1077 gen = gen_aarch64_simd_combinev2sf;
1078 break;
1079 case DImode:
1080 gen = gen_aarch64_simd_combinedi;
1081 break;
1082 case DFmode:
1083 gen = gen_aarch64_simd_combinedf;
1084 break;
1085 default:
1086 gcc_unreachable ();
1087 }
1088
1089 emit_insn (gen (dst, src1, src2));
1090 return;
1091 }
1092 }
1093
1094 /* Split a complex SIMD move. */
1095
1096 void
1097 aarch64_split_simd_move (rtx dst, rtx src)
1098 {
1099 machine_mode src_mode = GET_MODE (src);
1100 machine_mode dst_mode = GET_MODE (dst);
1101
1102 gcc_assert (VECTOR_MODE_P (dst_mode));
1103
1104 if (REG_P (dst) && REG_P (src))
1105 {
1106 rtx (*gen) (rtx, rtx);
1107
1108 gcc_assert (VECTOR_MODE_P (src_mode));
1109
1110 switch (src_mode)
1111 {
1112 case V16QImode:
1113 gen = gen_aarch64_split_simd_movv16qi;
1114 break;
1115 case V8HImode:
1116 gen = gen_aarch64_split_simd_movv8hi;
1117 break;
1118 case V4SImode:
1119 gen = gen_aarch64_split_simd_movv4si;
1120 break;
1121 case V2DImode:
1122 gen = gen_aarch64_split_simd_movv2di;
1123 break;
1124 case V4SFmode:
1125 gen = gen_aarch64_split_simd_movv4sf;
1126 break;
1127 case V2DFmode:
1128 gen = gen_aarch64_split_simd_movv2df;
1129 break;
1130 default:
1131 gcc_unreachable ();
1132 }
1133
1134 emit_insn (gen (dst, src));
1135 return;
1136 }
1137 }
1138
1139 static rtx
1140 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1141 {
1142 if (can_create_pseudo_p ())
1143 return force_reg (mode, value);
1144 else
1145 {
1146 x = aarch64_emit_move (x, value);
1147 return x;
1148 }
1149 }
1150
1151
1152 static rtx
1153 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1154 {
1155 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1156 {
1157 rtx high;
1158 /* Load the full offset into a register. This
1159 might be improvable in the future. */
1160 high = GEN_INT (offset);
1161 offset = 0;
1162 high = aarch64_force_temporary (mode, temp, high);
1163 reg = aarch64_force_temporary (mode, temp,
1164 gen_rtx_PLUS (mode, high, reg));
1165 }
1166 return plus_constant (mode, reg, offset);
1167 }
1168
1169 static int
1170 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1171 machine_mode mode)
1172 {
1173 unsigned HOST_WIDE_INT mask;
1174 int i;
1175 bool first;
1176 unsigned HOST_WIDE_INT val;
1177 bool subtargets;
1178 rtx subtarget;
1179 int one_match, zero_match, first_not_ffff_match;
1180 int num_insns = 0;
1181
1182 if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1183 {
1184 if (generate)
1185 emit_insn (gen_rtx_SET (dest, imm));
1186 num_insns++;
1187 return num_insns;
1188 }
1189
1190 if (mode == SImode)
1191 {
1192 /* We know we can't do this in 1 insn, and we must be able to do it
1193 in two; so don't mess around looking for sequences that don't buy
1194 us anything. */
1195 if (generate)
1196 {
1197 emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1198 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1199 GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1200 }
1201 num_insns += 2;
1202 return num_insns;
1203 }
1204
1205 /* Remaining cases are all for DImode. */
1206
1207 val = INTVAL (imm);
1208 subtargets = optimize && can_create_pseudo_p ();
1209
1210 one_match = 0;
1211 zero_match = 0;
1212 mask = 0xffff;
1213 first_not_ffff_match = -1;
1214
1215 for (i = 0; i < 64; i += 16, mask <<= 16)
1216 {
1217 if ((val & mask) == mask)
1218 one_match++;
1219 else
1220 {
1221 if (first_not_ffff_match < 0)
1222 first_not_ffff_match = i;
1223 if ((val & mask) == 0)
1224 zero_match++;
1225 }
1226 }
1227
1228 if (one_match == 2)
1229 {
1230 /* Set one of the quarters and then insert back into result. */
1231 mask = 0xffffll << first_not_ffff_match;
1232 if (generate)
1233 {
1234 emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1235 emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1236 GEN_INT ((val >> first_not_ffff_match)
1237 & 0xffff)));
1238 }
1239 num_insns += 2;
1240 return num_insns;
1241 }
1242
1243 if (zero_match == 2)
1244 goto simple_sequence;
1245
1246 mask = 0x0ffff0000UL;
1247 for (i = 16; i < 64; i += 16, mask <<= 16)
1248 {
1249 HOST_WIDE_INT comp = mask & ~(mask - 1);
1250
1251 if (aarch64_uimm12_shift (val - (val & mask)))
1252 {
1253 if (generate)
1254 {
1255 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1256 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1257 emit_insn (gen_adddi3 (dest, subtarget,
1258 GEN_INT (val - (val & mask))));
1259 }
1260 num_insns += 2;
1261 return num_insns;
1262 }
1263 else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1264 {
1265 if (generate)
1266 {
1267 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1268 emit_insn (gen_rtx_SET (subtarget,
1269 GEN_INT ((val + comp) & mask)));
1270 emit_insn (gen_adddi3 (dest, subtarget,
1271 GEN_INT (val - ((val + comp) & mask))));
1272 }
1273 num_insns += 2;
1274 return num_insns;
1275 }
1276 else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1277 {
1278 if (generate)
1279 {
1280 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1281 emit_insn (gen_rtx_SET (subtarget,
1282 GEN_INT ((val - comp) | ~mask)));
1283 emit_insn (gen_adddi3 (dest, subtarget,
1284 GEN_INT (val - ((val - comp) | ~mask))));
1285 }
1286 num_insns += 2;
1287 return num_insns;
1288 }
1289 else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1290 {
1291 if (generate)
1292 {
1293 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1294 emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1295 emit_insn (gen_adddi3 (dest, subtarget,
1296 GEN_INT (val - (val | ~mask))));
1297 }
1298 num_insns += 2;
1299 return num_insns;
1300 }
1301 }
1302
1303 /* See if we can do it by arithmetically combining two
1304 immediates. */
1305 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1306 {
1307 int j;
1308 mask = 0xffff;
1309
1310 if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1311 || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1312 {
1313 if (generate)
1314 {
1315 subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1316 emit_insn (gen_rtx_SET (subtarget,
1317 GEN_INT (aarch64_bitmasks[i])));
1318 emit_insn (gen_adddi3 (dest, subtarget,
1319 GEN_INT (val - aarch64_bitmasks[i])));
1320 }
1321 num_insns += 2;
1322 return num_insns;
1323 }
1324
1325 for (j = 0; j < 64; j += 16, mask <<= 16)
1326 {
1327 if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1328 {
1329 if (generate)
1330 {
1331 emit_insn (gen_rtx_SET (dest,
1332 GEN_INT (aarch64_bitmasks[i])));
1333 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1334 GEN_INT ((val >> j) & 0xffff)));
1335 }
1336 num_insns += 2;
1337 return num_insns;
1338 }
1339 }
1340 }
1341
1342 /* See if we can do it by logically combining two immediates. */
1343 for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1344 {
1345 if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1346 {
1347 int j;
1348
1349 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1350 if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1351 {
1352 if (generate)
1353 {
1354 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1355 emit_insn (gen_rtx_SET (subtarget,
1356 GEN_INT (aarch64_bitmasks[i])));
1357 emit_insn (gen_iordi3 (dest, subtarget,
1358 GEN_INT (aarch64_bitmasks[j])));
1359 }
1360 num_insns += 2;
1361 return num_insns;
1362 }
1363 }
1364 else if ((val & aarch64_bitmasks[i]) == val)
1365 {
1366 int j;
1367
1368 for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1369 if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1370 {
1371 if (generate)
1372 {
1373 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1374 emit_insn (gen_rtx_SET (subtarget,
1375 GEN_INT (aarch64_bitmasks[j])));
1376 emit_insn (gen_anddi3 (dest, subtarget,
1377 GEN_INT (aarch64_bitmasks[i])));
1378 }
1379 num_insns += 2;
1380 return num_insns;
1381 }
1382 }
1383 }
1384
1385 if (one_match > zero_match)
1386 {
1387 /* Set either first three quarters or all but the third. */
1388 mask = 0xffffll << (16 - first_not_ffff_match);
1389 if (generate)
1390 emit_insn (gen_rtx_SET (dest,
1391 GEN_INT (val | mask | 0xffffffff00000000ull)));
1392 num_insns ++;
1393
1394 /* Now insert other two quarters. */
1395 for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1396 i < 64; i += 16, mask <<= 16)
1397 {
1398 if ((val & mask) != mask)
1399 {
1400 if (generate)
1401 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1402 GEN_INT ((val >> i) & 0xffff)));
1403 num_insns ++;
1404 }
1405 }
1406 return num_insns;
1407 }
1408
1409 simple_sequence:
1410 first = true;
1411 mask = 0xffff;
1412 for (i = 0; i < 64; i += 16, mask <<= 16)
1413 {
1414 if ((val & mask) != 0)
1415 {
1416 if (first)
1417 {
1418 if (generate)
1419 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1420 num_insns ++;
1421 first = false;
1422 }
1423 else
1424 {
1425 if (generate)
1426 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1427 GEN_INT ((val >> i) & 0xffff)));
1428 num_insns ++;
1429 }
1430 }
1431 }
1432
1433 return num_insns;
1434 }
1435
1436
1437 void
1438 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1439 {
1440 machine_mode mode = GET_MODE (dest);
1441
1442 gcc_assert (mode == SImode || mode == DImode);
1443
1444 /* Check on what type of symbol it is. */
1445 if (GET_CODE (imm) == SYMBOL_REF
1446 || GET_CODE (imm) == LABEL_REF
1447 || GET_CODE (imm) == CONST)
1448 {
1449 rtx mem, base, offset;
1450 enum aarch64_symbol_type sty;
1451
1452 /* If we have (const (plus symbol offset)), separate out the offset
1453 before we start classifying the symbol. */
1454 split_const (imm, &base, &offset);
1455
1456 sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1457 switch (sty)
1458 {
1459 case SYMBOL_FORCE_TO_MEM:
1460 if (offset != const0_rtx
1461 && targetm.cannot_force_const_mem (mode, imm))
1462 {
1463 gcc_assert (can_create_pseudo_p ());
1464 base = aarch64_force_temporary (mode, dest, base);
1465 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1466 aarch64_emit_move (dest, base);
1467 return;
1468 }
1469 mem = force_const_mem (ptr_mode, imm);
1470 gcc_assert (mem);
1471 if (mode != ptr_mode)
1472 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1473 emit_insn (gen_rtx_SET (dest, mem));
1474 return;
1475
1476 case SYMBOL_SMALL_TLSGD:
1477 case SYMBOL_SMALL_TLSDESC:
1478 case SYMBOL_SMALL_GOTTPREL:
1479 case SYMBOL_SMALL_GOT:
1480 case SYMBOL_TINY_GOT:
1481 if (offset != const0_rtx)
1482 {
1483 gcc_assert(can_create_pseudo_p ());
1484 base = aarch64_force_temporary (mode, dest, base);
1485 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1486 aarch64_emit_move (dest, base);
1487 return;
1488 }
1489 /* FALLTHRU */
1490
1491 case SYMBOL_SMALL_TPREL:
1492 case SYMBOL_SMALL_ABSOLUTE:
1493 case SYMBOL_TINY_ABSOLUTE:
1494 aarch64_load_symref_appropriately (dest, imm, sty);
1495 return;
1496
1497 default:
1498 gcc_unreachable ();
1499 }
1500 }
1501
1502 if (!CONST_INT_P (imm))
1503 {
1504 if (GET_CODE (imm) == HIGH)
1505 emit_insn (gen_rtx_SET (dest, imm));
1506 else
1507 {
1508 rtx mem = force_const_mem (mode, imm);
1509 gcc_assert (mem);
1510 emit_insn (gen_rtx_SET (dest, mem));
1511 }
1512
1513 return;
1514 }
1515
1516 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1517 }
1518
1519 static bool
1520 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1521 tree exp ATTRIBUTE_UNUSED)
1522 {
1523 /* Currently, always true. */
1524 return true;
1525 }
1526
1527 /* Implement TARGET_PASS_BY_REFERENCE. */
1528
1529 static bool
1530 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1531 machine_mode mode,
1532 const_tree type,
1533 bool named ATTRIBUTE_UNUSED)
1534 {
1535 HOST_WIDE_INT size;
1536 machine_mode dummymode;
1537 int nregs;
1538
1539 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
1540 size = (mode == BLKmode && type)
1541 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1542
1543 /* Aggregates are passed by reference based on their size. */
1544 if (type && AGGREGATE_TYPE_P (type))
1545 {
1546 size = int_size_in_bytes (type);
1547 }
1548
1549 /* Variable sized arguments are always returned by reference. */
1550 if (size < 0)
1551 return true;
1552
1553 /* Can this be a candidate to be passed in fp/simd register(s)? */
1554 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1555 &dummymode, &nregs,
1556 NULL))
1557 return false;
1558
1559 /* Arguments which are variable sized or larger than 2 registers are
1560 passed by reference unless they are a homogenous floating point
1561 aggregate. */
1562 return size > 2 * UNITS_PER_WORD;
1563 }
1564
1565 /* Return TRUE if VALTYPE is padded to its least significant bits. */
1566 static bool
1567 aarch64_return_in_msb (const_tree valtype)
1568 {
1569 machine_mode dummy_mode;
1570 int dummy_int;
1571
1572 /* Never happens in little-endian mode. */
1573 if (!BYTES_BIG_ENDIAN)
1574 return false;
1575
1576 /* Only composite types smaller than or equal to 16 bytes can
1577 be potentially returned in registers. */
1578 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1579 || int_size_in_bytes (valtype) <= 0
1580 || int_size_in_bytes (valtype) > 16)
1581 return false;
1582
1583 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1584 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1585 is always passed/returned in the least significant bits of fp/simd
1586 register(s). */
1587 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1588 &dummy_mode, &dummy_int, NULL))
1589 return false;
1590
1591 return true;
1592 }
1593
1594 /* Implement TARGET_FUNCTION_VALUE.
1595 Define how to find the value returned by a function. */
1596
1597 static rtx
1598 aarch64_function_value (const_tree type, const_tree func,
1599 bool outgoing ATTRIBUTE_UNUSED)
1600 {
1601 machine_mode mode;
1602 int unsignedp;
1603 int count;
1604 machine_mode ag_mode;
1605
1606 mode = TYPE_MODE (type);
1607 if (INTEGRAL_TYPE_P (type))
1608 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1609
1610 if (aarch64_return_in_msb (type))
1611 {
1612 HOST_WIDE_INT size = int_size_in_bytes (type);
1613
1614 if (size % UNITS_PER_WORD != 0)
1615 {
1616 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1617 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1618 }
1619 }
1620
1621 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1622 &ag_mode, &count, NULL))
1623 {
1624 if (!aarch64_composite_type_p (type, mode))
1625 {
1626 gcc_assert (count == 1 && mode == ag_mode);
1627 return gen_rtx_REG (mode, V0_REGNUM);
1628 }
1629 else
1630 {
1631 int i;
1632 rtx par;
1633
1634 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1635 for (i = 0; i < count; i++)
1636 {
1637 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1638 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1639 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1640 XVECEXP (par, 0, i) = tmp;
1641 }
1642 return par;
1643 }
1644 }
1645 else
1646 return gen_rtx_REG (mode, R0_REGNUM);
1647 }
1648
1649 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1650 Return true if REGNO is the number of a hard register in which the values
1651 of called function may come back. */
1652
1653 static bool
1654 aarch64_function_value_regno_p (const unsigned int regno)
1655 {
1656 /* Maximum of 16 bytes can be returned in the general registers. Examples
1657 of 16-byte return values are: 128-bit integers and 16-byte small
1658 structures (excluding homogeneous floating-point aggregates). */
1659 if (regno == R0_REGNUM || regno == R1_REGNUM)
1660 return true;
1661
1662 /* Up to four fp/simd registers can return a function value, e.g. a
1663 homogeneous floating-point aggregate having four members. */
1664 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1665 return TARGET_FLOAT;
1666
1667 return false;
1668 }
1669
1670 /* Implement TARGET_RETURN_IN_MEMORY.
1671
1672 If the type T of the result of a function is such that
1673 void func (T arg)
1674 would require that arg be passed as a value in a register (or set of
1675 registers) according to the parameter passing rules, then the result
1676 is returned in the same registers as would be used for such an
1677 argument. */
1678
1679 static bool
1680 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1681 {
1682 HOST_WIDE_INT size;
1683 machine_mode ag_mode;
1684 int count;
1685
1686 if (!AGGREGATE_TYPE_P (type)
1687 && TREE_CODE (type) != COMPLEX_TYPE
1688 && TREE_CODE (type) != VECTOR_TYPE)
1689 /* Simple scalar types always returned in registers. */
1690 return false;
1691
1692 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1693 type,
1694 &ag_mode,
1695 &count,
1696 NULL))
1697 return false;
1698
1699 /* Types larger than 2 registers returned in memory. */
1700 size = int_size_in_bytes (type);
1701 return (size < 0 || size > 2 * UNITS_PER_WORD);
1702 }
1703
1704 static bool
1705 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1706 const_tree type, int *nregs)
1707 {
1708 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1709 return aarch64_vfp_is_call_or_return_candidate (mode,
1710 type,
1711 &pcum->aapcs_vfp_rmode,
1712 nregs,
1713 NULL);
1714 }
1715
1716 /* Given MODE and TYPE of a function argument, return the alignment in
1717 bits. The idea is to suppress any stronger alignment requested by
1718 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1719 This is a helper function for local use only. */
1720
1721 static unsigned int
1722 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1723 {
1724 unsigned int alignment;
1725
1726 if (type)
1727 {
1728 if (!integer_zerop (TYPE_SIZE (type)))
1729 {
1730 if (TYPE_MODE (type) == mode)
1731 alignment = TYPE_ALIGN (type);
1732 else
1733 alignment = GET_MODE_ALIGNMENT (mode);
1734 }
1735 else
1736 alignment = 0;
1737 }
1738 else
1739 alignment = GET_MODE_ALIGNMENT (mode);
1740
1741 return alignment;
1742 }
1743
1744 /* Layout a function argument according to the AAPCS64 rules. The rule
1745 numbers refer to the rule numbers in the AAPCS64. */
1746
1747 static void
1748 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1749 const_tree type,
1750 bool named ATTRIBUTE_UNUSED)
1751 {
1752 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1753 int ncrn, nvrn, nregs;
1754 bool allocate_ncrn, allocate_nvrn;
1755 HOST_WIDE_INT size;
1756
1757 /* We need to do this once per argument. */
1758 if (pcum->aapcs_arg_processed)
1759 return;
1760
1761 pcum->aapcs_arg_processed = true;
1762
1763 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
1764 size
1765 = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1766 UNITS_PER_WORD);
1767
1768 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1769 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1770 mode,
1771 type,
1772 &nregs);
1773
1774 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1775 The following code thus handles passing by SIMD/FP registers first. */
1776
1777 nvrn = pcum->aapcs_nvrn;
1778
1779 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1780 and homogenous short-vector aggregates (HVA). */
1781 if (allocate_nvrn)
1782 {
1783 if (!TARGET_FLOAT)
1784 aarch64_err_no_fpadvsimd (mode, "argument");
1785
1786 if (nvrn + nregs <= NUM_FP_ARG_REGS)
1787 {
1788 pcum->aapcs_nextnvrn = nvrn + nregs;
1789 if (!aarch64_composite_type_p (type, mode))
1790 {
1791 gcc_assert (nregs == 1);
1792 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1793 }
1794 else
1795 {
1796 rtx par;
1797 int i;
1798 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1799 for (i = 0; i < nregs; i++)
1800 {
1801 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1802 V0_REGNUM + nvrn + i);
1803 tmp = gen_rtx_EXPR_LIST
1804 (VOIDmode, tmp,
1805 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1806 XVECEXP (par, 0, i) = tmp;
1807 }
1808 pcum->aapcs_reg = par;
1809 }
1810 return;
1811 }
1812 else
1813 {
1814 /* C.3 NSRN is set to 8. */
1815 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1816 goto on_stack;
1817 }
1818 }
1819
1820 ncrn = pcum->aapcs_ncrn;
1821 nregs = size / UNITS_PER_WORD;
1822
1823 /* C6 - C9. though the sign and zero extension semantics are
1824 handled elsewhere. This is the case where the argument fits
1825 entirely general registers. */
1826 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1827 {
1828 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1829
1830 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1831
1832 /* C.8 if the argument has an alignment of 16 then the NGRN is
1833 rounded up to the next even number. */
1834 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1835 {
1836 ++ncrn;
1837 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1838 }
1839 /* NREGS can be 0 when e.g. an empty structure is to be passed.
1840 A reg is still generated for it, but the caller should be smart
1841 enough not to use it. */
1842 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1843 {
1844 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1845 }
1846 else
1847 {
1848 rtx par;
1849 int i;
1850
1851 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1852 for (i = 0; i < nregs; i++)
1853 {
1854 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1855 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1856 GEN_INT (i * UNITS_PER_WORD));
1857 XVECEXP (par, 0, i) = tmp;
1858 }
1859 pcum->aapcs_reg = par;
1860 }
1861
1862 pcum->aapcs_nextncrn = ncrn + nregs;
1863 return;
1864 }
1865
1866 /* C.11 */
1867 pcum->aapcs_nextncrn = NUM_ARG_REGS;
1868
1869 /* The argument is passed on stack; record the needed number of words for
1870 this argument and align the total size if necessary. */
1871 on_stack:
1872 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1873 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1874 pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1875 16 / UNITS_PER_WORD);
1876 return;
1877 }
1878
1879 /* Implement TARGET_FUNCTION_ARG. */
1880
1881 static rtx
1882 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1883 const_tree type, bool named)
1884 {
1885 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1886 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1887
1888 if (mode == VOIDmode)
1889 return NULL_RTX;
1890
1891 aarch64_layout_arg (pcum_v, mode, type, named);
1892 return pcum->aapcs_reg;
1893 }
1894
1895 void
1896 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1897 const_tree fntype ATTRIBUTE_UNUSED,
1898 rtx libname ATTRIBUTE_UNUSED,
1899 const_tree fndecl ATTRIBUTE_UNUSED,
1900 unsigned n_named ATTRIBUTE_UNUSED)
1901 {
1902 pcum->aapcs_ncrn = 0;
1903 pcum->aapcs_nvrn = 0;
1904 pcum->aapcs_nextncrn = 0;
1905 pcum->aapcs_nextnvrn = 0;
1906 pcum->pcs_variant = ARM_PCS_AAPCS64;
1907 pcum->aapcs_reg = NULL_RTX;
1908 pcum->aapcs_arg_processed = false;
1909 pcum->aapcs_stack_words = 0;
1910 pcum->aapcs_stack_size = 0;
1911
1912 if (!TARGET_FLOAT
1913 && fndecl && TREE_PUBLIC (fndecl)
1914 && fntype && fntype != error_mark_node)
1915 {
1916 const_tree type = TREE_TYPE (fntype);
1917 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
1918 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
1919 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
1920 &mode, &nregs, NULL))
1921 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
1922 }
1923 return;
1924 }
1925
1926 static void
1927 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1928 machine_mode mode,
1929 const_tree type,
1930 bool named)
1931 {
1932 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1933 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1934 {
1935 aarch64_layout_arg (pcum_v, mode, type, named);
1936 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1937 != (pcum->aapcs_stack_words != 0));
1938 pcum->aapcs_arg_processed = false;
1939 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1940 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1941 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1942 pcum->aapcs_stack_words = 0;
1943 pcum->aapcs_reg = NULL_RTX;
1944 }
1945 }
1946
1947 bool
1948 aarch64_function_arg_regno_p (unsigned regno)
1949 {
1950 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1951 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1952 }
1953
1954 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
1955 PARM_BOUNDARY bits of alignment, but will be given anything up
1956 to STACK_BOUNDARY bits if the type requires it. This makes sure
1957 that both before and after the layout of each argument, the Next
1958 Stacked Argument Address (NSAA) will have a minimum alignment of
1959 8 bytes. */
1960
1961 static unsigned int
1962 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1963 {
1964 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1965
1966 if (alignment < PARM_BOUNDARY)
1967 alignment = PARM_BOUNDARY;
1968 if (alignment > STACK_BOUNDARY)
1969 alignment = STACK_BOUNDARY;
1970 return alignment;
1971 }
1972
1973 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1974
1975 Return true if an argument passed on the stack should be padded upwards,
1976 i.e. if the least-significant byte of the stack slot has useful data.
1977
1978 Small aggregate types are placed in the lowest memory address.
1979
1980 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
1981
1982 bool
1983 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1984 {
1985 /* On little-endian targets, the least significant byte of every stack
1986 argument is passed at the lowest byte address of the stack slot. */
1987 if (!BYTES_BIG_ENDIAN)
1988 return true;
1989
1990 /* Otherwise, integral, floating-point and pointer types are padded downward:
1991 the least significant byte of a stack argument is passed at the highest
1992 byte address of the stack slot. */
1993 if (type
1994 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1995 || POINTER_TYPE_P (type))
1996 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1997 return false;
1998
1999 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2000 return true;
2001 }
2002
2003 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2004
2005 It specifies padding for the last (may also be the only)
2006 element of a block move between registers and memory. If
2007 assuming the block is in the memory, padding upward means that
2008 the last element is padded after its highest significant byte,
2009 while in downward padding, the last element is padded at the
2010 its least significant byte side.
2011
2012 Small aggregates and small complex types are always padded
2013 upwards.
2014
2015 We don't need to worry about homogeneous floating-point or
2016 short-vector aggregates; their move is not affected by the
2017 padding direction determined here. Regardless of endianness,
2018 each element of such an aggregate is put in the least
2019 significant bits of a fp/simd register.
2020
2021 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2022 register has useful data, and return the opposite if the most
2023 significant byte does. */
2024
2025 bool
2026 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2027 bool first ATTRIBUTE_UNUSED)
2028 {
2029
2030 /* Small composite types are always padded upward. */
2031 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2032 {
2033 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2034 : GET_MODE_SIZE (mode));
2035 if (size < 2 * UNITS_PER_WORD)
2036 return true;
2037 }
2038
2039 /* Otherwise, use the default padding. */
2040 return !BYTES_BIG_ENDIAN;
2041 }
2042
2043 static machine_mode
2044 aarch64_libgcc_cmp_return_mode (void)
2045 {
2046 return SImode;
2047 }
2048
2049 static bool
2050 aarch64_frame_pointer_required (void)
2051 {
2052 /* In aarch64_override_options_after_change
2053 flag_omit_leaf_frame_pointer turns off the frame pointer by
2054 default. Turn it back on now if we've not got a leaf
2055 function. */
2056 if (flag_omit_leaf_frame_pointer
2057 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2058 return true;
2059
2060 return false;
2061 }
2062
2063 /* Mark the registers that need to be saved by the callee and calculate
2064 the size of the callee-saved registers area and frame record (both FP
2065 and LR may be omitted). */
2066 static void
2067 aarch64_layout_frame (void)
2068 {
2069 HOST_WIDE_INT offset = 0;
2070 int regno;
2071
2072 if (reload_completed && cfun->machine->frame.laid_out)
2073 return;
2074
2075 #define SLOT_NOT_REQUIRED (-2)
2076 #define SLOT_REQUIRED (-1)
2077
2078 cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2079 cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2080
2081 /* First mark all the registers that really need to be saved... */
2082 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2083 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2084
2085 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2086 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2087
2088 /* ... that includes the eh data registers (if needed)... */
2089 if (crtl->calls_eh_return)
2090 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2091 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2092 = SLOT_REQUIRED;
2093
2094 /* ... and any callee saved register that dataflow says is live. */
2095 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2096 if (df_regs_ever_live_p (regno)
2097 && (regno == R30_REGNUM
2098 || !call_used_regs[regno]))
2099 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2100
2101 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2102 if (df_regs_ever_live_p (regno)
2103 && !call_used_regs[regno])
2104 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2105
2106 if (frame_pointer_needed)
2107 {
2108 /* FP and LR are placed in the linkage record. */
2109 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2110 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2111 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2112 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2113 cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2114 offset += 2 * UNITS_PER_WORD;
2115 }
2116
2117 /* Now assign stack slots for them. */
2118 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2119 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2120 {
2121 cfun->machine->frame.reg_offset[regno] = offset;
2122 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2123 cfun->machine->frame.wb_candidate1 = regno;
2124 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2125 cfun->machine->frame.wb_candidate2 = regno;
2126 offset += UNITS_PER_WORD;
2127 }
2128
2129 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2130 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2131 {
2132 cfun->machine->frame.reg_offset[regno] = offset;
2133 if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2134 cfun->machine->frame.wb_candidate1 = regno;
2135 else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2136 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2137 cfun->machine->frame.wb_candidate2 = regno;
2138 offset += UNITS_PER_WORD;
2139 }
2140
2141 cfun->machine->frame.padding0 =
2142 (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2143 offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2144
2145 cfun->machine->frame.saved_regs_size = offset;
2146
2147 cfun->machine->frame.hard_fp_offset
2148 = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2149 + get_frame_size ()
2150 + cfun->machine->frame.saved_regs_size,
2151 STACK_BOUNDARY / BITS_PER_UNIT);
2152
2153 cfun->machine->frame.frame_size
2154 = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2155 + crtl->outgoing_args_size,
2156 STACK_BOUNDARY / BITS_PER_UNIT);
2157
2158 cfun->machine->frame.laid_out = true;
2159 }
2160
2161 static bool
2162 aarch64_register_saved_on_entry (int regno)
2163 {
2164 return cfun->machine->frame.reg_offset[regno] >= 0;
2165 }
2166
2167 static unsigned
2168 aarch64_next_callee_save (unsigned regno, unsigned limit)
2169 {
2170 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2171 regno ++;
2172 return regno;
2173 }
2174
2175 static void
2176 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2177 HOST_WIDE_INT adjustment)
2178 {
2179 rtx base_rtx = stack_pointer_rtx;
2180 rtx insn, reg, mem;
2181
2182 reg = gen_rtx_REG (mode, regno);
2183 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2184 plus_constant (Pmode, base_rtx, -adjustment));
2185 mem = gen_rtx_MEM (mode, mem);
2186
2187 insn = emit_move_insn (mem, reg);
2188 RTX_FRAME_RELATED_P (insn) = 1;
2189 }
2190
2191 static rtx
2192 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2193 HOST_WIDE_INT adjustment)
2194 {
2195 switch (mode)
2196 {
2197 case DImode:
2198 return gen_storewb_pairdi_di (base, base, reg, reg2,
2199 GEN_INT (-adjustment),
2200 GEN_INT (UNITS_PER_WORD - adjustment));
2201 case DFmode:
2202 return gen_storewb_pairdf_di (base, base, reg, reg2,
2203 GEN_INT (-adjustment),
2204 GEN_INT (UNITS_PER_WORD - adjustment));
2205 default:
2206 gcc_unreachable ();
2207 }
2208 }
2209
2210 static void
2211 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2212 unsigned regno2, HOST_WIDE_INT adjustment)
2213 {
2214 rtx_insn *insn;
2215 rtx reg1 = gen_rtx_REG (mode, regno1);
2216 rtx reg2 = gen_rtx_REG (mode, regno2);
2217
2218 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2219 reg2, adjustment));
2220 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2221 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2222 RTX_FRAME_RELATED_P (insn) = 1;
2223 }
2224
2225 static rtx
2226 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2227 HOST_WIDE_INT adjustment)
2228 {
2229 switch (mode)
2230 {
2231 case DImode:
2232 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2233 GEN_INT (UNITS_PER_WORD));
2234 case DFmode:
2235 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2236 GEN_INT (UNITS_PER_WORD));
2237 default:
2238 gcc_unreachable ();
2239 }
2240 }
2241
2242 static rtx
2243 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2244 rtx reg2)
2245 {
2246 switch (mode)
2247 {
2248 case DImode:
2249 return gen_store_pairdi (mem1, reg1, mem2, reg2);
2250
2251 case DFmode:
2252 return gen_store_pairdf (mem1, reg1, mem2, reg2);
2253
2254 default:
2255 gcc_unreachable ();
2256 }
2257 }
2258
2259 static rtx
2260 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2261 rtx mem2)
2262 {
2263 switch (mode)
2264 {
2265 case DImode:
2266 return gen_load_pairdi (reg1, mem1, reg2, mem2);
2267
2268 case DFmode:
2269 return gen_load_pairdf (reg1, mem1, reg2, mem2);
2270
2271 default:
2272 gcc_unreachable ();
2273 }
2274 }
2275
2276
2277 static void
2278 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2279 unsigned start, unsigned limit, bool skip_wb)
2280 {
2281 rtx_insn *insn;
2282 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2283 ? gen_frame_mem : gen_rtx_MEM);
2284 unsigned regno;
2285 unsigned regno2;
2286
2287 for (regno = aarch64_next_callee_save (start, limit);
2288 regno <= limit;
2289 regno = aarch64_next_callee_save (regno + 1, limit))
2290 {
2291 rtx reg, mem;
2292 HOST_WIDE_INT offset;
2293
2294 if (skip_wb
2295 && (regno == cfun->machine->frame.wb_candidate1
2296 || regno == cfun->machine->frame.wb_candidate2))
2297 continue;
2298
2299 reg = gen_rtx_REG (mode, regno);
2300 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2301 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2302 offset));
2303
2304 regno2 = aarch64_next_callee_save (regno + 1, limit);
2305
2306 if (regno2 <= limit
2307 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2308 == cfun->machine->frame.reg_offset[regno2]))
2309
2310 {
2311 rtx reg2 = gen_rtx_REG (mode, regno2);
2312 rtx mem2;
2313
2314 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2315 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2316 offset));
2317 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2318 reg2));
2319
2320 /* The first part of a frame-related parallel insn is
2321 always assumed to be relevant to the frame
2322 calculations; subsequent parts, are only
2323 frame-related if explicitly marked. */
2324 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2325 regno = regno2;
2326 }
2327 else
2328 insn = emit_move_insn (mem, reg);
2329
2330 RTX_FRAME_RELATED_P (insn) = 1;
2331 }
2332 }
2333
2334 static void
2335 aarch64_restore_callee_saves (machine_mode mode,
2336 HOST_WIDE_INT start_offset, unsigned start,
2337 unsigned limit, bool skip_wb, rtx *cfi_ops)
2338 {
2339 rtx base_rtx = stack_pointer_rtx;
2340 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2341 ? gen_frame_mem : gen_rtx_MEM);
2342 unsigned regno;
2343 unsigned regno2;
2344 HOST_WIDE_INT offset;
2345
2346 for (regno = aarch64_next_callee_save (start, limit);
2347 regno <= limit;
2348 regno = aarch64_next_callee_save (regno + 1, limit))
2349 {
2350 rtx reg, mem;
2351
2352 if (skip_wb
2353 && (regno == cfun->machine->frame.wb_candidate1
2354 || regno == cfun->machine->frame.wb_candidate2))
2355 continue;
2356
2357 reg = gen_rtx_REG (mode, regno);
2358 offset = start_offset + cfun->machine->frame.reg_offset[regno];
2359 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2360
2361 regno2 = aarch64_next_callee_save (regno + 1, limit);
2362
2363 if (regno2 <= limit
2364 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2365 == cfun->machine->frame.reg_offset[regno2]))
2366 {
2367 rtx reg2 = gen_rtx_REG (mode, regno2);
2368 rtx mem2;
2369
2370 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2371 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2372 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2373
2374 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2375 regno = regno2;
2376 }
2377 else
2378 emit_move_insn (reg, mem);
2379 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2380 }
2381 }
2382
2383 /* AArch64 stack frames generated by this compiler look like:
2384
2385 +-------------------------------+
2386 | |
2387 | incoming stack arguments |
2388 | |
2389 +-------------------------------+
2390 | | <-- incoming stack pointer (aligned)
2391 | callee-allocated save area |
2392 | for register varargs |
2393 | |
2394 +-------------------------------+
2395 | local variables | <-- frame_pointer_rtx
2396 | |
2397 +-------------------------------+
2398 | padding0 | \
2399 +-------------------------------+ |
2400 | callee-saved registers | | frame.saved_regs_size
2401 +-------------------------------+ |
2402 | LR' | |
2403 +-------------------------------+ |
2404 | FP' | / <- hard_frame_pointer_rtx (aligned)
2405 +-------------------------------+
2406 | dynamic allocation |
2407 +-------------------------------+
2408 | padding |
2409 +-------------------------------+
2410 | outgoing stack arguments | <-- arg_pointer
2411 | |
2412 +-------------------------------+
2413 | | <-- stack_pointer_rtx (aligned)
2414
2415 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2416 but leave frame_pointer_rtx and hard_frame_pointer_rtx
2417 unchanged. */
2418
2419 /* Generate the prologue instructions for entry into a function.
2420 Establish the stack frame by decreasing the stack pointer with a
2421 properly calculated size and, if necessary, create a frame record
2422 filled with the values of LR and previous frame pointer. The
2423 current FP is also set up if it is in use. */
2424
2425 void
2426 aarch64_expand_prologue (void)
2427 {
2428 /* sub sp, sp, #<frame_size>
2429 stp {fp, lr}, [sp, #<frame_size> - 16]
2430 add fp, sp, #<frame_size> - hardfp_offset
2431 stp {cs_reg}, [fp, #-16] etc.
2432
2433 sub sp, sp, <final_adjustment_if_any>
2434 */
2435 HOST_WIDE_INT frame_size, offset;
2436 HOST_WIDE_INT fp_offset; /* Offset from hard FP to SP. */
2437 HOST_WIDE_INT hard_fp_offset;
2438 rtx_insn *insn;
2439
2440 aarch64_layout_frame ();
2441
2442 offset = frame_size = cfun->machine->frame.frame_size;
2443 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2444 fp_offset = frame_size - hard_fp_offset;
2445
2446 if (flag_stack_usage_info)
2447 current_function_static_stack_size = frame_size;
2448
2449 /* Store pairs and load pairs have a range only -512 to 504. */
2450 if (offset >= 512)
2451 {
2452 /* When the frame has a large size, an initial decrease is done on
2453 the stack pointer to jump over the callee-allocated save area for
2454 register varargs, the local variable area and/or the callee-saved
2455 register area. This will allow the pre-index write-back
2456 store pair instructions to be used for setting up the stack frame
2457 efficiently. */
2458 offset = hard_fp_offset;
2459 if (offset >= 512)
2460 offset = cfun->machine->frame.saved_regs_size;
2461
2462 frame_size -= (offset + crtl->outgoing_args_size);
2463 fp_offset = 0;
2464
2465 if (frame_size >= 0x1000000)
2466 {
2467 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2468 emit_move_insn (op0, GEN_INT (-frame_size));
2469 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2470
2471 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2472 gen_rtx_SET (stack_pointer_rtx,
2473 plus_constant (Pmode, stack_pointer_rtx,
2474 -frame_size)));
2475 RTX_FRAME_RELATED_P (insn) = 1;
2476 }
2477 else if (frame_size > 0)
2478 {
2479 int hi_ofs = frame_size & 0xfff000;
2480 int lo_ofs = frame_size & 0x000fff;
2481
2482 if (hi_ofs)
2483 {
2484 insn = emit_insn (gen_add2_insn
2485 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2486 RTX_FRAME_RELATED_P (insn) = 1;
2487 }
2488 if (lo_ofs)
2489 {
2490 insn = emit_insn (gen_add2_insn
2491 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2492 RTX_FRAME_RELATED_P (insn) = 1;
2493 }
2494 }
2495 }
2496 else
2497 frame_size = -1;
2498
2499 if (offset > 0)
2500 {
2501 bool skip_wb = false;
2502
2503 if (frame_pointer_needed)
2504 {
2505 skip_wb = true;
2506
2507 if (fp_offset)
2508 {
2509 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2510 GEN_INT (-offset)));
2511 RTX_FRAME_RELATED_P (insn) = 1;
2512
2513 aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2514 R30_REGNUM, false);
2515 }
2516 else
2517 aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2518
2519 /* Set up frame pointer to point to the location of the
2520 previous frame pointer on the stack. */
2521 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2522 stack_pointer_rtx,
2523 GEN_INT (fp_offset)));
2524 RTX_FRAME_RELATED_P (insn) = 1;
2525 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2526 }
2527 else
2528 {
2529 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2530 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2531
2532 if (fp_offset
2533 || reg1 == FIRST_PSEUDO_REGISTER
2534 || (reg2 == FIRST_PSEUDO_REGISTER
2535 && offset >= 256))
2536 {
2537 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2538 GEN_INT (-offset)));
2539 RTX_FRAME_RELATED_P (insn) = 1;
2540 }
2541 else
2542 {
2543 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2544
2545 skip_wb = true;
2546
2547 if (reg2 == FIRST_PSEUDO_REGISTER)
2548 aarch64_pushwb_single_reg (mode1, reg1, offset);
2549 else
2550 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2551 }
2552 }
2553
2554 aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2555 skip_wb);
2556 aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2557 skip_wb);
2558 }
2559
2560 /* when offset >= 512,
2561 sub sp, sp, #<outgoing_args_size> */
2562 if (frame_size > -1)
2563 {
2564 if (crtl->outgoing_args_size > 0)
2565 {
2566 insn = emit_insn (gen_add2_insn
2567 (stack_pointer_rtx,
2568 GEN_INT (- crtl->outgoing_args_size)));
2569 RTX_FRAME_RELATED_P (insn) = 1;
2570 }
2571 }
2572 }
2573
2574 /* Return TRUE if we can use a simple_return insn.
2575
2576 This function checks whether the callee saved stack is empty, which
2577 means no restore actions are need. The pro_and_epilogue will use
2578 this to check whether shrink-wrapping opt is feasible. */
2579
2580 bool
2581 aarch64_use_return_insn_p (void)
2582 {
2583 if (!reload_completed)
2584 return false;
2585
2586 if (crtl->profile)
2587 return false;
2588
2589 aarch64_layout_frame ();
2590
2591 return cfun->machine->frame.frame_size == 0;
2592 }
2593
2594 /* Generate the epilogue instructions for returning from a function. */
2595 void
2596 aarch64_expand_epilogue (bool for_sibcall)
2597 {
2598 HOST_WIDE_INT frame_size, offset;
2599 HOST_WIDE_INT fp_offset;
2600 HOST_WIDE_INT hard_fp_offset;
2601 rtx_insn *insn;
2602 /* We need to add memory barrier to prevent read from deallocated stack. */
2603 bool need_barrier_p = (get_frame_size () != 0
2604 || cfun->machine->frame.saved_varargs_size);
2605
2606 aarch64_layout_frame ();
2607
2608 offset = frame_size = cfun->machine->frame.frame_size;
2609 hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2610 fp_offset = frame_size - hard_fp_offset;
2611
2612 /* Store pairs and load pairs have a range only -512 to 504. */
2613 if (offset >= 512)
2614 {
2615 offset = hard_fp_offset;
2616 if (offset >= 512)
2617 offset = cfun->machine->frame.saved_regs_size;
2618
2619 frame_size -= (offset + crtl->outgoing_args_size);
2620 fp_offset = 0;
2621 if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2622 {
2623 insn = emit_insn (gen_add2_insn
2624 (stack_pointer_rtx,
2625 GEN_INT (crtl->outgoing_args_size)));
2626 RTX_FRAME_RELATED_P (insn) = 1;
2627 }
2628 }
2629 else
2630 frame_size = -1;
2631
2632 /* If there were outgoing arguments or we've done dynamic stack
2633 allocation, then restore the stack pointer from the frame
2634 pointer. This is at most one insn and more efficient than using
2635 GCC's internal mechanism. */
2636 if (frame_pointer_needed
2637 && (crtl->outgoing_args_size || cfun->calls_alloca))
2638 {
2639 if (cfun->calls_alloca)
2640 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641
2642 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2643 hard_frame_pointer_rtx,
2644 GEN_INT (0)));
2645 offset = offset - fp_offset;
2646 }
2647
2648 if (offset > 0)
2649 {
2650 unsigned reg1 = cfun->machine->frame.wb_candidate1;
2651 unsigned reg2 = cfun->machine->frame.wb_candidate2;
2652 bool skip_wb = true;
2653 rtx cfi_ops = NULL;
2654
2655 if (frame_pointer_needed)
2656 fp_offset = 0;
2657 else if (fp_offset
2658 || reg1 == FIRST_PSEUDO_REGISTER
2659 || (reg2 == FIRST_PSEUDO_REGISTER
2660 && offset >= 256))
2661 skip_wb = false;
2662
2663 aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2664 skip_wb, &cfi_ops);
2665 aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2666 skip_wb, &cfi_ops);
2667
2668 if (need_barrier_p)
2669 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2670
2671 if (skip_wb)
2672 {
2673 machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2674 rtx rreg1 = gen_rtx_REG (mode1, reg1);
2675
2676 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2677 if (reg2 == FIRST_PSEUDO_REGISTER)
2678 {
2679 rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2680 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2681 mem = gen_rtx_MEM (mode1, mem);
2682 insn = emit_move_insn (rreg1, mem);
2683 }
2684 else
2685 {
2686 rtx rreg2 = gen_rtx_REG (mode1, reg2);
2687
2688 cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2689 insn = emit_insn (aarch64_gen_loadwb_pair
2690 (mode1, stack_pointer_rtx, rreg1,
2691 rreg2, offset));
2692 }
2693 }
2694 else
2695 {
2696 insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2697 GEN_INT (offset)));
2698 }
2699
2700 /* Reset the CFA to be SP + FRAME_SIZE. */
2701 rtx new_cfa = stack_pointer_rtx;
2702 if (frame_size > 0)
2703 new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2704 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2705 REG_NOTES (insn) = cfi_ops;
2706 RTX_FRAME_RELATED_P (insn) = 1;
2707 }
2708
2709 if (frame_size > 0)
2710 {
2711 if (need_barrier_p)
2712 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2713
2714 if (frame_size >= 0x1000000)
2715 {
2716 rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2717 emit_move_insn (op0, GEN_INT (frame_size));
2718 insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2719 }
2720 else
2721 {
2722 int hi_ofs = frame_size & 0xfff000;
2723 int lo_ofs = frame_size & 0x000fff;
2724
2725 if (hi_ofs && lo_ofs)
2726 {
2727 insn = emit_insn (gen_add2_insn
2728 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2729 RTX_FRAME_RELATED_P (insn) = 1;
2730 frame_size = lo_ofs;
2731 }
2732 insn = emit_insn (gen_add2_insn
2733 (stack_pointer_rtx, GEN_INT (frame_size)));
2734 }
2735
2736 /* Reset the CFA to be SP + 0. */
2737 add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2738 RTX_FRAME_RELATED_P (insn) = 1;
2739 }
2740
2741 /* Stack adjustment for exception handler. */
2742 if (crtl->calls_eh_return)
2743 {
2744 /* We need to unwind the stack by the offset computed by
2745 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
2746 to be SP; letting the CFA move during this adjustment
2747 is just as correct as retaining the CFA from the body
2748 of the function. Therefore, do nothing special. */
2749 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2750 }
2751
2752 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2753 if (!for_sibcall)
2754 emit_jump_insn (ret_rtx);
2755 }
2756
2757 /* Return the place to copy the exception unwinding return address to.
2758 This will probably be a stack slot, but could (in theory be the
2759 return register). */
2760 rtx
2761 aarch64_final_eh_return_addr (void)
2762 {
2763 HOST_WIDE_INT fp_offset;
2764
2765 aarch64_layout_frame ();
2766
2767 fp_offset = cfun->machine->frame.frame_size
2768 - cfun->machine->frame.hard_fp_offset;
2769
2770 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2771 return gen_rtx_REG (DImode, LR_REGNUM);
2772
2773 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
2774 result in a store to save LR introduced by builtin_eh_return () being
2775 incorrectly deleted because the alias is not detected.
2776 So in the calculation of the address to copy the exception unwinding
2777 return address to, we note 2 cases.
2778 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2779 we return a SP-relative location since all the addresses are SP-relative
2780 in this case. This prevents the store from being optimized away.
2781 If the fp_offset is not 0, then the addresses will be FP-relative and
2782 therefore we return a FP-relative location. */
2783
2784 if (frame_pointer_needed)
2785 {
2786 if (fp_offset)
2787 return gen_frame_mem (DImode,
2788 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2789 else
2790 return gen_frame_mem (DImode,
2791 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2792 }
2793
2794 /* If FP is not needed, we calculate the location of LR, which would be
2795 at the top of the saved registers block. */
2796
2797 return gen_frame_mem (DImode,
2798 plus_constant (Pmode,
2799 stack_pointer_rtx,
2800 fp_offset
2801 + cfun->machine->frame.saved_regs_size
2802 - 2 * UNITS_PER_WORD));
2803 }
2804
2805 /* Possibly output code to build up a constant in a register. For
2806 the benefit of the costs infrastructure, returns the number of
2807 instructions which would be emitted. GENERATE inhibits or
2808 enables code generation. */
2809
2810 static int
2811 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2812 {
2813 int insns = 0;
2814
2815 if (aarch64_bitmask_imm (val, DImode))
2816 {
2817 if (generate)
2818 emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2819 insns = 1;
2820 }
2821 else
2822 {
2823 int i;
2824 int ncount = 0;
2825 int zcount = 0;
2826 HOST_WIDE_INT valp = val >> 16;
2827 HOST_WIDE_INT valm;
2828 HOST_WIDE_INT tval;
2829
2830 for (i = 16; i < 64; i += 16)
2831 {
2832 valm = (valp & 0xffff);
2833
2834 if (valm != 0)
2835 ++ zcount;
2836
2837 if (valm != 0xffff)
2838 ++ ncount;
2839
2840 valp >>= 16;
2841 }
2842
2843 /* zcount contains the number of additional MOVK instructions
2844 required if the constant is built up with an initial MOVZ instruction,
2845 while ncount is the number of MOVK instructions required if starting
2846 with a MOVN instruction. Choose the sequence that yields the fewest
2847 number of instructions, preferring MOVZ instructions when they are both
2848 the same. */
2849 if (ncount < zcount)
2850 {
2851 if (generate)
2852 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2853 GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2854 tval = 0xffff;
2855 insns++;
2856 }
2857 else
2858 {
2859 if (generate)
2860 emit_move_insn (gen_rtx_REG (Pmode, regnum),
2861 GEN_INT (val & 0xffff));
2862 tval = 0;
2863 insns++;
2864 }
2865
2866 val >>= 16;
2867
2868 for (i = 16; i < 64; i += 16)
2869 {
2870 if ((val & 0xffff) != tval)
2871 {
2872 if (generate)
2873 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2874 GEN_INT (i),
2875 GEN_INT (val & 0xffff)));
2876 insns++;
2877 }
2878 val >>= 16;
2879 }
2880 }
2881 return insns;
2882 }
2883
2884 static void
2885 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2886 {
2887 HOST_WIDE_INT mdelta = delta;
2888 rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2889 rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2890
2891 if (mdelta < 0)
2892 mdelta = -mdelta;
2893
2894 if (mdelta >= 4096 * 4096)
2895 {
2896 (void) aarch64_build_constant (scratchreg, delta, true);
2897 emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2898 }
2899 else if (mdelta > 0)
2900 {
2901 if (mdelta >= 4096)
2902 {
2903 emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2904 rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2905 if (delta < 0)
2906 emit_insn (gen_rtx_SET (this_rtx,
2907 gen_rtx_MINUS (Pmode, this_rtx, shift)));
2908 else
2909 emit_insn (gen_rtx_SET (this_rtx,
2910 gen_rtx_PLUS (Pmode, this_rtx, shift)));
2911 }
2912 if (mdelta % 4096 != 0)
2913 {
2914 scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2915 emit_insn (gen_rtx_SET (this_rtx,
2916 gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2917 }
2918 }
2919 }
2920
2921 /* Output code to add DELTA to the first argument, and then jump
2922 to FUNCTION. Used for C++ multiple inheritance. */
2923 static void
2924 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2925 HOST_WIDE_INT delta,
2926 HOST_WIDE_INT vcall_offset,
2927 tree function)
2928 {
2929 /* The this pointer is always in x0. Note that this differs from
2930 Arm where the this pointer maybe bumped to r1 if r0 is required
2931 to return a pointer to an aggregate. On AArch64 a result value
2932 pointer will be in x8. */
2933 int this_regno = R0_REGNUM;
2934 rtx this_rtx, temp0, temp1, addr, funexp;
2935 rtx_insn *insn;
2936
2937 reload_completed = 1;
2938 emit_note (NOTE_INSN_PROLOGUE_END);
2939
2940 if (vcall_offset == 0)
2941 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2942 else
2943 {
2944 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2945
2946 this_rtx = gen_rtx_REG (Pmode, this_regno);
2947 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2948 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2949
2950 addr = this_rtx;
2951 if (delta != 0)
2952 {
2953 if (delta >= -256 && delta < 256)
2954 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2955 plus_constant (Pmode, this_rtx, delta));
2956 else
2957 aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2958 }
2959
2960 if (Pmode == ptr_mode)
2961 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2962 else
2963 aarch64_emit_move (temp0,
2964 gen_rtx_ZERO_EXTEND (Pmode,
2965 gen_rtx_MEM (ptr_mode, addr)));
2966
2967 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2968 addr = plus_constant (Pmode, temp0, vcall_offset);
2969 else
2970 {
2971 (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2972 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2973 }
2974
2975 if (Pmode == ptr_mode)
2976 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2977 else
2978 aarch64_emit_move (temp1,
2979 gen_rtx_SIGN_EXTEND (Pmode,
2980 gen_rtx_MEM (ptr_mode, addr)));
2981
2982 emit_insn (gen_add2_insn (this_rtx, temp1));
2983 }
2984
2985 /* Generate a tail call to the target function. */
2986 if (!TREE_USED (function))
2987 {
2988 assemble_external (function);
2989 TREE_USED (function) = 1;
2990 }
2991 funexp = XEXP (DECL_RTL (function), 0);
2992 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2993 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2994 SIBLING_CALL_P (insn) = 1;
2995
2996 insn = get_insns ();
2997 shorten_branches (insn);
2998 final_start_function (insn, file, 1);
2999 final (insn, file, 1);
3000 final_end_function ();
3001
3002 /* Stop pretending to be a post-reload pass. */
3003 reload_completed = 0;
3004 }
3005
3006 static bool
3007 aarch64_tls_referenced_p (rtx x)
3008 {
3009 if (!TARGET_HAVE_TLS)
3010 return false;
3011 subrtx_iterator::array_type array;
3012 FOR_EACH_SUBRTX (iter, array, x, ALL)
3013 {
3014 const_rtx x = *iter;
3015 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3016 return true;
3017 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3018 TLS offsets, not real symbol references. */
3019 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3020 iter.skip_subrtxes ();
3021 }
3022 return false;
3023 }
3024
3025
3026 static int
3027 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3028 {
3029 const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3030 const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3031
3032 if (*imm1 < *imm2)
3033 return -1;
3034 if (*imm1 > *imm2)
3035 return +1;
3036 return 0;
3037 }
3038
3039
3040 static void
3041 aarch64_build_bitmask_table (void)
3042 {
3043 unsigned HOST_WIDE_INT mask, imm;
3044 unsigned int log_e, e, s, r;
3045 unsigned int nimms = 0;
3046
3047 for (log_e = 1; log_e <= 6; log_e++)
3048 {
3049 e = 1 << log_e;
3050 if (e == 64)
3051 mask = ~(HOST_WIDE_INT) 0;
3052 else
3053 mask = ((HOST_WIDE_INT) 1 << e) - 1;
3054 for (s = 1; s < e; s++)
3055 {
3056 for (r = 0; r < e; r++)
3057 {
3058 /* set s consecutive bits to 1 (s < 64) */
3059 imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3060 /* rotate right by r */
3061 if (r != 0)
3062 imm = ((imm >> r) | (imm << (e - r))) & mask;
3063 /* replicate the constant depending on SIMD size */
3064 switch (log_e) {
3065 case 1: imm |= (imm << 2);
3066 case 2: imm |= (imm << 4);
3067 case 3: imm |= (imm << 8);
3068 case 4: imm |= (imm << 16);
3069 case 5: imm |= (imm << 32);
3070 case 6:
3071 break;
3072 default:
3073 gcc_unreachable ();
3074 }
3075 gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3076 aarch64_bitmasks[nimms++] = imm;
3077 }
3078 }
3079 }
3080
3081 gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3082 qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3083 aarch64_bitmasks_cmp);
3084 }
3085
3086
3087 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3088 a left shift of 0 or 12 bits. */
3089 bool
3090 aarch64_uimm12_shift (HOST_WIDE_INT val)
3091 {
3092 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3093 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3094 );
3095 }
3096
3097
3098 /* Return true if val is an immediate that can be loaded into a
3099 register by a MOVZ instruction. */
3100 static bool
3101 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3102 {
3103 if (GET_MODE_SIZE (mode) > 4)
3104 {
3105 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3106 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3107 return 1;
3108 }
3109 else
3110 {
3111 /* Ignore sign extension. */
3112 val &= (HOST_WIDE_INT) 0xffffffff;
3113 }
3114 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3115 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3116 }
3117
3118
3119 /* Return true if val is a valid bitmask immediate. */
3120 bool
3121 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3122 {
3123 if (GET_MODE_SIZE (mode) < 8)
3124 {
3125 /* Replicate bit pattern. */
3126 val &= (HOST_WIDE_INT) 0xffffffff;
3127 val |= val << 32;
3128 }
3129 return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3130 sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3131 }
3132
3133
3134 /* Return true if val is an immediate that can be loaded into a
3135 register in a single instruction. */
3136 bool
3137 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3138 {
3139 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3140 return 1;
3141 return aarch64_bitmask_imm (val, mode);
3142 }
3143
3144 static bool
3145 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3146 {
3147 rtx base, offset;
3148
3149 if (GET_CODE (x) == HIGH)
3150 return true;
3151
3152 split_const (x, &base, &offset);
3153 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3154 {
3155 if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3156 != SYMBOL_FORCE_TO_MEM)
3157 return true;
3158 else
3159 /* Avoid generating a 64-bit relocation in ILP32; leave
3160 to aarch64_expand_mov_immediate to handle it properly. */
3161 return mode != ptr_mode;
3162 }
3163
3164 return aarch64_tls_referenced_p (x);
3165 }
3166
3167 /* Return true if register REGNO is a valid index register.
3168 STRICT_P is true if REG_OK_STRICT is in effect. */
3169
3170 bool
3171 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3172 {
3173 if (!HARD_REGISTER_NUM_P (regno))
3174 {
3175 if (!strict_p)
3176 return true;
3177
3178 if (!reg_renumber)
3179 return false;
3180
3181 regno = reg_renumber[regno];
3182 }
3183 return GP_REGNUM_P (regno);
3184 }
3185
3186 /* Return true if register REGNO is a valid base register for mode MODE.
3187 STRICT_P is true if REG_OK_STRICT is in effect. */
3188
3189 bool
3190 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3191 {
3192 if (!HARD_REGISTER_NUM_P (regno))
3193 {
3194 if (!strict_p)
3195 return true;
3196
3197 if (!reg_renumber)
3198 return false;
3199
3200 regno = reg_renumber[regno];
3201 }
3202
3203 /* The fake registers will be eliminated to either the stack or
3204 hard frame pointer, both of which are usually valid base registers.
3205 Reload deals with the cases where the eliminated form isn't valid. */
3206 return (GP_REGNUM_P (regno)
3207 || regno == SP_REGNUM
3208 || regno == FRAME_POINTER_REGNUM
3209 || regno == ARG_POINTER_REGNUM);
3210 }
3211
3212 /* Return true if X is a valid base register for mode MODE.
3213 STRICT_P is true if REG_OK_STRICT is in effect. */
3214
3215 static bool
3216 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3217 {
3218 if (!strict_p && GET_CODE (x) == SUBREG)
3219 x = SUBREG_REG (x);
3220
3221 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3222 }
3223
3224 /* Return true if address offset is a valid index. If it is, fill in INFO
3225 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3226
3227 static bool
3228 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3229 machine_mode mode, bool strict_p)
3230 {
3231 enum aarch64_address_type type;
3232 rtx index;
3233 int shift;
3234
3235 /* (reg:P) */
3236 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3237 && GET_MODE (x) == Pmode)
3238 {
3239 type = ADDRESS_REG_REG;
3240 index = x;
3241 shift = 0;
3242 }
3243 /* (sign_extend:DI (reg:SI)) */
3244 else if ((GET_CODE (x) == SIGN_EXTEND
3245 || GET_CODE (x) == ZERO_EXTEND)
3246 && GET_MODE (x) == DImode
3247 && GET_MODE (XEXP (x, 0)) == SImode)
3248 {
3249 type = (GET_CODE (x) == SIGN_EXTEND)
3250 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3251 index = XEXP (x, 0);
3252 shift = 0;
3253 }
3254 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3255 else if (GET_CODE (x) == MULT
3256 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3257 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3258 && GET_MODE (XEXP (x, 0)) == DImode
3259 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3260 && CONST_INT_P (XEXP (x, 1)))
3261 {
3262 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3263 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3264 index = XEXP (XEXP (x, 0), 0);
3265 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3266 }
3267 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3268 else if (GET_CODE (x) == ASHIFT
3269 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3270 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3271 && GET_MODE (XEXP (x, 0)) == DImode
3272 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3273 && CONST_INT_P (XEXP (x, 1)))
3274 {
3275 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3276 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3277 index = XEXP (XEXP (x, 0), 0);
3278 shift = INTVAL (XEXP (x, 1));
3279 }
3280 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3281 else if ((GET_CODE (x) == SIGN_EXTRACT
3282 || GET_CODE (x) == ZERO_EXTRACT)
3283 && GET_MODE (x) == DImode
3284 && GET_CODE (XEXP (x, 0)) == MULT
3285 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3286 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3287 {
3288 type = (GET_CODE (x) == SIGN_EXTRACT)
3289 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3290 index = XEXP (XEXP (x, 0), 0);
3291 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3292 if (INTVAL (XEXP (x, 1)) != 32 + shift
3293 || INTVAL (XEXP (x, 2)) != 0)
3294 shift = -1;
3295 }
3296 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3297 (const_int 0xffffffff<<shift)) */
3298 else if (GET_CODE (x) == AND
3299 && GET_MODE (x) == DImode
3300 && GET_CODE (XEXP (x, 0)) == MULT
3301 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3302 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3303 && CONST_INT_P (XEXP (x, 1)))
3304 {
3305 type = ADDRESS_REG_UXTW;
3306 index = XEXP (XEXP (x, 0), 0);
3307 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3308 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3309 shift = -1;
3310 }
3311 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3312 else if ((GET_CODE (x) == SIGN_EXTRACT
3313 || GET_CODE (x) == ZERO_EXTRACT)
3314 && GET_MODE (x) == DImode
3315 && GET_CODE (XEXP (x, 0)) == ASHIFT
3316 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3317 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3318 {
3319 type = (GET_CODE (x) == SIGN_EXTRACT)
3320 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3321 index = XEXP (XEXP (x, 0), 0);
3322 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3323 if (INTVAL (XEXP (x, 1)) != 32 + shift
3324 || INTVAL (XEXP (x, 2)) != 0)
3325 shift = -1;
3326 }
3327 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3328 (const_int 0xffffffff<<shift)) */
3329 else if (GET_CODE (x) == AND
3330 && GET_MODE (x) == DImode
3331 && GET_CODE (XEXP (x, 0)) == ASHIFT
3332 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3333 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3334 && CONST_INT_P (XEXP (x, 1)))
3335 {
3336 type = ADDRESS_REG_UXTW;
3337 index = XEXP (XEXP (x, 0), 0);
3338 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3339 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3340 shift = -1;
3341 }
3342 /* (mult:P (reg:P) (const_int scale)) */
3343 else if (GET_CODE (x) == MULT
3344 && GET_MODE (x) == Pmode
3345 && GET_MODE (XEXP (x, 0)) == Pmode
3346 && CONST_INT_P (XEXP (x, 1)))
3347 {
3348 type = ADDRESS_REG_REG;
3349 index = XEXP (x, 0);
3350 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3351 }
3352 /* (ashift:P (reg:P) (const_int shift)) */
3353 else if (GET_CODE (x) == ASHIFT
3354 && GET_MODE (x) == Pmode
3355 && GET_MODE (XEXP (x, 0)) == Pmode
3356 && CONST_INT_P (XEXP (x, 1)))
3357 {
3358 type = ADDRESS_REG_REG;
3359 index = XEXP (x, 0);
3360 shift = INTVAL (XEXP (x, 1));
3361 }
3362 else
3363 return false;
3364
3365 if (GET_CODE (index) == SUBREG)
3366 index = SUBREG_REG (index);
3367
3368 if ((shift == 0 ||
3369 (shift > 0 && shift <= 3
3370 && (1 << shift) == GET_MODE_SIZE (mode)))
3371 && REG_P (index)
3372 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3373 {
3374 info->type = type;
3375 info->offset = index;
3376 info->shift = shift;
3377 return true;
3378 }
3379
3380 return false;
3381 }
3382
3383 bool
3384 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3385 {
3386 return (offset >= -64 * GET_MODE_SIZE (mode)
3387 && offset < 64 * GET_MODE_SIZE (mode)
3388 && offset % GET_MODE_SIZE (mode) == 0);
3389 }
3390
3391 static inline bool
3392 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3393 HOST_WIDE_INT offset)
3394 {
3395 return offset >= -256 && offset < 256;
3396 }
3397
3398 static inline bool
3399 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3400 {
3401 return (offset >= 0
3402 && offset < 4096 * GET_MODE_SIZE (mode)
3403 && offset % GET_MODE_SIZE (mode) == 0);
3404 }
3405
3406 /* Return true if X is a valid address for machine mode MODE. If it is,
3407 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3408 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3409
3410 static bool
3411 aarch64_classify_address (struct aarch64_address_info *info,
3412 rtx x, machine_mode mode,
3413 RTX_CODE outer_code, bool strict_p)
3414 {
3415 enum rtx_code code = GET_CODE (x);
3416 rtx op0, op1;
3417
3418 /* On BE, we use load/store pair for all large int mode load/stores. */
3419 bool load_store_pair_p = (outer_code == PARALLEL
3420 || (BYTES_BIG_ENDIAN
3421 && aarch64_vect_struct_mode_p (mode)));
3422
3423 bool allow_reg_index_p =
3424 !load_store_pair_p
3425 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3426 && !aarch64_vect_struct_mode_p (mode);
3427
3428 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3429 REG addressing. */
3430 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3431 && (code != POST_INC && code != REG))
3432 return false;
3433
3434 switch (code)
3435 {
3436 case REG:
3437 case SUBREG:
3438 info->type = ADDRESS_REG_IMM;
3439 info->base = x;
3440 info->offset = const0_rtx;
3441 return aarch64_base_register_rtx_p (x, strict_p);
3442
3443 case PLUS:
3444 op0 = XEXP (x, 0);
3445 op1 = XEXP (x, 1);
3446
3447 if (! strict_p
3448 && REG_P (op0)
3449 && (op0 == virtual_stack_vars_rtx
3450 || op0 == frame_pointer_rtx
3451 || op0 == arg_pointer_rtx)
3452 && CONST_INT_P (op1))
3453 {
3454 info->type = ADDRESS_REG_IMM;
3455 info->base = op0;
3456 info->offset = op1;
3457
3458 return true;
3459 }
3460
3461 if (GET_MODE_SIZE (mode) != 0
3462 && CONST_INT_P (op1)
3463 && aarch64_base_register_rtx_p (op0, strict_p))
3464 {
3465 HOST_WIDE_INT offset = INTVAL (op1);
3466
3467 info->type = ADDRESS_REG_IMM;
3468 info->base = op0;
3469 info->offset = op1;
3470
3471 /* TImode and TFmode values are allowed in both pairs of X
3472 registers and individual Q registers. The available
3473 address modes are:
3474 X,X: 7-bit signed scaled offset
3475 Q: 9-bit signed offset
3476 We conservatively require an offset representable in either mode.
3477 */
3478 if (mode == TImode || mode == TFmode)
3479 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3480 && offset_9bit_signed_unscaled_p (mode, offset));
3481
3482 /* A 7bit offset check because OImode will emit a ldp/stp
3483 instruction (only big endian will get here).
3484 For ldp/stp instructions, the offset is scaled for the size of a
3485 single element of the pair. */
3486 if (mode == OImode)
3487 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3488
3489 /* Three 9/12 bit offsets checks because CImode will emit three
3490 ldr/str instructions (only big endian will get here). */
3491 if (mode == CImode)
3492 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3493 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3494 || offset_12bit_unsigned_scaled_p (V16QImode,
3495 offset + 32)));
3496
3497 /* Two 7bit offsets checks because XImode will emit two ldp/stp
3498 instructions (only big endian will get here). */
3499 if (mode == XImode)
3500 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3501 && aarch64_offset_7bit_signed_scaled_p (TImode,
3502 offset + 32));
3503
3504 if (load_store_pair_p)
3505 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3506 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3507 else
3508 return (offset_9bit_signed_unscaled_p (mode, offset)
3509 || offset_12bit_unsigned_scaled_p (mode, offset));
3510 }
3511
3512 if (allow_reg_index_p)
3513 {
3514 /* Look for base + (scaled/extended) index register. */
3515 if (aarch64_base_register_rtx_p (op0, strict_p)
3516 && aarch64_classify_index (info, op1, mode, strict_p))
3517 {
3518 info->base = op0;
3519 return true;
3520 }
3521 if (aarch64_base_register_rtx_p (op1, strict_p)
3522 && aarch64_classify_index (info, op0, mode, strict_p))
3523 {
3524 info->base = op1;
3525 return true;
3526 }
3527 }
3528
3529 return false;
3530
3531 case POST_INC:
3532 case POST_DEC:
3533 case PRE_INC:
3534 case PRE_DEC:
3535 info->type = ADDRESS_REG_WB;
3536 info->base = XEXP (x, 0);
3537 info->offset = NULL_RTX;
3538 return aarch64_base_register_rtx_p (info->base, strict_p);
3539
3540 case POST_MODIFY:
3541 case PRE_MODIFY:
3542 info->type = ADDRESS_REG_WB;
3543 info->base = XEXP (x, 0);
3544 if (GET_CODE (XEXP (x, 1)) == PLUS
3545 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3546 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3547 && aarch64_base_register_rtx_p (info->base, strict_p))
3548 {
3549 HOST_WIDE_INT offset;
3550 info->offset = XEXP (XEXP (x, 1), 1);
3551 offset = INTVAL (info->offset);
3552
3553 /* TImode and TFmode values are allowed in both pairs of X
3554 registers and individual Q registers. The available
3555 address modes are:
3556 X,X: 7-bit signed scaled offset
3557 Q: 9-bit signed offset
3558 We conservatively require an offset representable in either mode.
3559 */
3560 if (mode == TImode || mode == TFmode)
3561 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3562 && offset_9bit_signed_unscaled_p (mode, offset));
3563
3564 if (load_store_pair_p)
3565 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3566 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3567 else
3568 return offset_9bit_signed_unscaled_p (mode, offset);
3569 }
3570 return false;
3571
3572 case CONST:
3573 case SYMBOL_REF:
3574 case LABEL_REF:
3575 /* load literal: pc-relative constant pool entry. Only supported
3576 for SI mode or larger. */
3577 info->type = ADDRESS_SYMBOLIC;
3578
3579 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3580 {
3581 rtx sym, addend;
3582
3583 split_const (x, &sym, &addend);
3584 return (GET_CODE (sym) == LABEL_REF
3585 || (GET_CODE (sym) == SYMBOL_REF
3586 && CONSTANT_POOL_ADDRESS_P (sym)));
3587 }
3588 return false;
3589
3590 case LO_SUM:
3591 info->type = ADDRESS_LO_SUM;
3592 info->base = XEXP (x, 0);
3593 info->offset = XEXP (x, 1);
3594 if (allow_reg_index_p
3595 && aarch64_base_register_rtx_p (info->base, strict_p))
3596 {
3597 rtx sym, offs;
3598 split_const (info->offset, &sym, &offs);
3599 if (GET_CODE (sym) == SYMBOL_REF
3600 && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3601 == SYMBOL_SMALL_ABSOLUTE))
3602 {
3603 /* The symbol and offset must be aligned to the access size. */
3604 unsigned int align;
3605 unsigned int ref_size;
3606
3607 if (CONSTANT_POOL_ADDRESS_P (sym))
3608 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3609 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3610 {
3611 tree exp = SYMBOL_REF_DECL (sym);
3612 align = TYPE_ALIGN (TREE_TYPE (exp));
3613 align = CONSTANT_ALIGNMENT (exp, align);
3614 }
3615 else if (SYMBOL_REF_DECL (sym))
3616 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3617 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3618 && SYMBOL_REF_BLOCK (sym) != NULL)
3619 align = SYMBOL_REF_BLOCK (sym)->alignment;
3620 else
3621 align = BITS_PER_UNIT;
3622
3623 ref_size = GET_MODE_SIZE (mode);
3624 if (ref_size == 0)
3625 ref_size = GET_MODE_SIZE (DImode);
3626
3627 return ((INTVAL (offs) & (ref_size - 1)) == 0
3628 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3629 }
3630 }
3631 return false;
3632
3633 default:
3634 return false;
3635 }
3636 }
3637
3638 bool
3639 aarch64_symbolic_address_p (rtx x)
3640 {
3641 rtx offset;
3642
3643 split_const (x, &x, &offset);
3644 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3645 }
3646
3647 /* Classify the base of symbolic expression X, given that X appears in
3648 context CONTEXT. */
3649
3650 enum aarch64_symbol_type
3651 aarch64_classify_symbolic_expression (rtx x,
3652 enum aarch64_symbol_context context)
3653 {
3654 rtx offset;
3655
3656 split_const (x, &x, &offset);
3657 return aarch64_classify_symbol (x, offset, context);
3658 }
3659
3660
3661 /* Return TRUE if X is a legitimate address for accessing memory in
3662 mode MODE. */
3663 static bool
3664 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3665 {
3666 struct aarch64_address_info addr;
3667
3668 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3669 }
3670
3671 /* Return TRUE if X is a legitimate address for accessing memory in
3672 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
3673 pair operation. */
3674 bool
3675 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3676 RTX_CODE outer_code, bool strict_p)
3677 {
3678 struct aarch64_address_info addr;
3679
3680 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3681 }
3682
3683 /* Return TRUE if rtx X is immediate constant 0.0 */
3684 bool
3685 aarch64_float_const_zero_rtx_p (rtx x)
3686 {
3687 REAL_VALUE_TYPE r;
3688
3689 if (GET_MODE (x) == VOIDmode)
3690 return false;
3691
3692 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3693 if (REAL_VALUE_MINUS_ZERO (r))
3694 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3695 return REAL_VALUES_EQUAL (r, dconst0);
3696 }
3697
3698 /* Return the fixed registers used for condition codes. */
3699
3700 static bool
3701 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3702 {
3703 *p1 = CC_REGNUM;
3704 *p2 = INVALID_REGNUM;
3705 return true;
3706 }
3707
3708 /* Emit call insn with PAT and do aarch64-specific handling. */
3709
3710 void
3711 aarch64_emit_call_insn (rtx pat)
3712 {
3713 rtx insn = emit_call_insn (pat);
3714
3715 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3716 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3717 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3718 }
3719
3720 machine_mode
3721 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3722 {
3723 /* All floating point compares return CCFP if it is an equality
3724 comparison, and CCFPE otherwise. */
3725 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3726 {
3727 switch (code)
3728 {
3729 case EQ:
3730 case NE:
3731 case UNORDERED:
3732 case ORDERED:
3733 case UNLT:
3734 case UNLE:
3735 case UNGT:
3736 case UNGE:
3737 case UNEQ:
3738 case LTGT:
3739 return CCFPmode;
3740
3741 case LT:
3742 case LE:
3743 case GT:
3744 case GE:
3745 return CCFPEmode;
3746
3747 default:
3748 gcc_unreachable ();
3749 }
3750 }
3751
3752 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3753 && y == const0_rtx
3754 && (code == EQ || code == NE || code == LT || code == GE)
3755 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3756 || GET_CODE (x) == NEG))
3757 return CC_NZmode;
3758
3759 /* A compare with a shifted operand. Because of canonicalization,
3760 the comparison will have to be swapped when we emit the assembly
3761 code. */
3762 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3763 && (REG_P (y) || GET_CODE (y) == SUBREG)
3764 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3765 || GET_CODE (x) == LSHIFTRT
3766 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3767 return CC_SWPmode;
3768
3769 /* Similarly for a negated operand, but we can only do this for
3770 equalities. */
3771 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3772 && (REG_P (y) || GET_CODE (y) == SUBREG)
3773 && (code == EQ || code == NE)
3774 && GET_CODE (x) == NEG)
3775 return CC_Zmode;
3776
3777 /* A compare of a mode narrower than SI mode against zero can be done
3778 by extending the value in the comparison. */
3779 if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3780 && y == const0_rtx)
3781 /* Only use sign-extension if we really need it. */
3782 return ((code == GT || code == GE || code == LE || code == LT)
3783 ? CC_SESWPmode : CC_ZESWPmode);
3784
3785 /* For everything else, return CCmode. */
3786 return CCmode;
3787 }
3788
3789 static int
3790 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3791
3792 int
3793 aarch64_get_condition_code (rtx x)
3794 {
3795 machine_mode mode = GET_MODE (XEXP (x, 0));
3796 enum rtx_code comp_code = GET_CODE (x);
3797
3798 if (GET_MODE_CLASS (mode) != MODE_CC)
3799 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3800 return aarch64_get_condition_code_1 (mode, comp_code);
3801 }
3802
3803 static int
3804 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3805 {
3806 int ne = -1, eq = -1;
3807 switch (mode)
3808 {
3809 case CCFPmode:
3810 case CCFPEmode:
3811 switch (comp_code)
3812 {
3813 case GE: return AARCH64_GE;
3814 case GT: return AARCH64_GT;
3815 case LE: return AARCH64_LS;
3816 case LT: return AARCH64_MI;
3817 case NE: return AARCH64_NE;
3818 case EQ: return AARCH64_EQ;
3819 case ORDERED: return AARCH64_VC;
3820 case UNORDERED: return AARCH64_VS;
3821 case UNLT: return AARCH64_LT;
3822 case UNLE: return AARCH64_LE;
3823 case UNGT: return AARCH64_HI;
3824 case UNGE: return AARCH64_PL;
3825 default: return -1;
3826 }
3827 break;
3828
3829 case CC_DNEmode:
3830 ne = AARCH64_NE;
3831 eq = AARCH64_EQ;
3832 break;
3833
3834 case CC_DEQmode:
3835 ne = AARCH64_EQ;
3836 eq = AARCH64_NE;
3837 break;
3838
3839 case CC_DGEmode:
3840 ne = AARCH64_GE;
3841 eq = AARCH64_LT;
3842 break;
3843
3844 case CC_DLTmode:
3845 ne = AARCH64_LT;
3846 eq = AARCH64_GE;
3847 break;
3848
3849 case CC_DGTmode:
3850 ne = AARCH64_GT;
3851 eq = AARCH64_LE;
3852 break;
3853
3854 case CC_DLEmode:
3855 ne = AARCH64_LE;
3856 eq = AARCH64_GT;
3857 break;
3858
3859 case CC_DGEUmode:
3860 ne = AARCH64_CS;
3861 eq = AARCH64_CC;
3862 break;
3863
3864 case CC_DLTUmode:
3865 ne = AARCH64_CC;
3866 eq = AARCH64_CS;
3867 break;
3868
3869 case CC_DGTUmode:
3870 ne = AARCH64_HI;
3871 eq = AARCH64_LS;
3872 break;
3873
3874 case CC_DLEUmode:
3875 ne = AARCH64_LS;
3876 eq = AARCH64_HI;
3877 break;
3878
3879 case CCmode:
3880 switch (comp_code)
3881 {
3882 case NE: return AARCH64_NE;
3883 case EQ: return AARCH64_EQ;
3884 case GE: return AARCH64_GE;
3885 case GT: return AARCH64_GT;
3886 case LE: return AARCH64_LE;
3887 case LT: return AARCH64_LT;
3888 case GEU: return AARCH64_CS;
3889 case GTU: return AARCH64_HI;
3890 case LEU: return AARCH64_LS;
3891 case LTU: return AARCH64_CC;
3892 default: return -1;
3893 }
3894 break;
3895
3896 case CC_SWPmode:
3897 case CC_ZESWPmode:
3898 case CC_SESWPmode:
3899 switch (comp_code)
3900 {
3901 case NE: return AARCH64_NE;
3902 case EQ: return AARCH64_EQ;
3903 case GE: return AARCH64_LE;
3904 case GT: return AARCH64_LT;
3905 case LE: return AARCH64_GE;
3906 case LT: return AARCH64_GT;
3907 case GEU: return AARCH64_LS;
3908 case GTU: return AARCH64_CC;
3909 case LEU: return AARCH64_CS;
3910 case LTU: return AARCH64_HI;
3911 default: return -1;
3912 }
3913 break;
3914
3915 case CC_NZmode:
3916 switch (comp_code)
3917 {
3918 case NE: return AARCH64_NE;
3919 case EQ: return AARCH64_EQ;
3920 case GE: return AARCH64_PL;
3921 case LT: return AARCH64_MI;
3922 default: return -1;
3923 }
3924 break;
3925
3926 case CC_Zmode:
3927 switch (comp_code)
3928 {
3929 case NE: return AARCH64_NE;
3930 case EQ: return AARCH64_EQ;
3931 default: return -1;
3932 }
3933 break;
3934
3935 default:
3936 return -1;
3937 break;
3938 }
3939
3940 if (comp_code == NE)
3941 return ne;
3942
3943 if (comp_code == EQ)
3944 return eq;
3945
3946 return -1;
3947 }
3948
3949 bool
3950 aarch64_const_vec_all_same_in_range_p (rtx x,
3951 HOST_WIDE_INT minval,
3952 HOST_WIDE_INT maxval)
3953 {
3954 HOST_WIDE_INT firstval;
3955 int count, i;
3956
3957 if (GET_CODE (x) != CONST_VECTOR
3958 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3959 return false;
3960
3961 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3962 if (firstval < minval || firstval > maxval)
3963 return false;
3964
3965 count = CONST_VECTOR_NUNITS (x);
3966 for (i = 1; i < count; i++)
3967 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3968 return false;
3969
3970 return true;
3971 }
3972
3973 bool
3974 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3975 {
3976 return aarch64_const_vec_all_same_in_range_p (x, val, val);
3977 }
3978
3979 static unsigned
3980 bit_count (unsigned HOST_WIDE_INT value)
3981 {
3982 unsigned count = 0;
3983
3984 while (value)
3985 {
3986 count++;
3987 value &= value - 1;
3988 }
3989
3990 return count;
3991 }
3992
3993 /* N Z C V. */
3994 #define AARCH64_CC_V 1
3995 #define AARCH64_CC_C (1 << 1)
3996 #define AARCH64_CC_Z (1 << 2)
3997 #define AARCH64_CC_N (1 << 3)
3998
3999 /* N Z C V flags for ccmp. The first code is for AND op and the other
4000 is for IOR op. Indexed by AARCH64_COND_CODE. */
4001 static const int aarch64_nzcv_codes[][2] =
4002 {
4003 {AARCH64_CC_Z, 0}, /* EQ, Z == 1. */
4004 {0, AARCH64_CC_Z}, /* NE, Z == 0. */
4005 {AARCH64_CC_C, 0}, /* CS, C == 1. */
4006 {0, AARCH64_CC_C}, /* CC, C == 0. */
4007 {AARCH64_CC_N, 0}, /* MI, N == 1. */
4008 {0, AARCH64_CC_N}, /* PL, N == 0. */
4009 {AARCH64_CC_V, 0}, /* VS, V == 1. */
4010 {0, AARCH64_CC_V}, /* VC, V == 0. */
4011 {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0. */
4012 {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0). */
4013 {0, AARCH64_CC_V}, /* GE, N == V. */
4014 {AARCH64_CC_V, 0}, /* LT, N != V. */
4015 {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V. */
4016 {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V). */
4017 {0, 0}, /* AL, Any. */
4018 {0, 0}, /* NV, Any. */
4019 };
4020
4021 int
4022 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4023 {
4024 switch (mode)
4025 {
4026 case CC_DNEmode:
4027 return NE;
4028
4029 case CC_DEQmode:
4030 return EQ;
4031
4032 case CC_DLEmode:
4033 return LE;
4034
4035 case CC_DGTmode:
4036 return GT;
4037
4038 case CC_DLTmode:
4039 return LT;
4040
4041 case CC_DGEmode:
4042 return GE;
4043
4044 case CC_DLEUmode:
4045 return LEU;
4046
4047 case CC_DGTUmode:
4048 return GTU;
4049
4050 case CC_DLTUmode:
4051 return LTU;
4052
4053 case CC_DGEUmode:
4054 return GEU;
4055
4056 default:
4057 gcc_unreachable ();
4058 }
4059 }
4060
4061
4062 void
4063 aarch64_print_operand (FILE *f, rtx x, char code)
4064 {
4065 switch (code)
4066 {
4067 /* An integer or symbol address without a preceding # sign. */
4068 case 'c':
4069 switch (GET_CODE (x))
4070 {
4071 case CONST_INT:
4072 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4073 break;
4074
4075 case SYMBOL_REF:
4076 output_addr_const (f, x);
4077 break;
4078
4079 case CONST:
4080 if (GET_CODE (XEXP (x, 0)) == PLUS
4081 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4082 {
4083 output_addr_const (f, x);
4084 break;
4085 }
4086 /* Fall through. */
4087
4088 default:
4089 output_operand_lossage ("Unsupported operand for code '%c'", code);
4090 }
4091 break;
4092
4093 case 'e':
4094 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4095 {
4096 int n;
4097
4098 if (!CONST_INT_P (x)
4099 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4100 {
4101 output_operand_lossage ("invalid operand for '%%%c'", code);
4102 return;
4103 }
4104
4105 switch (n)
4106 {
4107 case 3:
4108 fputc ('b', f);
4109 break;
4110 case 4:
4111 fputc ('h', f);
4112 break;
4113 case 5:
4114 fputc ('w', f);
4115 break;
4116 default:
4117 output_operand_lossage ("invalid operand for '%%%c'", code);
4118 return;
4119 }
4120 }
4121 break;
4122
4123 case 'p':
4124 {
4125 int n;
4126
4127 /* Print N such that 2^N == X. */
4128 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4129 {
4130 output_operand_lossage ("invalid operand for '%%%c'", code);
4131 return;
4132 }
4133
4134 asm_fprintf (f, "%d", n);
4135 }
4136 break;
4137
4138 case 'P':
4139 /* Print the number of non-zero bits in X (a const_int). */
4140 if (!CONST_INT_P (x))
4141 {
4142 output_operand_lossage ("invalid operand for '%%%c'", code);
4143 return;
4144 }
4145
4146 asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4147 break;
4148
4149 case 'H':
4150 /* Print the higher numbered register of a pair (TImode) of regs. */
4151 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4152 {
4153 output_operand_lossage ("invalid operand for '%%%c'", code);
4154 return;
4155 }
4156
4157 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4158 break;
4159
4160 case 'm':
4161 {
4162 int cond_code;
4163 /* Print a condition (eq, ne, etc). */
4164
4165 /* CONST_TRUE_RTX means always -- that's the default. */
4166 if (x == const_true_rtx)
4167 return;
4168
4169 if (!COMPARISON_P (x))
4170 {
4171 output_operand_lossage ("invalid operand for '%%%c'", code);
4172 return;
4173 }
4174
4175 cond_code = aarch64_get_condition_code (x);
4176 gcc_assert (cond_code >= 0);
4177 fputs (aarch64_condition_codes[cond_code], f);
4178 }
4179 break;
4180
4181 case 'M':
4182 {
4183 int cond_code;
4184 /* Print the inverse of a condition (eq <-> ne, etc). */
4185
4186 /* CONST_TRUE_RTX means never -- that's the default. */
4187 if (x == const_true_rtx)
4188 {
4189 fputs ("nv", f);
4190 return;
4191 }
4192
4193 if (!COMPARISON_P (x))
4194 {
4195 output_operand_lossage ("invalid operand for '%%%c'", code);
4196 return;
4197 }
4198 cond_code = aarch64_get_condition_code (x);
4199 gcc_assert (cond_code >= 0);
4200 fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4201 (cond_code)], f);
4202 }
4203 break;
4204
4205 case 'b':
4206 case 'h':
4207 case 's':
4208 case 'd':
4209 case 'q':
4210 /* Print a scalar FP/SIMD register name. */
4211 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4212 {
4213 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4214 return;
4215 }
4216 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4217 break;
4218
4219 case 'S':
4220 case 'T':
4221 case 'U':
4222 case 'V':
4223 /* Print the first FP/SIMD register name in a list. */
4224 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4225 {
4226 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4227 return;
4228 }
4229 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4230 break;
4231
4232 case 'R':
4233 /* Print a scalar FP/SIMD register name + 1. */
4234 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4235 {
4236 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4237 return;
4238 }
4239 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4240 break;
4241
4242 case 'X':
4243 /* Print bottom 16 bits of integer constant in hex. */
4244 if (!CONST_INT_P (x))
4245 {
4246 output_operand_lossage ("invalid operand for '%%%c'", code);
4247 return;
4248 }
4249 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4250 break;
4251
4252 case 'w':
4253 case 'x':
4254 /* Print a general register name or the zero register (32-bit or
4255 64-bit). */
4256 if (x == const0_rtx
4257 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4258 {
4259 asm_fprintf (f, "%czr", code);
4260 break;
4261 }
4262
4263 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4264 {
4265 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4266 break;
4267 }
4268
4269 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4270 {
4271 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4272 break;
4273 }
4274
4275 /* Fall through */
4276
4277 case 0:
4278 /* Print a normal operand, if it's a general register, then we
4279 assume DImode. */
4280 if (x == NULL)
4281 {
4282 output_operand_lossage ("missing operand");
4283 return;
4284 }
4285
4286 switch (GET_CODE (x))
4287 {
4288 case REG:
4289 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4290 break;
4291
4292 case MEM:
4293 aarch64_memory_reference_mode = GET_MODE (x);
4294 output_address (XEXP (x, 0));
4295 break;
4296
4297 case LABEL_REF:
4298 case SYMBOL_REF:
4299 output_addr_const (asm_out_file, x);
4300 break;
4301
4302 case CONST_INT:
4303 asm_fprintf (f, "%wd", INTVAL (x));
4304 break;
4305
4306 case CONST_VECTOR:
4307 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4308 {
4309 gcc_assert (
4310 aarch64_const_vec_all_same_in_range_p (x,
4311 HOST_WIDE_INT_MIN,
4312 HOST_WIDE_INT_MAX));
4313 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4314 }
4315 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4316 {
4317 fputc ('0', f);
4318 }
4319 else
4320 gcc_unreachable ();
4321 break;
4322
4323 case CONST_DOUBLE:
4324 /* CONST_DOUBLE can represent a double-width integer.
4325 In this case, the mode of x is VOIDmode. */
4326 if (GET_MODE (x) == VOIDmode)
4327 ; /* Do Nothing. */
4328 else if (aarch64_float_const_zero_rtx_p (x))
4329 {
4330 fputc ('0', f);
4331 break;
4332 }
4333 else if (aarch64_float_const_representable_p (x))
4334 {
4335 #define buf_size 20
4336 char float_buf[buf_size] = {'\0'};
4337 REAL_VALUE_TYPE r;
4338 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4339 real_to_decimal_for_mode (float_buf, &r,
4340 buf_size, buf_size,
4341 1, GET_MODE (x));
4342 asm_fprintf (asm_out_file, "%s", float_buf);
4343 break;
4344 #undef buf_size
4345 }
4346 output_operand_lossage ("invalid constant");
4347 return;
4348 default:
4349 output_operand_lossage ("invalid operand");
4350 return;
4351 }
4352 break;
4353
4354 case 'A':
4355 if (GET_CODE (x) == HIGH)
4356 x = XEXP (x, 0);
4357
4358 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4359 {
4360 case SYMBOL_SMALL_GOT:
4361 asm_fprintf (asm_out_file, ":got:");
4362 break;
4363
4364 case SYMBOL_SMALL_TLSGD:
4365 asm_fprintf (asm_out_file, ":tlsgd:");
4366 break;
4367
4368 case SYMBOL_SMALL_TLSDESC:
4369 asm_fprintf (asm_out_file, ":tlsdesc:");
4370 break;
4371
4372 case SYMBOL_SMALL_GOTTPREL:
4373 asm_fprintf (asm_out_file, ":gottprel:");
4374 break;
4375
4376 case SYMBOL_SMALL_TPREL:
4377 asm_fprintf (asm_out_file, ":tprel:");
4378 break;
4379
4380 case SYMBOL_TINY_GOT:
4381 gcc_unreachable ();
4382 break;
4383
4384 default:
4385 break;
4386 }
4387 output_addr_const (asm_out_file, x);
4388 break;
4389
4390 case 'L':
4391 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4392 {
4393 case SYMBOL_SMALL_GOT:
4394 asm_fprintf (asm_out_file, ":lo12:");
4395 break;
4396
4397 case SYMBOL_SMALL_TLSGD:
4398 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4399 break;
4400
4401 case SYMBOL_SMALL_TLSDESC:
4402 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4403 break;
4404
4405 case SYMBOL_SMALL_GOTTPREL:
4406 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4407 break;
4408
4409 case SYMBOL_SMALL_TPREL:
4410 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4411 break;
4412
4413 case SYMBOL_TINY_GOT:
4414 asm_fprintf (asm_out_file, ":got:");
4415 break;
4416
4417 default:
4418 break;
4419 }
4420 output_addr_const (asm_out_file, x);
4421 break;
4422
4423 case 'G':
4424
4425 switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4426 {
4427 case SYMBOL_SMALL_TPREL:
4428 asm_fprintf (asm_out_file, ":tprel_hi12:");
4429 break;
4430 default:
4431 break;
4432 }
4433 output_addr_const (asm_out_file, x);
4434 break;
4435
4436 case 'K':
4437 {
4438 int cond_code;
4439 /* Print nzcv. */
4440
4441 if (!COMPARISON_P (x))
4442 {
4443 output_operand_lossage ("invalid operand for '%%%c'", code);
4444 return;
4445 }
4446
4447 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4448 gcc_assert (cond_code >= 0);
4449 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4450 }
4451 break;
4452
4453 case 'k':
4454 {
4455 int cond_code;
4456 /* Print nzcv. */
4457
4458 if (!COMPARISON_P (x))
4459 {
4460 output_operand_lossage ("invalid operand for '%%%c'", code);
4461 return;
4462 }
4463
4464 cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4465 gcc_assert (cond_code >= 0);
4466 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4467 }
4468 break;
4469
4470 default:
4471 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4472 return;
4473 }
4474 }
4475
4476 void
4477 aarch64_print_operand_address (FILE *f, rtx x)
4478 {
4479 struct aarch64_address_info addr;
4480
4481 if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4482 MEM, true))
4483 switch (addr.type)
4484 {
4485 case ADDRESS_REG_IMM:
4486 if (addr.offset == const0_rtx)
4487 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4488 else
4489 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4490 INTVAL (addr.offset));
4491 return;
4492
4493 case ADDRESS_REG_REG:
4494 if (addr.shift == 0)
4495 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4496 reg_names [REGNO (addr.offset)]);
4497 else
4498 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4499 reg_names [REGNO (addr.offset)], addr.shift);
4500 return;
4501
4502 case ADDRESS_REG_UXTW:
4503 if (addr.shift == 0)
4504 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4505 REGNO (addr.offset) - R0_REGNUM);
4506 else
4507 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4508 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4509 return;
4510
4511 case ADDRESS_REG_SXTW:
4512 if (addr.shift == 0)
4513 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4514 REGNO (addr.offset) - R0_REGNUM);
4515 else
4516 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4517 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4518 return;
4519
4520 case ADDRESS_REG_WB:
4521 switch (GET_CODE (x))
4522 {
4523 case PRE_INC:
4524 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4525 GET_MODE_SIZE (aarch64_memory_reference_mode));
4526 return;
4527 case POST_INC:
4528 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4529 GET_MODE_SIZE (aarch64_memory_reference_mode));
4530 return;
4531 case PRE_DEC:
4532 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4533 GET_MODE_SIZE (aarch64_memory_reference_mode));
4534 return;
4535 case POST_DEC:
4536 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4537 GET_MODE_SIZE (aarch64_memory_reference_mode));
4538 return;
4539 case PRE_MODIFY:
4540 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4541 INTVAL (addr.offset));
4542 return;
4543 case POST_MODIFY:
4544 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4545 INTVAL (addr.offset));
4546 return;
4547 default:
4548 break;
4549 }
4550 break;
4551
4552 case ADDRESS_LO_SUM:
4553 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4554 output_addr_const (f, addr.offset);
4555 asm_fprintf (f, "]");
4556 return;
4557
4558 case ADDRESS_SYMBOLIC:
4559 break;
4560 }
4561
4562 output_addr_const (f, x);
4563 }
4564
4565 bool
4566 aarch64_label_mentioned_p (rtx x)
4567 {
4568 const char *fmt;
4569 int i;
4570
4571 if (GET_CODE (x) == LABEL_REF)
4572 return true;
4573
4574 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4575 referencing instruction, but they are constant offsets, not
4576 symbols. */
4577 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4578 return false;
4579
4580 fmt = GET_RTX_FORMAT (GET_CODE (x));
4581 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4582 {
4583 if (fmt[i] == 'E')
4584 {
4585 int j;
4586
4587 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4588 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4589 return 1;
4590 }
4591 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4592 return 1;
4593 }
4594
4595 return 0;
4596 }
4597
4598 /* Implement REGNO_REG_CLASS. */
4599
4600 enum reg_class
4601 aarch64_regno_regclass (unsigned regno)
4602 {
4603 if (GP_REGNUM_P (regno))
4604 return GENERAL_REGS;
4605
4606 if (regno == SP_REGNUM)
4607 return STACK_REG;
4608
4609 if (regno == FRAME_POINTER_REGNUM
4610 || regno == ARG_POINTER_REGNUM)
4611 return POINTER_REGS;
4612
4613 if (FP_REGNUM_P (regno))
4614 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
4615
4616 return NO_REGS;
4617 }
4618
4619 static rtx
4620 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
4621 {
4622 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4623 where mask is selected by alignment and size of the offset.
4624 We try to pick as large a range for the offset as possible to
4625 maximize the chance of a CSE. However, for aligned addresses
4626 we limit the range to 4k so that structures with different sized
4627 elements are likely to use the same base. */
4628
4629 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4630 {
4631 HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4632 HOST_WIDE_INT base_offset;
4633
4634 /* Does it look like we'll need a load/store-pair operation? */
4635 if (GET_MODE_SIZE (mode) > 16
4636 || mode == TImode)
4637 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4638 & ~((128 * GET_MODE_SIZE (mode)) - 1));
4639 /* For offsets aren't a multiple of the access size, the limit is
4640 -256...255. */
4641 else if (offset & (GET_MODE_SIZE (mode) - 1))
4642 base_offset = (offset + 0x100) & ~0x1ff;
4643 else
4644 base_offset = offset & ~0xfff;
4645
4646 if (base_offset == 0)
4647 return x;
4648
4649 offset -= base_offset;
4650 rtx base_reg = gen_reg_rtx (Pmode);
4651 rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4652 NULL_RTX);
4653 emit_move_insn (base_reg, val);
4654 x = plus_constant (Pmode, base_reg, offset);
4655 }
4656
4657 return x;
4658 }
4659
4660 /* Try a machine-dependent way of reloading an illegitimate address
4661 operand. If we find one, push the reload and return the new rtx. */
4662
4663 rtx
4664 aarch64_legitimize_reload_address (rtx *x_p,
4665 machine_mode mode,
4666 int opnum, int type,
4667 int ind_levels ATTRIBUTE_UNUSED)
4668 {
4669 rtx x = *x_p;
4670
4671 /* Do not allow mem (plus (reg, const)) if vector struct mode. */
4672 if (aarch64_vect_struct_mode_p (mode)
4673 && GET_CODE (x) == PLUS
4674 && REG_P (XEXP (x, 0))
4675 && CONST_INT_P (XEXP (x, 1)))
4676 {
4677 rtx orig_rtx = x;
4678 x = copy_rtx (x);
4679 push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4680 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4681 opnum, (enum reload_type) type);
4682 return x;
4683 }
4684
4685 /* We must recognize output that we have already generated ourselves. */
4686 if (GET_CODE (x) == PLUS
4687 && GET_CODE (XEXP (x, 0)) == PLUS
4688 && REG_P (XEXP (XEXP (x, 0), 0))
4689 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4690 && CONST_INT_P (XEXP (x, 1)))
4691 {
4692 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4693 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4694 opnum, (enum reload_type) type);
4695 return x;
4696 }
4697
4698 /* We wish to handle large displacements off a base register by splitting
4699 the addend across an add and the mem insn. This can cut the number of
4700 extra insns needed from 3 to 1. It is only useful for load/store of a
4701 single register with 12 bit offset field. */
4702 if (GET_CODE (x) == PLUS
4703 && REG_P (XEXP (x, 0))
4704 && CONST_INT_P (XEXP (x, 1))
4705 && HARD_REGISTER_P (XEXP (x, 0))
4706 && mode != TImode
4707 && mode != TFmode
4708 && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4709 {
4710 HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4711 HOST_WIDE_INT low = val & 0xfff;
4712 HOST_WIDE_INT high = val - low;
4713 HOST_WIDE_INT offs;
4714 rtx cst;
4715 machine_mode xmode = GET_MODE (x);
4716
4717 /* In ILP32, xmode can be either DImode or SImode. */
4718 gcc_assert (xmode == DImode || xmode == SImode);
4719
4720 /* Reload non-zero BLKmode offsets. This is because we cannot ascertain
4721 BLKmode alignment. */
4722 if (GET_MODE_SIZE (mode) == 0)
4723 return NULL_RTX;
4724
4725 offs = low % GET_MODE_SIZE (mode);
4726
4727 /* Align misaligned offset by adjusting high part to compensate. */
4728 if (offs != 0)
4729 {
4730 if (aarch64_uimm12_shift (high + offs))
4731 {
4732 /* Align down. */
4733 low = low - offs;
4734 high = high + offs;
4735 }
4736 else
4737 {
4738 /* Align up. */
4739 offs = GET_MODE_SIZE (mode) - offs;
4740 low = low + offs;
4741 high = high + (low & 0x1000) - offs;
4742 low &= 0xfff;
4743 }
4744 }
4745
4746 /* Check for overflow. */
4747 if (high + low != val)
4748 return NULL_RTX;
4749
4750 cst = GEN_INT (high);
4751 if (!aarch64_uimm12_shift (high))
4752 cst = force_const_mem (xmode, cst);
4753
4754 /* Reload high part into base reg, leaving the low part
4755 in the mem instruction.
4756 Note that replacing this gen_rtx_PLUS with plus_constant is
4757 wrong in this case because we rely on the
4758 (plus (plus reg c1) c2) structure being preserved so that
4759 XEXP (*p, 0) in push_reload below uses the correct term. */
4760 x = gen_rtx_PLUS (xmode,
4761 gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4762 GEN_INT (low));
4763
4764 push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4765 BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4766 opnum, (enum reload_type) type);
4767 return x;
4768 }
4769
4770 return NULL_RTX;
4771 }
4772
4773
4774 static reg_class_t
4775 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4776 reg_class_t rclass,
4777 machine_mode mode,
4778 secondary_reload_info *sri)
4779 {
4780 /* Without the TARGET_SIMD instructions we cannot move a Q register
4781 to a Q register directly. We need a scratch. */
4782 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4783 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4784 && reg_class_subset_p (rclass, FP_REGS))
4785 {
4786 if (mode == TFmode)
4787 sri->icode = CODE_FOR_aarch64_reload_movtf;
4788 else if (mode == TImode)
4789 sri->icode = CODE_FOR_aarch64_reload_movti;
4790 return NO_REGS;
4791 }
4792
4793 /* A TFmode or TImode memory access should be handled via an FP_REGS
4794 because AArch64 has richer addressing modes for LDR/STR instructions
4795 than LDP/STP instructions. */
4796 if (TARGET_FLOAT && rclass == GENERAL_REGS
4797 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4798 return FP_REGS;
4799
4800 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4801 return GENERAL_REGS;
4802
4803 return NO_REGS;
4804 }
4805
4806 static bool
4807 aarch64_can_eliminate (const int from, const int to)
4808 {
4809 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4810 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
4811
4812 if (frame_pointer_needed)
4813 {
4814 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4815 return true;
4816 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4817 return false;
4818 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4819 && !cfun->calls_alloca)
4820 return true;
4821 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4822 return true;
4823
4824 return false;
4825 }
4826 else
4827 {
4828 /* If we decided that we didn't need a leaf frame pointer but then used
4829 LR in the function, then we'll want a frame pointer after all, so
4830 prevent this elimination to ensure a frame pointer is used. */
4831 if (to == STACK_POINTER_REGNUM
4832 && flag_omit_leaf_frame_pointer
4833 && df_regs_ever_live_p (LR_REGNUM))
4834 return false;
4835 }
4836
4837 return true;
4838 }
4839
4840 HOST_WIDE_INT
4841 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4842 {
4843 aarch64_layout_frame ();
4844
4845 if (to == HARD_FRAME_POINTER_REGNUM)
4846 {
4847 if (from == ARG_POINTER_REGNUM)
4848 return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4849
4850 if (from == FRAME_POINTER_REGNUM)
4851 return (cfun->machine->frame.hard_fp_offset
4852 - cfun->machine->frame.saved_varargs_size);
4853 }
4854
4855 if (to == STACK_POINTER_REGNUM)
4856 {
4857 if (from == FRAME_POINTER_REGNUM)
4858 return (cfun->machine->frame.frame_size
4859 - cfun->machine->frame.saved_varargs_size);
4860 }
4861
4862 return cfun->machine->frame.frame_size;
4863 }
4864
4865 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
4866 previous frame. */
4867
4868 rtx
4869 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4870 {
4871 if (count != 0)
4872 return const0_rtx;
4873 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4874 }
4875
4876
4877 static void
4878 aarch64_asm_trampoline_template (FILE *f)
4879 {
4880 if (TARGET_ILP32)
4881 {
4882 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4883 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4884 }
4885 else
4886 {
4887 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4888 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4889 }
4890 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4891 assemble_aligned_integer (4, const0_rtx);
4892 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4893 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4894 }
4895
4896 static void
4897 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4898 {
4899 rtx fnaddr, mem, a_tramp;
4900 const int tramp_code_sz = 16;
4901
4902 /* Don't need to copy the trailing D-words, we fill those in below. */
4903 emit_block_move (m_tramp, assemble_trampoline_template (),
4904 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4905 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4906 fnaddr = XEXP (DECL_RTL (fndecl), 0);
4907 if (GET_MODE (fnaddr) != ptr_mode)
4908 fnaddr = convert_memory_address (ptr_mode, fnaddr);
4909 emit_move_insn (mem, fnaddr);
4910
4911 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4912 emit_move_insn (mem, chain_value);
4913
4914 /* XXX We should really define a "clear_cache" pattern and use
4915 gen_clear_cache(). */
4916 a_tramp = XEXP (m_tramp, 0);
4917 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4918 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4919 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4920 ptr_mode);
4921 }
4922
4923 static unsigned char
4924 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4925 {
4926 switch (regclass)
4927 {
4928 case CALLER_SAVE_REGS:
4929 case POINTER_REGS:
4930 case GENERAL_REGS:
4931 case ALL_REGS:
4932 case FP_REGS:
4933 case FP_LO_REGS:
4934 return
4935 aarch64_vector_mode_p (mode)
4936 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
4937 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4938 case STACK_REG:
4939 return 1;
4940
4941 case NO_REGS:
4942 return 0;
4943
4944 default:
4945 break;
4946 }
4947 gcc_unreachable ();
4948 }
4949
4950 static reg_class_t
4951 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4952 {
4953 if (regclass == POINTER_REGS)
4954 return GENERAL_REGS;
4955
4956 if (regclass == STACK_REG)
4957 {
4958 if (REG_P(x)
4959 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4960 return regclass;
4961
4962 return NO_REGS;
4963 }
4964
4965 /* If it's an integer immediate that MOVI can't handle, then
4966 FP_REGS is not an option, so we return NO_REGS instead. */
4967 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4968 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4969 return NO_REGS;
4970
4971 /* Register eliminiation can result in a request for
4972 SP+constant->FP_REGS. We cannot support such operations which
4973 use SP as source and an FP_REG as destination, so reject out
4974 right now. */
4975 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4976 {
4977 rtx lhs = XEXP (x, 0);
4978
4979 /* Look through a possible SUBREG introduced by ILP32. */
4980 if (GET_CODE (lhs) == SUBREG)
4981 lhs = SUBREG_REG (lhs);
4982
4983 gcc_assert (REG_P (lhs));
4984 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4985 POINTER_REGS));
4986 return NO_REGS;
4987 }
4988
4989 return regclass;
4990 }
4991
4992 void
4993 aarch64_asm_output_labelref (FILE* f, const char *name)
4994 {
4995 asm_fprintf (f, "%U%s", name);
4996 }
4997
4998 static void
4999 aarch64_elf_asm_constructor (rtx symbol, int priority)
5000 {
5001 if (priority == DEFAULT_INIT_PRIORITY)
5002 default_ctor_section_asm_out_constructor (symbol, priority);
5003 else
5004 {
5005 section *s;
5006 char buf[18];
5007 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5008 s = get_section (buf, SECTION_WRITE, NULL);
5009 switch_to_section (s);
5010 assemble_align (POINTER_SIZE);
5011 assemble_aligned_integer (POINTER_BYTES, symbol);
5012 }
5013 }
5014
5015 static void
5016 aarch64_elf_asm_destructor (rtx symbol, int priority)
5017 {
5018 if (priority == DEFAULT_INIT_PRIORITY)
5019 default_dtor_section_asm_out_destructor (symbol, priority);
5020 else
5021 {
5022 section *s;
5023 char buf[18];
5024 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5025 s = get_section (buf, SECTION_WRITE, NULL);
5026 switch_to_section (s);
5027 assemble_align (POINTER_SIZE);
5028 assemble_aligned_integer (POINTER_BYTES, symbol);
5029 }
5030 }
5031
5032 const char*
5033 aarch64_output_casesi (rtx *operands)
5034 {
5035 char buf[100];
5036 char label[100];
5037 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5038 int index;
5039 static const char *const patterns[4][2] =
5040 {
5041 {
5042 "ldrb\t%w3, [%0,%w1,uxtw]",
5043 "add\t%3, %4, %w3, sxtb #2"
5044 },
5045 {
5046 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5047 "add\t%3, %4, %w3, sxth #2"
5048 },
5049 {
5050 "ldr\t%w3, [%0,%w1,uxtw #2]",
5051 "add\t%3, %4, %w3, sxtw #2"
5052 },
5053 /* We assume that DImode is only generated when not optimizing and
5054 that we don't really need 64-bit address offsets. That would
5055 imply an object file with 8GB of code in a single function! */
5056 {
5057 "ldr\t%w3, [%0,%w1,uxtw #2]",
5058 "add\t%3, %4, %w3, sxtw #2"
5059 }
5060 };
5061
5062 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5063
5064 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5065
5066 gcc_assert (index >= 0 && index <= 3);
5067
5068 /* Need to implement table size reduction, by chaning the code below. */
5069 output_asm_insn (patterns[index][0], operands);
5070 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5071 snprintf (buf, sizeof (buf),
5072 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5073 output_asm_insn (buf, operands);
5074 output_asm_insn (patterns[index][1], operands);
5075 output_asm_insn ("br\t%3", operands);
5076 assemble_label (asm_out_file, label);
5077 return "";
5078 }
5079
5080
5081 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5082 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5083 operator. */
5084
5085 int
5086 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5087 {
5088 if (shift >= 0 && shift <= 3)
5089 {
5090 int size;
5091 for (size = 8; size <= 32; size *= 2)
5092 {
5093 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5094 if (mask == bits << shift)
5095 return size;
5096 }
5097 }
5098 return 0;
5099 }
5100
5101 static bool
5102 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5103 const_rtx x ATTRIBUTE_UNUSED)
5104 {
5105 /* We can't use blocks for constants when we're using a per-function
5106 constant pool. */
5107 return false;
5108 }
5109
5110 static section *
5111 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5112 rtx x ATTRIBUTE_UNUSED,
5113 unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5114 {
5115 /* Force all constant pool entries into the current function section. */
5116 return function_section (current_function_decl);
5117 }
5118
5119
5120 /* Costs. */
5121
5122 /* Helper function for rtx cost calculation. Strip a shift expression
5123 from X. Returns the inner operand if successful, or the original
5124 expression on failure. */
5125 static rtx
5126 aarch64_strip_shift (rtx x)
5127 {
5128 rtx op = x;
5129
5130 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5131 we can convert both to ROR during final output. */
5132 if ((GET_CODE (op) == ASHIFT
5133 || GET_CODE (op) == ASHIFTRT
5134 || GET_CODE (op) == LSHIFTRT
5135 || GET_CODE (op) == ROTATERT
5136 || GET_CODE (op) == ROTATE)
5137 && CONST_INT_P (XEXP (op, 1)))
5138 return XEXP (op, 0);
5139
5140 if (GET_CODE (op) == MULT
5141 && CONST_INT_P (XEXP (op, 1))
5142 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5143 return XEXP (op, 0);
5144
5145 return x;
5146 }
5147
5148 /* Helper function for rtx cost calculation. Strip an extend
5149 expression from X. Returns the inner operand if successful, or the
5150 original expression on failure. We deal with a number of possible
5151 canonicalization variations here. */
5152 static rtx
5153 aarch64_strip_extend (rtx x)
5154 {
5155 rtx op = x;
5156
5157 /* Zero and sign extraction of a widened value. */
5158 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5159 && XEXP (op, 2) == const0_rtx
5160 && GET_CODE (XEXP (op, 0)) == MULT
5161 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5162 XEXP (op, 1)))
5163 return XEXP (XEXP (op, 0), 0);
5164
5165 /* It can also be represented (for zero-extend) as an AND with an
5166 immediate. */
5167 if (GET_CODE (op) == AND
5168 && GET_CODE (XEXP (op, 0)) == MULT
5169 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5170 && CONST_INT_P (XEXP (op, 1))
5171 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5172 INTVAL (XEXP (op, 1))) != 0)
5173 return XEXP (XEXP (op, 0), 0);
5174
5175 /* Now handle extended register, as this may also have an optional
5176 left shift by 1..4. */
5177 if (GET_CODE (op) == ASHIFT
5178 && CONST_INT_P (XEXP (op, 1))
5179 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5180 op = XEXP (op, 0);
5181
5182 if (GET_CODE (op) == ZERO_EXTEND
5183 || GET_CODE (op) == SIGN_EXTEND)
5184 op = XEXP (op, 0);
5185
5186 if (op != x)
5187 return op;
5188
5189 return x;
5190 }
5191
5192 /* Return true iff CODE is a shift supported in combination
5193 with arithmetic instructions. */
5194
5195 static bool
5196 aarch64_shift_p (enum rtx_code code)
5197 {
5198 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5199 }
5200
5201 /* Helper function for rtx cost calculation. Calculate the cost of
5202 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5203 Return the calculated cost of the expression, recursing manually in to
5204 operands where needed. */
5205
5206 static int
5207 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5208 {
5209 rtx op0, op1;
5210 const struct cpu_cost_table *extra_cost
5211 = aarch64_tune_params->insn_extra_cost;
5212 int cost = 0;
5213 bool compound_p = (outer == PLUS || outer == MINUS);
5214 machine_mode mode = GET_MODE (x);
5215
5216 gcc_checking_assert (code == MULT);
5217
5218 op0 = XEXP (x, 0);
5219 op1 = XEXP (x, 1);
5220
5221 if (VECTOR_MODE_P (mode))
5222 mode = GET_MODE_INNER (mode);
5223
5224 /* Integer multiply/fma. */
5225 if (GET_MODE_CLASS (mode) == MODE_INT)
5226 {
5227 /* The multiply will be canonicalized as a shift, cost it as such. */
5228 if (aarch64_shift_p (GET_CODE (x))
5229 || (CONST_INT_P (op1)
5230 && exact_log2 (INTVAL (op1)) > 0))
5231 {
5232 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5233 || GET_CODE (op0) == SIGN_EXTEND;
5234 if (speed)
5235 {
5236 if (compound_p)
5237 {
5238 if (REG_P (op1))
5239 /* ARITH + shift-by-register. */
5240 cost += extra_cost->alu.arith_shift_reg;
5241 else if (is_extend)
5242 /* ARITH + extended register. We don't have a cost field
5243 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5244 cost += extra_cost->alu.extend_arith;
5245 else
5246 /* ARITH + shift-by-immediate. */
5247 cost += extra_cost->alu.arith_shift;
5248 }
5249 else
5250 /* LSL (immediate). */
5251 cost += extra_cost->alu.shift;
5252
5253 }
5254 /* Strip extends as we will have costed them in the case above. */
5255 if (is_extend)
5256 op0 = aarch64_strip_extend (op0);
5257
5258 cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5259
5260 return cost;
5261 }
5262
5263 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5264 compound and let the below cases handle it. After all, MNEG is a
5265 special-case alias of MSUB. */
5266 if (GET_CODE (op0) == NEG)
5267 {
5268 op0 = XEXP (op0, 0);
5269 compound_p = true;
5270 }
5271
5272 /* Integer multiplies or FMAs have zero/sign extending variants. */
5273 if ((GET_CODE (op0) == ZERO_EXTEND
5274 && GET_CODE (op1) == ZERO_EXTEND)
5275 || (GET_CODE (op0) == SIGN_EXTEND
5276 && GET_CODE (op1) == SIGN_EXTEND))
5277 {
5278 cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5279 + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5280
5281 if (speed)
5282 {
5283 if (compound_p)
5284 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5285 cost += extra_cost->mult[0].extend_add;
5286 else
5287 /* MUL/SMULL/UMULL. */
5288 cost += extra_cost->mult[0].extend;
5289 }
5290
5291 return cost;
5292 }
5293
5294 /* This is either an integer multiply or a MADD. In both cases
5295 we want to recurse and cost the operands. */
5296 cost += rtx_cost (op0, MULT, 0, speed)
5297 + rtx_cost (op1, MULT, 1, speed);
5298
5299 if (speed)
5300 {
5301 if (compound_p)
5302 /* MADD/MSUB. */
5303 cost += extra_cost->mult[mode == DImode].add;
5304 else
5305 /* MUL. */
5306 cost += extra_cost->mult[mode == DImode].simple;
5307 }
5308
5309 return cost;
5310 }
5311 else
5312 {
5313 if (speed)
5314 {
5315 /* Floating-point FMA/FMUL can also support negations of the
5316 operands. */
5317 if (GET_CODE (op0) == NEG)
5318 op0 = XEXP (op0, 0);
5319 if (GET_CODE (op1) == NEG)
5320 op1 = XEXP (op1, 0);
5321
5322 if (compound_p)
5323 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5324 cost += extra_cost->fp[mode == DFmode].fma;
5325 else
5326 /* FMUL/FNMUL. */
5327 cost += extra_cost->fp[mode == DFmode].mult;
5328 }
5329
5330 cost += rtx_cost (op0, MULT, 0, speed)
5331 + rtx_cost (op1, MULT, 1, speed);
5332 return cost;
5333 }
5334 }
5335
5336 static int
5337 aarch64_address_cost (rtx x,
5338 machine_mode mode,
5339 addr_space_t as ATTRIBUTE_UNUSED,
5340 bool speed)
5341 {
5342 enum rtx_code c = GET_CODE (x);
5343 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5344 struct aarch64_address_info info;
5345 int cost = 0;
5346 info.shift = 0;
5347
5348 if (!aarch64_classify_address (&info, x, mode, c, false))
5349 {
5350 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5351 {
5352 /* This is a CONST or SYMBOL ref which will be split
5353 in a different way depending on the code model in use.
5354 Cost it through the generic infrastructure. */
5355 int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5356 /* Divide through by the cost of one instruction to
5357 bring it to the same units as the address costs. */
5358 cost_symbol_ref /= COSTS_N_INSNS (1);
5359 /* The cost is then the cost of preparing the address,
5360 followed by an immediate (possibly 0) offset. */
5361 return cost_symbol_ref + addr_cost->imm_offset;
5362 }
5363 else
5364 {
5365 /* This is most likely a jump table from a case
5366 statement. */
5367 return addr_cost->register_offset;
5368 }
5369 }
5370
5371 switch (info.type)
5372 {
5373 case ADDRESS_LO_SUM:
5374 case ADDRESS_SYMBOLIC:
5375 case ADDRESS_REG_IMM:
5376 cost += addr_cost->imm_offset;
5377 break;
5378
5379 case ADDRESS_REG_WB:
5380 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5381 cost += addr_cost->pre_modify;
5382 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5383 cost += addr_cost->post_modify;
5384 else
5385 gcc_unreachable ();
5386
5387 break;
5388
5389 case ADDRESS_REG_REG:
5390 cost += addr_cost->register_offset;
5391 break;
5392
5393 case ADDRESS_REG_UXTW:
5394 case ADDRESS_REG_SXTW:
5395 cost += addr_cost->register_extend;
5396 break;
5397
5398 default:
5399 gcc_unreachable ();
5400 }
5401
5402
5403 if (info.shift > 0)
5404 {
5405 /* For the sake of calculating the cost of the shifted register
5406 component, we can treat same sized modes in the same way. */
5407 switch (GET_MODE_BITSIZE (mode))
5408 {
5409 case 16:
5410 cost += addr_cost->addr_scale_costs.hi;
5411 break;
5412
5413 case 32:
5414 cost += addr_cost->addr_scale_costs.si;
5415 break;
5416
5417 case 64:
5418 cost += addr_cost->addr_scale_costs.di;
5419 break;
5420
5421 /* We can't tell, or this is a 128-bit vector. */
5422 default:
5423 cost += addr_cost->addr_scale_costs.ti;
5424 break;
5425 }
5426 }
5427
5428 return cost;
5429 }
5430
5431 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5432 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5433 to be taken. */
5434
5435 int
5436 aarch64_branch_cost (bool speed_p, bool predictable_p)
5437 {
5438 /* When optimizing for speed, use the cost of unpredictable branches. */
5439 const struct cpu_branch_cost *branch_costs =
5440 aarch64_tune_params->branch_costs;
5441
5442 if (!speed_p || predictable_p)
5443 return branch_costs->predictable;
5444 else
5445 return branch_costs->unpredictable;
5446 }
5447
5448 /* Return true if the RTX X in mode MODE is a zero or sign extract
5449 usable in an ADD or SUB (extended register) instruction. */
5450 static bool
5451 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5452 {
5453 /* Catch add with a sign extract.
5454 This is add_<optab><mode>_multp2. */
5455 if (GET_CODE (x) == SIGN_EXTRACT
5456 || GET_CODE (x) == ZERO_EXTRACT)
5457 {
5458 rtx op0 = XEXP (x, 0);
5459 rtx op1 = XEXP (x, 1);
5460 rtx op2 = XEXP (x, 2);
5461
5462 if (GET_CODE (op0) == MULT
5463 && CONST_INT_P (op1)
5464 && op2 == const0_rtx
5465 && CONST_INT_P (XEXP (op0, 1))
5466 && aarch64_is_extend_from_extract (mode,
5467 XEXP (op0, 1),
5468 op1))
5469 {
5470 return true;
5471 }
5472 }
5473
5474 return false;
5475 }
5476
5477 static bool
5478 aarch64_frint_unspec_p (unsigned int u)
5479 {
5480 switch (u)
5481 {
5482 case UNSPEC_FRINTZ:
5483 case UNSPEC_FRINTP:
5484 case UNSPEC_FRINTM:
5485 case UNSPEC_FRINTA:
5486 case UNSPEC_FRINTN:
5487 case UNSPEC_FRINTX:
5488 case UNSPEC_FRINTI:
5489 return true;
5490
5491 default:
5492 return false;
5493 }
5494 }
5495
5496 /* Return true iff X is an rtx that will match an extr instruction
5497 i.e. as described in the *extr<mode>5_insn family of patterns.
5498 OP0 and OP1 will be set to the operands of the shifts involved
5499 on success and will be NULL_RTX otherwise. */
5500
5501 static bool
5502 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5503 {
5504 rtx op0, op1;
5505 machine_mode mode = GET_MODE (x);
5506
5507 *res_op0 = NULL_RTX;
5508 *res_op1 = NULL_RTX;
5509
5510 if (GET_CODE (x) != IOR)
5511 return false;
5512
5513 op0 = XEXP (x, 0);
5514 op1 = XEXP (x, 1);
5515
5516 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5517 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5518 {
5519 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5520 if (GET_CODE (op1) == ASHIFT)
5521 std::swap (op0, op1);
5522
5523 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5524 return false;
5525
5526 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5527 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5528
5529 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5530 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5531 {
5532 *res_op0 = XEXP (op0, 0);
5533 *res_op1 = XEXP (op1, 0);
5534 return true;
5535 }
5536 }
5537
5538 return false;
5539 }
5540
5541 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5542 storing it in *COST. Result is true if the total cost of the operation
5543 has now been calculated. */
5544 static bool
5545 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5546 {
5547 rtx inner;
5548 rtx comparator;
5549 enum rtx_code cmpcode;
5550
5551 if (COMPARISON_P (op0))
5552 {
5553 inner = XEXP (op0, 0);
5554 comparator = XEXP (op0, 1);
5555 cmpcode = GET_CODE (op0);
5556 }
5557 else
5558 {
5559 inner = op0;
5560 comparator = const0_rtx;
5561 cmpcode = NE;
5562 }
5563
5564 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5565 {
5566 /* Conditional branch. */
5567 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5568 return true;
5569 else
5570 {
5571 if (cmpcode == NE || cmpcode == EQ)
5572 {
5573 if (comparator == const0_rtx)
5574 {
5575 /* TBZ/TBNZ/CBZ/CBNZ. */
5576 if (GET_CODE (inner) == ZERO_EXTRACT)
5577 /* TBZ/TBNZ. */
5578 *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5579 0, speed);
5580 else
5581 /* CBZ/CBNZ. */
5582 *cost += rtx_cost (inner, cmpcode, 0, speed);
5583
5584 return true;
5585 }
5586 }
5587 else if (cmpcode == LT || cmpcode == GE)
5588 {
5589 /* TBZ/TBNZ. */
5590 if (comparator == const0_rtx)
5591 return true;
5592 }
5593 }
5594 }
5595 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5596 {
5597 /* It's a conditional operation based on the status flags,
5598 so it must be some flavor of CSEL. */
5599
5600 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
5601 if (GET_CODE (op1) == NEG
5602 || GET_CODE (op1) == NOT
5603 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5604 op1 = XEXP (op1, 0);
5605
5606 *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5607 *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5608 return true;
5609 }
5610
5611 /* We don't know what this is, cost all operands. */
5612 return false;
5613 }
5614
5615 /* Calculate the cost of calculating X, storing it in *COST. Result
5616 is true if the total cost of the operation has now been calculated. */
5617 static bool
5618 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5619 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5620 {
5621 rtx op0, op1, op2;
5622 const struct cpu_cost_table *extra_cost
5623 = aarch64_tune_params->insn_extra_cost;
5624 machine_mode mode = GET_MODE (x);
5625
5626 /* By default, assume that everything has equivalent cost to the
5627 cheapest instruction. Any additional costs are applied as a delta
5628 above this default. */
5629 *cost = COSTS_N_INSNS (1);
5630
5631 switch (code)
5632 {
5633 case SET:
5634 /* The cost depends entirely on the operands to SET. */
5635 *cost = 0;
5636 op0 = SET_DEST (x);
5637 op1 = SET_SRC (x);
5638
5639 switch (GET_CODE (op0))
5640 {
5641 case MEM:
5642 if (speed)
5643 {
5644 rtx address = XEXP (op0, 0);
5645 if (VECTOR_MODE_P (mode))
5646 *cost += extra_cost->ldst.storev;
5647 else if (GET_MODE_CLASS (mode) == MODE_INT)
5648 *cost += extra_cost->ldst.store;
5649 else if (mode == SFmode)
5650 *cost += extra_cost->ldst.storef;
5651 else if (mode == DFmode)
5652 *cost += extra_cost->ldst.stored;
5653
5654 *cost +=
5655 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5656 0, speed));
5657 }
5658
5659 *cost += rtx_cost (op1, SET, 1, speed);
5660 return true;
5661
5662 case SUBREG:
5663 if (! REG_P (SUBREG_REG (op0)))
5664 *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5665
5666 /* Fall through. */
5667 case REG:
5668 /* The cost is one per vector-register copied. */
5669 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5670 {
5671 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5672 / GET_MODE_SIZE (V4SImode);
5673 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5674 }
5675 /* const0_rtx is in general free, but we will use an
5676 instruction to set a register to 0. */
5677 else if (REG_P (op1) || op1 == const0_rtx)
5678 {
5679 /* The cost is 1 per register copied. */
5680 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5681 / UNITS_PER_WORD;
5682 *cost = COSTS_N_INSNS (n_minus_1 + 1);
5683 }
5684 else
5685 /* Cost is just the cost of the RHS of the set. */
5686 *cost += rtx_cost (op1, SET, 1, speed);
5687 return true;
5688
5689 case ZERO_EXTRACT:
5690 case SIGN_EXTRACT:
5691 /* Bit-field insertion. Strip any redundant widening of
5692 the RHS to meet the width of the target. */
5693 if (GET_CODE (op1) == SUBREG)
5694 op1 = SUBREG_REG (op1);
5695 if ((GET_CODE (op1) == ZERO_EXTEND
5696 || GET_CODE (op1) == SIGN_EXTEND)
5697 && CONST_INT_P (XEXP (op0, 1))
5698 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5699 >= INTVAL (XEXP (op0, 1))))
5700 op1 = XEXP (op1, 0);
5701
5702 if (CONST_INT_P (op1))
5703 {
5704 /* MOV immediate is assumed to always be cheap. */
5705 *cost = COSTS_N_INSNS (1);
5706 }
5707 else
5708 {
5709 /* BFM. */
5710 if (speed)
5711 *cost += extra_cost->alu.bfi;
5712 *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5713 }
5714
5715 return true;
5716
5717 default:
5718 /* We can't make sense of this, assume default cost. */
5719 *cost = COSTS_N_INSNS (1);
5720 return false;
5721 }
5722 return false;
5723
5724 case CONST_INT:
5725 /* If an instruction can incorporate a constant within the
5726 instruction, the instruction's expression avoids calling
5727 rtx_cost() on the constant. If rtx_cost() is called on a
5728 constant, then it is usually because the constant must be
5729 moved into a register by one or more instructions.
5730
5731 The exception is constant 0, which can be expressed
5732 as XZR/WZR and is therefore free. The exception to this is
5733 if we have (set (reg) (const0_rtx)) in which case we must cost
5734 the move. However, we can catch that when we cost the SET, so
5735 we don't need to consider that here. */
5736 if (x == const0_rtx)
5737 *cost = 0;
5738 else
5739 {
5740 /* To an approximation, building any other constant is
5741 proportionally expensive to the number of instructions
5742 required to build that constant. This is true whether we
5743 are compiling for SPEED or otherwise. */
5744 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5745 (NULL_RTX, x, false, mode));
5746 }
5747 return true;
5748
5749 case CONST_DOUBLE:
5750 if (speed)
5751 {
5752 /* mov[df,sf]_aarch64. */
5753 if (aarch64_float_const_representable_p (x))
5754 /* FMOV (scalar immediate). */
5755 *cost += extra_cost->fp[mode == DFmode].fpconst;
5756 else if (!aarch64_float_const_zero_rtx_p (x))
5757 {
5758 /* This will be a load from memory. */
5759 if (mode == DFmode)
5760 *cost += extra_cost->ldst.loadd;
5761 else
5762 *cost += extra_cost->ldst.loadf;
5763 }
5764 else
5765 /* Otherwise this is +0.0. We get this using MOVI d0, #0
5766 or MOV v0.s[0], wzr - neither of which are modeled by the
5767 cost tables. Just use the default cost. */
5768 {
5769 }
5770 }
5771
5772 return true;
5773
5774 case MEM:
5775 if (speed)
5776 {
5777 /* For loads we want the base cost of a load, plus an
5778 approximation for the additional cost of the addressing
5779 mode. */
5780 rtx address = XEXP (x, 0);
5781 if (VECTOR_MODE_P (mode))
5782 *cost += extra_cost->ldst.loadv;
5783 else if (GET_MODE_CLASS (mode) == MODE_INT)
5784 *cost += extra_cost->ldst.load;
5785 else if (mode == SFmode)
5786 *cost += extra_cost->ldst.loadf;
5787 else if (mode == DFmode)
5788 *cost += extra_cost->ldst.loadd;
5789
5790 *cost +=
5791 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5792 0, speed));
5793 }
5794
5795 return true;
5796
5797 case NEG:
5798 op0 = XEXP (x, 0);
5799
5800 if (VECTOR_MODE_P (mode))
5801 {
5802 if (speed)
5803 {
5804 /* FNEG. */
5805 *cost += extra_cost->vect.alu;
5806 }
5807 return false;
5808 }
5809
5810 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5811 {
5812 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5813 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5814 {
5815 /* CSETM. */
5816 *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5817 return true;
5818 }
5819
5820 /* Cost this as SUB wzr, X. */
5821 op0 = CONST0_RTX (GET_MODE (x));
5822 op1 = XEXP (x, 0);
5823 goto cost_minus;
5824 }
5825
5826 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5827 {
5828 /* Support (neg(fma...)) as a single instruction only if
5829 sign of zeros is unimportant. This matches the decision
5830 making in aarch64.md. */
5831 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5832 {
5833 /* FNMADD. */
5834 *cost = rtx_cost (op0, NEG, 0, speed);
5835 return true;
5836 }
5837 if (speed)
5838 /* FNEG. */
5839 *cost += extra_cost->fp[mode == DFmode].neg;
5840 return false;
5841 }
5842
5843 return false;
5844
5845 case CLRSB:
5846 case CLZ:
5847 if (speed)
5848 {
5849 if (VECTOR_MODE_P (mode))
5850 *cost += extra_cost->vect.alu;
5851 else
5852 *cost += extra_cost->alu.clz;
5853 }
5854
5855 return false;
5856
5857 case COMPARE:
5858 op0 = XEXP (x, 0);
5859 op1 = XEXP (x, 1);
5860
5861 if (op1 == const0_rtx
5862 && GET_CODE (op0) == AND)
5863 {
5864 x = op0;
5865 goto cost_logic;
5866 }
5867
5868 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5869 {
5870 /* TODO: A write to the CC flags possibly costs extra, this
5871 needs encoding in the cost tables. */
5872
5873 /* CC_ZESWPmode supports zero extend for free. */
5874 if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5875 op0 = XEXP (op0, 0);
5876
5877 /* ANDS. */
5878 if (GET_CODE (op0) == AND)
5879 {
5880 x = op0;
5881 goto cost_logic;
5882 }
5883
5884 if (GET_CODE (op0) == PLUS)
5885 {
5886 /* ADDS (and CMN alias). */
5887 x = op0;
5888 goto cost_plus;
5889 }
5890
5891 if (GET_CODE (op0) == MINUS)
5892 {
5893 /* SUBS. */
5894 x = op0;
5895 goto cost_minus;
5896 }
5897
5898 if (GET_CODE (op1) == NEG)
5899 {
5900 /* CMN. */
5901 if (speed)
5902 *cost += extra_cost->alu.arith;
5903
5904 *cost += rtx_cost (op0, COMPARE, 0, speed);
5905 *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5906 return true;
5907 }
5908
5909 /* CMP.
5910
5911 Compare can freely swap the order of operands, and
5912 canonicalization puts the more complex operation first.
5913 But the integer MINUS logic expects the shift/extend
5914 operation in op1. */
5915 if (! (REG_P (op0)
5916 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5917 {
5918 op0 = XEXP (x, 1);
5919 op1 = XEXP (x, 0);
5920 }
5921 goto cost_minus;
5922 }
5923
5924 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5925 {
5926 /* FCMP. */
5927 if (speed)
5928 *cost += extra_cost->fp[mode == DFmode].compare;
5929
5930 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5931 {
5932 *cost += rtx_cost (op0, COMPARE, 0, speed);
5933 /* FCMP supports constant 0.0 for no extra cost. */
5934 return true;
5935 }
5936 return false;
5937 }
5938
5939 if (VECTOR_MODE_P (mode))
5940 {
5941 /* Vector compare. */
5942 if (speed)
5943 *cost += extra_cost->vect.alu;
5944
5945 if (aarch64_float_const_zero_rtx_p (op1))
5946 {
5947 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
5948 cost. */
5949 return true;
5950 }
5951 return false;
5952 }
5953 return false;
5954
5955 case MINUS:
5956 {
5957 op0 = XEXP (x, 0);
5958 op1 = XEXP (x, 1);
5959
5960 cost_minus:
5961 *cost += rtx_cost (op0, MINUS, 0, speed);
5962
5963 /* Detect valid immediates. */
5964 if ((GET_MODE_CLASS (mode) == MODE_INT
5965 || (GET_MODE_CLASS (mode) == MODE_CC
5966 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5967 && CONST_INT_P (op1)
5968 && aarch64_uimm12_shift (INTVAL (op1)))
5969 {
5970 if (speed)
5971 /* SUB(S) (immediate). */
5972 *cost += extra_cost->alu.arith;
5973 return true;
5974 }
5975
5976 /* Look for SUB (extended register). */
5977 if (aarch64_rtx_arith_op_extract_p (op1, mode))
5978 {
5979 if (speed)
5980 *cost += extra_cost->alu.extend_arith;
5981
5982 *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5983 (enum rtx_code) GET_CODE (op1),
5984 0, speed);
5985 return true;
5986 }
5987
5988 rtx new_op1 = aarch64_strip_extend (op1);
5989
5990 /* Cost this as an FMA-alike operation. */
5991 if ((GET_CODE (new_op1) == MULT
5992 || aarch64_shift_p (GET_CODE (new_op1)))
5993 && code != COMPARE)
5994 {
5995 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5996 (enum rtx_code) code,
5997 speed);
5998 return true;
5999 }
6000
6001 *cost += rtx_cost (new_op1, MINUS, 1, speed);
6002
6003 if (speed)
6004 {
6005 if (VECTOR_MODE_P (mode))
6006 {
6007 /* Vector SUB. */
6008 *cost += extra_cost->vect.alu;
6009 }
6010 else if (GET_MODE_CLASS (mode) == MODE_INT)
6011 {
6012 /* SUB(S). */
6013 *cost += extra_cost->alu.arith;
6014 }
6015 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6016 {
6017 /* FSUB. */
6018 *cost += extra_cost->fp[mode == DFmode].addsub;
6019 }
6020 }
6021 return true;
6022 }
6023
6024 case PLUS:
6025 {
6026 rtx new_op0;
6027
6028 op0 = XEXP (x, 0);
6029 op1 = XEXP (x, 1);
6030
6031 cost_plus:
6032 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6033 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6034 {
6035 /* CSINC. */
6036 *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6037 *cost += rtx_cost (op1, PLUS, 1, speed);
6038 return true;
6039 }
6040
6041 if (GET_MODE_CLASS (mode) == MODE_INT
6042 && CONST_INT_P (op1)
6043 && aarch64_uimm12_shift (INTVAL (op1)))
6044 {
6045 *cost += rtx_cost (op0, PLUS, 0, speed);
6046
6047 if (speed)
6048 /* ADD (immediate). */
6049 *cost += extra_cost->alu.arith;
6050 return true;
6051 }
6052
6053 *cost += rtx_cost (op1, PLUS, 1, speed);
6054
6055 /* Look for ADD (extended register). */
6056 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6057 {
6058 if (speed)
6059 *cost += extra_cost->alu.extend_arith;
6060
6061 *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6062 (enum rtx_code) GET_CODE (op0),
6063 0, speed);
6064 return true;
6065 }
6066
6067 /* Strip any extend, leave shifts behind as we will
6068 cost them through mult_cost. */
6069 new_op0 = aarch64_strip_extend (op0);
6070
6071 if (GET_CODE (new_op0) == MULT
6072 || aarch64_shift_p (GET_CODE (new_op0)))
6073 {
6074 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6075 speed);
6076 return true;
6077 }
6078
6079 *cost += rtx_cost (new_op0, PLUS, 0, speed);
6080
6081 if (speed)
6082 {
6083 if (VECTOR_MODE_P (mode))
6084 {
6085 /* Vector ADD. */
6086 *cost += extra_cost->vect.alu;
6087 }
6088 else if (GET_MODE_CLASS (mode) == MODE_INT)
6089 {
6090 /* ADD. */
6091 *cost += extra_cost->alu.arith;
6092 }
6093 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6094 {
6095 /* FADD. */
6096 *cost += extra_cost->fp[mode == DFmode].addsub;
6097 }
6098 }
6099 return true;
6100 }
6101
6102 case BSWAP:
6103 *cost = COSTS_N_INSNS (1);
6104
6105 if (speed)
6106 {
6107 if (VECTOR_MODE_P (mode))
6108 *cost += extra_cost->vect.alu;
6109 else
6110 *cost += extra_cost->alu.rev;
6111 }
6112 return false;
6113
6114 case IOR:
6115 if (aarch_rev16_p (x))
6116 {
6117 *cost = COSTS_N_INSNS (1);
6118
6119 if (speed)
6120 {
6121 if (VECTOR_MODE_P (mode))
6122 *cost += extra_cost->vect.alu;
6123 else
6124 *cost += extra_cost->alu.rev;
6125 }
6126 return true;
6127 }
6128
6129 if (aarch64_extr_rtx_p (x, &op0, &op1))
6130 {
6131 *cost += rtx_cost (op0, IOR, 0, speed)
6132 + rtx_cost (op1, IOR, 1, speed);
6133 if (speed)
6134 *cost += extra_cost->alu.shift;
6135
6136 return true;
6137 }
6138 /* Fall through. */
6139 case XOR:
6140 case AND:
6141 cost_logic:
6142 op0 = XEXP (x, 0);
6143 op1 = XEXP (x, 1);
6144
6145 if (VECTOR_MODE_P (mode))
6146 {
6147 if (speed)
6148 *cost += extra_cost->vect.alu;
6149 return true;
6150 }
6151
6152 if (code == AND
6153 && GET_CODE (op0) == MULT
6154 && CONST_INT_P (XEXP (op0, 1))
6155 && CONST_INT_P (op1)
6156 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6157 INTVAL (op1)) != 0)
6158 {
6159 /* This is a UBFM/SBFM. */
6160 *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6161 if (speed)
6162 *cost += extra_cost->alu.bfx;
6163 return true;
6164 }
6165
6166 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6167 {
6168 /* We possibly get the immediate for free, this is not
6169 modelled. */
6170 if (CONST_INT_P (op1)
6171 && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6172 {
6173 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6174
6175 if (speed)
6176 *cost += extra_cost->alu.logical;
6177
6178 return true;
6179 }
6180 else
6181 {
6182 rtx new_op0 = op0;
6183
6184 /* Handle ORN, EON, or BIC. */
6185 if (GET_CODE (op0) == NOT)
6186 op0 = XEXP (op0, 0);
6187
6188 new_op0 = aarch64_strip_shift (op0);
6189
6190 /* If we had a shift on op0 then this is a logical-shift-
6191 by-register/immediate operation. Otherwise, this is just
6192 a logical operation. */
6193 if (speed)
6194 {
6195 if (new_op0 != op0)
6196 {
6197 /* Shift by immediate. */
6198 if (CONST_INT_P (XEXP (op0, 1)))
6199 *cost += extra_cost->alu.log_shift;
6200 else
6201 *cost += extra_cost->alu.log_shift_reg;
6202 }
6203 else
6204 *cost += extra_cost->alu.logical;
6205 }
6206
6207 /* In both cases we want to cost both operands. */
6208 *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6209 + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6210
6211 return true;
6212 }
6213 }
6214 return false;
6215
6216 case NOT:
6217 x = XEXP (x, 0);
6218 op0 = aarch64_strip_shift (x);
6219
6220 if (VECTOR_MODE_P (mode))
6221 {
6222 /* Vector NOT. */
6223 *cost += extra_cost->vect.alu;
6224 return false;
6225 }
6226
6227 /* MVN-shifted-reg. */
6228 if (op0 != x)
6229 {
6230 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6231
6232 if (speed)
6233 *cost += extra_cost->alu.log_shift;
6234
6235 return true;
6236 }
6237 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6238 Handle the second form here taking care that 'a' in the above can
6239 be a shift. */
6240 else if (GET_CODE (op0) == XOR)
6241 {
6242 rtx newop0 = XEXP (op0, 0);
6243 rtx newop1 = XEXP (op0, 1);
6244 rtx op0_stripped = aarch64_strip_shift (newop0);
6245
6246 *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6247 + rtx_cost (op0_stripped, XOR, 0, speed);
6248
6249 if (speed)
6250 {
6251 if (op0_stripped != newop0)
6252 *cost += extra_cost->alu.log_shift;
6253 else
6254 *cost += extra_cost->alu.logical;
6255 }
6256
6257 return true;
6258 }
6259 /* MVN. */
6260 if (speed)
6261 *cost += extra_cost->alu.logical;
6262
6263 return false;
6264
6265 case ZERO_EXTEND:
6266
6267 op0 = XEXP (x, 0);
6268 /* If a value is written in SI mode, then zero extended to DI
6269 mode, the operation will in general be free as a write to
6270 a 'w' register implicitly zeroes the upper bits of an 'x'
6271 register. However, if this is
6272
6273 (set (reg) (zero_extend (reg)))
6274
6275 we must cost the explicit register move. */
6276 if (mode == DImode
6277 && GET_MODE (op0) == SImode
6278 && outer == SET)
6279 {
6280 int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6281
6282 if (!op_cost && speed)
6283 /* MOV. */
6284 *cost += extra_cost->alu.extend;
6285 else
6286 /* Free, the cost is that of the SI mode operation. */
6287 *cost = op_cost;
6288
6289 return true;
6290 }
6291 else if (MEM_P (XEXP (x, 0)))
6292 {
6293 /* All loads can zero extend to any size for free. */
6294 *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6295 return true;
6296 }
6297
6298 if (speed)
6299 {
6300 if (VECTOR_MODE_P (mode))
6301 {
6302 /* UMOV. */
6303 *cost += extra_cost->vect.alu;
6304 }
6305 else
6306 {
6307 /* UXTB/UXTH. */
6308 *cost += extra_cost->alu.extend;
6309 }
6310 }
6311 return false;
6312
6313 case SIGN_EXTEND:
6314 if (MEM_P (XEXP (x, 0)))
6315 {
6316 /* LDRSH. */
6317 if (speed)
6318 {
6319 rtx address = XEXP (XEXP (x, 0), 0);
6320 *cost += extra_cost->ldst.load_sign_extend;
6321
6322 *cost +=
6323 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6324 0, speed));
6325 }
6326 return true;
6327 }
6328
6329 if (speed)
6330 {
6331 if (VECTOR_MODE_P (mode))
6332 *cost += extra_cost->vect.alu;
6333 else
6334 *cost += extra_cost->alu.extend;
6335 }
6336 return false;
6337
6338 case ASHIFT:
6339 op0 = XEXP (x, 0);
6340 op1 = XEXP (x, 1);
6341
6342 if (CONST_INT_P (op1))
6343 {
6344 if (speed)
6345 {
6346 if (VECTOR_MODE_P (mode))
6347 {
6348 /* Vector shift (immediate). */
6349 *cost += extra_cost->vect.alu;
6350 }
6351 else
6352 {
6353 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6354 aliases. */
6355 *cost += extra_cost->alu.shift;
6356 }
6357 }
6358
6359 /* We can incorporate zero/sign extend for free. */
6360 if (GET_CODE (op0) == ZERO_EXTEND
6361 || GET_CODE (op0) == SIGN_EXTEND)
6362 op0 = XEXP (op0, 0);
6363
6364 *cost += rtx_cost (op0, ASHIFT, 0, speed);
6365 return true;
6366 }
6367 else
6368 {
6369 if (speed)
6370 {
6371 if (VECTOR_MODE_P (mode))
6372 {
6373 /* Vector shift (register). */
6374 *cost += extra_cost->vect.alu;
6375 }
6376 else
6377 {
6378 /* LSLV. */
6379 *cost += extra_cost->alu.shift_reg;
6380 }
6381 }
6382 return false; /* All arguments need to be in registers. */
6383 }
6384
6385 case ROTATE:
6386 case ROTATERT:
6387 case LSHIFTRT:
6388 case ASHIFTRT:
6389 op0 = XEXP (x, 0);
6390 op1 = XEXP (x, 1);
6391
6392 if (CONST_INT_P (op1))
6393 {
6394 /* ASR (immediate) and friends. */
6395 if (speed)
6396 {
6397 if (VECTOR_MODE_P (mode))
6398 *cost += extra_cost->vect.alu;
6399 else
6400 *cost += extra_cost->alu.shift;
6401 }
6402
6403 *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6404 return true;
6405 }
6406 else
6407 {
6408
6409 /* ASR (register) and friends. */
6410 if (speed)
6411 {
6412 if (VECTOR_MODE_P (mode))
6413 *cost += extra_cost->vect.alu;
6414 else
6415 *cost += extra_cost->alu.shift_reg;
6416 }
6417 return false; /* All arguments need to be in registers. */
6418 }
6419
6420 case SYMBOL_REF:
6421
6422 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6423 {
6424 /* LDR. */
6425 if (speed)
6426 *cost += extra_cost->ldst.load;
6427 }
6428 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6429 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6430 {
6431 /* ADRP, followed by ADD. */
6432 *cost += COSTS_N_INSNS (1);
6433 if (speed)
6434 *cost += 2 * extra_cost->alu.arith;
6435 }
6436 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6437 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6438 {
6439 /* ADR. */
6440 if (speed)
6441 *cost += extra_cost->alu.arith;
6442 }
6443
6444 if (flag_pic)
6445 {
6446 /* One extra load instruction, after accessing the GOT. */
6447 *cost += COSTS_N_INSNS (1);
6448 if (speed)
6449 *cost += extra_cost->ldst.load;
6450 }
6451 return true;
6452
6453 case HIGH:
6454 case LO_SUM:
6455 /* ADRP/ADD (immediate). */
6456 if (speed)
6457 *cost += extra_cost->alu.arith;
6458 return true;
6459
6460 case ZERO_EXTRACT:
6461 case SIGN_EXTRACT:
6462 /* UBFX/SBFX. */
6463 if (speed)
6464 {
6465 if (VECTOR_MODE_P (mode))
6466 *cost += extra_cost->vect.alu;
6467 else
6468 *cost += extra_cost->alu.bfx;
6469 }
6470
6471 /* We can trust that the immediates used will be correct (there
6472 are no by-register forms), so we need only cost op0. */
6473 *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6474 return true;
6475
6476 case MULT:
6477 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6478 /* aarch64_rtx_mult_cost always handles recursion to its
6479 operands. */
6480 return true;
6481
6482 case MOD:
6483 case UMOD:
6484 if (speed)
6485 {
6486 if (VECTOR_MODE_P (mode))
6487 *cost += extra_cost->vect.alu;
6488 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6489 *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6490 + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6491 else if (GET_MODE (x) == DFmode)
6492 *cost += (extra_cost->fp[1].mult
6493 + extra_cost->fp[1].div);
6494 else if (GET_MODE (x) == SFmode)
6495 *cost += (extra_cost->fp[0].mult
6496 + extra_cost->fp[0].div);
6497 }
6498 return false; /* All arguments need to be in registers. */
6499
6500 case DIV:
6501 case UDIV:
6502 case SQRT:
6503 if (speed)
6504 {
6505 if (VECTOR_MODE_P (mode))
6506 *cost += extra_cost->vect.alu;
6507 else if (GET_MODE_CLASS (mode) == MODE_INT)
6508 /* There is no integer SQRT, so only DIV and UDIV can get
6509 here. */
6510 *cost += extra_cost->mult[mode == DImode].idiv;
6511 else
6512 *cost += extra_cost->fp[mode == DFmode].div;
6513 }
6514 return false; /* All arguments need to be in registers. */
6515
6516 case IF_THEN_ELSE:
6517 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6518 XEXP (x, 2), cost, speed);
6519
6520 case EQ:
6521 case NE:
6522 case GT:
6523 case GTU:
6524 case LT:
6525 case LTU:
6526 case GE:
6527 case GEU:
6528 case LE:
6529 case LEU:
6530
6531 return false; /* All arguments must be in registers. */
6532
6533 case FMA:
6534 op0 = XEXP (x, 0);
6535 op1 = XEXP (x, 1);
6536 op2 = XEXP (x, 2);
6537
6538 if (speed)
6539 {
6540 if (VECTOR_MODE_P (mode))
6541 *cost += extra_cost->vect.alu;
6542 else
6543 *cost += extra_cost->fp[mode == DFmode].fma;
6544 }
6545
6546 /* FMSUB, FNMADD, and FNMSUB are free. */
6547 if (GET_CODE (op0) == NEG)
6548 op0 = XEXP (op0, 0);
6549
6550 if (GET_CODE (op2) == NEG)
6551 op2 = XEXP (op2, 0);
6552
6553 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6554 and the by-element operand as operand 0. */
6555 if (GET_CODE (op1) == NEG)
6556 op1 = XEXP (op1, 0);
6557
6558 /* Catch vector-by-element operations. The by-element operand can
6559 either be (vec_duplicate (vec_select (x))) or just
6560 (vec_select (x)), depending on whether we are multiplying by
6561 a vector or a scalar.
6562
6563 Canonicalization is not very good in these cases, FMA4 will put the
6564 by-element operand as operand 0, FNMA4 will have it as operand 1. */
6565 if (GET_CODE (op0) == VEC_DUPLICATE)
6566 op0 = XEXP (op0, 0);
6567 else if (GET_CODE (op1) == VEC_DUPLICATE)
6568 op1 = XEXP (op1, 0);
6569
6570 if (GET_CODE (op0) == VEC_SELECT)
6571 op0 = XEXP (op0, 0);
6572 else if (GET_CODE (op1) == VEC_SELECT)
6573 op1 = XEXP (op1, 0);
6574
6575 /* If the remaining parameters are not registers,
6576 get the cost to put them into registers. */
6577 *cost += rtx_cost (op0, FMA, 0, speed);
6578 *cost += rtx_cost (op1, FMA, 1, speed);
6579 *cost += rtx_cost (op2, FMA, 2, speed);
6580 return true;
6581
6582 case FLOAT:
6583 case UNSIGNED_FLOAT:
6584 if (speed)
6585 *cost += extra_cost->fp[mode == DFmode].fromint;
6586 return false;
6587
6588 case FLOAT_EXTEND:
6589 if (speed)
6590 {
6591 if (VECTOR_MODE_P (mode))
6592 {
6593 /*Vector truncate. */
6594 *cost += extra_cost->vect.alu;
6595 }
6596 else
6597 *cost += extra_cost->fp[mode == DFmode].widen;
6598 }
6599 return false;
6600
6601 case FLOAT_TRUNCATE:
6602 if (speed)
6603 {
6604 if (VECTOR_MODE_P (mode))
6605 {
6606 /*Vector conversion. */
6607 *cost += extra_cost->vect.alu;
6608 }
6609 else
6610 *cost += extra_cost->fp[mode == DFmode].narrow;
6611 }
6612 return false;
6613
6614 case FIX:
6615 case UNSIGNED_FIX:
6616 x = XEXP (x, 0);
6617 /* Strip the rounding part. They will all be implemented
6618 by the fcvt* family of instructions anyway. */
6619 if (GET_CODE (x) == UNSPEC)
6620 {
6621 unsigned int uns_code = XINT (x, 1);
6622
6623 if (uns_code == UNSPEC_FRINTA
6624 || uns_code == UNSPEC_FRINTM
6625 || uns_code == UNSPEC_FRINTN
6626 || uns_code == UNSPEC_FRINTP
6627 || uns_code == UNSPEC_FRINTZ)
6628 x = XVECEXP (x, 0, 0);
6629 }
6630
6631 if (speed)
6632 {
6633 if (VECTOR_MODE_P (mode))
6634 *cost += extra_cost->vect.alu;
6635 else
6636 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6637 }
6638 *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6639 return true;
6640
6641 case ABS:
6642 if (VECTOR_MODE_P (mode))
6643 {
6644 /* ABS (vector). */
6645 if (speed)
6646 *cost += extra_cost->vect.alu;
6647 }
6648 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6649 {
6650 op0 = XEXP (x, 0);
6651
6652 /* FABD, which is analogous to FADD. */
6653 if (GET_CODE (op0) == MINUS)
6654 {
6655 *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6656 + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6657 if (speed)
6658 *cost += extra_cost->fp[mode == DFmode].addsub;
6659
6660 return true;
6661 }
6662 /* Simple FABS is analogous to FNEG. */
6663 if (speed)
6664 *cost += extra_cost->fp[mode == DFmode].neg;
6665 }
6666 else
6667 {
6668 /* Integer ABS will either be split to
6669 two arithmetic instructions, or will be an ABS
6670 (scalar), which we don't model. */
6671 *cost = COSTS_N_INSNS (2);
6672 if (speed)
6673 *cost += 2 * extra_cost->alu.arith;
6674 }
6675 return false;
6676
6677 case SMAX:
6678 case SMIN:
6679 if (speed)
6680 {
6681 if (VECTOR_MODE_P (mode))
6682 *cost += extra_cost->vect.alu;
6683 else
6684 {
6685 /* FMAXNM/FMINNM/FMAX/FMIN.
6686 TODO: This may not be accurate for all implementations, but
6687 we do not model this in the cost tables. */
6688 *cost += extra_cost->fp[mode == DFmode].addsub;
6689 }
6690 }
6691 return false;
6692
6693 case UNSPEC:
6694 /* The floating point round to integer frint* instructions. */
6695 if (aarch64_frint_unspec_p (XINT (x, 1)))
6696 {
6697 if (speed)
6698 *cost += extra_cost->fp[mode == DFmode].roundint;
6699
6700 return false;
6701 }
6702
6703 if (XINT (x, 1) == UNSPEC_RBIT)
6704 {
6705 if (speed)
6706 *cost += extra_cost->alu.rev;
6707
6708 return false;
6709 }
6710 break;
6711
6712 case TRUNCATE:
6713
6714 /* Decompose <su>muldi3_highpart. */
6715 if (/* (truncate:DI */
6716 mode == DImode
6717 /* (lshiftrt:TI */
6718 && GET_MODE (XEXP (x, 0)) == TImode
6719 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6720 /* (mult:TI */
6721 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6722 /* (ANY_EXTEND:TI (reg:DI))
6723 (ANY_EXTEND:TI (reg:DI))) */
6724 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6725 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6726 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6727 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6728 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6729 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6730 /* (const_int 64) */
6731 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6732 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6733 {
6734 /* UMULH/SMULH. */
6735 if (speed)
6736 *cost += extra_cost->mult[mode == DImode].extend;
6737 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6738 MULT, 0, speed);
6739 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6740 MULT, 1, speed);
6741 return true;
6742 }
6743
6744 /* Fall through. */
6745 default:
6746 break;
6747 }
6748
6749 if (dump_file && (dump_flags & TDF_DETAILS))
6750 fprintf (dump_file,
6751 "\nFailed to cost RTX. Assuming default cost.\n");
6752
6753 return true;
6754 }
6755
6756 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6757 calculated for X. This cost is stored in *COST. Returns true
6758 if the total cost of X was calculated. */
6759 static bool
6760 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6761 int param, int *cost, bool speed)
6762 {
6763 bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6764
6765 if (dump_file && (dump_flags & TDF_DETAILS))
6766 {
6767 print_rtl_single (dump_file, x);
6768 fprintf (dump_file, "\n%s cost: %d (%s)\n",
6769 speed ? "Hot" : "Cold",
6770 *cost, result ? "final" : "partial");
6771 }
6772
6773 return result;
6774 }
6775
6776 static int
6777 aarch64_register_move_cost (machine_mode mode,
6778 reg_class_t from_i, reg_class_t to_i)
6779 {
6780 enum reg_class from = (enum reg_class) from_i;
6781 enum reg_class to = (enum reg_class) to_i;
6782 const struct cpu_regmove_cost *regmove_cost
6783 = aarch64_tune_params->regmove_cost;
6784
6785 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
6786 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6787 to = GENERAL_REGS;
6788
6789 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6790 from = GENERAL_REGS;
6791
6792 /* Moving between GPR and stack cost is the same as GP2GP. */
6793 if ((from == GENERAL_REGS && to == STACK_REG)
6794 || (to == GENERAL_REGS && from == STACK_REG))
6795 return regmove_cost->GP2GP;
6796
6797 /* To/From the stack register, we move via the gprs. */
6798 if (to == STACK_REG || from == STACK_REG)
6799 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6800 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6801
6802 if (GET_MODE_SIZE (mode) == 16)
6803 {
6804 /* 128-bit operations on general registers require 2 instructions. */
6805 if (from == GENERAL_REGS && to == GENERAL_REGS)
6806 return regmove_cost->GP2GP * 2;
6807 else if (from == GENERAL_REGS)
6808 return regmove_cost->GP2FP * 2;
6809 else if (to == GENERAL_REGS)
6810 return regmove_cost->FP2GP * 2;
6811
6812 /* When AdvSIMD instructions are disabled it is not possible to move
6813 a 128-bit value directly between Q registers. This is handled in
6814 secondary reload. A general register is used as a scratch to move
6815 the upper DI value and the lower DI value is moved directly,
6816 hence the cost is the sum of three moves. */
6817 if (! TARGET_SIMD)
6818 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6819
6820 return regmove_cost->FP2FP;
6821 }
6822
6823 if (from == GENERAL_REGS && to == GENERAL_REGS)
6824 return regmove_cost->GP2GP;
6825 else if (from == GENERAL_REGS)
6826 return regmove_cost->GP2FP;
6827 else if (to == GENERAL_REGS)
6828 return regmove_cost->FP2GP;
6829
6830 return regmove_cost->FP2FP;
6831 }
6832
6833 static int
6834 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6835 reg_class_t rclass ATTRIBUTE_UNUSED,
6836 bool in ATTRIBUTE_UNUSED)
6837 {
6838 return aarch64_tune_params->memmov_cost;
6839 }
6840
6841 /* Return the number of instructions that can be issued per cycle. */
6842 static int
6843 aarch64_sched_issue_rate (void)
6844 {
6845 return aarch64_tune_params->issue_rate;
6846 }
6847
6848 static int
6849 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6850 {
6851 int issue_rate = aarch64_sched_issue_rate ();
6852
6853 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6854 }
6855
6856 /* Vectorizer cost model target hooks. */
6857
6858 /* Implement targetm.vectorize.builtin_vectorization_cost. */
6859 static int
6860 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6861 tree vectype,
6862 int misalign ATTRIBUTE_UNUSED)
6863 {
6864 unsigned elements;
6865
6866 switch (type_of_cost)
6867 {
6868 case scalar_stmt:
6869 return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6870
6871 case scalar_load:
6872 return aarch64_tune_params->vec_costs->scalar_load_cost;
6873
6874 case scalar_store:
6875 return aarch64_tune_params->vec_costs->scalar_store_cost;
6876
6877 case vector_stmt:
6878 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6879
6880 case vector_load:
6881 return aarch64_tune_params->vec_costs->vec_align_load_cost;
6882
6883 case vector_store:
6884 return aarch64_tune_params->vec_costs->vec_store_cost;
6885
6886 case vec_to_scalar:
6887 return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6888
6889 case scalar_to_vec:
6890 return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6891
6892 case unaligned_load:
6893 return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6894
6895 case unaligned_store:
6896 return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6897
6898 case cond_branch_taken:
6899 return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6900
6901 case cond_branch_not_taken:
6902 return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6903
6904 case vec_perm:
6905 case vec_promote_demote:
6906 return aarch64_tune_params->vec_costs->vec_stmt_cost;
6907
6908 case vec_construct:
6909 elements = TYPE_VECTOR_SUBPARTS (vectype);
6910 return elements / 2 + 1;
6911
6912 default:
6913 gcc_unreachable ();
6914 }
6915 }
6916
6917 /* Implement targetm.vectorize.add_stmt_cost. */
6918 static unsigned
6919 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6920 struct _stmt_vec_info *stmt_info, int misalign,
6921 enum vect_cost_model_location where)
6922 {
6923 unsigned *cost = (unsigned *) data;
6924 unsigned retval = 0;
6925
6926 if (flag_vect_cost_model)
6927 {
6928 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6929 int stmt_cost =
6930 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6931
6932 /* Statements in an inner loop relative to the loop being
6933 vectorized are weighted more heavily. The value here is
6934 a function (linear for now) of the loop nest level. */
6935 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6936 {
6937 loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6938 struct loop *loop = LOOP_VINFO_LOOP (loop_info);
6939 unsigned nest_level = loop_depth (loop);
6940
6941 count *= nest_level;
6942 }
6943
6944 retval = (unsigned) (count * stmt_cost);
6945 cost[where] += retval;
6946 }
6947
6948 return retval;
6949 }
6950
6951 static void initialize_aarch64_code_model (void);
6952
6953 /* Parse the architecture extension string. */
6954
6955 static void
6956 aarch64_parse_extension (char *str)
6957 {
6958 /* The extension string is parsed left to right. */
6959 const struct aarch64_option_extension *opt = NULL;
6960
6961 /* Flag to say whether we are adding or removing an extension. */
6962 int adding_ext = -1;
6963
6964 while (str != NULL && *str != 0)
6965 {
6966 char *ext;
6967 size_t len;
6968
6969 str++;
6970 ext = strchr (str, '+');
6971
6972 if (ext != NULL)
6973 len = ext - str;
6974 else
6975 len = strlen (str);
6976
6977 if (len >= 2 && strncmp (str, "no", 2) == 0)
6978 {
6979 adding_ext = 0;
6980 len -= 2;
6981 str += 2;
6982 }
6983 else if (len > 0)
6984 adding_ext = 1;
6985
6986 if (len == 0)
6987 {
6988 error ("missing feature modifier after %qs", adding_ext ? "+"
6989 : "+no");
6990 return;
6991 }
6992
6993 /* Scan over the extensions table trying to find an exact match. */
6994 for (opt = all_extensions; opt->name != NULL; opt++)
6995 {
6996 if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6997 {
6998 /* Add or remove the extension. */
6999 if (adding_ext)
7000 aarch64_isa_flags |= opt->flags_on;
7001 else
7002 aarch64_isa_flags &= ~(opt->flags_off);
7003 break;
7004 }
7005 }
7006
7007 if (opt->name == NULL)
7008 {
7009 /* Extension not found in list. */
7010 error ("unknown feature modifier %qs", str);
7011 return;
7012 }
7013
7014 str = ext;
7015 };
7016
7017 return;
7018 }
7019
7020 /* Parse the ARCH string. */
7021
7022 static void
7023 aarch64_parse_arch (void)
7024 {
7025 char *ext;
7026 const struct processor *arch;
7027 char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7028 size_t len;
7029
7030 strcpy (str, aarch64_arch_string);
7031
7032 ext = strchr (str, '+');
7033
7034 if (ext != NULL)
7035 len = ext - str;
7036 else
7037 len = strlen (str);
7038
7039 if (len == 0)
7040 {
7041 error ("missing arch name in -march=%qs", str);
7042 return;
7043 }
7044
7045 /* Loop through the list of supported ARCHs to find a match. */
7046 for (arch = all_architectures; arch->name != NULL; arch++)
7047 {
7048 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7049 {
7050 selected_arch = arch;
7051 aarch64_isa_flags = selected_arch->flags;
7052
7053 if (!selected_cpu)
7054 selected_cpu = &all_cores[selected_arch->core];
7055
7056 if (ext != NULL)
7057 {
7058 /* ARCH string contains at least one extension. */
7059 aarch64_parse_extension (ext);
7060 }
7061
7062 if (strcmp (selected_arch->arch, selected_cpu->arch))
7063 {
7064 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7065 selected_cpu->name, selected_arch->name);
7066 }
7067
7068 return;
7069 }
7070 }
7071
7072 /* ARCH name not found in list. */
7073 error ("unknown value %qs for -march", str);
7074 return;
7075 }
7076
7077 /* Parse the CPU string. */
7078
7079 static void
7080 aarch64_parse_cpu (void)
7081 {
7082 char *ext;
7083 const struct processor *cpu;
7084 char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7085 size_t len;
7086
7087 strcpy (str, aarch64_cpu_string);
7088
7089 ext = strchr (str, '+');
7090
7091 if (ext != NULL)
7092 len = ext - str;
7093 else
7094 len = strlen (str);
7095
7096 if (len == 0)
7097 {
7098 error ("missing cpu name in -mcpu=%qs", str);
7099 return;
7100 }
7101
7102 /* Loop through the list of supported CPUs to find a match. */
7103 for (cpu = all_cores; cpu->name != NULL; cpu++)
7104 {
7105 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7106 {
7107 selected_cpu = cpu;
7108 aarch64_isa_flags = selected_cpu->flags;
7109
7110 if (ext != NULL)
7111 {
7112 /* CPU string contains at least one extension. */
7113 aarch64_parse_extension (ext);
7114 }
7115
7116 return;
7117 }
7118 }
7119
7120 /* CPU name not found in list. */
7121 error ("unknown value %qs for -mcpu", str);
7122 return;
7123 }
7124
7125 /* Parse the TUNE string. */
7126
7127 static void
7128 aarch64_parse_tune (void)
7129 {
7130 const struct processor *cpu;
7131 char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7132 strcpy (str, aarch64_tune_string);
7133
7134 /* Loop through the list of supported CPUs to find a match. */
7135 for (cpu = all_cores; cpu->name != NULL; cpu++)
7136 {
7137 if (strcmp (cpu->name, str) == 0)
7138 {
7139 selected_tune = cpu;
7140 return;
7141 }
7142 }
7143
7144 /* CPU name not found in list. */
7145 error ("unknown value %qs for -mtune", str);
7146 return;
7147 }
7148
7149
7150 /* Implement TARGET_OPTION_OVERRIDE. */
7151
7152 static void
7153 aarch64_override_options (void)
7154 {
7155 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7156 If either of -march or -mtune is given, they override their
7157 respective component of -mcpu.
7158
7159 So, first parse AARCH64_CPU_STRING, then the others, be careful
7160 with -march as, if -mcpu is not present on the command line, march
7161 must set a sensible default CPU. */
7162 if (aarch64_cpu_string)
7163 {
7164 aarch64_parse_cpu ();
7165 }
7166
7167 if (aarch64_arch_string)
7168 {
7169 aarch64_parse_arch ();
7170 }
7171
7172 if (aarch64_tune_string)
7173 {
7174 aarch64_parse_tune ();
7175 }
7176
7177 #ifndef HAVE_AS_MABI_OPTION
7178 /* The compiler may have been configured with 2.23.* binutils, which does
7179 not have support for ILP32. */
7180 if (TARGET_ILP32)
7181 error ("Assembler does not support -mabi=ilp32");
7182 #endif
7183
7184 initialize_aarch64_code_model ();
7185
7186 aarch64_build_bitmask_table ();
7187
7188 /* This target defaults to strict volatile bitfields. */
7189 if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7190 flag_strict_volatile_bitfields = 1;
7191
7192 /* If the user did not specify a processor, choose the default
7193 one for them. This will be the CPU set during configuration using
7194 --with-cpu, otherwise it is "generic". */
7195 if (!selected_cpu)
7196 {
7197 selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7198 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7199 }
7200
7201 gcc_assert (selected_cpu);
7202
7203 if (!selected_tune)
7204 selected_tune = selected_cpu;
7205
7206 aarch64_tune_flags = selected_tune->flags;
7207 aarch64_tune = selected_tune->core;
7208 aarch64_tune_params = selected_tune->tune;
7209 aarch64_architecture_version = selected_cpu->architecture_version;
7210
7211 if (aarch64_fix_a53_err835769 == 2)
7212 {
7213 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7214 aarch64_fix_a53_err835769 = 1;
7215 #else
7216 aarch64_fix_a53_err835769 = 0;
7217 #endif
7218 }
7219
7220 aarch64_register_fma_steering ();
7221
7222 aarch64_override_options_after_change ();
7223 }
7224
7225 /* Implement targetm.override_options_after_change. */
7226
7227 static void
7228 aarch64_override_options_after_change (void)
7229 {
7230 if (flag_omit_frame_pointer)
7231 flag_omit_leaf_frame_pointer = false;
7232 else if (flag_omit_leaf_frame_pointer)
7233 flag_omit_frame_pointer = true;
7234
7235 /* If not optimizing for size, set the default
7236 alignment to what the target wants */
7237 if (!optimize_size)
7238 {
7239 if (align_loops <= 0)
7240 align_loops = aarch64_tune_params->loop_align;
7241 if (align_jumps <= 0)
7242 align_jumps = aarch64_tune_params->jump_align;
7243 if (align_functions <= 0)
7244 align_functions = aarch64_tune_params->function_align;
7245 }
7246 }
7247
7248 static struct machine_function *
7249 aarch64_init_machine_status (void)
7250 {
7251 struct machine_function *machine;
7252 machine = ggc_cleared_alloc<machine_function> ();
7253 return machine;
7254 }
7255
7256 void
7257 aarch64_init_expanders (void)
7258 {
7259 init_machine_status = aarch64_init_machine_status;
7260 }
7261
7262 /* A checking mechanism for the implementation of the various code models. */
7263 static void
7264 initialize_aarch64_code_model (void)
7265 {
7266 if (flag_pic)
7267 {
7268 switch (aarch64_cmodel_var)
7269 {
7270 case AARCH64_CMODEL_TINY:
7271 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7272 break;
7273 case AARCH64_CMODEL_SMALL:
7274 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7275 break;
7276 case AARCH64_CMODEL_LARGE:
7277 sorry ("code model %qs with -f%s", "large",
7278 flag_pic > 1 ? "PIC" : "pic");
7279 default:
7280 gcc_unreachable ();
7281 }
7282 }
7283 else
7284 aarch64_cmodel = aarch64_cmodel_var;
7285 }
7286
7287 /* Return true if SYMBOL_REF X binds locally. */
7288
7289 static bool
7290 aarch64_symbol_binds_local_p (const_rtx x)
7291 {
7292 return (SYMBOL_REF_DECL (x)
7293 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7294 : SYMBOL_REF_LOCAL_P (x));
7295 }
7296
7297 /* Return true if SYMBOL_REF X is thread local */
7298 static bool
7299 aarch64_tls_symbol_p (rtx x)
7300 {
7301 if (! TARGET_HAVE_TLS)
7302 return false;
7303
7304 if (GET_CODE (x) != SYMBOL_REF)
7305 return false;
7306
7307 return SYMBOL_REF_TLS_MODEL (x) != 0;
7308 }
7309
7310 /* Classify a TLS symbol into one of the TLS kinds. */
7311 enum aarch64_symbol_type
7312 aarch64_classify_tls_symbol (rtx x)
7313 {
7314 enum tls_model tls_kind = tls_symbolic_operand_type (x);
7315
7316 switch (tls_kind)
7317 {
7318 case TLS_MODEL_GLOBAL_DYNAMIC:
7319 case TLS_MODEL_LOCAL_DYNAMIC:
7320 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7321
7322 case TLS_MODEL_INITIAL_EXEC:
7323 return SYMBOL_SMALL_GOTTPREL;
7324
7325 case TLS_MODEL_LOCAL_EXEC:
7326 return SYMBOL_SMALL_TPREL;
7327
7328 case TLS_MODEL_EMULATED:
7329 case TLS_MODEL_NONE:
7330 return SYMBOL_FORCE_TO_MEM;
7331
7332 default:
7333 gcc_unreachable ();
7334 }
7335 }
7336
7337 /* Return the method that should be used to access SYMBOL_REF or
7338 LABEL_REF X in context CONTEXT. */
7339
7340 enum aarch64_symbol_type
7341 aarch64_classify_symbol (rtx x, rtx offset,
7342 enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7343 {
7344 if (GET_CODE (x) == LABEL_REF)
7345 {
7346 switch (aarch64_cmodel)
7347 {
7348 case AARCH64_CMODEL_LARGE:
7349 return SYMBOL_FORCE_TO_MEM;
7350
7351 case AARCH64_CMODEL_TINY_PIC:
7352 case AARCH64_CMODEL_TINY:
7353 return SYMBOL_TINY_ABSOLUTE;
7354
7355 case AARCH64_CMODEL_SMALL_PIC:
7356 case AARCH64_CMODEL_SMALL:
7357 return SYMBOL_SMALL_ABSOLUTE;
7358
7359 default:
7360 gcc_unreachable ();
7361 }
7362 }
7363
7364 if (GET_CODE (x) == SYMBOL_REF)
7365 {
7366 if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7367 return SYMBOL_FORCE_TO_MEM;
7368
7369 if (aarch64_tls_symbol_p (x))
7370 return aarch64_classify_tls_symbol (x);
7371
7372 switch (aarch64_cmodel)
7373 {
7374 case AARCH64_CMODEL_TINY:
7375 /* When we retreive symbol + offset address, we have to make sure
7376 the offset does not cause overflow of the final address. But
7377 we have no way of knowing the address of symbol at compile time
7378 so we can't accurately say if the distance between the PC and
7379 symbol + offset is outside the addressible range of +/-1M in the
7380 TINY code model. So we rely on images not being greater than
7381 1M and cap the offset at 1M and anything beyond 1M will have to
7382 be loaded using an alternative mechanism. */
7383 if (SYMBOL_REF_WEAK (x)
7384 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7385 return SYMBOL_FORCE_TO_MEM;
7386 return SYMBOL_TINY_ABSOLUTE;
7387
7388 case AARCH64_CMODEL_SMALL:
7389 /* Same reasoning as the tiny code model, but the offset cap here is
7390 4G. */
7391 if (SYMBOL_REF_WEAK (x)
7392 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7393 HOST_WIDE_INT_C (4294967264)))
7394 return SYMBOL_FORCE_TO_MEM;
7395 return SYMBOL_SMALL_ABSOLUTE;
7396
7397 case AARCH64_CMODEL_TINY_PIC:
7398 if (!aarch64_symbol_binds_local_p (x))
7399 return SYMBOL_TINY_GOT;
7400 return SYMBOL_TINY_ABSOLUTE;
7401
7402 case AARCH64_CMODEL_SMALL_PIC:
7403 if (!aarch64_symbol_binds_local_p (x))
7404 return SYMBOL_SMALL_GOT;
7405 return SYMBOL_SMALL_ABSOLUTE;
7406
7407 default:
7408 gcc_unreachable ();
7409 }
7410 }
7411
7412 /* By default push everything into the constant pool. */
7413 return SYMBOL_FORCE_TO_MEM;
7414 }
7415
7416 bool
7417 aarch64_constant_address_p (rtx x)
7418 {
7419 return (CONSTANT_P (x) && memory_address_p (DImode, x));
7420 }
7421
7422 bool
7423 aarch64_legitimate_pic_operand_p (rtx x)
7424 {
7425 if (GET_CODE (x) == SYMBOL_REF
7426 || (GET_CODE (x) == CONST
7427 && GET_CODE (XEXP (x, 0)) == PLUS
7428 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7429 return false;
7430
7431 return true;
7432 }
7433
7434 /* Return true if X holds either a quarter-precision or
7435 floating-point +0.0 constant. */
7436 static bool
7437 aarch64_valid_floating_const (machine_mode mode, rtx x)
7438 {
7439 if (!CONST_DOUBLE_P (x))
7440 return false;
7441
7442 if (aarch64_float_const_zero_rtx_p (x))
7443 return true;
7444
7445 /* We only handle moving 0.0 to a TFmode register. */
7446 if (!(mode == SFmode || mode == DFmode))
7447 return false;
7448
7449 return aarch64_float_const_representable_p (x);
7450 }
7451
7452 static bool
7453 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7454 {
7455 /* Do not allow vector struct mode constants. We could support
7456 0 and -1 easily, but they need support in aarch64-simd.md. */
7457 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7458 return false;
7459
7460 /* This could probably go away because
7461 we now decompose CONST_INTs according to expand_mov_immediate. */
7462 if ((GET_CODE (x) == CONST_VECTOR
7463 && aarch64_simd_valid_immediate (x, mode, false, NULL))
7464 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7465 return !targetm.cannot_force_const_mem (mode, x);
7466
7467 if (GET_CODE (x) == HIGH
7468 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7469 return true;
7470
7471 return aarch64_constant_address_p (x);
7472 }
7473
7474 rtx
7475 aarch64_load_tp (rtx target)
7476 {
7477 if (!target
7478 || GET_MODE (target) != Pmode
7479 || !register_operand (target, Pmode))
7480 target = gen_reg_rtx (Pmode);
7481
7482 /* Can return in any reg. */
7483 emit_insn (gen_aarch64_load_tp_hard (target));
7484 return target;
7485 }
7486
7487 /* On AAPCS systems, this is the "struct __va_list". */
7488 static GTY(()) tree va_list_type;
7489
7490 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7491 Return the type to use as __builtin_va_list.
7492
7493 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7494
7495 struct __va_list
7496 {
7497 void *__stack;
7498 void *__gr_top;
7499 void *__vr_top;
7500 int __gr_offs;
7501 int __vr_offs;
7502 }; */
7503
7504 static tree
7505 aarch64_build_builtin_va_list (void)
7506 {
7507 tree va_list_name;
7508 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7509
7510 /* Create the type. */
7511 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7512 /* Give it the required name. */
7513 va_list_name = build_decl (BUILTINS_LOCATION,
7514 TYPE_DECL,
7515 get_identifier ("__va_list"),
7516 va_list_type);
7517 DECL_ARTIFICIAL (va_list_name) = 1;
7518 TYPE_NAME (va_list_type) = va_list_name;
7519 TYPE_STUB_DECL (va_list_type) = va_list_name;
7520
7521 /* Create the fields. */
7522 f_stack = build_decl (BUILTINS_LOCATION,
7523 FIELD_DECL, get_identifier ("__stack"),
7524 ptr_type_node);
7525 f_grtop = build_decl (BUILTINS_LOCATION,
7526 FIELD_DECL, get_identifier ("__gr_top"),
7527 ptr_type_node);
7528 f_vrtop = build_decl (BUILTINS_LOCATION,
7529 FIELD_DECL, get_identifier ("__vr_top"),
7530 ptr_type_node);
7531 f_groff = build_decl (BUILTINS_LOCATION,
7532 FIELD_DECL, get_identifier ("__gr_offs"),
7533 integer_type_node);
7534 f_vroff = build_decl (BUILTINS_LOCATION,
7535 FIELD_DECL, get_identifier ("__vr_offs"),
7536 integer_type_node);
7537
7538 DECL_ARTIFICIAL (f_stack) = 1;
7539 DECL_ARTIFICIAL (f_grtop) = 1;
7540 DECL_ARTIFICIAL (f_vrtop) = 1;
7541 DECL_ARTIFICIAL (f_groff) = 1;
7542 DECL_ARTIFICIAL (f_vroff) = 1;
7543
7544 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7545 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7546 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7547 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7548 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7549
7550 TYPE_FIELDS (va_list_type) = f_stack;
7551 DECL_CHAIN (f_stack) = f_grtop;
7552 DECL_CHAIN (f_grtop) = f_vrtop;
7553 DECL_CHAIN (f_vrtop) = f_groff;
7554 DECL_CHAIN (f_groff) = f_vroff;
7555
7556 /* Compute its layout. */
7557 layout_type (va_list_type);
7558
7559 return va_list_type;
7560 }
7561
7562 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
7563 static void
7564 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7565 {
7566 const CUMULATIVE_ARGS *cum;
7567 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7568 tree stack, grtop, vrtop, groff, vroff;
7569 tree t;
7570 int gr_save_area_size;
7571 int vr_save_area_size;
7572 int vr_offset;
7573
7574 cum = &crtl->args.info;
7575 gr_save_area_size
7576 = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7577 vr_save_area_size
7578 = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7579
7580 if (!TARGET_FLOAT)
7581 {
7582 gcc_assert (cum->aapcs_nvrn == 0);
7583 vr_save_area_size = 0;
7584 }
7585
7586 f_stack = TYPE_FIELDS (va_list_type_node);
7587 f_grtop = DECL_CHAIN (f_stack);
7588 f_vrtop = DECL_CHAIN (f_grtop);
7589 f_groff = DECL_CHAIN (f_vrtop);
7590 f_vroff = DECL_CHAIN (f_groff);
7591
7592 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7593 NULL_TREE);
7594 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7595 NULL_TREE);
7596 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7597 NULL_TREE);
7598 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7599 NULL_TREE);
7600 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7601 NULL_TREE);
7602
7603 /* Emit code to initialize STACK, which points to the next varargs stack
7604 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
7605 by named arguments. STACK is 8-byte aligned. */
7606 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7607 if (cum->aapcs_stack_size > 0)
7608 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7609 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7610 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7611
7612 /* Emit code to initialize GRTOP, the top of the GR save area.
7613 virtual_incoming_args_rtx should have been 16 byte aligned. */
7614 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7615 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7616 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7617
7618 /* Emit code to initialize VRTOP, the top of the VR save area.
7619 This address is gr_save_area_bytes below GRTOP, rounded
7620 down to the next 16-byte boundary. */
7621 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7622 vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7623 STACK_BOUNDARY / BITS_PER_UNIT);
7624
7625 if (vr_offset)
7626 t = fold_build_pointer_plus_hwi (t, -vr_offset);
7627 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7628 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7629
7630 /* Emit code to initialize GROFF, the offset from GRTOP of the
7631 next GPR argument. */
7632 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7633 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7634 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7635
7636 /* Likewise emit code to initialize VROFF, the offset from FTOP
7637 of the next VR argument. */
7638 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7639 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7640 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7641 }
7642
7643 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
7644
7645 static tree
7646 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7647 gimple_seq *post_p ATTRIBUTE_UNUSED)
7648 {
7649 tree addr;
7650 bool indirect_p;
7651 bool is_ha; /* is HFA or HVA. */
7652 bool dw_align; /* double-word align. */
7653 machine_mode ag_mode = VOIDmode;
7654 int nregs;
7655 machine_mode mode;
7656
7657 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7658 tree stack, f_top, f_off, off, arg, roundup, on_stack;
7659 HOST_WIDE_INT size, rsize, adjust, align;
7660 tree t, u, cond1, cond2;
7661
7662 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7663 if (indirect_p)
7664 type = build_pointer_type (type);
7665
7666 mode = TYPE_MODE (type);
7667
7668 f_stack = TYPE_FIELDS (va_list_type_node);
7669 f_grtop = DECL_CHAIN (f_stack);
7670 f_vrtop = DECL_CHAIN (f_grtop);
7671 f_groff = DECL_CHAIN (f_vrtop);
7672 f_vroff = DECL_CHAIN (f_groff);
7673
7674 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7675 f_stack, NULL_TREE);
7676 size = int_size_in_bytes (type);
7677 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7678
7679 dw_align = false;
7680 adjust = 0;
7681 if (aarch64_vfp_is_call_or_return_candidate (mode,
7682 type,
7683 &ag_mode,
7684 &nregs,
7685 &is_ha))
7686 {
7687 /* TYPE passed in fp/simd registers. */
7688 if (!TARGET_FLOAT)
7689 aarch64_err_no_fpadvsimd (mode, "varargs");
7690
7691 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7692 unshare_expr (valist), f_vrtop, NULL_TREE);
7693 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7694 unshare_expr (valist), f_vroff, NULL_TREE);
7695
7696 rsize = nregs * UNITS_PER_VREG;
7697
7698 if (is_ha)
7699 {
7700 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7701 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7702 }
7703 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7704 && size < UNITS_PER_VREG)
7705 {
7706 adjust = UNITS_PER_VREG - size;
7707 }
7708 }
7709 else
7710 {
7711 /* TYPE passed in general registers. */
7712 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7713 unshare_expr (valist), f_grtop, NULL_TREE);
7714 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7715 unshare_expr (valist), f_groff, NULL_TREE);
7716 rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7717 nregs = rsize / UNITS_PER_WORD;
7718
7719 if (align > 8)
7720 dw_align = true;
7721
7722 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7723 && size < UNITS_PER_WORD)
7724 {
7725 adjust = UNITS_PER_WORD - size;
7726 }
7727 }
7728
7729 /* Get a local temporary for the field value. */
7730 off = get_initialized_tmp_var (f_off, pre_p, NULL);
7731
7732 /* Emit code to branch if off >= 0. */
7733 t = build2 (GE_EXPR, boolean_type_node, off,
7734 build_int_cst (TREE_TYPE (off), 0));
7735 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7736
7737 if (dw_align)
7738 {
7739 /* Emit: offs = (offs + 15) & -16. */
7740 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7741 build_int_cst (TREE_TYPE (off), 15));
7742 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7743 build_int_cst (TREE_TYPE (off), -16));
7744 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7745 }
7746 else
7747 roundup = NULL;
7748
7749 /* Update ap.__[g|v]r_offs */
7750 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7751 build_int_cst (TREE_TYPE (off), rsize));
7752 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7753
7754 /* String up. */
7755 if (roundup)
7756 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7757
7758 /* [cond2] if (ap.__[g|v]r_offs > 0) */
7759 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7760 build_int_cst (TREE_TYPE (f_off), 0));
7761 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7762
7763 /* String up: make sure the assignment happens before the use. */
7764 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7765 COND_EXPR_ELSE (cond1) = t;
7766
7767 /* Prepare the trees handling the argument that is passed on the stack;
7768 the top level node will store in ON_STACK. */
7769 arg = get_initialized_tmp_var (stack, pre_p, NULL);
7770 if (align > 8)
7771 {
7772 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
7773 t = fold_convert (intDI_type_node, arg);
7774 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7775 build_int_cst (TREE_TYPE (t), 15));
7776 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7777 build_int_cst (TREE_TYPE (t), -16));
7778 t = fold_convert (TREE_TYPE (arg), t);
7779 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7780 }
7781 else
7782 roundup = NULL;
7783 /* Advance ap.__stack */
7784 t = fold_convert (intDI_type_node, arg);
7785 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7786 build_int_cst (TREE_TYPE (t), size + 7));
7787 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7788 build_int_cst (TREE_TYPE (t), -8));
7789 t = fold_convert (TREE_TYPE (arg), t);
7790 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7791 /* String up roundup and advance. */
7792 if (roundup)
7793 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7794 /* String up with arg */
7795 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7796 /* Big-endianness related address adjustment. */
7797 if (BLOCK_REG_PADDING (mode, type, 1) == downward
7798 && size < UNITS_PER_WORD)
7799 {
7800 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7801 size_int (UNITS_PER_WORD - size));
7802 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7803 }
7804
7805 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7806 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7807
7808 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
7809 t = off;
7810 if (adjust)
7811 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7812 build_int_cst (TREE_TYPE (off), adjust));
7813
7814 t = fold_convert (sizetype, t);
7815 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7816
7817 if (is_ha)
7818 {
7819 /* type ha; // treat as "struct {ftype field[n];}"
7820 ... [computing offs]
7821 for (i = 0; i <nregs; ++i, offs += 16)
7822 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7823 return ha; */
7824 int i;
7825 tree tmp_ha, field_t, field_ptr_t;
7826
7827 /* Declare a local variable. */
7828 tmp_ha = create_tmp_var_raw (type, "ha");
7829 gimple_add_tmp_var (tmp_ha);
7830
7831 /* Establish the base type. */
7832 switch (ag_mode)
7833 {
7834 case SFmode:
7835 field_t = float_type_node;
7836 field_ptr_t = float_ptr_type_node;
7837 break;
7838 case DFmode:
7839 field_t = double_type_node;
7840 field_ptr_t = double_ptr_type_node;
7841 break;
7842 case TFmode:
7843 field_t = long_double_type_node;
7844 field_ptr_t = long_double_ptr_type_node;
7845 break;
7846 /* The half precision and quad precision are not fully supported yet. Enable
7847 the following code after the support is complete. Need to find the correct
7848 type node for __fp16 *. */
7849 #if 0
7850 case HFmode:
7851 field_t = float_type_node;
7852 field_ptr_t = float_ptr_type_node;
7853 break;
7854 #endif
7855 case V2SImode:
7856 case V4SImode:
7857 {
7858 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7859 field_t = build_vector_type_for_mode (innertype, ag_mode);
7860 field_ptr_t = build_pointer_type (field_t);
7861 }
7862 break;
7863 default:
7864 gcc_assert (0);
7865 }
7866
7867 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
7868 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7869 addr = t;
7870 t = fold_convert (field_ptr_t, addr);
7871 t = build2 (MODIFY_EXPR, field_t,
7872 build1 (INDIRECT_REF, field_t, tmp_ha),
7873 build1 (INDIRECT_REF, field_t, t));
7874
7875 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
7876 for (i = 1; i < nregs; ++i)
7877 {
7878 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7879 u = fold_convert (field_ptr_t, addr);
7880 u = build2 (MODIFY_EXPR, field_t,
7881 build2 (MEM_REF, field_t, tmp_ha,
7882 build_int_cst (field_ptr_t,
7883 (i *
7884 int_size_in_bytes (field_t)))),
7885 build1 (INDIRECT_REF, field_t, u));
7886 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7887 }
7888
7889 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7890 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7891 }
7892
7893 COND_EXPR_ELSE (cond2) = t;
7894 addr = fold_convert (build_pointer_type (type), cond1);
7895 addr = build_va_arg_indirect_ref (addr);
7896
7897 if (indirect_p)
7898 addr = build_va_arg_indirect_ref (addr);
7899
7900 return addr;
7901 }
7902
7903 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
7904
7905 static void
7906 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7907 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7908 int no_rtl)
7909 {
7910 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7911 CUMULATIVE_ARGS local_cum;
7912 int gr_saved, vr_saved;
7913
7914 /* The caller has advanced CUM up to, but not beyond, the last named
7915 argument. Advance a local copy of CUM past the last "real" named
7916 argument, to find out how many registers are left over. */
7917 local_cum = *cum;
7918 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7919
7920 /* Found out how many registers we need to save. */
7921 gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7922 vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7923
7924 if (!TARGET_FLOAT)
7925 {
7926 gcc_assert (local_cum.aapcs_nvrn == 0);
7927 vr_saved = 0;
7928 }
7929
7930 if (!no_rtl)
7931 {
7932 if (gr_saved > 0)
7933 {
7934 rtx ptr, mem;
7935
7936 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
7937 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7938 - gr_saved * UNITS_PER_WORD);
7939 mem = gen_frame_mem (BLKmode, ptr);
7940 set_mem_alias_set (mem, get_varargs_alias_set ());
7941
7942 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7943 mem, gr_saved);
7944 }
7945 if (vr_saved > 0)
7946 {
7947 /* We can't use move_block_from_reg, because it will use
7948 the wrong mode, storing D regs only. */
7949 machine_mode mode = TImode;
7950 int off, i;
7951
7952 /* Set OFF to the offset from virtual_incoming_args_rtx of
7953 the first vector register. The VR save area lies below
7954 the GR one, and is aligned to 16 bytes. */
7955 off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7956 STACK_BOUNDARY / BITS_PER_UNIT);
7957 off -= vr_saved * UNITS_PER_VREG;
7958
7959 for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7960 {
7961 rtx ptr, mem;
7962
7963 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7964 mem = gen_frame_mem (mode, ptr);
7965 set_mem_alias_set (mem, get_varargs_alias_set ());
7966 aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7967 off += UNITS_PER_VREG;
7968 }
7969 }
7970 }
7971
7972 /* We don't save the size into *PRETEND_SIZE because we want to avoid
7973 any complication of having crtl->args.pretend_args_size changed. */
7974 cfun->machine->frame.saved_varargs_size
7975 = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7976 STACK_BOUNDARY / BITS_PER_UNIT)
7977 + vr_saved * UNITS_PER_VREG);
7978 }
7979
7980 static void
7981 aarch64_conditional_register_usage (void)
7982 {
7983 int i;
7984 if (!TARGET_FLOAT)
7985 {
7986 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7987 {
7988 fixed_regs[i] = 1;
7989 call_used_regs[i] = 1;
7990 }
7991 }
7992 }
7993
7994 /* Walk down the type tree of TYPE counting consecutive base elements.
7995 If *MODEP is VOIDmode, then set it to the first valid floating point
7996 type. If a non-floating point type is found, or if a floating point
7997 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7998 otherwise return the count in the sub-tree. */
7999 static int
8000 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8001 {
8002 machine_mode mode;
8003 HOST_WIDE_INT size;
8004
8005 switch (TREE_CODE (type))
8006 {
8007 case REAL_TYPE:
8008 mode = TYPE_MODE (type);
8009 if (mode != DFmode && mode != SFmode && mode != TFmode)
8010 return -1;
8011
8012 if (*modep == VOIDmode)
8013 *modep = mode;
8014
8015 if (*modep == mode)
8016 return 1;
8017
8018 break;
8019
8020 case COMPLEX_TYPE:
8021 mode = TYPE_MODE (TREE_TYPE (type));
8022 if (mode != DFmode && mode != SFmode && mode != TFmode)
8023 return -1;
8024
8025 if (*modep == VOIDmode)
8026 *modep = mode;
8027
8028 if (*modep == mode)
8029 return 2;
8030
8031 break;
8032
8033 case VECTOR_TYPE:
8034 /* Use V2SImode and V4SImode as representatives of all 64-bit
8035 and 128-bit vector types. */
8036 size = int_size_in_bytes (type);
8037 switch (size)
8038 {
8039 case 8:
8040 mode = V2SImode;
8041 break;
8042 case 16:
8043 mode = V4SImode;
8044 break;
8045 default:
8046 return -1;
8047 }
8048
8049 if (*modep == VOIDmode)
8050 *modep = mode;
8051
8052 /* Vector modes are considered to be opaque: two vectors are
8053 equivalent for the purposes of being homogeneous aggregates
8054 if they are the same size. */
8055 if (*modep == mode)
8056 return 1;
8057
8058 break;
8059
8060 case ARRAY_TYPE:
8061 {
8062 int count;
8063 tree index = TYPE_DOMAIN (type);
8064
8065 /* Can't handle incomplete types nor sizes that are not
8066 fixed. */
8067 if (!COMPLETE_TYPE_P (type)
8068 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8069 return -1;
8070
8071 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8072 if (count == -1
8073 || !index
8074 || !TYPE_MAX_VALUE (index)
8075 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8076 || !TYPE_MIN_VALUE (index)
8077 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8078 || count < 0)
8079 return -1;
8080
8081 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8082 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8083
8084 /* There must be no padding. */
8085 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8086 return -1;
8087
8088 return count;
8089 }
8090
8091 case RECORD_TYPE:
8092 {
8093 int count = 0;
8094 int sub_count;
8095 tree field;
8096
8097 /* Can't handle incomplete types nor sizes that are not
8098 fixed. */
8099 if (!COMPLETE_TYPE_P (type)
8100 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8101 return -1;
8102
8103 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8104 {
8105 if (TREE_CODE (field) != FIELD_DECL)
8106 continue;
8107
8108 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8109 if (sub_count < 0)
8110 return -1;
8111 count += sub_count;
8112 }
8113
8114 /* There must be no padding. */
8115 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8116 return -1;
8117
8118 return count;
8119 }
8120
8121 case UNION_TYPE:
8122 case QUAL_UNION_TYPE:
8123 {
8124 /* These aren't very interesting except in a degenerate case. */
8125 int count = 0;
8126 int sub_count;
8127 tree field;
8128
8129 /* Can't handle incomplete types nor sizes that are not
8130 fixed. */
8131 if (!COMPLETE_TYPE_P (type)
8132 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8133 return -1;
8134
8135 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8136 {
8137 if (TREE_CODE (field) != FIELD_DECL)
8138 continue;
8139
8140 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8141 if (sub_count < 0)
8142 return -1;
8143 count = count > sub_count ? count : sub_count;
8144 }
8145
8146 /* There must be no padding. */
8147 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8148 return -1;
8149
8150 return count;
8151 }
8152
8153 default:
8154 break;
8155 }
8156
8157 return -1;
8158 }
8159
8160 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8161 type as described in AAPCS64 \S 4.1.2.
8162
8163 See the comment above aarch64_composite_type_p for the notes on MODE. */
8164
8165 static bool
8166 aarch64_short_vector_p (const_tree type,
8167 machine_mode mode)
8168 {
8169 HOST_WIDE_INT size = -1;
8170
8171 if (type && TREE_CODE (type) == VECTOR_TYPE)
8172 size = int_size_in_bytes (type);
8173 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8174 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8175 size = GET_MODE_SIZE (mode);
8176
8177 return (size == 8 || size == 16);
8178 }
8179
8180 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8181 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
8182 array types. The C99 floating-point complex types are also considered
8183 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
8184 types, which are GCC extensions and out of the scope of AAPCS64, are
8185 treated as composite types here as well.
8186
8187 Note that MODE itself is not sufficient in determining whether a type
8188 is such a composite type or not. This is because
8189 stor-layout.c:compute_record_mode may have already changed the MODE
8190 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
8191 structure with only one field may have its MODE set to the mode of the
8192 field. Also an integer mode whose size matches the size of the
8193 RECORD_TYPE type may be used to substitute the original mode
8194 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
8195 solely relied on. */
8196
8197 static bool
8198 aarch64_composite_type_p (const_tree type,
8199 machine_mode mode)
8200 {
8201 if (aarch64_short_vector_p (type, mode))
8202 return false;
8203
8204 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8205 return true;
8206
8207 if (mode == BLKmode
8208 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8209 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8210 return true;
8211
8212 return false;
8213 }
8214
8215 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8216 shall be passed or returned in simd/fp register(s) (providing these
8217 parameter passing registers are available).
8218
8219 Upon successful return, *COUNT returns the number of needed registers,
8220 *BASE_MODE returns the mode of the individual register and when IS_HAF
8221 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8222 floating-point aggregate or a homogeneous short-vector aggregate. */
8223
8224 static bool
8225 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8226 const_tree type,
8227 machine_mode *base_mode,
8228 int *count,
8229 bool *is_ha)
8230 {
8231 machine_mode new_mode = VOIDmode;
8232 bool composite_p = aarch64_composite_type_p (type, mode);
8233
8234 if (is_ha != NULL) *is_ha = false;
8235
8236 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8237 || aarch64_short_vector_p (type, mode))
8238 {
8239 *count = 1;
8240 new_mode = mode;
8241 }
8242 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8243 {
8244 if (is_ha != NULL) *is_ha = true;
8245 *count = 2;
8246 new_mode = GET_MODE_INNER (mode);
8247 }
8248 else if (type && composite_p)
8249 {
8250 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8251
8252 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8253 {
8254 if (is_ha != NULL) *is_ha = true;
8255 *count = ag_count;
8256 }
8257 else
8258 return false;
8259 }
8260 else
8261 return false;
8262
8263 *base_mode = new_mode;
8264 return true;
8265 }
8266
8267 /* Implement TARGET_STRUCT_VALUE_RTX. */
8268
8269 static rtx
8270 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8271 int incoming ATTRIBUTE_UNUSED)
8272 {
8273 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8274 }
8275
8276 /* Implements target hook vector_mode_supported_p. */
8277 static bool
8278 aarch64_vector_mode_supported_p (machine_mode mode)
8279 {
8280 if (TARGET_SIMD
8281 && (mode == V4SImode || mode == V8HImode
8282 || mode == V16QImode || mode == V2DImode
8283 || mode == V2SImode || mode == V4HImode
8284 || mode == V8QImode || mode == V2SFmode
8285 || mode == V4SFmode || mode == V2DFmode
8286 || mode == V1DFmode))
8287 return true;
8288
8289 return false;
8290 }
8291
8292 /* Return appropriate SIMD container
8293 for MODE within a vector of WIDTH bits. */
8294 static machine_mode
8295 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8296 {
8297 gcc_assert (width == 64 || width == 128);
8298 if (TARGET_SIMD)
8299 {
8300 if (width == 128)
8301 switch (mode)
8302 {
8303 case DFmode:
8304 return V2DFmode;
8305 case SFmode:
8306 return V4SFmode;
8307 case SImode:
8308 return V4SImode;
8309 case HImode:
8310 return V8HImode;
8311 case QImode:
8312 return V16QImode;
8313 case DImode:
8314 return V2DImode;
8315 default:
8316 break;
8317 }
8318 else
8319 switch (mode)
8320 {
8321 case SFmode:
8322 return V2SFmode;
8323 case SImode:
8324 return V2SImode;
8325 case HImode:
8326 return V4HImode;
8327 case QImode:
8328 return V8QImode;
8329 default:
8330 break;
8331 }
8332 }
8333 return word_mode;
8334 }
8335
8336 /* Return 128-bit container as the preferred SIMD mode for MODE. */
8337 static machine_mode
8338 aarch64_preferred_simd_mode (machine_mode mode)
8339 {
8340 return aarch64_simd_container_mode (mode, 128);
8341 }
8342
8343 /* Return the bitmask of possible vector sizes for the vectorizer
8344 to iterate over. */
8345 static unsigned int
8346 aarch64_autovectorize_vector_sizes (void)
8347 {
8348 return (16 | 8);
8349 }
8350
8351 /* Implement TARGET_MANGLE_TYPE. */
8352
8353 static const char *
8354 aarch64_mangle_type (const_tree type)
8355 {
8356 /* The AArch64 ABI documents say that "__va_list" has to be
8357 managled as if it is in the "std" namespace. */
8358 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8359 return "St9__va_list";
8360
8361 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
8362 builtin types. */
8363 if (TYPE_NAME (type) != NULL)
8364 return aarch64_mangle_builtin_type (type);
8365
8366 /* Use the default mangling. */
8367 return NULL;
8368 }
8369
8370
8371 /* Return true if the rtx_insn contains a MEM RTX somewhere
8372 in it. */
8373
8374 static bool
8375 has_memory_op (rtx_insn *mem_insn)
8376 {
8377 subrtx_iterator::array_type array;
8378 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8379 if (MEM_P (*iter))
8380 return true;
8381
8382 return false;
8383 }
8384
8385 /* Find the first rtx_insn before insn that will generate an assembly
8386 instruction. */
8387
8388 static rtx_insn *
8389 aarch64_prev_real_insn (rtx_insn *insn)
8390 {
8391 if (!insn)
8392 return NULL;
8393
8394 do
8395 {
8396 insn = prev_real_insn (insn);
8397 }
8398 while (insn && recog_memoized (insn) < 0);
8399
8400 return insn;
8401 }
8402
8403 static bool
8404 is_madd_op (enum attr_type t1)
8405 {
8406 unsigned int i;
8407 /* A number of these may be AArch32 only. */
8408 enum attr_type mlatypes[] = {
8409 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8410 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8411 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8412 };
8413
8414 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8415 {
8416 if (t1 == mlatypes[i])
8417 return true;
8418 }
8419
8420 return false;
8421 }
8422
8423 /* Check if there is a register dependency between a load and the insn
8424 for which we hold recog_data. */
8425
8426 static bool
8427 dep_between_memop_and_curr (rtx memop)
8428 {
8429 rtx load_reg;
8430 int opno;
8431
8432 gcc_assert (GET_CODE (memop) == SET);
8433
8434 if (!REG_P (SET_DEST (memop)))
8435 return false;
8436
8437 load_reg = SET_DEST (memop);
8438 for (opno = 1; opno < recog_data.n_operands; opno++)
8439 {
8440 rtx operand = recog_data.operand[opno];
8441 if (REG_P (operand)
8442 && reg_overlap_mentioned_p (load_reg, operand))
8443 return true;
8444
8445 }
8446 return false;
8447 }
8448
8449
8450 /* When working around the Cortex-A53 erratum 835769,
8451 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8452 instruction and has a preceding memory instruction such that a NOP
8453 should be inserted between them. */
8454
8455 bool
8456 aarch64_madd_needs_nop (rtx_insn* insn)
8457 {
8458 enum attr_type attr_type;
8459 rtx_insn *prev;
8460 rtx body;
8461
8462 if (!aarch64_fix_a53_err835769)
8463 return false;
8464
8465 if (recog_memoized (insn) < 0)
8466 return false;
8467
8468 attr_type = get_attr_type (insn);
8469 if (!is_madd_op (attr_type))
8470 return false;
8471
8472 prev = aarch64_prev_real_insn (insn);
8473 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8474 Restore recog state to INSN to avoid state corruption. */
8475 extract_constrain_insn_cached (insn);
8476
8477 if (!prev || !has_memory_op (prev))
8478 return false;
8479
8480 body = single_set (prev);
8481
8482 /* If the previous insn is a memory op and there is no dependency between
8483 it and the DImode madd, emit a NOP between them. If body is NULL then we
8484 have a complex memory operation, probably a load/store pair.
8485 Be conservative for now and emit a NOP. */
8486 if (GET_MODE (recog_data.operand[0]) == DImode
8487 && (!body || !dep_between_memop_and_curr (body)))
8488 return true;
8489
8490 return false;
8491
8492 }
8493
8494
8495 /* Implement FINAL_PRESCAN_INSN. */
8496
8497 void
8498 aarch64_final_prescan_insn (rtx_insn *insn)
8499 {
8500 if (aarch64_madd_needs_nop (insn))
8501 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8502 }
8503
8504
8505 /* Return the equivalent letter for size. */
8506 static char
8507 sizetochar (int size)
8508 {
8509 switch (size)
8510 {
8511 case 64: return 'd';
8512 case 32: return 's';
8513 case 16: return 'h';
8514 case 8 : return 'b';
8515 default: gcc_unreachable ();
8516 }
8517 }
8518
8519 /* Return true iff x is a uniform vector of floating-point
8520 constants, and the constant can be represented in
8521 quarter-precision form. Note, as aarch64_float_const_representable
8522 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
8523 static bool
8524 aarch64_vect_float_const_representable_p (rtx x)
8525 {
8526 int i = 0;
8527 REAL_VALUE_TYPE r0, ri;
8528 rtx x0, xi;
8529
8530 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8531 return false;
8532
8533 x0 = CONST_VECTOR_ELT (x, 0);
8534 if (!CONST_DOUBLE_P (x0))
8535 return false;
8536
8537 REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8538
8539 for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8540 {
8541 xi = CONST_VECTOR_ELT (x, i);
8542 if (!CONST_DOUBLE_P (xi))
8543 return false;
8544
8545 REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8546 if (!REAL_VALUES_EQUAL (r0, ri))
8547 return false;
8548 }
8549
8550 return aarch64_float_const_representable_p (x0);
8551 }
8552
8553 /* Return true for valid and false for invalid. */
8554 bool
8555 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8556 struct simd_immediate_info *info)
8557 {
8558 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
8559 matches = 1; \
8560 for (i = 0; i < idx; i += (STRIDE)) \
8561 if (!(TEST)) \
8562 matches = 0; \
8563 if (matches) \
8564 { \
8565 immtype = (CLASS); \
8566 elsize = (ELSIZE); \
8567 eshift = (SHIFT); \
8568 emvn = (NEG); \
8569 break; \
8570 }
8571
8572 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8573 unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8574 unsigned char bytes[16];
8575 int immtype = -1, matches;
8576 unsigned int invmask = inverse ? 0xff : 0;
8577 int eshift, emvn;
8578
8579 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8580 {
8581 if (! (aarch64_simd_imm_zero_p (op, mode)
8582 || aarch64_vect_float_const_representable_p (op)))
8583 return false;
8584
8585 if (info)
8586 {
8587 info->value = CONST_VECTOR_ELT (op, 0);
8588 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8589 info->mvn = false;
8590 info->shift = 0;
8591 }
8592
8593 return true;
8594 }
8595
8596 /* Splat vector constant out into a byte vector. */
8597 for (i = 0; i < n_elts; i++)
8598 {
8599 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
8600 it must be laid out in the vector register in reverse order. */
8601 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8602 unsigned HOST_WIDE_INT elpart;
8603 unsigned int part, parts;
8604
8605 if (CONST_INT_P (el))
8606 {
8607 elpart = INTVAL (el);
8608 parts = 1;
8609 }
8610 else if (GET_CODE (el) == CONST_DOUBLE)
8611 {
8612 elpart = CONST_DOUBLE_LOW (el);
8613 parts = 2;
8614 }
8615 else
8616 gcc_unreachable ();
8617
8618 for (part = 0; part < parts; part++)
8619 {
8620 unsigned int byte;
8621 for (byte = 0; byte < innersize; byte++)
8622 {
8623 bytes[idx++] = (elpart & 0xff) ^ invmask;
8624 elpart >>= BITS_PER_UNIT;
8625 }
8626 if (GET_CODE (el) == CONST_DOUBLE)
8627 elpart = CONST_DOUBLE_HIGH (el);
8628 }
8629 }
8630
8631 /* Sanity check. */
8632 gcc_assert (idx == GET_MODE_SIZE (mode));
8633
8634 do
8635 {
8636 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8637 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8638
8639 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8640 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8641
8642 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8643 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8644
8645 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8646 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8647
8648 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8649
8650 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8651
8652 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8653 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8654
8655 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8656 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8657
8658 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8659 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8660
8661 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8662 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8663
8664 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8665
8666 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8667
8668 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8669 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8670
8671 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8672 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8673
8674 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8675 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8676
8677 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8678 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8679
8680 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8681
8682 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8683 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8684 }
8685 while (0);
8686
8687 if (immtype == -1)
8688 return false;
8689
8690 if (info)
8691 {
8692 info->element_width = elsize;
8693 info->mvn = emvn != 0;
8694 info->shift = eshift;
8695
8696 unsigned HOST_WIDE_INT imm = 0;
8697
8698 if (immtype >= 12 && immtype <= 15)
8699 info->msl = true;
8700
8701 /* Un-invert bytes of recognized vector, if necessary. */
8702 if (invmask != 0)
8703 for (i = 0; i < idx; i++)
8704 bytes[i] ^= invmask;
8705
8706 if (immtype == 17)
8707 {
8708 /* FIXME: Broken on 32-bit H_W_I hosts. */
8709 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8710
8711 for (i = 0; i < 8; i++)
8712 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8713 << (i * BITS_PER_UNIT);
8714
8715
8716 info->value = GEN_INT (imm);
8717 }
8718 else
8719 {
8720 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8721 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8722
8723 /* Construct 'abcdefgh' because the assembler cannot handle
8724 generic constants. */
8725 if (info->mvn)
8726 imm = ~imm;
8727 imm = (imm >> info->shift) & 0xff;
8728 info->value = GEN_INT (imm);
8729 }
8730 }
8731
8732 return true;
8733 #undef CHECK
8734 }
8735
8736 /* Check of immediate shift constants are within range. */
8737 bool
8738 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8739 {
8740 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8741 if (left)
8742 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8743 else
8744 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8745 }
8746
8747 /* Return true if X is a uniform vector where all elements
8748 are either the floating-point constant 0.0 or the
8749 integer constant 0. */
8750 bool
8751 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8752 {
8753 return x == CONST0_RTX (mode);
8754 }
8755
8756 bool
8757 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8758 {
8759 HOST_WIDE_INT imm = INTVAL (x);
8760 int i;
8761
8762 for (i = 0; i < 8; i++)
8763 {
8764 unsigned int byte = imm & 0xff;
8765 if (byte != 0xff && byte != 0)
8766 return false;
8767 imm >>= 8;
8768 }
8769
8770 return true;
8771 }
8772
8773 bool
8774 aarch64_mov_operand_p (rtx x,
8775 enum aarch64_symbol_context context,
8776 machine_mode mode)
8777 {
8778 if (GET_CODE (x) == HIGH
8779 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8780 return true;
8781
8782 if (CONST_INT_P (x))
8783 return true;
8784
8785 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8786 return true;
8787
8788 return aarch64_classify_symbolic_expression (x, context)
8789 == SYMBOL_TINY_ABSOLUTE;
8790 }
8791
8792 /* Return a const_int vector of VAL. */
8793 rtx
8794 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8795 {
8796 int nunits = GET_MODE_NUNITS (mode);
8797 rtvec v = rtvec_alloc (nunits);
8798 int i;
8799
8800 for (i=0; i < nunits; i++)
8801 RTVEC_ELT (v, i) = GEN_INT (val);
8802
8803 return gen_rtx_CONST_VECTOR (mode, v);
8804 }
8805
8806 /* Check OP is a legal scalar immediate for the MOVI instruction. */
8807
8808 bool
8809 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8810 {
8811 machine_mode vmode;
8812
8813 gcc_assert (!VECTOR_MODE_P (mode));
8814 vmode = aarch64_preferred_simd_mode (mode);
8815 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8816 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8817 }
8818
8819 /* Construct and return a PARALLEL RTX vector with elements numbering the
8820 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8821 the vector - from the perspective of the architecture. This does not
8822 line up with GCC's perspective on lane numbers, so we end up with
8823 different masks depending on our target endian-ness. The diagram
8824 below may help. We must draw the distinction when building masks
8825 which select one half of the vector. An instruction selecting
8826 architectural low-lanes for a big-endian target, must be described using
8827 a mask selecting GCC high-lanes.
8828
8829 Big-Endian Little-Endian
8830
8831 GCC 0 1 2 3 3 2 1 0
8832 | x | x | x | x | | x | x | x | x |
8833 Architecture 3 2 1 0 3 2 1 0
8834
8835 Low Mask: { 2, 3 } { 0, 1 }
8836 High Mask: { 0, 1 } { 2, 3 }
8837 */
8838
8839 rtx
8840 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8841 {
8842 int nunits = GET_MODE_NUNITS (mode);
8843 rtvec v = rtvec_alloc (nunits / 2);
8844 int high_base = nunits / 2;
8845 int low_base = 0;
8846 int base;
8847 rtx t1;
8848 int i;
8849
8850 if (BYTES_BIG_ENDIAN)
8851 base = high ? low_base : high_base;
8852 else
8853 base = high ? high_base : low_base;
8854
8855 for (i = 0; i < nunits / 2; i++)
8856 RTVEC_ELT (v, i) = GEN_INT (base + i);
8857
8858 t1 = gen_rtx_PARALLEL (mode, v);
8859 return t1;
8860 }
8861
8862 /* Check OP for validity as a PARALLEL RTX vector with elements
8863 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8864 from the perspective of the architecture. See the diagram above
8865 aarch64_simd_vect_par_cnst_half for more details. */
8866
8867 bool
8868 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8869 bool high)
8870 {
8871 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8872 HOST_WIDE_INT count_op = XVECLEN (op, 0);
8873 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8874 int i = 0;
8875
8876 if (!VECTOR_MODE_P (mode))
8877 return false;
8878
8879 if (count_op != count_ideal)
8880 return false;
8881
8882 for (i = 0; i < count_ideal; i++)
8883 {
8884 rtx elt_op = XVECEXP (op, 0, i);
8885 rtx elt_ideal = XVECEXP (ideal, 0, i);
8886
8887 if (!CONST_INT_P (elt_op)
8888 || INTVAL (elt_ideal) != INTVAL (elt_op))
8889 return false;
8890 }
8891 return true;
8892 }
8893
8894 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
8895 HIGH (exclusive). */
8896 void
8897 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8898 const_tree exp)
8899 {
8900 HOST_WIDE_INT lane;
8901 gcc_assert (CONST_INT_P (operand));
8902 lane = INTVAL (operand);
8903
8904 if (lane < low || lane >= high)
8905 {
8906 if (exp)
8907 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
8908 else
8909 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
8910 }
8911 }
8912
8913 /* Return TRUE if OP is a valid vector addressing mode. */
8914 bool
8915 aarch64_simd_mem_operand_p (rtx op)
8916 {
8917 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8918 || REG_P (XEXP (op, 0)));
8919 }
8920
8921 /* Emit a register copy from operand to operand, taking care not to
8922 early-clobber source registers in the process.
8923
8924 COUNT is the number of components into which the copy needs to be
8925 decomposed. */
8926 void
8927 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8928 unsigned int count)
8929 {
8930 unsigned int i;
8931 int rdest = REGNO (operands[0]);
8932 int rsrc = REGNO (operands[1]);
8933
8934 if (!reg_overlap_mentioned_p (operands[0], operands[1])
8935 || rdest < rsrc)
8936 for (i = 0; i < count; i++)
8937 emit_move_insn (gen_rtx_REG (mode, rdest + i),
8938 gen_rtx_REG (mode, rsrc + i));
8939 else
8940 for (i = 0; i < count; i++)
8941 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8942 gen_rtx_REG (mode, rsrc + count - i - 1));
8943 }
8944
8945 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8946 one of VSTRUCT modes: OI, CI or XI. */
8947 int
8948 aarch64_simd_attr_length_move (rtx_insn *insn)
8949 {
8950 machine_mode mode;
8951
8952 extract_insn_cached (insn);
8953
8954 if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8955 {
8956 mode = GET_MODE (recog_data.operand[0]);
8957 switch (mode)
8958 {
8959 case OImode:
8960 return 8;
8961 case CImode:
8962 return 12;
8963 case XImode:
8964 return 16;
8965 default:
8966 gcc_unreachable ();
8967 }
8968 }
8969 return 4;
8970 }
8971
8972 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8973 one of VSTRUCT modes: OI, CI, EI, or XI. */
8974 int
8975 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8976 {
8977 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8978 }
8979
8980 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
8981 alignment of a vector to 128 bits. */
8982 static HOST_WIDE_INT
8983 aarch64_simd_vector_alignment (const_tree type)
8984 {
8985 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8986 return MIN (align, 128);
8987 }
8988
8989 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
8990 static bool
8991 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8992 {
8993 if (is_packed)
8994 return false;
8995
8996 /* We guarantee alignment for vectors up to 128-bits. */
8997 if (tree_int_cst_compare (TYPE_SIZE (type),
8998 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8999 return false;
9000
9001 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
9002 return true;
9003 }
9004
9005 /* If VALS is a vector constant that can be loaded into a register
9006 using DUP, generate instructions to do so and return an RTX to
9007 assign to the register. Otherwise return NULL_RTX. */
9008 static rtx
9009 aarch64_simd_dup_constant (rtx vals)
9010 {
9011 machine_mode mode = GET_MODE (vals);
9012 machine_mode inner_mode = GET_MODE_INNER (mode);
9013 int n_elts = GET_MODE_NUNITS (mode);
9014 bool all_same = true;
9015 rtx x;
9016 int i;
9017
9018 if (GET_CODE (vals) != CONST_VECTOR)
9019 return NULL_RTX;
9020
9021 for (i = 1; i < n_elts; ++i)
9022 {
9023 x = CONST_VECTOR_ELT (vals, i);
9024 if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9025 all_same = false;
9026 }
9027
9028 if (!all_same)
9029 return NULL_RTX;
9030
9031 /* We can load this constant by using DUP and a constant in a
9032 single ARM register. This will be cheaper than a vector
9033 load. */
9034 x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9035 return gen_rtx_VEC_DUPLICATE (mode, x);
9036 }
9037
9038
9039 /* Generate code to load VALS, which is a PARALLEL containing only
9040 constants (for vec_init) or CONST_VECTOR, efficiently into a
9041 register. Returns an RTX to copy into the register, or NULL_RTX
9042 for a PARALLEL that can not be converted into a CONST_VECTOR. */
9043 static rtx
9044 aarch64_simd_make_constant (rtx vals)
9045 {
9046 machine_mode mode = GET_MODE (vals);
9047 rtx const_dup;
9048 rtx const_vec = NULL_RTX;
9049 int n_elts = GET_MODE_NUNITS (mode);
9050 int n_const = 0;
9051 int i;
9052
9053 if (GET_CODE (vals) == CONST_VECTOR)
9054 const_vec = vals;
9055 else if (GET_CODE (vals) == PARALLEL)
9056 {
9057 /* A CONST_VECTOR must contain only CONST_INTs and
9058 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9059 Only store valid constants in a CONST_VECTOR. */
9060 for (i = 0; i < n_elts; ++i)
9061 {
9062 rtx x = XVECEXP (vals, 0, i);
9063 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9064 n_const++;
9065 }
9066 if (n_const == n_elts)
9067 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9068 }
9069 else
9070 gcc_unreachable ();
9071
9072 if (const_vec != NULL_RTX
9073 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9074 /* Load using MOVI/MVNI. */
9075 return const_vec;
9076 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9077 /* Loaded using DUP. */
9078 return const_dup;
9079 else if (const_vec != NULL_RTX)
9080 /* Load from constant pool. We can not take advantage of single-cycle
9081 LD1 because we need a PC-relative addressing mode. */
9082 return const_vec;
9083 else
9084 /* A PARALLEL containing something not valid inside CONST_VECTOR.
9085 We can not construct an initializer. */
9086 return NULL_RTX;
9087 }
9088
9089 void
9090 aarch64_expand_vector_init (rtx target, rtx vals)
9091 {
9092 machine_mode mode = GET_MODE (target);
9093 machine_mode inner_mode = GET_MODE_INNER (mode);
9094 int n_elts = GET_MODE_NUNITS (mode);
9095 int n_var = 0;
9096 rtx any_const = NULL_RTX;
9097 bool all_same = true;
9098
9099 for (int i = 0; i < n_elts; ++i)
9100 {
9101 rtx x = XVECEXP (vals, 0, i);
9102 if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9103 ++n_var;
9104 else
9105 any_const = x;
9106
9107 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9108 all_same = false;
9109 }
9110
9111 if (n_var == 0)
9112 {
9113 rtx constant = aarch64_simd_make_constant (vals);
9114 if (constant != NULL_RTX)
9115 {
9116 emit_move_insn (target, constant);
9117 return;
9118 }
9119 }
9120
9121 /* Splat a single non-constant element if we can. */
9122 if (all_same)
9123 {
9124 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9125 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9126 return;
9127 }
9128
9129 /* Half the fields (or less) are non-constant. Load constant then overwrite
9130 varying fields. Hope that this is more efficient than using the stack. */
9131 if (n_var <= n_elts/2)
9132 {
9133 rtx copy = copy_rtx (vals);
9134
9135 /* Load constant part of vector. We really don't care what goes into the
9136 parts we will overwrite, but we're more likely to be able to load the
9137 constant efficiently if it has fewer, larger, repeating parts
9138 (see aarch64_simd_valid_immediate). */
9139 for (int i = 0; i < n_elts; i++)
9140 {
9141 rtx x = XVECEXP (vals, 0, i);
9142 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9143 continue;
9144 rtx subst = any_const;
9145 for (int bit = n_elts / 2; bit > 0; bit /= 2)
9146 {
9147 /* Look in the copied vector, as more elements are const. */
9148 rtx test = XVECEXP (copy, 0, i ^ bit);
9149 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9150 {
9151 subst = test;
9152 break;
9153 }
9154 }
9155 XVECEXP (copy, 0, i) = subst;
9156 }
9157 aarch64_expand_vector_init (target, copy);
9158
9159 /* Insert variables. */
9160 enum insn_code icode = optab_handler (vec_set_optab, mode);
9161 gcc_assert (icode != CODE_FOR_nothing);
9162
9163 for (int i = 0; i < n_elts; i++)
9164 {
9165 rtx x = XVECEXP (vals, 0, i);
9166 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9167 continue;
9168 x = copy_to_mode_reg (inner_mode, x);
9169 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9170 }
9171 return;
9172 }
9173
9174 /* Construct the vector in memory one field at a time
9175 and load the whole vector. */
9176 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9177 for (int i = 0; i < n_elts; i++)
9178 emit_move_insn (adjust_address_nv (mem, inner_mode,
9179 i * GET_MODE_SIZE (inner_mode)),
9180 XVECEXP (vals, 0, i));
9181 emit_move_insn (target, mem);
9182
9183 }
9184
9185 static unsigned HOST_WIDE_INT
9186 aarch64_shift_truncation_mask (machine_mode mode)
9187 {
9188 return
9189 (aarch64_vector_mode_supported_p (mode)
9190 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9191 }
9192
9193 #ifndef TLS_SECTION_ASM_FLAG
9194 #define TLS_SECTION_ASM_FLAG 'T'
9195 #endif
9196
9197 void
9198 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9199 tree decl ATTRIBUTE_UNUSED)
9200 {
9201 char flagchars[10], *f = flagchars;
9202
9203 /* If we have already declared this section, we can use an
9204 abbreviated form to switch back to it -- unless this section is
9205 part of a COMDAT groups, in which case GAS requires the full
9206 declaration every time. */
9207 if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9208 && (flags & SECTION_DECLARED))
9209 {
9210 fprintf (asm_out_file, "\t.section\t%s\n", name);
9211 return;
9212 }
9213
9214 if (!(flags & SECTION_DEBUG))
9215 *f++ = 'a';
9216 if (flags & SECTION_WRITE)
9217 *f++ = 'w';
9218 if (flags & SECTION_CODE)
9219 *f++ = 'x';
9220 if (flags & SECTION_SMALL)
9221 *f++ = 's';
9222 if (flags & SECTION_MERGE)
9223 *f++ = 'M';
9224 if (flags & SECTION_STRINGS)
9225 *f++ = 'S';
9226 if (flags & SECTION_TLS)
9227 *f++ = TLS_SECTION_ASM_FLAG;
9228 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9229 *f++ = 'G';
9230 *f = '\0';
9231
9232 fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9233
9234 if (!(flags & SECTION_NOTYPE))
9235 {
9236 const char *type;
9237 const char *format;
9238
9239 if (flags & SECTION_BSS)
9240 type = "nobits";
9241 else
9242 type = "progbits";
9243
9244 #ifdef TYPE_OPERAND_FMT
9245 format = "," TYPE_OPERAND_FMT;
9246 #else
9247 format = ",@%s";
9248 #endif
9249
9250 fprintf (asm_out_file, format, type);
9251
9252 if (flags & SECTION_ENTSIZE)
9253 fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9254 if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9255 {
9256 if (TREE_CODE (decl) == IDENTIFIER_NODE)
9257 fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9258 else
9259 fprintf (asm_out_file, ",%s,comdat",
9260 IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9261 }
9262 }
9263
9264 putc ('\n', asm_out_file);
9265 }
9266
9267 /* Select a format to encode pointers in exception handling data. */
9268 int
9269 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9270 {
9271 int type;
9272 switch (aarch64_cmodel)
9273 {
9274 case AARCH64_CMODEL_TINY:
9275 case AARCH64_CMODEL_TINY_PIC:
9276 case AARCH64_CMODEL_SMALL:
9277 case AARCH64_CMODEL_SMALL_PIC:
9278 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
9279 for everything. */
9280 type = DW_EH_PE_sdata4;
9281 break;
9282 default:
9283 /* No assumptions here. 8-byte relocs required. */
9284 type = DW_EH_PE_sdata8;
9285 break;
9286 }
9287 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9288 }
9289
9290 /* Emit load exclusive. */
9291
9292 static void
9293 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9294 rtx mem, rtx model_rtx)
9295 {
9296 rtx (*gen) (rtx, rtx, rtx);
9297
9298 switch (mode)
9299 {
9300 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9301 case HImode: gen = gen_aarch64_load_exclusivehi; break;
9302 case SImode: gen = gen_aarch64_load_exclusivesi; break;
9303 case DImode: gen = gen_aarch64_load_exclusivedi; break;
9304 default:
9305 gcc_unreachable ();
9306 }
9307
9308 emit_insn (gen (rval, mem, model_rtx));
9309 }
9310
9311 /* Emit store exclusive. */
9312
9313 static void
9314 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9315 rtx rval, rtx mem, rtx model_rtx)
9316 {
9317 rtx (*gen) (rtx, rtx, rtx, rtx);
9318
9319 switch (mode)
9320 {
9321 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9322 case HImode: gen = gen_aarch64_store_exclusivehi; break;
9323 case SImode: gen = gen_aarch64_store_exclusivesi; break;
9324 case DImode: gen = gen_aarch64_store_exclusivedi; break;
9325 default:
9326 gcc_unreachable ();
9327 }
9328
9329 emit_insn (gen (bval, rval, mem, model_rtx));
9330 }
9331
9332 /* Mark the previous jump instruction as unlikely. */
9333
9334 static void
9335 aarch64_emit_unlikely_jump (rtx insn)
9336 {
9337 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9338
9339 insn = emit_jump_insn (insn);
9340 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9341 }
9342
9343 /* Expand a compare and swap pattern. */
9344
9345 void
9346 aarch64_expand_compare_and_swap (rtx operands[])
9347 {
9348 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9349 machine_mode mode, cmp_mode;
9350 rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9351
9352 bval = operands[0];
9353 rval = operands[1];
9354 mem = operands[2];
9355 oldval = operands[3];
9356 newval = operands[4];
9357 is_weak = operands[5];
9358 mod_s = operands[6];
9359 mod_f = operands[7];
9360 mode = GET_MODE (mem);
9361 cmp_mode = mode;
9362
9363 /* Normally the succ memory model must be stronger than fail, but in the
9364 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9365 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
9366
9367 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9368 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9369 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9370
9371 switch (mode)
9372 {
9373 case QImode:
9374 case HImode:
9375 /* For short modes, we're going to perform the comparison in SImode,
9376 so do the zero-extension now. */
9377 cmp_mode = SImode;
9378 rval = gen_reg_rtx (SImode);
9379 oldval = convert_modes (SImode, mode, oldval, true);
9380 /* Fall through. */
9381
9382 case SImode:
9383 case DImode:
9384 /* Force the value into a register if needed. */
9385 if (!aarch64_plus_operand (oldval, mode))
9386 oldval = force_reg (cmp_mode, oldval);
9387 break;
9388
9389 default:
9390 gcc_unreachable ();
9391 }
9392
9393 switch (mode)
9394 {
9395 case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9396 case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9397 case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9398 case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9399 default:
9400 gcc_unreachable ();
9401 }
9402
9403 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9404
9405 if (mode == QImode || mode == HImode)
9406 emit_move_insn (operands[1], gen_lowpart (mode, rval));
9407
9408 x = gen_rtx_REG (CCmode, CC_REGNUM);
9409 x = gen_rtx_EQ (SImode, x, const0_rtx);
9410 emit_insn (gen_rtx_SET (bval, x));
9411 }
9412
9413 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9414 sequence implementing an atomic operation. */
9415
9416 static void
9417 aarch64_emit_post_barrier (enum memmodel model)
9418 {
9419 const enum memmodel base_model = memmodel_base (model);
9420
9421 if (is_mm_sync (model)
9422 && (base_model == MEMMODEL_ACQUIRE
9423 || base_model == MEMMODEL_ACQ_REL
9424 || base_model == MEMMODEL_SEQ_CST))
9425 {
9426 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9427 }
9428 }
9429
9430 /* Split a compare and swap pattern. */
9431
9432 void
9433 aarch64_split_compare_and_swap (rtx operands[])
9434 {
9435 rtx rval, mem, oldval, newval, scratch;
9436 machine_mode mode;
9437 bool is_weak;
9438 rtx_code_label *label1, *label2;
9439 rtx x, cond;
9440 enum memmodel model;
9441 rtx model_rtx;
9442
9443 rval = operands[0];
9444 mem = operands[1];
9445 oldval = operands[2];
9446 newval = operands[3];
9447 is_weak = (operands[4] != const0_rtx);
9448 model_rtx = operands[5];
9449 scratch = operands[7];
9450 mode = GET_MODE (mem);
9451 model = memmodel_from_int (INTVAL (model_rtx));
9452
9453 label1 = NULL;
9454 if (!is_weak)
9455 {
9456 label1 = gen_label_rtx ();
9457 emit_label (label1);
9458 }
9459 label2 = gen_label_rtx ();
9460
9461 /* The initial load can be relaxed for a __sync operation since a final
9462 barrier will be emitted to stop code hoisting. */
9463 if (is_mm_sync (model))
9464 aarch64_emit_load_exclusive (mode, rval, mem,
9465 GEN_INT (MEMMODEL_RELAXED));
9466 else
9467 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9468
9469 cond = aarch64_gen_compare_reg (NE, rval, oldval);
9470 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9471 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9472 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9473 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9474
9475 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9476
9477 if (!is_weak)
9478 {
9479 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9480 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9481 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9482 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9483 }
9484 else
9485 {
9486 cond = gen_rtx_REG (CCmode, CC_REGNUM);
9487 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9488 emit_insn (gen_rtx_SET (cond, x));
9489 }
9490
9491 emit_label (label2);
9492
9493 /* Emit any final barrier needed for a __sync operation. */
9494 if (is_mm_sync (model))
9495 aarch64_emit_post_barrier (model);
9496 }
9497
9498 /* Split an atomic operation. */
9499
9500 void
9501 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9502 rtx value, rtx model_rtx, rtx cond)
9503 {
9504 machine_mode mode = GET_MODE (mem);
9505 machine_mode wmode = (mode == DImode ? DImode : SImode);
9506 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9507 const bool is_sync = is_mm_sync (model);
9508 rtx_code_label *label;
9509 rtx x;
9510
9511 label = gen_label_rtx ();
9512 emit_label (label);
9513
9514 if (new_out)
9515 new_out = gen_lowpart (wmode, new_out);
9516 if (old_out)
9517 old_out = gen_lowpart (wmode, old_out);
9518 else
9519 old_out = new_out;
9520 value = simplify_gen_subreg (wmode, value, mode, 0);
9521
9522 /* The initial load can be relaxed for a __sync operation since a final
9523 barrier will be emitted to stop code hoisting. */
9524 if (is_sync)
9525 aarch64_emit_load_exclusive (mode, old_out, mem,
9526 GEN_INT (MEMMODEL_RELAXED));
9527 else
9528 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9529
9530 switch (code)
9531 {
9532 case SET:
9533 new_out = value;
9534 break;
9535
9536 case NOT:
9537 x = gen_rtx_AND (wmode, old_out, value);
9538 emit_insn (gen_rtx_SET (new_out, x));
9539 x = gen_rtx_NOT (wmode, new_out);
9540 emit_insn (gen_rtx_SET (new_out, x));
9541 break;
9542
9543 case MINUS:
9544 if (CONST_INT_P (value))
9545 {
9546 value = GEN_INT (-INTVAL (value));
9547 code = PLUS;
9548 }
9549 /* Fall through. */
9550
9551 default:
9552 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9553 emit_insn (gen_rtx_SET (new_out, x));
9554 break;
9555 }
9556
9557 aarch64_emit_store_exclusive (mode, cond, mem,
9558 gen_lowpart (mode, new_out), model_rtx);
9559
9560 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9561 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9562 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9563 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9564
9565 /* Emit any final barrier needed for a __sync operation. */
9566 if (is_sync)
9567 aarch64_emit_post_barrier (model);
9568 }
9569
9570 static void
9571 aarch64_print_extension (void)
9572 {
9573 const struct aarch64_option_extension *opt = NULL;
9574
9575 for (opt = all_extensions; opt->name != NULL; opt++)
9576 if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9577 asm_fprintf (asm_out_file, "+%s", opt->name);
9578
9579 asm_fprintf (asm_out_file, "\n");
9580 }
9581
9582 static void
9583 aarch64_start_file (void)
9584 {
9585 if (selected_arch)
9586 {
9587 asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9588 aarch64_print_extension ();
9589 }
9590 else if (selected_cpu)
9591 {
9592 const char *truncated_name
9593 = aarch64_rewrite_selected_cpu (selected_cpu->name);
9594 asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9595 aarch64_print_extension ();
9596 }
9597 default_file_start();
9598 }
9599
9600 /* Target hook for c_mode_for_suffix. */
9601 static machine_mode
9602 aarch64_c_mode_for_suffix (char suffix)
9603 {
9604 if (suffix == 'q')
9605 return TFmode;
9606
9607 return VOIDmode;
9608 }
9609
9610 /* We can only represent floating point constants which will fit in
9611 "quarter-precision" values. These values are characterised by
9612 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
9613 by:
9614
9615 (-1)^s * (n/16) * 2^r
9616
9617 Where:
9618 's' is the sign bit.
9619 'n' is an integer in the range 16 <= n <= 31.
9620 'r' is an integer in the range -3 <= r <= 4. */
9621
9622 /* Return true iff X can be represented by a quarter-precision
9623 floating point immediate operand X. Note, we cannot represent 0.0. */
9624 bool
9625 aarch64_float_const_representable_p (rtx x)
9626 {
9627 /* This represents our current view of how many bits
9628 make up the mantissa. */
9629 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9630 int exponent;
9631 unsigned HOST_WIDE_INT mantissa, mask;
9632 REAL_VALUE_TYPE r, m;
9633 bool fail;
9634
9635 if (!CONST_DOUBLE_P (x))
9636 return false;
9637
9638 if (GET_MODE (x) == VOIDmode)
9639 return false;
9640
9641 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9642
9643 /* We cannot represent infinities, NaNs or +/-zero. We won't
9644 know if we have +zero until we analyse the mantissa, but we
9645 can reject the other invalid values. */
9646 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9647 || REAL_VALUE_MINUS_ZERO (r))
9648 return false;
9649
9650 /* Extract exponent. */
9651 r = real_value_abs (&r);
9652 exponent = REAL_EXP (&r);
9653
9654 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9655 highest (sign) bit, with a fixed binary point at bit point_pos.
9656 m1 holds the low part of the mantissa, m2 the high part.
9657 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9658 bits for the mantissa, this can fail (low bits will be lost). */
9659 real_ldexp (&m, &r, point_pos - exponent);
9660 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9661
9662 /* If the low part of the mantissa has bits set we cannot represent
9663 the value. */
9664 if (w.elt (0) != 0)
9665 return false;
9666 /* We have rejected the lower HOST_WIDE_INT, so update our
9667 understanding of how many bits lie in the mantissa and
9668 look only at the high HOST_WIDE_INT. */
9669 mantissa = w.elt (1);
9670 point_pos -= HOST_BITS_PER_WIDE_INT;
9671
9672 /* We can only represent values with a mantissa of the form 1.xxxx. */
9673 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9674 if ((mantissa & mask) != 0)
9675 return false;
9676
9677 /* Having filtered unrepresentable values, we may now remove all
9678 but the highest 5 bits. */
9679 mantissa >>= point_pos - 5;
9680
9681 /* We cannot represent the value 0.0, so reject it. This is handled
9682 elsewhere. */
9683 if (mantissa == 0)
9684 return false;
9685
9686 /* Then, as bit 4 is always set, we can mask it off, leaving
9687 the mantissa in the range [0, 15]. */
9688 mantissa &= ~(1 << 4);
9689 gcc_assert (mantissa <= 15);
9690
9691 /* GCC internally does not use IEEE754-like encoding (where normalized
9692 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
9693 Our mantissa values are shifted 4 places to the left relative to
9694 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9695 by 5 places to correct for GCC's representation. */
9696 exponent = 5 - exponent;
9697
9698 return (exponent >= 0 && exponent <= 7);
9699 }
9700
9701 char*
9702 aarch64_output_simd_mov_immediate (rtx const_vector,
9703 machine_mode mode,
9704 unsigned width)
9705 {
9706 bool is_valid;
9707 static char templ[40];
9708 const char *mnemonic;
9709 const char *shift_op;
9710 unsigned int lane_count = 0;
9711 char element_char;
9712
9713 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9714
9715 /* This will return true to show const_vector is legal for use as either
9716 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
9717 also update INFO to show how the immediate should be generated. */
9718 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9719 gcc_assert (is_valid);
9720
9721 element_char = sizetochar (info.element_width);
9722 lane_count = width / info.element_width;
9723
9724 mode = GET_MODE_INNER (mode);
9725 if (mode == SFmode || mode == DFmode)
9726 {
9727 gcc_assert (info.shift == 0 && ! info.mvn);
9728 if (aarch64_float_const_zero_rtx_p (info.value))
9729 info.value = GEN_INT (0);
9730 else
9731 {
9732 #define buf_size 20
9733 REAL_VALUE_TYPE r;
9734 REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9735 char float_buf[buf_size] = {'\0'};
9736 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9737 #undef buf_size
9738
9739 if (lane_count == 1)
9740 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9741 else
9742 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9743 lane_count, element_char, float_buf);
9744 return templ;
9745 }
9746 }
9747
9748 mnemonic = info.mvn ? "mvni" : "movi";
9749 shift_op = info.msl ? "msl" : "lsl";
9750
9751 if (lane_count == 1)
9752 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9753 mnemonic, UINTVAL (info.value));
9754 else if (info.shift)
9755 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9756 ", %s %d", mnemonic, lane_count, element_char,
9757 UINTVAL (info.value), shift_op, info.shift);
9758 else
9759 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9760 mnemonic, lane_count, element_char, UINTVAL (info.value));
9761 return templ;
9762 }
9763
9764 char*
9765 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9766 machine_mode mode)
9767 {
9768 machine_mode vmode;
9769
9770 gcc_assert (!VECTOR_MODE_P (mode));
9771 vmode = aarch64_simd_container_mode (mode, 64);
9772 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9773 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9774 }
9775
9776 /* Split operands into moves from op[1] + op[2] into op[0]. */
9777
9778 void
9779 aarch64_split_combinev16qi (rtx operands[3])
9780 {
9781 unsigned int dest = REGNO (operands[0]);
9782 unsigned int src1 = REGNO (operands[1]);
9783 unsigned int src2 = REGNO (operands[2]);
9784 machine_mode halfmode = GET_MODE (operands[1]);
9785 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9786 rtx destlo, desthi;
9787
9788 gcc_assert (halfmode == V16QImode);
9789
9790 if (src1 == dest && src2 == dest + halfregs)
9791 {
9792 /* No-op move. Can't split to nothing; emit something. */
9793 emit_note (NOTE_INSN_DELETED);
9794 return;
9795 }
9796
9797 /* Preserve register attributes for variable tracking. */
9798 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9799 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9800 GET_MODE_SIZE (halfmode));
9801
9802 /* Special case of reversed high/low parts. */
9803 if (reg_overlap_mentioned_p (operands[2], destlo)
9804 && reg_overlap_mentioned_p (operands[1], desthi))
9805 {
9806 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9807 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9808 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9809 }
9810 else if (!reg_overlap_mentioned_p (operands[2], destlo))
9811 {
9812 /* Try to avoid unnecessary moves if part of the result
9813 is in the right place already. */
9814 if (src1 != dest)
9815 emit_move_insn (destlo, operands[1]);
9816 if (src2 != dest + halfregs)
9817 emit_move_insn (desthi, operands[2]);
9818 }
9819 else
9820 {
9821 if (src2 != dest + halfregs)
9822 emit_move_insn (desthi, operands[2]);
9823 if (src1 != dest)
9824 emit_move_insn (destlo, operands[1]);
9825 }
9826 }
9827
9828 /* vec_perm support. */
9829
9830 #define MAX_VECT_LEN 16
9831
9832 struct expand_vec_perm_d
9833 {
9834 rtx target, op0, op1;
9835 unsigned char perm[MAX_VECT_LEN];
9836 machine_mode vmode;
9837 unsigned char nelt;
9838 bool one_vector_p;
9839 bool testing_p;
9840 };
9841
9842 /* Generate a variable permutation. */
9843
9844 static void
9845 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9846 {
9847 machine_mode vmode = GET_MODE (target);
9848 bool one_vector_p = rtx_equal_p (op0, op1);
9849
9850 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9851 gcc_checking_assert (GET_MODE (op0) == vmode);
9852 gcc_checking_assert (GET_MODE (op1) == vmode);
9853 gcc_checking_assert (GET_MODE (sel) == vmode);
9854 gcc_checking_assert (TARGET_SIMD);
9855
9856 if (one_vector_p)
9857 {
9858 if (vmode == V8QImode)
9859 {
9860 /* Expand the argument to a V16QI mode by duplicating it. */
9861 rtx pair = gen_reg_rtx (V16QImode);
9862 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9863 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9864 }
9865 else
9866 {
9867 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9868 }
9869 }
9870 else
9871 {
9872 rtx pair;
9873
9874 if (vmode == V8QImode)
9875 {
9876 pair = gen_reg_rtx (V16QImode);
9877 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9878 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9879 }
9880 else
9881 {
9882 pair = gen_reg_rtx (OImode);
9883 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9884 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9885 }
9886 }
9887 }
9888
9889 void
9890 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9891 {
9892 machine_mode vmode = GET_MODE (target);
9893 unsigned int nelt = GET_MODE_NUNITS (vmode);
9894 bool one_vector_p = rtx_equal_p (op0, op1);
9895 rtx mask;
9896
9897 /* The TBL instruction does not use a modulo index, so we must take care
9898 of that ourselves. */
9899 mask = aarch64_simd_gen_const_vector_dup (vmode,
9900 one_vector_p ? nelt - 1 : 2 * nelt - 1);
9901 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9902
9903 /* For big-endian, we also need to reverse the index within the vector
9904 (but not which vector). */
9905 if (BYTES_BIG_ENDIAN)
9906 {
9907 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
9908 if (!one_vector_p)
9909 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9910 sel = expand_simple_binop (vmode, XOR, sel, mask,
9911 NULL, 0, OPTAB_LIB_WIDEN);
9912 }
9913 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9914 }
9915
9916 /* Recognize patterns suitable for the TRN instructions. */
9917 static bool
9918 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9919 {
9920 unsigned int i, odd, mask, nelt = d->nelt;
9921 rtx out, in0, in1, x;
9922 rtx (*gen) (rtx, rtx, rtx);
9923 machine_mode vmode = d->vmode;
9924
9925 if (GET_MODE_UNIT_SIZE (vmode) > 8)
9926 return false;
9927
9928 /* Note that these are little-endian tests.
9929 We correct for big-endian later. */
9930 if (d->perm[0] == 0)
9931 odd = 0;
9932 else if (d->perm[0] == 1)
9933 odd = 1;
9934 else
9935 return false;
9936 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9937
9938 for (i = 0; i < nelt; i += 2)
9939 {
9940 if (d->perm[i] != i + odd)
9941 return false;
9942 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9943 return false;
9944 }
9945
9946 /* Success! */
9947 if (d->testing_p)
9948 return true;
9949
9950 in0 = d->op0;
9951 in1 = d->op1;
9952 if (BYTES_BIG_ENDIAN)
9953 {
9954 x = in0, in0 = in1, in1 = x;
9955 odd = !odd;
9956 }
9957 out = d->target;
9958
9959 if (odd)
9960 {
9961 switch (vmode)
9962 {
9963 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9964 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9965 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9966 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9967 case V4SImode: gen = gen_aarch64_trn2v4si; break;
9968 case V2SImode: gen = gen_aarch64_trn2v2si; break;
9969 case V2DImode: gen = gen_aarch64_trn2v2di; break;
9970 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9971 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9972 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9973 default:
9974 return false;
9975 }
9976 }
9977 else
9978 {
9979 switch (vmode)
9980 {
9981 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9982 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9983 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9984 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9985 case V4SImode: gen = gen_aarch64_trn1v4si; break;
9986 case V2SImode: gen = gen_aarch64_trn1v2si; break;
9987 case V2DImode: gen = gen_aarch64_trn1v2di; break;
9988 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9989 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9990 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9991 default:
9992 return false;
9993 }
9994 }
9995
9996 emit_insn (gen (out, in0, in1));
9997 return true;
9998 }
9999
10000 /* Recognize patterns suitable for the UZP instructions. */
10001 static bool
10002 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10003 {
10004 unsigned int i, odd, mask, nelt = d->nelt;
10005 rtx out, in0, in1, x;
10006 rtx (*gen) (rtx, rtx, rtx);
10007 machine_mode vmode = d->vmode;
10008
10009 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10010 return false;
10011
10012 /* Note that these are little-endian tests.
10013 We correct for big-endian later. */
10014 if (d->perm[0] == 0)
10015 odd = 0;
10016 else if (d->perm[0] == 1)
10017 odd = 1;
10018 else
10019 return false;
10020 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10021
10022 for (i = 0; i < nelt; i++)
10023 {
10024 unsigned elt = (i * 2 + odd) & mask;
10025 if (d->perm[i] != elt)
10026 return false;
10027 }
10028
10029 /* Success! */
10030 if (d->testing_p)
10031 return true;
10032
10033 in0 = d->op0;
10034 in1 = d->op1;
10035 if (BYTES_BIG_ENDIAN)
10036 {
10037 x = in0, in0 = in1, in1 = x;
10038 odd = !odd;
10039 }
10040 out = d->target;
10041
10042 if (odd)
10043 {
10044 switch (vmode)
10045 {
10046 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10047 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10048 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10049 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10050 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10051 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10052 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10053 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10054 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10055 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10056 default:
10057 return false;
10058 }
10059 }
10060 else
10061 {
10062 switch (vmode)
10063 {
10064 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10065 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10066 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10067 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10068 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10069 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10070 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10071 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10072 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10073 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10074 default:
10075 return false;
10076 }
10077 }
10078
10079 emit_insn (gen (out, in0, in1));
10080 return true;
10081 }
10082
10083 /* Recognize patterns suitable for the ZIP instructions. */
10084 static bool
10085 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10086 {
10087 unsigned int i, high, mask, nelt = d->nelt;
10088 rtx out, in0, in1, x;
10089 rtx (*gen) (rtx, rtx, rtx);
10090 machine_mode vmode = d->vmode;
10091
10092 if (GET_MODE_UNIT_SIZE (vmode) > 8)
10093 return false;
10094
10095 /* Note that these are little-endian tests.
10096 We correct for big-endian later. */
10097 high = nelt / 2;
10098 if (d->perm[0] == high)
10099 /* Do Nothing. */
10100 ;
10101 else if (d->perm[0] == 0)
10102 high = 0;
10103 else
10104 return false;
10105 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10106
10107 for (i = 0; i < nelt / 2; i++)
10108 {
10109 unsigned elt = (i + high) & mask;
10110 if (d->perm[i * 2] != elt)
10111 return false;
10112 elt = (elt + nelt) & mask;
10113 if (d->perm[i * 2 + 1] != elt)
10114 return false;
10115 }
10116
10117 /* Success! */
10118 if (d->testing_p)
10119 return true;
10120
10121 in0 = d->op0;
10122 in1 = d->op1;
10123 if (BYTES_BIG_ENDIAN)
10124 {
10125 x = in0, in0 = in1, in1 = x;
10126 high = !high;
10127 }
10128 out = d->target;
10129
10130 if (high)
10131 {
10132 switch (vmode)
10133 {
10134 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10135 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10136 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10137 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10138 case V4SImode: gen = gen_aarch64_zip2v4si; break;
10139 case V2SImode: gen = gen_aarch64_zip2v2si; break;
10140 case V2DImode: gen = gen_aarch64_zip2v2di; break;
10141 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10142 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10143 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10144 default:
10145 return false;
10146 }
10147 }
10148 else
10149 {
10150 switch (vmode)
10151 {
10152 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10153 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10154 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10155 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10156 case V4SImode: gen = gen_aarch64_zip1v4si; break;
10157 case V2SImode: gen = gen_aarch64_zip1v2si; break;
10158 case V2DImode: gen = gen_aarch64_zip1v2di; break;
10159 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10160 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10161 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10162 default:
10163 return false;
10164 }
10165 }
10166
10167 emit_insn (gen (out, in0, in1));
10168 return true;
10169 }
10170
10171 /* Recognize patterns for the EXT insn. */
10172
10173 static bool
10174 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10175 {
10176 unsigned int i, nelt = d->nelt;
10177 rtx (*gen) (rtx, rtx, rtx, rtx);
10178 rtx offset;
10179
10180 unsigned int location = d->perm[0]; /* Always < nelt. */
10181
10182 /* Check if the extracted indices are increasing by one. */
10183 for (i = 1; i < nelt; i++)
10184 {
10185 unsigned int required = location + i;
10186 if (d->one_vector_p)
10187 {
10188 /* We'll pass the same vector in twice, so allow indices to wrap. */
10189 required &= (nelt - 1);
10190 }
10191 if (d->perm[i] != required)
10192 return false;
10193 }
10194
10195 switch (d->vmode)
10196 {
10197 case V16QImode: gen = gen_aarch64_extv16qi; break;
10198 case V8QImode: gen = gen_aarch64_extv8qi; break;
10199 case V4HImode: gen = gen_aarch64_extv4hi; break;
10200 case V8HImode: gen = gen_aarch64_extv8hi; break;
10201 case V2SImode: gen = gen_aarch64_extv2si; break;
10202 case V4SImode: gen = gen_aarch64_extv4si; break;
10203 case V2SFmode: gen = gen_aarch64_extv2sf; break;
10204 case V4SFmode: gen = gen_aarch64_extv4sf; break;
10205 case V2DImode: gen = gen_aarch64_extv2di; break;
10206 case V2DFmode: gen = gen_aarch64_extv2df; break;
10207 default:
10208 return false;
10209 }
10210
10211 /* Success! */
10212 if (d->testing_p)
10213 return true;
10214
10215 /* The case where (location == 0) is a no-op for both big- and little-endian,
10216 and is removed by the mid-end at optimization levels -O1 and higher. */
10217
10218 if (BYTES_BIG_ENDIAN && (location != 0))
10219 {
10220 /* After setup, we want the high elements of the first vector (stored
10221 at the LSB end of the register), and the low elements of the second
10222 vector (stored at the MSB end of the register). So swap. */
10223 std::swap (d->op0, d->op1);
10224 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
10225 location = nelt - location;
10226 }
10227
10228 offset = GEN_INT (location);
10229 emit_insn (gen (d->target, d->op0, d->op1, offset));
10230 return true;
10231 }
10232
10233 /* Recognize patterns for the REV insns. */
10234
10235 static bool
10236 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10237 {
10238 unsigned int i, j, diff, nelt = d->nelt;
10239 rtx (*gen) (rtx, rtx);
10240
10241 if (!d->one_vector_p)
10242 return false;
10243
10244 diff = d->perm[0];
10245 switch (diff)
10246 {
10247 case 7:
10248 switch (d->vmode)
10249 {
10250 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10251 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
10252 default:
10253 return false;
10254 }
10255 break;
10256 case 3:
10257 switch (d->vmode)
10258 {
10259 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10260 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
10261 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
10262 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
10263 default:
10264 return false;
10265 }
10266 break;
10267 case 1:
10268 switch (d->vmode)
10269 {
10270 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10271 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
10272 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
10273 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
10274 case V4SImode: gen = gen_aarch64_rev64v4si; break;
10275 case V2SImode: gen = gen_aarch64_rev64v2si; break;
10276 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
10277 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
10278 default:
10279 return false;
10280 }
10281 break;
10282 default:
10283 return false;
10284 }
10285
10286 for (i = 0; i < nelt ; i += diff + 1)
10287 for (j = 0; j <= diff; j += 1)
10288 {
10289 /* This is guaranteed to be true as the value of diff
10290 is 7, 3, 1 and we should have enough elements in the
10291 queue to generate this. Getting a vector mask with a
10292 value of diff other than these values implies that
10293 something is wrong by the time we get here. */
10294 gcc_assert (i + j < nelt);
10295 if (d->perm[i + j] != i + diff - j)
10296 return false;
10297 }
10298
10299 /* Success! */
10300 if (d->testing_p)
10301 return true;
10302
10303 emit_insn (gen (d->target, d->op0));
10304 return true;
10305 }
10306
10307 static bool
10308 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10309 {
10310 rtx (*gen) (rtx, rtx, rtx);
10311 rtx out = d->target;
10312 rtx in0;
10313 machine_mode vmode = d->vmode;
10314 unsigned int i, elt, nelt = d->nelt;
10315 rtx lane;
10316
10317 elt = d->perm[0];
10318 for (i = 1; i < nelt; i++)
10319 {
10320 if (elt != d->perm[i])
10321 return false;
10322 }
10323
10324 /* The generic preparation in aarch64_expand_vec_perm_const_1
10325 swaps the operand order and the permute indices if it finds
10326 d->perm[0] to be in the second operand. Thus, we can always
10327 use d->op0 and need not do any extra arithmetic to get the
10328 correct lane number. */
10329 in0 = d->op0;
10330 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
10331
10332 switch (vmode)
10333 {
10334 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10335 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10336 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10337 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10338 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10339 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10340 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10341 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10342 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10343 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10344 default:
10345 return false;
10346 }
10347
10348 emit_insn (gen (out, in0, lane));
10349 return true;
10350 }
10351
10352 static bool
10353 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10354 {
10355 rtx rperm[MAX_VECT_LEN], sel;
10356 machine_mode vmode = d->vmode;
10357 unsigned int i, nelt = d->nelt;
10358
10359 if (d->testing_p)
10360 return true;
10361
10362 /* Generic code will try constant permutation twice. Once with the
10363 original mode and again with the elements lowered to QImode.
10364 So wait and don't do the selector expansion ourselves. */
10365 if (vmode != V8QImode && vmode != V16QImode)
10366 return false;
10367
10368 for (i = 0; i < nelt; ++i)
10369 {
10370 int nunits = GET_MODE_NUNITS (vmode);
10371
10372 /* If big-endian and two vectors we end up with a weird mixed-endian
10373 mode on NEON. Reverse the index within each word but not the word
10374 itself. */
10375 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10376 : d->perm[i]);
10377 }
10378 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10379 sel = force_reg (vmode, sel);
10380
10381 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10382 return true;
10383 }
10384
10385 static bool
10386 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10387 {
10388 /* The pattern matching functions above are written to look for a small
10389 number to begin the sequence (0, 1, N/2). If we begin with an index
10390 from the second operand, we can swap the operands. */
10391 if (d->perm[0] >= d->nelt)
10392 {
10393 unsigned i, nelt = d->nelt;
10394
10395 gcc_assert (nelt == (nelt & -nelt));
10396 for (i = 0; i < nelt; ++i)
10397 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
10398
10399 std::swap (d->op0, d->op1);
10400 }
10401
10402 if (TARGET_SIMD)
10403 {
10404 if (aarch64_evpc_rev (d))
10405 return true;
10406 else if (aarch64_evpc_ext (d))
10407 return true;
10408 else if (aarch64_evpc_dup (d))
10409 return true;
10410 else if (aarch64_evpc_zip (d))
10411 return true;
10412 else if (aarch64_evpc_uzp (d))
10413 return true;
10414 else if (aarch64_evpc_trn (d))
10415 return true;
10416 return aarch64_evpc_tbl (d);
10417 }
10418 return false;
10419 }
10420
10421 /* Expand a vec_perm_const pattern. */
10422
10423 bool
10424 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10425 {
10426 struct expand_vec_perm_d d;
10427 int i, nelt, which;
10428
10429 d.target = target;
10430 d.op0 = op0;
10431 d.op1 = op1;
10432
10433 d.vmode = GET_MODE (target);
10434 gcc_assert (VECTOR_MODE_P (d.vmode));
10435 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10436 d.testing_p = false;
10437
10438 for (i = which = 0; i < nelt; ++i)
10439 {
10440 rtx e = XVECEXP (sel, 0, i);
10441 int ei = INTVAL (e) & (2 * nelt - 1);
10442 which |= (ei < nelt ? 1 : 2);
10443 d.perm[i] = ei;
10444 }
10445
10446 switch (which)
10447 {
10448 default:
10449 gcc_unreachable ();
10450
10451 case 3:
10452 d.one_vector_p = false;
10453 if (!rtx_equal_p (op0, op1))
10454 break;
10455
10456 /* The elements of PERM do not suggest that only the first operand
10457 is used, but both operands are identical. Allow easier matching
10458 of the permutation by folding the permutation into the single
10459 input vector. */
10460 /* Fall Through. */
10461 case 2:
10462 for (i = 0; i < nelt; ++i)
10463 d.perm[i] &= nelt - 1;
10464 d.op0 = op1;
10465 d.one_vector_p = true;
10466 break;
10467
10468 case 1:
10469 d.op1 = op0;
10470 d.one_vector_p = true;
10471 break;
10472 }
10473
10474 return aarch64_expand_vec_perm_const_1 (&d);
10475 }
10476
10477 static bool
10478 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10479 const unsigned char *sel)
10480 {
10481 struct expand_vec_perm_d d;
10482 unsigned int i, nelt, which;
10483 bool ret;
10484
10485 d.vmode = vmode;
10486 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10487 d.testing_p = true;
10488 memcpy (d.perm, sel, nelt);
10489
10490 /* Calculate whether all elements are in one vector. */
10491 for (i = which = 0; i < nelt; ++i)
10492 {
10493 unsigned char e = d.perm[i];
10494 gcc_assert (e < 2 * nelt);
10495 which |= (e < nelt ? 1 : 2);
10496 }
10497
10498 /* If all elements are from the second vector, reindex as if from the
10499 first vector. */
10500 if (which == 2)
10501 for (i = 0; i < nelt; ++i)
10502 d.perm[i] -= nelt;
10503
10504 /* Check whether the mask can be applied to a single vector. */
10505 d.one_vector_p = (which != 3);
10506
10507 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10508 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10509 if (!d.one_vector_p)
10510 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10511
10512 start_sequence ();
10513 ret = aarch64_expand_vec_perm_const_1 (&d);
10514 end_sequence ();
10515
10516 return ret;
10517 }
10518
10519 rtx
10520 aarch64_reverse_mask (enum machine_mode mode)
10521 {
10522 /* We have to reverse each vector because we dont have
10523 a permuted load that can reverse-load according to ABI rules. */
10524 rtx mask;
10525 rtvec v = rtvec_alloc (16);
10526 int i, j;
10527 int nunits = GET_MODE_NUNITS (mode);
10528 int usize = GET_MODE_UNIT_SIZE (mode);
10529
10530 gcc_assert (BYTES_BIG_ENDIAN);
10531 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10532
10533 for (i = 0; i < nunits; i++)
10534 for (j = 0; j < usize; j++)
10535 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10536 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10537 return force_reg (V16QImode, mask);
10538 }
10539
10540 /* Implement MODES_TIEABLE_P. */
10541
10542 bool
10543 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10544 {
10545 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10546 return true;
10547
10548 /* We specifically want to allow elements of "structure" modes to
10549 be tieable to the structure. This more general condition allows
10550 other rarer situations too. */
10551 if (TARGET_SIMD
10552 && aarch64_vector_mode_p (mode1)
10553 && aarch64_vector_mode_p (mode2))
10554 return true;
10555
10556 return false;
10557 }
10558
10559 /* Return a new RTX holding the result of moving POINTER forward by
10560 AMOUNT bytes. */
10561
10562 static rtx
10563 aarch64_move_pointer (rtx pointer, int amount)
10564 {
10565 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10566
10567 return adjust_automodify_address (pointer, GET_MODE (pointer),
10568 next, amount);
10569 }
10570
10571 /* Return a new RTX holding the result of moving POINTER forward by the
10572 size of the mode it points to. */
10573
10574 static rtx
10575 aarch64_progress_pointer (rtx pointer)
10576 {
10577 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10578
10579 return aarch64_move_pointer (pointer, amount);
10580 }
10581
10582 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10583 MODE bytes. */
10584
10585 static void
10586 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10587 machine_mode mode)
10588 {
10589 rtx reg = gen_reg_rtx (mode);
10590
10591 /* "Cast" the pointers to the correct mode. */
10592 *src = adjust_address (*src, mode, 0);
10593 *dst = adjust_address (*dst, mode, 0);
10594 /* Emit the memcpy. */
10595 emit_move_insn (reg, *src);
10596 emit_move_insn (*dst, reg);
10597 /* Move the pointers forward. */
10598 *src = aarch64_progress_pointer (*src);
10599 *dst = aarch64_progress_pointer (*dst);
10600 }
10601
10602 /* Expand movmem, as if from a __builtin_memcpy. Return true if
10603 we succeed, otherwise return false. */
10604
10605 bool
10606 aarch64_expand_movmem (rtx *operands)
10607 {
10608 unsigned int n;
10609 rtx dst = operands[0];
10610 rtx src = operands[1];
10611 rtx base;
10612 bool speed_p = !optimize_function_for_size_p (cfun);
10613
10614 /* When optimizing for size, give a better estimate of the length of a
10615 memcpy call, but use the default otherwise. */
10616 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10617
10618 /* We can't do anything smart if the amount to copy is not constant. */
10619 if (!CONST_INT_P (operands[2]))
10620 return false;
10621
10622 n = UINTVAL (operands[2]);
10623
10624 /* Try to keep the number of instructions low. For cases below 16 bytes we
10625 need to make at most two moves. For cases above 16 bytes it will be one
10626 move for each 16 byte chunk, then at most two additional moves. */
10627 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10628 return false;
10629
10630 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10631 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10632
10633 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10634 src = adjust_automodify_address (src, VOIDmode, base, 0);
10635
10636 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10637 1-byte chunk. */
10638 if (n < 4)
10639 {
10640 if (n >= 2)
10641 {
10642 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10643 n -= 2;
10644 }
10645
10646 if (n == 1)
10647 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10648
10649 return true;
10650 }
10651
10652 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
10653 4-byte chunk, partially overlapping with the previously copied chunk. */
10654 if (n < 8)
10655 {
10656 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10657 n -= 4;
10658 if (n > 0)
10659 {
10660 int move = n - 4;
10661
10662 src = aarch64_move_pointer (src, move);
10663 dst = aarch64_move_pointer (dst, move);
10664 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10665 }
10666 return true;
10667 }
10668
10669 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
10670 them, then (if applicable) an 8-byte chunk. */
10671 while (n >= 8)
10672 {
10673 if (n / 16)
10674 {
10675 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10676 n -= 16;
10677 }
10678 else
10679 {
10680 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10681 n -= 8;
10682 }
10683 }
10684
10685 /* Finish the final bytes of the copy. We can always do this in one
10686 instruction. We either copy the exact amount we need, or partially
10687 overlap with the previous chunk we copied and copy 8-bytes. */
10688 if (n == 0)
10689 return true;
10690 else if (n == 1)
10691 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10692 else if (n == 2)
10693 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10694 else if (n == 4)
10695 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10696 else
10697 {
10698 if (n == 3)
10699 {
10700 src = aarch64_move_pointer (src, -1);
10701 dst = aarch64_move_pointer (dst, -1);
10702 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10703 }
10704 else
10705 {
10706 int move = n - 8;
10707
10708 src = aarch64_move_pointer (src, move);
10709 dst = aarch64_move_pointer (dst, move);
10710 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10711 }
10712 }
10713
10714 return true;
10715 }
10716
10717 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
10718
10719 static unsigned HOST_WIDE_INT
10720 aarch64_asan_shadow_offset (void)
10721 {
10722 return (HOST_WIDE_INT_1 << 36);
10723 }
10724
10725 static bool
10726 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10727 unsigned int align,
10728 enum by_pieces_operation op,
10729 bool speed_p)
10730 {
10731 /* STORE_BY_PIECES can be used when copying a constant string, but
10732 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10733 For now we always fail this and let the move_by_pieces code copy
10734 the string from read-only memory. */
10735 if (op == STORE_BY_PIECES)
10736 return false;
10737
10738 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10739 }
10740
10741 static enum machine_mode
10742 aarch64_code_to_ccmode (enum rtx_code code)
10743 {
10744 switch (code)
10745 {
10746 case NE:
10747 return CC_DNEmode;
10748
10749 case EQ:
10750 return CC_DEQmode;
10751
10752 case LE:
10753 return CC_DLEmode;
10754
10755 case LT:
10756 return CC_DLTmode;
10757
10758 case GE:
10759 return CC_DGEmode;
10760
10761 case GT:
10762 return CC_DGTmode;
10763
10764 case LEU:
10765 return CC_DLEUmode;
10766
10767 case LTU:
10768 return CC_DLTUmode;
10769
10770 case GEU:
10771 return CC_DGEUmode;
10772
10773 case GTU:
10774 return CC_DGTUmode;
10775
10776 default:
10777 return CCmode;
10778 }
10779 }
10780
10781 static rtx
10782 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10783 int code, tree treeop0, tree treeop1)
10784 {
10785 enum machine_mode op_mode, cmp_mode, cc_mode;
10786 rtx op0, op1, cmp, target;
10787 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10788 enum insn_code icode;
10789 struct expand_operand ops[4];
10790
10791 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10792 if (cc_mode == CCmode)
10793 return NULL_RTX;
10794
10795 start_sequence ();
10796 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10797
10798 op_mode = GET_MODE (op0);
10799 if (op_mode == VOIDmode)
10800 op_mode = GET_MODE (op1);
10801
10802 switch (op_mode)
10803 {
10804 case QImode:
10805 case HImode:
10806 case SImode:
10807 cmp_mode = SImode;
10808 icode = CODE_FOR_cmpsi;
10809 break;
10810
10811 case DImode:
10812 cmp_mode = DImode;
10813 icode = CODE_FOR_cmpdi;
10814 break;
10815
10816 default:
10817 end_sequence ();
10818 return NULL_RTX;
10819 }
10820
10821 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10822 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10823 if (!op0 || !op1)
10824 {
10825 end_sequence ();
10826 return NULL_RTX;
10827 }
10828 *prep_seq = get_insns ();
10829 end_sequence ();
10830
10831 cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10832 target = gen_rtx_REG (CCmode, CC_REGNUM);
10833
10834 create_output_operand (&ops[0], target, CCmode);
10835 create_fixed_operand (&ops[1], cmp);
10836 create_fixed_operand (&ops[2], op0);
10837 create_fixed_operand (&ops[3], op1);
10838
10839 start_sequence ();
10840 if (!maybe_expand_insn (icode, 4, ops))
10841 {
10842 end_sequence ();
10843 return NULL_RTX;
10844 }
10845 *gen_seq = get_insns ();
10846 end_sequence ();
10847
10848 return gen_rtx_REG (cc_mode, CC_REGNUM);
10849 }
10850
10851 static rtx
10852 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10853 tree treeop0, tree treeop1, int bit_code)
10854 {
10855 rtx op0, op1, cmp0, cmp1, target;
10856 enum machine_mode op_mode, cmp_mode, cc_mode;
10857 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10858 enum insn_code icode = CODE_FOR_ccmp_andsi;
10859 struct expand_operand ops[6];
10860
10861 cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10862 if (cc_mode == CCmode)
10863 return NULL_RTX;
10864
10865 push_to_sequence ((rtx_insn*) *prep_seq);
10866 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10867
10868 op_mode = GET_MODE (op0);
10869 if (op_mode == VOIDmode)
10870 op_mode = GET_MODE (op1);
10871
10872 switch (op_mode)
10873 {
10874 case QImode:
10875 case HImode:
10876 case SImode:
10877 cmp_mode = SImode;
10878 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10879 : CODE_FOR_ccmp_iorsi;
10880 break;
10881
10882 case DImode:
10883 cmp_mode = DImode;
10884 icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10885 : CODE_FOR_ccmp_iordi;
10886 break;
10887
10888 default:
10889 end_sequence ();
10890 return NULL_RTX;
10891 }
10892
10893 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10894 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10895 if (!op0 || !op1)
10896 {
10897 end_sequence ();
10898 return NULL_RTX;
10899 }
10900 *prep_seq = get_insns ();
10901 end_sequence ();
10902
10903 target = gen_rtx_REG (cc_mode, CC_REGNUM);
10904 cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10905 cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10906
10907 create_fixed_operand (&ops[0], prev);
10908 create_fixed_operand (&ops[1], target);
10909 create_fixed_operand (&ops[2], op0);
10910 create_fixed_operand (&ops[3], op1);
10911 create_fixed_operand (&ops[4], cmp0);
10912 create_fixed_operand (&ops[5], cmp1);
10913
10914 push_to_sequence ((rtx_insn*) *gen_seq);
10915 if (!maybe_expand_insn (icode, 6, ops))
10916 {
10917 end_sequence ();
10918 return NULL_RTX;
10919 }
10920
10921 *gen_seq = get_insns ();
10922 end_sequence ();
10923
10924 return target;
10925 }
10926
10927 #undef TARGET_GEN_CCMP_FIRST
10928 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10929
10930 #undef TARGET_GEN_CCMP_NEXT
10931 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10932
10933 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
10934 instruction fusion of some sort. */
10935
10936 static bool
10937 aarch64_macro_fusion_p (void)
10938 {
10939 return aarch64_tune_params->fusible_ops != AARCH64_FUSE_NOTHING;
10940 }
10941
10942
10943 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
10944 should be kept together during scheduling. */
10945
10946 static bool
10947 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10948 {
10949 rtx set_dest;
10950 rtx prev_set = single_set (prev);
10951 rtx curr_set = single_set (curr);
10952 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
10953 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10954
10955 if (!aarch64_macro_fusion_p ())
10956 return false;
10957
10958 if (simple_sets_p
10959 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOV_MOVK))
10960 {
10961 /* We are trying to match:
10962 prev (mov) == (set (reg r0) (const_int imm16))
10963 curr (movk) == (set (zero_extract (reg r0)
10964 (const_int 16)
10965 (const_int 16))
10966 (const_int imm16_1)) */
10967
10968 set_dest = SET_DEST (curr_set);
10969
10970 if (GET_CODE (set_dest) == ZERO_EXTRACT
10971 && CONST_INT_P (SET_SRC (curr_set))
10972 && CONST_INT_P (SET_SRC (prev_set))
10973 && CONST_INT_P (XEXP (set_dest, 2))
10974 && INTVAL (XEXP (set_dest, 2)) == 16
10975 && REG_P (XEXP (set_dest, 0))
10976 && REG_P (SET_DEST (prev_set))
10977 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10978 {
10979 return true;
10980 }
10981 }
10982
10983 if (simple_sets_p
10984 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_ADD))
10985 {
10986
10987 /* We're trying to match:
10988 prev (adrp) == (set (reg r1)
10989 (high (symbol_ref ("SYM"))))
10990 curr (add) == (set (reg r0)
10991 (lo_sum (reg r1)
10992 (symbol_ref ("SYM"))))
10993 Note that r0 need not necessarily be the same as r1, especially
10994 during pre-regalloc scheduling. */
10995
10996 if (satisfies_constraint_Ush (SET_SRC (prev_set))
10997 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10998 {
10999 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11000 && REG_P (XEXP (SET_SRC (curr_set), 0))
11001 && REGNO (XEXP (SET_SRC (curr_set), 0))
11002 == REGNO (SET_DEST (prev_set))
11003 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11004 XEXP (SET_SRC (curr_set), 1)))
11005 return true;
11006 }
11007 }
11008
11009 if (simple_sets_p
11010 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11011 {
11012
11013 /* We're trying to match:
11014 prev (movk) == (set (zero_extract (reg r0)
11015 (const_int 16)
11016 (const_int 32))
11017 (const_int imm16_1))
11018 curr (movk) == (set (zero_extract (reg r0)
11019 (const_int 16)
11020 (const_int 48))
11021 (const_int imm16_2)) */
11022
11023 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11024 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11025 && REG_P (XEXP (SET_DEST (prev_set), 0))
11026 && REG_P (XEXP (SET_DEST (curr_set), 0))
11027 && REGNO (XEXP (SET_DEST (prev_set), 0))
11028 == REGNO (XEXP (SET_DEST (curr_set), 0))
11029 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11030 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11031 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11032 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11033 && CONST_INT_P (SET_SRC (prev_set))
11034 && CONST_INT_P (SET_SRC (curr_set)))
11035 return true;
11036
11037 }
11038 if (simple_sets_p
11039 && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_LDR))
11040 {
11041 /* We're trying to match:
11042 prev (adrp) == (set (reg r0)
11043 (high (symbol_ref ("SYM"))))
11044 curr (ldr) == (set (reg r1)
11045 (mem (lo_sum (reg r0)
11046 (symbol_ref ("SYM")))))
11047 or
11048 curr (ldr) == (set (reg r1)
11049 (zero_extend (mem
11050 (lo_sum (reg r0)
11051 (symbol_ref ("SYM")))))) */
11052 if (satisfies_constraint_Ush (SET_SRC (prev_set))
11053 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11054 {
11055 rtx curr_src = SET_SRC (curr_set);
11056
11057 if (GET_CODE (curr_src) == ZERO_EXTEND)
11058 curr_src = XEXP (curr_src, 0);
11059
11060 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11061 && REG_P (XEXP (XEXP (curr_src, 0), 0))
11062 && REGNO (XEXP (XEXP (curr_src, 0), 0))
11063 == REGNO (SET_DEST (prev_set))
11064 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11065 XEXP (SET_SRC (prev_set), 0)))
11066 return true;
11067 }
11068 }
11069
11070 if ((aarch64_tune_params->fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11071 && any_condjump_p (curr))
11072 {
11073 enum attr_type prev_type = get_attr_type (prev);
11074
11075 /* FIXME: this misses some which is considered simple arthematic
11076 instructions for ThunderX. Simple shifts are missed here. */
11077 if (prev_type == TYPE_ALUS_SREG
11078 || prev_type == TYPE_ALUS_IMM
11079 || prev_type == TYPE_LOGICS_REG
11080 || prev_type == TYPE_LOGICS_IMM)
11081 return true;
11082 }
11083
11084 return false;
11085 }
11086
11087 /* If MEM is in the form of [base+offset], extract the two parts
11088 of address and set to BASE and OFFSET, otherwise return false
11089 after clearing BASE and OFFSET. */
11090
11091 bool
11092 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11093 {
11094 rtx addr;
11095
11096 gcc_assert (MEM_P (mem));
11097
11098 addr = XEXP (mem, 0);
11099
11100 if (REG_P (addr))
11101 {
11102 *base = addr;
11103 *offset = const0_rtx;
11104 return true;
11105 }
11106
11107 if (GET_CODE (addr) == PLUS
11108 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11109 {
11110 *base = XEXP (addr, 0);
11111 *offset = XEXP (addr, 1);
11112 return true;
11113 }
11114
11115 *base = NULL_RTX;
11116 *offset = NULL_RTX;
11117
11118 return false;
11119 }
11120
11121 /* Types for scheduling fusion. */
11122 enum sched_fusion_type
11123 {
11124 SCHED_FUSION_NONE = 0,
11125 SCHED_FUSION_LD_SIGN_EXTEND,
11126 SCHED_FUSION_LD_ZERO_EXTEND,
11127 SCHED_FUSION_LD,
11128 SCHED_FUSION_ST,
11129 SCHED_FUSION_NUM
11130 };
11131
11132 /* If INSN is a load or store of address in the form of [base+offset],
11133 extract the two parts and set to BASE and OFFSET. Return scheduling
11134 fusion type this INSN is. */
11135
11136 static enum sched_fusion_type
11137 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11138 {
11139 rtx x, dest, src;
11140 enum sched_fusion_type fusion = SCHED_FUSION_LD;
11141
11142 gcc_assert (INSN_P (insn));
11143 x = PATTERN (insn);
11144 if (GET_CODE (x) != SET)
11145 return SCHED_FUSION_NONE;
11146
11147 src = SET_SRC (x);
11148 dest = SET_DEST (x);
11149
11150 if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11151 && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11152 return SCHED_FUSION_NONE;
11153
11154 if (GET_CODE (src) == SIGN_EXTEND)
11155 {
11156 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11157 src = XEXP (src, 0);
11158 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11159 return SCHED_FUSION_NONE;
11160 }
11161 else if (GET_CODE (src) == ZERO_EXTEND)
11162 {
11163 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11164 src = XEXP (src, 0);
11165 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11166 return SCHED_FUSION_NONE;
11167 }
11168
11169 if (GET_CODE (src) == MEM && REG_P (dest))
11170 extract_base_offset_in_addr (src, base, offset);
11171 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11172 {
11173 fusion = SCHED_FUSION_ST;
11174 extract_base_offset_in_addr (dest, base, offset);
11175 }
11176 else
11177 return SCHED_FUSION_NONE;
11178
11179 if (*base == NULL_RTX || *offset == NULL_RTX)
11180 fusion = SCHED_FUSION_NONE;
11181
11182 return fusion;
11183 }
11184
11185 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11186
11187 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11188 and PRI are only calculated for these instructions. For other instruction,
11189 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
11190 type instruction fusion can be added by returning different priorities.
11191
11192 It's important that irrelevant instructions get the largest FUSION_PRI. */
11193
11194 static void
11195 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11196 int *fusion_pri, int *pri)
11197 {
11198 int tmp, off_val;
11199 rtx base, offset;
11200 enum sched_fusion_type fusion;
11201
11202 gcc_assert (INSN_P (insn));
11203
11204 tmp = max_pri - 1;
11205 fusion = fusion_load_store (insn, &base, &offset);
11206 if (fusion == SCHED_FUSION_NONE)
11207 {
11208 *pri = tmp;
11209 *fusion_pri = tmp;
11210 return;
11211 }
11212
11213 /* Set FUSION_PRI according to fusion type and base register. */
11214 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11215
11216 /* Calculate PRI. */
11217 tmp /= 2;
11218
11219 /* INSN with smaller offset goes first. */
11220 off_val = (int)(INTVAL (offset));
11221 if (off_val >= 0)
11222 tmp -= (off_val & 0xfffff);
11223 else
11224 tmp += ((- off_val) & 0xfffff);
11225
11226 *pri = tmp;
11227 return;
11228 }
11229
11230 /* Given OPERANDS of consecutive load/store, check if we can merge
11231 them into ldp/stp. LOAD is true if they are load instructions.
11232 MODE is the mode of memory operands. */
11233
11234 bool
11235 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11236 enum machine_mode mode)
11237 {
11238 HOST_WIDE_INT offval_1, offval_2, msize;
11239 enum reg_class rclass_1, rclass_2;
11240 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11241
11242 if (load)
11243 {
11244 mem_1 = operands[1];
11245 mem_2 = operands[3];
11246 reg_1 = operands[0];
11247 reg_2 = operands[2];
11248 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11249 if (REGNO (reg_1) == REGNO (reg_2))
11250 return false;
11251 }
11252 else
11253 {
11254 mem_1 = operands[0];
11255 mem_2 = operands[2];
11256 reg_1 = operands[1];
11257 reg_2 = operands[3];
11258 }
11259
11260 /* The mems cannot be volatile. */
11261 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11262 return false;
11263
11264 /* Check if the addresses are in the form of [base+offset]. */
11265 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11266 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11267 return false;
11268 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11269 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11270 return false;
11271
11272 /* Check if the bases are same. */
11273 if (!rtx_equal_p (base_1, base_2))
11274 return false;
11275
11276 offval_1 = INTVAL (offset_1);
11277 offval_2 = INTVAL (offset_2);
11278 msize = GET_MODE_SIZE (mode);
11279 /* Check if the offsets are consecutive. */
11280 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11281 return false;
11282
11283 /* Check if the addresses are clobbered by load. */
11284 if (load)
11285 {
11286 if (reg_mentioned_p (reg_1, mem_1))
11287 return false;
11288
11289 /* In increasing order, the last load can clobber the address. */
11290 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11291 return false;
11292 }
11293
11294 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11295 rclass_1 = FP_REGS;
11296 else
11297 rclass_1 = GENERAL_REGS;
11298
11299 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11300 rclass_2 = FP_REGS;
11301 else
11302 rclass_2 = GENERAL_REGS;
11303
11304 /* Check if the registers are of same class. */
11305 if (rclass_1 != rclass_2)
11306 return false;
11307
11308 return true;
11309 }
11310
11311 /* Given OPERANDS of consecutive load/store, check if we can merge
11312 them into ldp/stp by adjusting the offset. LOAD is true if they
11313 are load instructions. MODE is the mode of memory operands.
11314
11315 Given below consecutive stores:
11316
11317 str w1, [xb, 0x100]
11318 str w1, [xb, 0x104]
11319 str w1, [xb, 0x108]
11320 str w1, [xb, 0x10c]
11321
11322 Though the offsets are out of the range supported by stp, we can
11323 still pair them after adjusting the offset, like:
11324
11325 add scratch, xb, 0x100
11326 stp w1, w1, [scratch]
11327 stp w1, w1, [scratch, 0x8]
11328
11329 The peephole patterns detecting this opportunity should guarantee
11330 the scratch register is avaliable. */
11331
11332 bool
11333 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11334 enum machine_mode mode)
11335 {
11336 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11337 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11338 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11339 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11340
11341 if (load)
11342 {
11343 reg_1 = operands[0];
11344 mem_1 = operands[1];
11345 reg_2 = operands[2];
11346 mem_2 = operands[3];
11347 reg_3 = operands[4];
11348 mem_3 = operands[5];
11349 reg_4 = operands[6];
11350 mem_4 = operands[7];
11351 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11352 && REG_P (reg_3) && REG_P (reg_4));
11353 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11354 return false;
11355 }
11356 else
11357 {
11358 mem_1 = operands[0];
11359 reg_1 = operands[1];
11360 mem_2 = operands[2];
11361 reg_2 = operands[3];
11362 mem_3 = operands[4];
11363 reg_3 = operands[5];
11364 mem_4 = operands[6];
11365 reg_4 = operands[7];
11366 }
11367 /* Skip if memory operand is by itslef valid for ldp/stp. */
11368 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11369 return false;
11370
11371 /* The mems cannot be volatile. */
11372 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11373 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11374 return false;
11375
11376 /* Check if the addresses are in the form of [base+offset]. */
11377 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11378 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11379 return false;
11380 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11381 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11382 return false;
11383 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11384 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11385 return false;
11386 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11387 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11388 return false;
11389
11390 /* Check if the bases are same. */
11391 if (!rtx_equal_p (base_1, base_2)
11392 || !rtx_equal_p (base_2, base_3)
11393 || !rtx_equal_p (base_3, base_4))
11394 return false;
11395
11396 offval_1 = INTVAL (offset_1);
11397 offval_2 = INTVAL (offset_2);
11398 offval_3 = INTVAL (offset_3);
11399 offval_4 = INTVAL (offset_4);
11400 msize = GET_MODE_SIZE (mode);
11401 /* Check if the offsets are consecutive. */
11402 if ((offval_1 != (offval_2 + msize)
11403 || offval_1 != (offval_3 + msize * 2)
11404 || offval_1 != (offval_4 + msize * 3))
11405 && (offval_4 != (offval_3 + msize)
11406 || offval_4 != (offval_2 + msize * 2)
11407 || offval_4 != (offval_1 + msize * 3)))
11408 return false;
11409
11410 /* Check if the addresses are clobbered by load. */
11411 if (load)
11412 {
11413 if (reg_mentioned_p (reg_1, mem_1)
11414 || reg_mentioned_p (reg_2, mem_2)
11415 || reg_mentioned_p (reg_3, mem_3))
11416 return false;
11417
11418 /* In increasing order, the last load can clobber the address. */
11419 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11420 return false;
11421 }
11422
11423 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11424 rclass_1 = FP_REGS;
11425 else
11426 rclass_1 = GENERAL_REGS;
11427
11428 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11429 rclass_2 = FP_REGS;
11430 else
11431 rclass_2 = GENERAL_REGS;
11432
11433 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11434 rclass_3 = FP_REGS;
11435 else
11436 rclass_3 = GENERAL_REGS;
11437
11438 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11439 rclass_4 = FP_REGS;
11440 else
11441 rclass_4 = GENERAL_REGS;
11442
11443 /* Check if the registers are of same class. */
11444 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11445 return false;
11446
11447 return true;
11448 }
11449
11450 /* Given OPERANDS of consecutive load/store, this function pairs them
11451 into ldp/stp after adjusting the offset. It depends on the fact
11452 that addresses of load/store instructions are in increasing order.
11453 MODE is the mode of memory operands. CODE is the rtl operator
11454 which should be applied to all memory operands, it's SIGN_EXTEND,
11455 ZERO_EXTEND or UNKNOWN. */
11456
11457 bool
11458 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11459 enum machine_mode mode, RTX_CODE code)
11460 {
11461 rtx base, offset, t1, t2;
11462 rtx mem_1, mem_2, mem_3, mem_4;
11463 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11464
11465 if (load)
11466 {
11467 mem_1 = operands[1];
11468 mem_2 = operands[3];
11469 mem_3 = operands[5];
11470 mem_4 = operands[7];
11471 }
11472 else
11473 {
11474 mem_1 = operands[0];
11475 mem_2 = operands[2];
11476 mem_3 = operands[4];
11477 mem_4 = operands[6];
11478 gcc_assert (code == UNKNOWN);
11479 }
11480
11481 extract_base_offset_in_addr (mem_1, &base, &offset);
11482 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11483
11484 /* Adjust offset thus it can fit in ldp/stp instruction. */
11485 msize = GET_MODE_SIZE (mode);
11486 stp_off_limit = msize * 0x40;
11487 off_val = INTVAL (offset);
11488 abs_off = (off_val < 0) ? -off_val : off_val;
11489 new_off = abs_off % stp_off_limit;
11490 adj_off = abs_off - new_off;
11491
11492 /* Further adjust to make sure all offsets are OK. */
11493 if ((new_off + msize * 2) >= stp_off_limit)
11494 {
11495 adj_off += stp_off_limit;
11496 new_off -= stp_off_limit;
11497 }
11498
11499 /* Make sure the adjustment can be done with ADD/SUB instructions. */
11500 if (adj_off >= 0x1000)
11501 return false;
11502
11503 if (off_val < 0)
11504 {
11505 adj_off = -adj_off;
11506 new_off = -new_off;
11507 }
11508
11509 /* Create new memory references. */
11510 mem_1 = change_address (mem_1, VOIDmode,
11511 plus_constant (DImode, operands[8], new_off));
11512
11513 /* Check if the adjusted address is OK for ldp/stp. */
11514 if (!aarch64_mem_pair_operand (mem_1, mode))
11515 return false;
11516
11517 msize = GET_MODE_SIZE (mode);
11518 mem_2 = change_address (mem_2, VOIDmode,
11519 plus_constant (DImode,
11520 operands[8],
11521 new_off + msize));
11522 mem_3 = change_address (mem_3, VOIDmode,
11523 plus_constant (DImode,
11524 operands[8],
11525 new_off + msize * 2));
11526 mem_4 = change_address (mem_4, VOIDmode,
11527 plus_constant (DImode,
11528 operands[8],
11529 new_off + msize * 3));
11530
11531 if (code == ZERO_EXTEND)
11532 {
11533 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11534 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11535 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11536 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11537 }
11538 else if (code == SIGN_EXTEND)
11539 {
11540 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11541 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11542 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11543 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11544 }
11545
11546 if (load)
11547 {
11548 operands[1] = mem_1;
11549 operands[3] = mem_2;
11550 operands[5] = mem_3;
11551 operands[7] = mem_4;
11552 }
11553 else
11554 {
11555 operands[0] = mem_1;
11556 operands[2] = mem_2;
11557 operands[4] = mem_3;
11558 operands[6] = mem_4;
11559 }
11560
11561 /* Emit adjusting instruction. */
11562 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11563 /* Emit ldp/stp instructions. */
11564 t1 = gen_rtx_SET (operands[0], operands[1]);
11565 t2 = gen_rtx_SET (operands[2], operands[3]);
11566 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11567 t1 = gen_rtx_SET (operands[4], operands[5]);
11568 t2 = gen_rtx_SET (operands[6], operands[7]);
11569 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11570 return true;
11571 }
11572
11573 #undef TARGET_ADDRESS_COST
11574 #define TARGET_ADDRESS_COST aarch64_address_cost
11575
11576 /* This hook will determines whether unnamed bitfields affect the alignment
11577 of the containing structure. The hook returns true if the structure
11578 should inherit the alignment requirements of an unnamed bitfield's
11579 type. */
11580 #undef TARGET_ALIGN_ANON_BITFIELD
11581 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11582
11583 #undef TARGET_ASM_ALIGNED_DI_OP
11584 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11585
11586 #undef TARGET_ASM_ALIGNED_HI_OP
11587 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11588
11589 #undef TARGET_ASM_ALIGNED_SI_OP
11590 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11591
11592 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11593 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11594 hook_bool_const_tree_hwi_hwi_const_tree_true
11595
11596 #undef TARGET_ASM_FILE_START
11597 #define TARGET_ASM_FILE_START aarch64_start_file
11598
11599 #undef TARGET_ASM_OUTPUT_MI_THUNK
11600 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11601
11602 #undef TARGET_ASM_SELECT_RTX_SECTION
11603 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11604
11605 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11606 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11607
11608 #undef TARGET_BUILD_BUILTIN_VA_LIST
11609 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11610
11611 #undef TARGET_CALLEE_COPIES
11612 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11613
11614 #undef TARGET_CAN_ELIMINATE
11615 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11616
11617 #undef TARGET_CANNOT_FORCE_CONST_MEM
11618 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11619
11620 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11621 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11622
11623 /* Only the least significant bit is used for initialization guard
11624 variables. */
11625 #undef TARGET_CXX_GUARD_MASK_BIT
11626 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11627
11628 #undef TARGET_C_MODE_FOR_SUFFIX
11629 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11630
11631 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11632 #undef TARGET_DEFAULT_TARGET_FLAGS
11633 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11634 #endif
11635
11636 #undef TARGET_CLASS_MAX_NREGS
11637 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11638
11639 #undef TARGET_BUILTIN_DECL
11640 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11641
11642 #undef TARGET_EXPAND_BUILTIN
11643 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11644
11645 #undef TARGET_EXPAND_BUILTIN_VA_START
11646 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11647
11648 #undef TARGET_FOLD_BUILTIN
11649 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11650
11651 #undef TARGET_FUNCTION_ARG
11652 #define TARGET_FUNCTION_ARG aarch64_function_arg
11653
11654 #undef TARGET_FUNCTION_ARG_ADVANCE
11655 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11656
11657 #undef TARGET_FUNCTION_ARG_BOUNDARY
11658 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11659
11660 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11661 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11662
11663 #undef TARGET_FUNCTION_VALUE
11664 #define TARGET_FUNCTION_VALUE aarch64_function_value
11665
11666 #undef TARGET_FUNCTION_VALUE_REGNO_P
11667 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11668
11669 #undef TARGET_FRAME_POINTER_REQUIRED
11670 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11671
11672 #undef TARGET_GIMPLE_FOLD_BUILTIN
11673 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11674
11675 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11676 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11677
11678 #undef TARGET_INIT_BUILTINS
11679 #define TARGET_INIT_BUILTINS aarch64_init_builtins
11680
11681 #undef TARGET_LEGITIMATE_ADDRESS_P
11682 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11683
11684 #undef TARGET_LEGITIMATE_CONSTANT_P
11685 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11686
11687 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11688 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11689
11690 #undef TARGET_LRA_P
11691 #define TARGET_LRA_P hook_bool_void_true
11692
11693 #undef TARGET_MANGLE_TYPE
11694 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11695
11696 #undef TARGET_MEMORY_MOVE_COST
11697 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11698
11699 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11700 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11701
11702 #undef TARGET_MUST_PASS_IN_STACK
11703 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11704
11705 /* This target hook should return true if accesses to volatile bitfields
11706 should use the narrowest mode possible. It should return false if these
11707 accesses should use the bitfield container type. */
11708 #undef TARGET_NARROW_VOLATILE_BITFIELD
11709 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11710
11711 #undef TARGET_OPTION_OVERRIDE
11712 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11713
11714 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11715 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11716 aarch64_override_options_after_change
11717
11718 #undef TARGET_PASS_BY_REFERENCE
11719 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11720
11721 #undef TARGET_PREFERRED_RELOAD_CLASS
11722 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11723
11724 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11725 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11726
11727 #undef TARGET_SECONDARY_RELOAD
11728 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11729
11730 #undef TARGET_SHIFT_TRUNCATION_MASK
11731 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11732
11733 #undef TARGET_SETUP_INCOMING_VARARGS
11734 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11735
11736 #undef TARGET_STRUCT_VALUE_RTX
11737 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
11738
11739 #undef TARGET_REGISTER_MOVE_COST
11740 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11741
11742 #undef TARGET_RETURN_IN_MEMORY
11743 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11744
11745 #undef TARGET_RETURN_IN_MSB
11746 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11747
11748 #undef TARGET_RTX_COSTS
11749 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11750
11751 #undef TARGET_SCHED_ISSUE_RATE
11752 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11753
11754 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11755 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11756 aarch64_sched_first_cycle_multipass_dfa_lookahead
11757
11758 #undef TARGET_TRAMPOLINE_INIT
11759 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11760
11761 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11762 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11763
11764 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11765 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11766
11767 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11768 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11769
11770 #undef TARGET_VECTORIZE_ADD_STMT_COST
11771 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11772
11773 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11774 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11775 aarch64_builtin_vectorization_cost
11776
11777 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11778 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11779
11780 #undef TARGET_VECTORIZE_BUILTINS
11781 #define TARGET_VECTORIZE_BUILTINS
11782
11783 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11784 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11785 aarch64_builtin_vectorized_function
11786
11787 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11788 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11789 aarch64_autovectorize_vector_sizes
11790
11791 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11792 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11793 aarch64_atomic_assign_expand_fenv
11794
11795 /* Section anchor support. */
11796
11797 #undef TARGET_MIN_ANCHOR_OFFSET
11798 #define TARGET_MIN_ANCHOR_OFFSET -256
11799
11800 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11801 byte offset; we can do much more for larger data types, but have no way
11802 to determine the size of the access. We assume accesses are aligned. */
11803 #undef TARGET_MAX_ANCHOR_OFFSET
11804 #define TARGET_MAX_ANCHOR_OFFSET 4095
11805
11806 #undef TARGET_VECTOR_ALIGNMENT
11807 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11808
11809 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11810 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11811 aarch64_simd_vector_alignment_reachable
11812
11813 /* vec_perm support. */
11814
11815 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11816 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11817 aarch64_vectorize_vec_perm_const_ok
11818
11819
11820 #undef TARGET_FIXED_CONDITION_CODE_REGS
11821 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11822
11823 #undef TARGET_FLAGS_REGNUM
11824 #define TARGET_FLAGS_REGNUM CC_REGNUM
11825
11826 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11827 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11828
11829 #undef TARGET_ASAN_SHADOW_OFFSET
11830 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11831
11832 #undef TARGET_LEGITIMIZE_ADDRESS
11833 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11834
11835 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11836 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11837 aarch64_use_by_pieces_infrastructure_p
11838
11839 #undef TARGET_CAN_USE_DOLOOP_P
11840 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11841
11842 #undef TARGET_SCHED_MACRO_FUSION_P
11843 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11844
11845 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11846 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11847
11848 #undef TARGET_SCHED_FUSION_PRIORITY
11849 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11850
11851 struct gcc_target targetm = TARGET_INITIALIZER;
11852
11853 #include "gt-aarch64.h"