gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stringpool.h"
  33 #include "stor-layout.h"
  34 #include "calls.h"
  35 #include "varasm.h"
  36 #include "regs.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfgrtl.h"
  40 #include "cfganal.h"
  41 #include "lcm.h"
  42 #include "cfgbuild.h"
  43 #include "cfgcleanup.h"
  44 #include "predict.h"
  45 #include "basic-block.h"
  46 #include "df.h"
  47 #include "hard-reg-set.h"
  48 #include "output.h"
  49 #include "function.h"
  50 #include "flags.h"
  51 #include "insn-config.h"
  52 #include "expmed.h"
  53 #include "dojump.h"
  54 #include "explow.h"
  55 #include "emit-rtl.h"
  56 #include "stmt.h"
  57 #include "expr.h"
  58 #include "reload.h"
  59 #include "toplev.h"
  60 #include "target.h"
  61 #include "targhooks.h"
  62 #include "tm_p.h"
  63 #include "recog.h"
  64 #include "langhooks.h"
  65 #include "diagnostic-core.h"
  66 #include "tree-ssa-alias.h"
  67 #include "internal-fn.h"
  68 #include "gimple-fold.h"
  69 #include "tree-eh.h"
  70 #include "gimple-expr.h"
  71 #include "gimple.h"
  72 #include "gimplify.h"
  73 #include "optabs.h"
  74 #include "dwarf2.h"
  75 #include "cfgloop.h"
  76 #include "tree-vectorizer.h"
  77 #include "aarch64-cost-tables.h"
  78 #include "dumpfile.h"
  79 #include "builtins.h"
  80 #include "rtl-iter.h"
  81 #include "tm-constrs.h"
  82 #include "sched-int.h"
  83 #include "cortex-a57-fma-steering.h"
  84
  85 /* This file should be included last.  */
  86 #include "target-def.h"
  87
  88 /* Defined for convenience.  */
  89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  90
  91 /* Classifies an address.
  92
  93    ADDRESS_REG_IMM
  94        A simple base register plus immediate offset.
  95
  96    ADDRESS_REG_WB
  97        A base register indexed by immediate offset with writeback.
  98
  99    ADDRESS_REG_REG
 100        A base register indexed by (optionally scaled) register.
 101
 102    ADDRESS_REG_UXTW
 103        A base register indexed by (optionally scaled) zero-extended register.
 104
 105    ADDRESS_REG_SXTW
 106        A base register indexed by (optionally scaled) sign-extended register.
 107
 108    ADDRESS_LO_SUM
 109        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 110
 111    ADDRESS_SYMBOLIC:
 112        A constant symbolic address, in pc-relative literal pool.  */
 113
 114 enum aarch64_address_type {
 115   ADDRESS_REG_IMM,
 116   ADDRESS_REG_WB,
 117   ADDRESS_REG_REG,
 118   ADDRESS_REG_UXTW,
 119   ADDRESS_REG_SXTW,
 120   ADDRESS_LO_SUM,
 121   ADDRESS_SYMBOLIC
 122 };
 123
 124 struct aarch64_address_info {
 125   enum aarch64_address_type type;
 126   rtx base;
 127   rtx offset;
 128   int shift;
 129   enum aarch64_symbol_type symbol_type;
 130 };
 131
 132 struct simd_immediate_info
 133 {
 134   rtx value;
 135   int shift;
 136   int element_width;
 137   bool mvn;
 138   bool msl;
 139 };
 140
 141 /* The current code model.  */
 142 enum aarch64_code_model aarch64_cmodel;
 143
 144 #ifdef HAVE_AS_TLS
 145 #undef TARGET_HAVE_TLS
 146 #define TARGET_HAVE_TLS 1
 147 #endif
 148
 149 static bool aarch64_composite_type_p (const_tree, machine_mode);
 150 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 151                                                      const_tree,
 152                                                      machine_mode *, int *,
 153                                                      bool *);
 154 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 155 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 156 static void aarch64_override_options_after_change (void);
 157 static bool aarch64_vector_mode_supported_p (machine_mode);
 158 static unsigned bit_count (unsigned HOST_WIDE_INT);
 159 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 160                                                  const unsigned char *sel);
 161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 162
 163 /* Major revision number of the ARM Architecture implemented by the target.  */
 164 unsigned aarch64_architecture_version;
 165
 166 /* The processor for which instructions should be scheduled.  */
 167 enum aarch64_processor aarch64_tune = cortexa53;
 168
 169 /* Mask to specify which instructions we are allowed to generate.  */
 170 unsigned long aarch64_isa_flags = 0;
 171
 172 /* Mask to specify which instruction scheduling options should be used.  */
 173 unsigned long aarch64_tune_flags = 0;
 174
 175 /* Support for command line parsing of boolean flags in the tuning
 176    structures.  */
 177 struct aarch64_flag_desc
 178 {
 179   const char* name;
 180   unsigned int flag;
 181 };
 182
 183 #define AARCH64_FUSION_PAIR(name, internal_name, y) \
 184   { name, AARCH64_FUSE_##internal_name },
 185 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 186 {
 187   { "none", AARCH64_FUSE_NOTHING },
 188 #include "aarch64-fusion-pairs.def"
 189   { "all", AARCH64_FUSE_ALL },
 190   { NULL, AARCH64_FUSE_NOTHING }
 191 };
 192 #undef AARCH64_FUION_PAIR
 193
 194 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name, y) \
 195   { name, AARCH64_EXTRA_TUNE_##internal_name },
 196 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 197 {
 198   { "none", AARCH64_EXTRA_TUNE_NONE },
 199 #include "aarch64-tuning-flags.def"
 200   { "all", AARCH64_EXTRA_TUNE_ALL },
 201   { NULL, AARCH64_EXTRA_TUNE_NONE }
 202 };
 203 #undef AARCH64_EXTRA_TUNING_OPTION
 204
 205 /* Tuning parameters.  */
 206
 207 static const struct cpu_addrcost_table generic_addrcost_table =
 208 {
 209     {
 210       0, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       0, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0 /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   0, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   0, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_addrcost_table xgene1_addrcost_table =
 238 {
 239     {
 240       1, /* hi  */
 241       0, /* si  */
 242       0, /* di  */
 243       1, /* ti  */
 244     },
 245   1, /* pre_modify  */
 246   0, /* post_modify  */
 247   0, /* register_offset  */
 248   1, /* register_extend  */
 249   0, /* imm_offset  */
 250 };
 251
 252 static const struct cpu_regmove_cost generic_regmove_cost =
 253 {
 254   1, /* GP2GP  */
 255   /* Avoid the use of slow int<->fp moves for spilling by setting
 256      their cost higher than memmov_cost.  */
 257   5, /* GP2FP  */
 258   5, /* FP2GP  */
 259   2 /* FP2FP  */
 260 };
 261
 262 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 263 {
 264   1, /* GP2GP  */
 265   /* Avoid the use of slow int<->fp moves for spilling by setting
 266      their cost higher than memmov_cost.  */
 267   5, /* GP2FP  */
 268   5, /* FP2GP  */
 269   2 /* FP2FP  */
 270 };
 271
 272 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 273 {
 274   1, /* GP2GP  */
 275   /* Avoid the use of slow int<->fp moves for spilling by setting
 276      their cost higher than memmov_cost.  */
 277   5, /* GP2FP  */
 278   5, /* FP2GP  */
 279   2 /* FP2FP  */
 280 };
 281
 282 static const struct cpu_regmove_cost thunderx_regmove_cost =
 283 {
 284   2, /* GP2GP  */
 285   2, /* GP2FP  */
 286   6, /* FP2GP  */
 287   4 /* FP2FP  */
 288 };
 289
 290 static const struct cpu_regmove_cost xgene1_regmove_cost =
 291 {
 292   1, /* GP2GP  */
 293   /* Avoid the use of slow int<->fp moves for spilling by setting
 294      their cost higher than memmov_cost.  */
 295   8, /* GP2FP  */
 296   8, /* FP2GP  */
 297   2 /* FP2FP  */
 298 };
 299
 300 /* Generic costs for vector insn classes.  */
 301 static const struct cpu_vector_cost generic_vector_cost =
 302 {
 303   1, /* scalar_stmt_cost  */
 304   1, /* scalar_load_cost  */
 305   1, /* scalar_store_cost  */
 306   1, /* vec_stmt_cost  */
 307   1, /* vec_to_scalar_cost  */
 308   1, /* scalar_to_vec_cost  */
 309   1, /* vec_align_load_cost  */
 310   1, /* vec_unalign_load_cost  */
 311   1, /* vec_unalign_store_cost  */
 312   1, /* vec_store_cost  */
 313   3, /* cond_taken_branch_cost  */
 314   1 /* cond_not_taken_branch_cost  */
 315 };
 316
 317 /* Generic costs for vector insn classes.  */
 318 static const struct cpu_vector_cost cortexa57_vector_cost =
 319 {
 320   1, /* scalar_stmt_cost  */
 321   4, /* scalar_load_cost  */
 322   1, /* scalar_store_cost  */
 323   3, /* vec_stmt_cost  */
 324   8, /* vec_to_scalar_cost  */
 325   8, /* scalar_to_vec_cost  */
 326   5, /* vec_align_load_cost  */
 327   5, /* vec_unalign_load_cost  */
 328   1, /* vec_unalign_store_cost  */
 329   1, /* vec_store_cost  */
 330   1, /* cond_taken_branch_cost  */
 331   1 /* cond_not_taken_branch_cost  */
 332 };
 333
 334 /* Generic costs for vector insn classes.  */
 335 static const struct cpu_vector_cost xgene1_vector_cost =
 336 {
 337   1, /* scalar_stmt_cost  */
 338   5, /* scalar_load_cost  */
 339   1, /* scalar_store_cost  */
 340   2, /* vec_stmt_cost  */
 341   4, /* vec_to_scalar_cost  */
 342   4, /* scalar_to_vec_cost  */
 343   10, /* vec_align_load_cost  */
 344   10, /* vec_unalign_load_cost  */
 345   2, /* vec_unalign_store_cost  */
 346   2, /* vec_store_cost  */
 347   2, /* cond_taken_branch_cost  */
 348   1 /* cond_not_taken_branch_cost  */
 349 };
 350
 351 /* Generic costs for branch instructions.  */
 352 static const struct cpu_branch_cost generic_branch_cost =
 353 {
 354   2,  /* Predictable.  */
 355   2   /* Unpredictable.  */
 356 };
 357
 358 static const struct tune_params generic_tunings =
 359 {
 360   &cortexa57_extra_costs,
 361   &generic_addrcost_table,
 362   &generic_regmove_cost,
 363   &generic_vector_cost,
 364   &generic_branch_cost,
 365   4, /* memmov_cost  */
 366   2, /* issue_rate  */
 367   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 368   8,    /* function_align.  */
 369   8,    /* jump_align.  */
 370   4,    /* loop_align.  */
 371   2,    /* int_reassoc_width.  */
 372   4,    /* fp_reassoc_width.  */
 373   1,    /* vec_reassoc_width.  */
 374   2,    /* min_div_recip_mul_sf.  */
 375   2,    /* min_div_recip_mul_df.  */
 376   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 377 };
 378
 379 static const struct tune_params cortexa53_tunings =
 380 {
 381   &cortexa53_extra_costs,
 382   &generic_addrcost_table,
 383   &cortexa53_regmove_cost,
 384   &generic_vector_cost,
 385   &generic_branch_cost,
 386   4, /* memmov_cost  */
 387   2, /* issue_rate  */
 388   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 389    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 390   8,    /* function_align.  */
 391   8,    /* jump_align.  */
 392   4,    /* loop_align.  */
 393   2,    /* int_reassoc_width.  */
 394   4,    /* fp_reassoc_width.  */
 395   1,    /* vec_reassoc_width.  */
 396   2,    /* min_div_recip_mul_sf.  */
 397   2,    /* min_div_recip_mul_df.  */
 398   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 399 };
 400
 401 static const struct tune_params cortexa57_tunings =
 402 {
 403   &cortexa57_extra_costs,
 404   &cortexa57_addrcost_table,
 405   &cortexa57_regmove_cost,
 406   &cortexa57_vector_cost,
 407   &generic_branch_cost,
 408   4, /* memmov_cost  */
 409   3, /* issue_rate  */
 410   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 411    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 412   16,   /* function_align.  */
 413   8,    /* jump_align.  */
 414   4,    /* loop_align.  */
 415   2,    /* int_reassoc_width.  */
 416   4,    /* fp_reassoc_width.  */
 417   1,    /* vec_reassoc_width.  */
 418   2,    /* min_div_recip_mul_sf.  */
 419   2,    /* min_div_recip_mul_df.  */
 420   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 421 };
 422
 423 static const struct tune_params cortexa72_tunings =
 424 {
 425   &cortexa57_extra_costs,
 426   &cortexa57_addrcost_table,
 427   &cortexa57_regmove_cost,
 428   &cortexa57_vector_cost,
 429   &generic_branch_cost,
 430   4, /* memmov_cost  */
 431   3, /* issue_rate  */
 432   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 433    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 434   16,   /* function_align.  */
 435   8,    /* jump_align.  */
 436   4,    /* loop_align.  */
 437   2,    /* int_reassoc_width.  */
 438   4,    /* fp_reassoc_width.  */
 439   1,    /* vec_reassoc_width.  */
 440   2,    /* min_div_recip_mul_sf.  */
 441   2,    /* min_div_recip_mul_df.  */
 442   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 443 };
 444
 445 static const struct tune_params thunderx_tunings =
 446 {
 447   &thunderx_extra_costs,
 448   &generic_addrcost_table,
 449   &thunderx_regmove_cost,
 450   &generic_vector_cost,
 451   &generic_branch_cost,
 452   6, /* memmov_cost  */
 453   2, /* issue_rate  */
 454   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 455   8,    /* function_align.  */
 456   8,    /* jump_align.  */
 457   8,    /* loop_align.  */
 458   2,    /* int_reassoc_width.  */
 459   4,    /* fp_reassoc_width.  */
 460   1,    /* vec_reassoc_width.  */
 461   2,    /* min_div_recip_mul_sf.  */
 462   2,    /* min_div_recip_mul_df.  */
 463   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 464 };
 465
 466 static const struct tune_params xgene1_tunings =
 467 {
 468   &xgene1_extra_costs,
 469   &xgene1_addrcost_table,
 470   &xgene1_regmove_cost,
 471   &xgene1_vector_cost,
 472   &generic_branch_cost,
 473   6, /* memmov_cost  */
 474   4, /* issue_rate  */
 475   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 476   16,   /* function_align.  */
 477   8,    /* jump_align.  */
 478   16,   /* loop_align.  */
 479   2,    /* int_reassoc_width.  */
 480   4,    /* fp_reassoc_width.  */
 481   1,    /* vec_reassoc_width.  */
 482   2,    /* min_div_recip_mul_sf.  */
 483   2,    /* min_div_recip_mul_df.  */
 484   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 485 };
 486
 487 /* Support for fine-grained override of the tuning structures.  */
 488 struct aarch64_tuning_override_function
 489 {
 490   const char* name;
 491   void (*parse_override)(const char*, struct tune_params*);
 492 };
 493
 494 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 495 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 496
 497 static const struct aarch64_tuning_override_function
 498 aarch64_tuning_override_functions[] =
 499 {
 500   { "fuse", aarch64_parse_fuse_string },
 501   { "tune", aarch64_parse_tune_string },
 502   { NULL, NULL }
 503 };
 504
 505 /* A processor implementing AArch64.  */
 506 struct processor
 507 {
 508   const char *const name;
 509   enum aarch64_processor core;
 510   const char *arch;
 511   unsigned architecture_version;
 512   const unsigned long flags;
 513   const struct tune_params *const tune;
 514 };
 515
 516 /* Processor cores implementing AArch64.  */
 517 static const struct processor all_cores[] =
 518 {
 519 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 520   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 521 #include "aarch64-cores.def"
 522 #undef AARCH64_CORE
 523   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 524   {NULL, aarch64_none, NULL, 0, 0, NULL}
 525 };
 526
 527 /* Architectures implementing AArch64.  */
 528 static const struct processor all_architectures[] =
 529 {
 530 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 531   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 532 #include "aarch64-arches.def"
 533 #undef AARCH64_ARCH
 534   {NULL, aarch64_none, NULL, 0, 0, NULL}
 535 };
 536
 537 /* Target specification.  These are populated as commandline arguments
 538    are processed, or NULL if not specified.  */
 539 static const struct processor *selected_arch;
 540 static const struct processor *selected_cpu;
 541 static const struct processor *selected_tune;
 542
 543 /* The current tuning set.  */
 544 struct tune_params aarch64_tune_params = generic_tunings;
 545
 546 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 547
 548 /* An ISA extension in the co-processor and main instruction set space.  */
 549 struct aarch64_option_extension
 550 {
 551   const char *const name;
 552   const unsigned long flags_on;
 553   const unsigned long flags_off;
 554 };
 555
 556 /* ISA extensions in AArch64.  */
 557 static const struct aarch64_option_extension all_extensions[] =
 558 {
 559 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 560   {NAME, FLAGS_ON, FLAGS_OFF},
 561 #include "aarch64-option-extensions.def"
 562 #undef AARCH64_OPT_EXTENSION
 563   {NULL, 0, 0}
 564 };
 565
 566 /* Used to track the size of an address when generating a pre/post
 567    increment address.  */
 568 static machine_mode aarch64_memory_reference_mode;
 569
 570 /* A table of valid AArch64 "bitmask immediate" values for
 571    logical instructions.  */
 572
 573 #define AARCH64_NUM_BITMASKS  5334
 574 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 575
 576 typedef enum aarch64_cond_code
 577 {
 578   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 579   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 580   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 581 }
 582 aarch64_cc;
 583
 584 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 585
 586 /* The condition codes of the processor, and the inverse function.  */
 587 static const char * const aarch64_condition_codes[] =
 588 {
 589   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 590   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 591 };
 592
 593 void
 594 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 595 {
 596   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 597   if (TARGET_GENERAL_REGS_ONLY)
 598     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 599   else
 600     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 601 }
 602
 603 static unsigned int
 604 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 605 {
 606   if (GET_MODE_UNIT_SIZE (mode) == 4)
 607     return aarch64_tune_params.min_div_recip_mul_sf;
 608   return aarch64_tune_params.min_div_recip_mul_df;
 609 }
 610
 611 static int
 612 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 613                              enum machine_mode mode)
 614 {
 615   if (VECTOR_MODE_P (mode))
 616     return aarch64_tune_params.vec_reassoc_width;
 617   if (INTEGRAL_MODE_P (mode))
 618     return aarch64_tune_params.int_reassoc_width;
 619   if (FLOAT_MODE_P (mode))
 620     return aarch64_tune_params.fp_reassoc_width;
 621   return 1;
 622 }
 623
 624 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 625 unsigned
 626 aarch64_dbx_register_number (unsigned regno)
 627 {
 628    if (GP_REGNUM_P (regno))
 629      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 630    else if (regno == SP_REGNUM)
 631      return AARCH64_DWARF_SP;
 632    else if (FP_REGNUM_P (regno))
 633      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 634
 635    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 636       equivalent DWARF register.  */
 637    return DWARF_FRAME_REGISTERS;
 638 }
 639
 640 /* Return TRUE if MODE is any of the large INT modes.  */
 641 static bool
 642 aarch64_vect_struct_mode_p (machine_mode mode)
 643 {
 644   return mode == OImode || mode == CImode || mode == XImode;
 645 }
 646
 647 /* Return TRUE if MODE is any of the vector modes.  */
 648 static bool
 649 aarch64_vector_mode_p (machine_mode mode)
 650 {
 651   return aarch64_vector_mode_supported_p (mode)
 652          || aarch64_vect_struct_mode_p (mode);
 653 }
 654
 655 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 656 static bool
 657 aarch64_array_mode_supported_p (machine_mode mode,
 658                                 unsigned HOST_WIDE_INT nelems)
 659 {
 660   if (TARGET_SIMD
 661       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 662       && (nelems >= 2 && nelems <= 4))
 663     return true;
 664
 665   return false;
 666 }
 667
 668 /* Implement HARD_REGNO_NREGS.  */
 669
 670 int
 671 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 672 {
 673   switch (aarch64_regno_regclass (regno))
 674     {
 675     case FP_REGS:
 676     case FP_LO_REGS:
 677       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 678     default:
 679       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 680     }
 681   gcc_unreachable ();
 682 }
 683
 684 /* Implement HARD_REGNO_MODE_OK.  */
 685
 686 int
 687 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 688 {
 689   if (GET_MODE_CLASS (mode) == MODE_CC)
 690     return regno == CC_REGNUM;
 691
 692   if (regno == SP_REGNUM)
 693     /* The purpose of comparing with ptr_mode is to support the
 694        global register variable associated with the stack pointer
 695        register via the syntax of asm ("wsp") in ILP32.  */
 696     return mode == Pmode || mode == ptr_mode;
 697
 698   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 699     return mode == Pmode;
 700
 701   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 702     return 1;
 703
 704   if (FP_REGNUM_P (regno))
 705     {
 706       if (aarch64_vect_struct_mode_p (mode))
 707         return
 708           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 709       else
 710         return 1;
 711     }
 712
 713   return 0;
 714 }
 715
 716 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 717 machine_mode
 718 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 719                                      machine_mode mode)
 720 {
 721   /* Handle modes that fit within single registers.  */
 722   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 723     {
 724       if (GET_MODE_SIZE (mode) >= 4)
 725         return mode;
 726       else
 727         return SImode;
 728     }
 729   /* Fall back to generic for multi-reg and very large modes.  */
 730   else
 731     return choose_hard_reg_mode (regno, nregs, false);
 732 }
 733
 734 /* Return true if calls to DECL should be treated as
 735    long-calls (ie called via a register).  */
 736 static bool
 737 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 738 {
 739   return false;
 740 }
 741
 742 /* Return true if calls to symbol-ref SYM should be treated as
 743    long-calls (ie called via a register).  */
 744 bool
 745 aarch64_is_long_call_p (rtx sym)
 746 {
 747   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 748 }
 749
 750 /* Return true if the offsets to a zero/sign-extract operation
 751    represent an expression that matches an extend operation.  The
 752    operands represent the paramters from
 753
 754    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 755 bool
 756 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 757                                 rtx extract_imm)
 758 {
 759   HOST_WIDE_INT mult_val, extract_val;
 760
 761   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 762     return false;
 763
 764   mult_val = INTVAL (mult_imm);
 765   extract_val = INTVAL (extract_imm);
 766
 767   if (extract_val > 8
 768       && extract_val < GET_MODE_BITSIZE (mode)
 769       && exact_log2 (extract_val & ~7) > 0
 770       && (extract_val & 7) <= 4
 771       && mult_val == (1 << (extract_val & 7)))
 772     return true;
 773
 774   return false;
 775 }
 776
 777 /* Emit an insn that's a simple single-set.  Both the operands must be
 778    known to be valid.  */
 779 inline static rtx
 780 emit_set_insn (rtx x, rtx y)
 781 {
 782   return emit_insn (gen_rtx_SET (x, y));
 783 }
 784
 785 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 786    return the rtx for register 0 in the proper mode.  */
 787 rtx
 788 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 789 {
 790   machine_mode mode = SELECT_CC_MODE (code, x, y);
 791   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 792
 793   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 794   return cc_reg;
 795 }
 796
 797 /* Build the SYMBOL_REF for __tls_get_addr.  */
 798
 799 static GTY(()) rtx tls_get_addr_libfunc;
 800
 801 rtx
 802 aarch64_tls_get_addr (void)
 803 {
 804   if (!tls_get_addr_libfunc)
 805     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 806   return tls_get_addr_libfunc;
 807 }
 808
 809 /* Return the TLS model to use for ADDR.  */
 810
 811 static enum tls_model
 812 tls_symbolic_operand_type (rtx addr)
 813 {
 814   enum tls_model tls_kind = TLS_MODEL_NONE;
 815   rtx sym, addend;
 816
 817   if (GET_CODE (addr) == CONST)
 818     {
 819       split_const (addr, &sym, &addend);
 820       if (GET_CODE (sym) == SYMBOL_REF)
 821         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 822     }
 823   else if (GET_CODE (addr) == SYMBOL_REF)
 824     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 825
 826   return tls_kind;
 827 }
 828
 829 /* We'll allow lo_sum's in addresses in our legitimate addresses
 830    so that combine would take care of combining addresses where
 831    necessary, but for generation purposes, we'll generate the address
 832    as :
 833    RTL                               Absolute
 834    tmp = hi (symbol_ref);            adrp  x1, foo
 835    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 836                                      nop
 837
 838    PIC                               TLS
 839    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 840    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 841                                      bl   __tls_get_addr
 842                                      nop
 843
 844    Load TLS symbol, depending on TLS mechanism and TLS access model.
 845
 846    Global Dynamic - Traditional TLS:
 847    adrp tmp, :tlsgd:imm
 848    add  dest, tmp, #:tlsgd_lo12:imm
 849    bl   __tls_get_addr
 850
 851    Global Dynamic - TLS Descriptors:
 852    adrp dest, :tlsdesc:imm
 853    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 854    add  dest, dest, #:tlsdesc_lo12:imm
 855    blr  tmp
 856    mrs  tp, tpidr_el0
 857    add  dest, dest, tp
 858
 859    Initial Exec:
 860    mrs  tp, tpidr_el0
 861    adrp tmp, :gottprel:imm
 862    ldr  dest, [tmp, #:gottprel_lo12:imm]
 863    add  dest, dest, tp
 864
 865    Local Exec:
 866    mrs  tp, tpidr_el0
 867    add  t0, tp, #:tprel_hi12:imm, lsl #12
 868    add  t0, t0, #:tprel_lo12_nc:imm
 869 */
 870
 871 static void
 872 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 873                                    enum aarch64_symbol_type type)
 874 {
 875   switch (type)
 876     {
 877     case SYMBOL_SMALL_ABSOLUTE:
 878       {
 879         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 880         rtx tmp_reg = dest;
 881         machine_mode mode = GET_MODE (dest);
 882
 883         gcc_assert (mode == Pmode || mode == ptr_mode);
 884
 885         if (can_create_pseudo_p ())
 886           tmp_reg = gen_reg_rtx (mode);
 887
 888         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 889         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 890         return;
 891       }
 892
 893     case SYMBOL_TINY_ABSOLUTE:
 894       emit_insn (gen_rtx_SET (dest, imm));
 895       return;
 896
 897     case SYMBOL_SMALL_GOT_28K:
 898       {
 899         machine_mode mode = GET_MODE (dest);
 900         rtx gp_rtx = pic_offset_table_rtx;
 901
 902         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
 903            here before rtl expand.  Tree IVOPT will generate rtl pattern to
 904            decide rtx costs, in which case pic_offset_table_rtx is not
 905            initialized.  For that case no need to generate the first adrp
 906            instruction as the the final cost for global variable access is
 907            one instruction.  */
 908         if (gp_rtx != NULL)
 909           {
 910             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
 911                using the page base as GOT base, the first page may be wasted,
 912                in the worst scenario, there is only 28K space for GOT).
 913
 914                The generate instruction sequence for accessing global variable
 915                is:
 916
 917                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
 918
 919                Only one instruction needed. But we must initialize
 920                pic_offset_table_rtx properly.  We generate initialize insn for
 921                every global access, and allow CSE to remove all redundant.
 922
 923                The final instruction sequences will look like the following
 924                for multiply global variables access.
 925
 926                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
 927
 928                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
 929                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
 930                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
 931                  ...  */
 932
 933             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
 934             crtl->uses_pic_offset_table = 1;
 935             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 936
 937             if (mode != GET_MODE (gp_rtx))
 938               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
 939           }
 940
 941         if (mode == ptr_mode)
 942           {
 943             if (mode == DImode)
 944               emit_insn (gen_ldr_got_small_28k_di (dest, gp_rtx, imm));
 945             else
 946               emit_insn (gen_ldr_got_small_28k_si (dest, gp_rtx, imm));
 947           }
 948         else
 949           {
 950             gcc_assert (mode == Pmode);
 951             emit_insn (gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm));
 952           }
 953
 954         return;
 955       }
 956
 957     case SYMBOL_SMALL_GOT_4G:
 958       {
 959         /* In ILP32, the mode of dest can be either SImode or DImode,
 960            while the got entry is always of SImode size.  The mode of
 961            dest depends on how dest is used: if dest is assigned to a
 962            pointer (e.g. in the memory), it has SImode; it may have
 963            DImode if dest is dereferenced to access the memeory.
 964            This is why we have to handle three different ldr_got_small
 965            patterns here (two patterns for ILP32).  */
 966         rtx tmp_reg = dest;
 967         machine_mode mode = GET_MODE (dest);
 968
 969         if (can_create_pseudo_p ())
 970           tmp_reg = gen_reg_rtx (mode);
 971
 972         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 973         if (mode == ptr_mode)
 974           {
 975             if (mode == DImode)
 976               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 977             else
 978               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 979           }
 980         else
 981           {
 982             gcc_assert (mode == Pmode);
 983             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 984           }
 985
 986         return;
 987       }
 988
 989     case SYMBOL_SMALL_TLSGD:
 990       {
 991         rtx_insn *insns;
 992         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 993
 994         start_sequence ();
 995         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 996         insns = get_insns ();
 997         end_sequence ();
 998
 999         RTL_CONST_CALL_P (insns) = 1;
1000         emit_libcall_block (insns, dest, result, imm);
1001         return;
1002       }
1003
1004     case SYMBOL_SMALL_TLSDESC:
1005       {
1006         machine_mode mode = GET_MODE (dest);
1007         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1008         rtx tp;
1009
1010         gcc_assert (mode == Pmode || mode == ptr_mode);
1011
1012         /* In ILP32, the got entry is always of SImode size.  Unlike
1013            small GOT, the dest is fixed at reg 0.  */
1014         if (TARGET_ILP32)
1015           emit_insn (gen_tlsdesc_small_si (imm));
1016         else
1017           emit_insn (gen_tlsdesc_small_di (imm));
1018         tp = aarch64_load_tp (NULL);
1019
1020         if (mode != Pmode)
1021           tp = gen_lowpart (mode, tp);
1022
1023         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1024         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1025         return;
1026       }
1027
1028     case SYMBOL_SMALL_GOTTPREL:
1029       {
1030         /* In ILP32, the mode of dest can be either SImode or DImode,
1031            while the got entry is always of SImode size.  The mode of
1032            dest depends on how dest is used: if dest is assigned to a
1033            pointer (e.g. in the memory), it has SImode; it may have
1034            DImode if dest is dereferenced to access the memeory.
1035            This is why we have to handle three different tlsie_small
1036            patterns here (two patterns for ILP32).  */
1037         machine_mode mode = GET_MODE (dest);
1038         rtx tmp_reg = gen_reg_rtx (mode);
1039         rtx tp = aarch64_load_tp (NULL);
1040
1041         if (mode == ptr_mode)
1042           {
1043             if (mode == DImode)
1044               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1045             else
1046               {
1047                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1048                 tp = gen_lowpart (mode, tp);
1049               }
1050           }
1051         else
1052           {
1053             gcc_assert (mode == Pmode);
1054             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1055           }
1056
1057         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1058         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1059         return;
1060       }
1061
1062     case SYMBOL_SMALL_TPREL:
1063       {
1064         rtx tp = aarch64_load_tp (NULL);
1065
1066         if (GET_MODE (dest) != Pmode)
1067           tp = gen_lowpart (GET_MODE (dest), tp);
1068
1069         emit_insn (gen_tlsle_small (dest, tp, imm));
1070         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1071         return;
1072       }
1073
1074     case SYMBOL_TINY_GOT:
1075       emit_insn (gen_ldr_got_tiny (dest, imm));
1076       return;
1077
1078     default:
1079       gcc_unreachable ();
1080     }
1081 }
1082
1083 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1084    handle all moves if !can_create_pseudo_p ().  The distinction is
1085    important because, unlike emit_move_insn, the move expanders know
1086    how to force Pmode objects into the constant pool even when the
1087    constant pool address is not itself legitimate.  */
1088 static rtx
1089 aarch64_emit_move (rtx dest, rtx src)
1090 {
1091   return (can_create_pseudo_p ()
1092           ? emit_move_insn (dest, src)
1093           : emit_move_insn_1 (dest, src));
1094 }
1095
1096 /* Split a 128-bit move operation into two 64-bit move operations,
1097    taking care to handle partial overlap of register to register
1098    copies.  Special cases are needed when moving between GP regs and
1099    FP regs.  SRC can be a register, constant or memory; DST a register
1100    or memory.  If either operand is memory it must not have any side
1101    effects.  */
1102 void
1103 aarch64_split_128bit_move (rtx dst, rtx src)
1104 {
1105   rtx dst_lo, dst_hi;
1106   rtx src_lo, src_hi;
1107
1108   machine_mode mode = GET_MODE (dst);
1109
1110   gcc_assert (mode == TImode || mode == TFmode);
1111   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1112   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1113
1114   if (REG_P (dst) && REG_P (src))
1115     {
1116       int src_regno = REGNO (src);
1117       int dst_regno = REGNO (dst);
1118
1119       /* Handle FP <-> GP regs.  */
1120       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1121         {
1122           src_lo = gen_lowpart (word_mode, src);
1123           src_hi = gen_highpart (word_mode, src);
1124
1125           if (mode == TImode)
1126             {
1127               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1128               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1129             }
1130           else
1131             {
1132               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1133               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1134             }
1135           return;
1136         }
1137       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1138         {
1139           dst_lo = gen_lowpart (word_mode, dst);
1140           dst_hi = gen_highpart (word_mode, dst);
1141
1142           if (mode == TImode)
1143             {
1144               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1145               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1146             }
1147           else
1148             {
1149               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1150               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1151             }
1152           return;
1153         }
1154     }
1155
1156   dst_lo = gen_lowpart (word_mode, dst);
1157   dst_hi = gen_highpart (word_mode, dst);
1158   src_lo = gen_lowpart (word_mode, src);
1159   src_hi = gen_highpart_mode (word_mode, mode, src);
1160
1161   /* At most one pairing may overlap.  */
1162   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1163     {
1164       aarch64_emit_move (dst_hi, src_hi);
1165       aarch64_emit_move (dst_lo, src_lo);
1166     }
1167   else
1168     {
1169       aarch64_emit_move (dst_lo, src_lo);
1170       aarch64_emit_move (dst_hi, src_hi);
1171     }
1172 }
1173
1174 bool
1175 aarch64_split_128bit_move_p (rtx dst, rtx src)
1176 {
1177   return (! REG_P (src)
1178           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1179 }
1180
1181 /* Split a complex SIMD combine.  */
1182
1183 void
1184 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1185 {
1186   machine_mode src_mode = GET_MODE (src1);
1187   machine_mode dst_mode = GET_MODE (dst);
1188
1189   gcc_assert (VECTOR_MODE_P (dst_mode));
1190
1191   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1192     {
1193       rtx (*gen) (rtx, rtx, rtx);
1194
1195       switch (src_mode)
1196         {
1197         case V8QImode:
1198           gen = gen_aarch64_simd_combinev8qi;
1199           break;
1200         case V4HImode:
1201           gen = gen_aarch64_simd_combinev4hi;
1202           break;
1203         case V2SImode:
1204           gen = gen_aarch64_simd_combinev2si;
1205           break;
1206         case V2SFmode:
1207           gen = gen_aarch64_simd_combinev2sf;
1208           break;
1209         case DImode:
1210           gen = gen_aarch64_simd_combinedi;
1211           break;
1212         case DFmode:
1213           gen = gen_aarch64_simd_combinedf;
1214           break;
1215         default:
1216           gcc_unreachable ();
1217         }
1218
1219       emit_insn (gen (dst, src1, src2));
1220       return;
1221     }
1222 }
1223
1224 /* Split a complex SIMD move.  */
1225
1226 void
1227 aarch64_split_simd_move (rtx dst, rtx src)
1228 {
1229   machine_mode src_mode = GET_MODE (src);
1230   machine_mode dst_mode = GET_MODE (dst);
1231
1232   gcc_assert (VECTOR_MODE_P (dst_mode));
1233
1234   if (REG_P (dst) && REG_P (src))
1235     {
1236       rtx (*gen) (rtx, rtx);
1237
1238       gcc_assert (VECTOR_MODE_P (src_mode));
1239
1240       switch (src_mode)
1241         {
1242         case V16QImode:
1243           gen = gen_aarch64_split_simd_movv16qi;
1244           break;
1245         case V8HImode:
1246           gen = gen_aarch64_split_simd_movv8hi;
1247           break;
1248         case V4SImode:
1249           gen = gen_aarch64_split_simd_movv4si;
1250           break;
1251         case V2DImode:
1252           gen = gen_aarch64_split_simd_movv2di;
1253           break;
1254         case V4SFmode:
1255           gen = gen_aarch64_split_simd_movv4sf;
1256           break;
1257         case V2DFmode:
1258           gen = gen_aarch64_split_simd_movv2df;
1259           break;
1260         default:
1261           gcc_unreachable ();
1262         }
1263
1264       emit_insn (gen (dst, src));
1265       return;
1266     }
1267 }
1268
1269 static rtx
1270 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1271 {
1272   if (can_create_pseudo_p ())
1273     return force_reg (mode, value);
1274   else
1275     {
1276       x = aarch64_emit_move (x, value);
1277       return x;
1278     }
1279 }
1280
1281
1282 static rtx
1283 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1284 {
1285   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1286     {
1287       rtx high;
1288       /* Load the full offset into a register.  This
1289          might be improvable in the future.  */
1290       high = GEN_INT (offset);
1291       offset = 0;
1292       high = aarch64_force_temporary (mode, temp, high);
1293       reg = aarch64_force_temporary (mode, temp,
1294                                      gen_rtx_PLUS (mode, high, reg));
1295     }
1296   return plus_constant (mode, reg, offset);
1297 }
1298
1299 static int
1300 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1301                                 machine_mode mode)
1302 {
1303   unsigned HOST_WIDE_INT mask;
1304   int i;
1305   bool first;
1306   unsigned HOST_WIDE_INT val;
1307   bool subtargets;
1308   rtx subtarget;
1309   int one_match, zero_match, first_not_ffff_match;
1310   int num_insns = 0;
1311
1312   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1313     {
1314       if (generate)
1315         emit_insn (gen_rtx_SET (dest, imm));
1316       num_insns++;
1317       return num_insns;
1318     }
1319
1320   if (mode == SImode)
1321     {
1322       /* We know we can't do this in 1 insn, and we must be able to do it
1323          in two; so don't mess around looking for sequences that don't buy
1324          us anything.  */
1325       if (generate)
1326         {
1327           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1328           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1329                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1330         }
1331       num_insns += 2;
1332       return num_insns;
1333     }
1334
1335   /* Remaining cases are all for DImode.  */
1336
1337   val = INTVAL (imm);
1338   subtargets = optimize && can_create_pseudo_p ();
1339
1340   one_match = 0;
1341   zero_match = 0;
1342   mask = 0xffff;
1343   first_not_ffff_match = -1;
1344
1345   for (i = 0; i < 64; i += 16, mask <<= 16)
1346     {
1347       if ((val & mask) == mask)
1348         one_match++;
1349       else
1350         {
1351           if (first_not_ffff_match < 0)
1352             first_not_ffff_match = i;
1353           if ((val & mask) == 0)
1354             zero_match++;
1355         }
1356     }
1357
1358   if (one_match == 2)
1359     {
1360       /* Set one of the quarters and then insert back into result.  */
1361       mask = 0xffffll << first_not_ffff_match;
1362       if (generate)
1363         {
1364           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1365           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1366                                      GEN_INT ((val >> first_not_ffff_match)
1367                                               & 0xffff)));
1368         }
1369       num_insns += 2;
1370       return num_insns;
1371     }
1372
1373   if (zero_match == 2)
1374     goto simple_sequence;
1375
1376   mask = 0x0ffff0000UL;
1377   for (i = 16; i < 64; i += 16, mask <<= 16)
1378     {
1379       HOST_WIDE_INT comp = mask & ~(mask - 1);
1380
1381       if (aarch64_uimm12_shift (val - (val & mask)))
1382         {
1383           if (generate)
1384             {
1385               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1386               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1387               emit_insn (gen_adddi3 (dest, subtarget,
1388                                      GEN_INT (val - (val & mask))));
1389             }
1390           num_insns += 2;
1391           return num_insns;
1392         }
1393       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1394         {
1395           if (generate)
1396             {
1397               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1398               emit_insn (gen_rtx_SET (subtarget,
1399                                       GEN_INT ((val + comp) & mask)));
1400               emit_insn (gen_adddi3 (dest, subtarget,
1401                                      GEN_INT (val - ((val + comp) & mask))));
1402             }
1403           num_insns += 2;
1404           return num_insns;
1405         }
1406       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1407         {
1408           if (generate)
1409             {
1410               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1411               emit_insn (gen_rtx_SET (subtarget,
1412                                       GEN_INT ((val - comp) | ~mask)));
1413               emit_insn (gen_adddi3 (dest, subtarget,
1414                                      GEN_INT (val - ((val - comp) | ~mask))));
1415             }
1416           num_insns += 2;
1417           return num_insns;
1418         }
1419       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1420         {
1421           if (generate)
1422             {
1423               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1424               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1425               emit_insn (gen_adddi3 (dest, subtarget,
1426                                      GEN_INT (val - (val | ~mask))));
1427             }
1428           num_insns += 2;
1429           return num_insns;
1430         }
1431     }
1432
1433   /* See if we can do it by arithmetically combining two
1434      immediates.  */
1435   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1436     {
1437       int j;
1438       mask = 0xffff;
1439
1440       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1441           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1442         {
1443           if (generate)
1444             {
1445               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1446               emit_insn (gen_rtx_SET (subtarget,
1447                                       GEN_INT (aarch64_bitmasks[i])));
1448               emit_insn (gen_adddi3 (dest, subtarget,
1449                                      GEN_INT (val - aarch64_bitmasks[i])));
1450             }
1451           num_insns += 2;
1452           return num_insns;
1453         }
1454
1455       for (j = 0; j < 64; j += 16, mask <<= 16)
1456         {
1457           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1458             {
1459               if (generate)
1460                 {
1461                   emit_insn (gen_rtx_SET (dest,
1462                                           GEN_INT (aarch64_bitmasks[i])));
1463                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1464                                              GEN_INT ((val >> j) & 0xffff)));
1465                 }
1466               num_insns += 2;
1467               return num_insns;
1468             }
1469         }
1470     }
1471
1472   /* See if we can do it by logically combining two immediates.  */
1473   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1474     {
1475       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1476         {
1477           int j;
1478
1479           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1480             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1481               {
1482                 if (generate)
1483                   {
1484                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1485                     emit_insn (gen_rtx_SET (subtarget,
1486                                             GEN_INT (aarch64_bitmasks[i])));
1487                     emit_insn (gen_iordi3 (dest, subtarget,
1488                                            GEN_INT (aarch64_bitmasks[j])));
1489                   }
1490                 num_insns += 2;
1491                 return num_insns;
1492               }
1493         }
1494       else if ((val & aarch64_bitmasks[i]) == val)
1495         {
1496           int j;
1497
1498           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1499             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1500               {
1501                 if (generate)
1502                   {
1503                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1504                     emit_insn (gen_rtx_SET (subtarget,
1505                                             GEN_INT (aarch64_bitmasks[j])));
1506                     emit_insn (gen_anddi3 (dest, subtarget,
1507                                            GEN_INT (aarch64_bitmasks[i])));
1508                   }
1509                 num_insns += 2;
1510                 return num_insns;
1511               }
1512         }
1513     }
1514
1515   if (one_match > zero_match)
1516     {
1517       /* Set either first three quarters or all but the third.   */
1518       mask = 0xffffll << (16 - first_not_ffff_match);
1519       if (generate)
1520         emit_insn (gen_rtx_SET (dest,
1521                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1522       num_insns ++;
1523
1524       /* Now insert other two quarters.  */
1525       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1526            i < 64; i += 16, mask <<= 16)
1527         {
1528           if ((val & mask) != mask)
1529             {
1530               if (generate)
1531                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1532                                            GEN_INT ((val >> i) & 0xffff)));
1533               num_insns ++;
1534             }
1535         }
1536       return num_insns;
1537     }
1538
1539  simple_sequence:
1540   first = true;
1541   mask = 0xffff;
1542   for (i = 0; i < 64; i += 16, mask <<= 16)
1543     {
1544       if ((val & mask) != 0)
1545         {
1546           if (first)
1547             {
1548               if (generate)
1549                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1550               num_insns ++;
1551               first = false;
1552             }
1553           else
1554             {
1555               if (generate)
1556                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1557                                            GEN_INT ((val >> i) & 0xffff)));
1558               num_insns ++;
1559             }
1560         }
1561     }
1562
1563   return num_insns;
1564 }
1565
1566
1567 void
1568 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1569 {
1570   machine_mode mode = GET_MODE (dest);
1571
1572   gcc_assert (mode == SImode || mode == DImode);
1573
1574   /* Check on what type of symbol it is.  */
1575   if (GET_CODE (imm) == SYMBOL_REF
1576       || GET_CODE (imm) == LABEL_REF
1577       || GET_CODE (imm) == CONST)
1578     {
1579       rtx mem, base, offset;
1580       enum aarch64_symbol_type sty;
1581
1582       /* If we have (const (plus symbol offset)), separate out the offset
1583          before we start classifying the symbol.  */
1584       split_const (imm, &base, &offset);
1585
1586       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1587       switch (sty)
1588         {
1589         case SYMBOL_FORCE_TO_MEM:
1590           if (offset != const0_rtx
1591               && targetm.cannot_force_const_mem (mode, imm))
1592             {
1593               gcc_assert (can_create_pseudo_p ());
1594               base = aarch64_force_temporary (mode, dest, base);
1595               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1596               aarch64_emit_move (dest, base);
1597               return;
1598             }
1599           mem = force_const_mem (ptr_mode, imm);
1600           gcc_assert (mem);
1601           if (mode != ptr_mode)
1602             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1603           emit_insn (gen_rtx_SET (dest, mem));
1604           return;
1605
1606         case SYMBOL_SMALL_TLSGD:
1607         case SYMBOL_SMALL_TLSDESC:
1608         case SYMBOL_SMALL_GOTTPREL:
1609         case SYMBOL_SMALL_GOT_28K:
1610         case SYMBOL_SMALL_GOT_4G:
1611         case SYMBOL_TINY_GOT:
1612           if (offset != const0_rtx)
1613             {
1614               gcc_assert(can_create_pseudo_p ());
1615               base = aarch64_force_temporary (mode, dest, base);
1616               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1617               aarch64_emit_move (dest, base);
1618               return;
1619             }
1620           /* FALLTHRU */
1621
1622         case SYMBOL_SMALL_TPREL:
1623         case SYMBOL_SMALL_ABSOLUTE:
1624         case SYMBOL_TINY_ABSOLUTE:
1625           aarch64_load_symref_appropriately (dest, imm, sty);
1626           return;
1627
1628         default:
1629           gcc_unreachable ();
1630         }
1631     }
1632
1633   if (!CONST_INT_P (imm))
1634     {
1635       if (GET_CODE (imm) == HIGH)
1636         emit_insn (gen_rtx_SET (dest, imm));
1637       else
1638         {
1639           rtx mem = force_const_mem (mode, imm);
1640           gcc_assert (mem);
1641           emit_insn (gen_rtx_SET (dest, mem));
1642         }
1643
1644       return;
1645     }
1646
1647   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1648 }
1649
1650 static bool
1651 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1652                                  tree exp ATTRIBUTE_UNUSED)
1653 {
1654   /* Currently, always true.  */
1655   return true;
1656 }
1657
1658 /* Implement TARGET_PASS_BY_REFERENCE.  */
1659
1660 static bool
1661 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1662                            machine_mode mode,
1663                            const_tree type,
1664                            bool named ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode dummymode;
1668   int nregs;
1669
1670   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1671   size = (mode == BLKmode && type)
1672     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1673
1674   /* Aggregates are passed by reference based on their size.  */
1675   if (type && AGGREGATE_TYPE_P (type))
1676     {
1677       size = int_size_in_bytes (type);
1678     }
1679
1680   /* Variable sized arguments are always returned by reference.  */
1681   if (size < 0)
1682     return true;
1683
1684   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1685   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1686                                                &dummymode, &nregs,
1687                                                NULL))
1688     return false;
1689
1690   /* Arguments which are variable sized or larger than 2 registers are
1691      passed by reference unless they are a homogenous floating point
1692      aggregate.  */
1693   return size > 2 * UNITS_PER_WORD;
1694 }
1695
1696 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1697 static bool
1698 aarch64_return_in_msb (const_tree valtype)
1699 {
1700   machine_mode dummy_mode;
1701   int dummy_int;
1702
1703   /* Never happens in little-endian mode.  */
1704   if (!BYTES_BIG_ENDIAN)
1705     return false;
1706
1707   /* Only composite types smaller than or equal to 16 bytes can
1708      be potentially returned in registers.  */
1709   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1710       || int_size_in_bytes (valtype) <= 0
1711       || int_size_in_bytes (valtype) > 16)
1712     return false;
1713
1714   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1715      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1716      is always passed/returned in the least significant bits of fp/simd
1717      register(s).  */
1718   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1719                                                &dummy_mode, &dummy_int, NULL))
1720     return false;
1721
1722   return true;
1723 }
1724
1725 /* Implement TARGET_FUNCTION_VALUE.
1726    Define how to find the value returned by a function.  */
1727
1728 static rtx
1729 aarch64_function_value (const_tree type, const_tree func,
1730                         bool outgoing ATTRIBUTE_UNUSED)
1731 {
1732   machine_mode mode;
1733   int unsignedp;
1734   int count;
1735   machine_mode ag_mode;
1736
1737   mode = TYPE_MODE (type);
1738   if (INTEGRAL_TYPE_P (type))
1739     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1740
1741   if (aarch64_return_in_msb (type))
1742     {
1743       HOST_WIDE_INT size = int_size_in_bytes (type);
1744
1745       if (size % UNITS_PER_WORD != 0)
1746         {
1747           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1748           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1749         }
1750     }
1751
1752   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1753                                                &ag_mode, &count, NULL))
1754     {
1755       if (!aarch64_composite_type_p (type, mode))
1756         {
1757           gcc_assert (count == 1 && mode == ag_mode);
1758           return gen_rtx_REG (mode, V0_REGNUM);
1759         }
1760       else
1761         {
1762           int i;
1763           rtx par;
1764
1765           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1766           for (i = 0; i < count; i++)
1767             {
1768               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1769               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1770                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1771               XVECEXP (par, 0, i) = tmp;
1772             }
1773           return par;
1774         }
1775     }
1776   else
1777     return gen_rtx_REG (mode, R0_REGNUM);
1778 }
1779
1780 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1781    Return true if REGNO is the number of a hard register in which the values
1782    of called function may come back.  */
1783
1784 static bool
1785 aarch64_function_value_regno_p (const unsigned int regno)
1786 {
1787   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1788      of 16-byte return values are: 128-bit integers and 16-byte small
1789      structures (excluding homogeneous floating-point aggregates).  */
1790   if (regno == R0_REGNUM || regno == R1_REGNUM)
1791     return true;
1792
1793   /* Up to four fp/simd registers can return a function value, e.g. a
1794      homogeneous floating-point aggregate having four members.  */
1795   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1796     return TARGET_FLOAT;
1797
1798   return false;
1799 }
1800
1801 /* Implement TARGET_RETURN_IN_MEMORY.
1802
1803    If the type T of the result of a function is such that
1804      void func (T arg)
1805    would require that arg be passed as a value in a register (or set of
1806    registers) according to the parameter passing rules, then the result
1807    is returned in the same registers as would be used for such an
1808    argument.  */
1809
1810 static bool
1811 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1812 {
1813   HOST_WIDE_INT size;
1814   machine_mode ag_mode;
1815   int count;
1816
1817   if (!AGGREGATE_TYPE_P (type)
1818       && TREE_CODE (type) != COMPLEX_TYPE
1819       && TREE_CODE (type) != VECTOR_TYPE)
1820     /* Simple scalar types always returned in registers.  */
1821     return false;
1822
1823   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1824                                                type,
1825                                                &ag_mode,
1826                                                &count,
1827                                                NULL))
1828     return false;
1829
1830   /* Types larger than 2 registers returned in memory.  */
1831   size = int_size_in_bytes (type);
1832   return (size < 0 || size > 2 * UNITS_PER_WORD);
1833 }
1834
1835 static bool
1836 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1837                                const_tree type, int *nregs)
1838 {
1839   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1840   return aarch64_vfp_is_call_or_return_candidate (mode,
1841                                                   type,
1842                                                   &pcum->aapcs_vfp_rmode,
1843                                                   nregs,
1844                                                   NULL);
1845 }
1846
1847 /* Given MODE and TYPE of a function argument, return the alignment in
1848    bits.  The idea is to suppress any stronger alignment requested by
1849    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1850    This is a helper function for local use only.  */
1851
1852 static unsigned int
1853 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1854 {
1855   unsigned int alignment;
1856
1857   if (type)
1858     {
1859       if (!integer_zerop (TYPE_SIZE (type)))
1860         {
1861           if (TYPE_MODE (type) == mode)
1862             alignment = TYPE_ALIGN (type);
1863           else
1864             alignment = GET_MODE_ALIGNMENT (mode);
1865         }
1866       else
1867         alignment = 0;
1868     }
1869   else
1870     alignment = GET_MODE_ALIGNMENT (mode);
1871
1872   return alignment;
1873 }
1874
1875 /* Layout a function argument according to the AAPCS64 rules.  The rule
1876    numbers refer to the rule numbers in the AAPCS64.  */
1877
1878 static void
1879 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1880                     const_tree type,
1881                     bool named ATTRIBUTE_UNUSED)
1882 {
1883   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1884   int ncrn, nvrn, nregs;
1885   bool allocate_ncrn, allocate_nvrn;
1886   HOST_WIDE_INT size;
1887
1888   /* We need to do this once per argument.  */
1889   if (pcum->aapcs_arg_processed)
1890     return;
1891
1892   pcum->aapcs_arg_processed = true;
1893
1894   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1895   size
1896     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1897                         UNITS_PER_WORD);
1898
1899   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1900   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1901                                                  mode,
1902                                                  type,
1903                                                  &nregs);
1904
1905   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1906      The following code thus handles passing by SIMD/FP registers first.  */
1907
1908   nvrn = pcum->aapcs_nvrn;
1909
1910   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1911      and homogenous short-vector aggregates (HVA).  */
1912   if (allocate_nvrn)
1913     {
1914       if (!TARGET_FLOAT)
1915         aarch64_err_no_fpadvsimd (mode, "argument");
1916
1917       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1918         {
1919           pcum->aapcs_nextnvrn = nvrn + nregs;
1920           if (!aarch64_composite_type_p (type, mode))
1921             {
1922               gcc_assert (nregs == 1);
1923               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1924             }
1925           else
1926             {
1927               rtx par;
1928               int i;
1929               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1930               for (i = 0; i < nregs; i++)
1931                 {
1932                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1933                                          V0_REGNUM + nvrn + i);
1934                   tmp = gen_rtx_EXPR_LIST
1935                     (VOIDmode, tmp,
1936                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1937                   XVECEXP (par, 0, i) = tmp;
1938                 }
1939               pcum->aapcs_reg = par;
1940             }
1941           return;
1942         }
1943       else
1944         {
1945           /* C.3 NSRN is set to 8.  */
1946           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1947           goto on_stack;
1948         }
1949     }
1950
1951   ncrn = pcum->aapcs_ncrn;
1952   nregs = size / UNITS_PER_WORD;
1953
1954   /* C6 - C9.  though the sign and zero extension semantics are
1955      handled elsewhere.  This is the case where the argument fits
1956      entirely general registers.  */
1957   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1958     {
1959       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1960
1961       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1962
1963       /* C.8 if the argument has an alignment of 16 then the NGRN is
1964          rounded up to the next even number.  */
1965       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1966         {
1967           ++ncrn;
1968           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1969         }
1970       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1971          A reg is still generated for it, but the caller should be smart
1972          enough not to use it.  */
1973       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1974         {
1975           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1976         }
1977       else
1978         {
1979           rtx par;
1980           int i;
1981
1982           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1983           for (i = 0; i < nregs; i++)
1984             {
1985               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1986               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1987                                        GEN_INT (i * UNITS_PER_WORD));
1988               XVECEXP (par, 0, i) = tmp;
1989             }
1990           pcum->aapcs_reg = par;
1991         }
1992
1993       pcum->aapcs_nextncrn = ncrn + nregs;
1994       return;
1995     }
1996
1997   /* C.11  */
1998   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1999
2000   /* The argument is passed on stack; record the needed number of words for
2001      this argument and align the total size if necessary.  */
2002 on_stack:
2003   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2004   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2005     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
2006                                                16 / UNITS_PER_WORD);
2007   return;
2008 }
2009
2010 /* Implement TARGET_FUNCTION_ARG.  */
2011
2012 static rtx
2013 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2014                       const_tree type, bool named)
2015 {
2016   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2017   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2018
2019   if (mode == VOIDmode)
2020     return NULL_RTX;
2021
2022   aarch64_layout_arg (pcum_v, mode, type, named);
2023   return pcum->aapcs_reg;
2024 }
2025
2026 void
2027 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2028                            const_tree fntype ATTRIBUTE_UNUSED,
2029                            rtx libname ATTRIBUTE_UNUSED,
2030                            const_tree fndecl ATTRIBUTE_UNUSED,
2031                            unsigned n_named ATTRIBUTE_UNUSED)
2032 {
2033   pcum->aapcs_ncrn = 0;
2034   pcum->aapcs_nvrn = 0;
2035   pcum->aapcs_nextncrn = 0;
2036   pcum->aapcs_nextnvrn = 0;
2037   pcum->pcs_variant = ARM_PCS_AAPCS64;
2038   pcum->aapcs_reg = NULL_RTX;
2039   pcum->aapcs_arg_processed = false;
2040   pcum->aapcs_stack_words = 0;
2041   pcum->aapcs_stack_size = 0;
2042
2043   if (!TARGET_FLOAT
2044       && fndecl && TREE_PUBLIC (fndecl)
2045       && fntype && fntype != error_mark_node)
2046     {
2047       const_tree type = TREE_TYPE (fntype);
2048       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2049       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2050       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2051                                                    &mode, &nregs, NULL))
2052         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2053     }
2054   return;
2055 }
2056
2057 static void
2058 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2059                               machine_mode mode,
2060                               const_tree type,
2061                               bool named)
2062 {
2063   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2064   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2065     {
2066       aarch64_layout_arg (pcum_v, mode, type, named);
2067       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2068                   != (pcum->aapcs_stack_words != 0));
2069       pcum->aapcs_arg_processed = false;
2070       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2071       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2072       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2073       pcum->aapcs_stack_words = 0;
2074       pcum->aapcs_reg = NULL_RTX;
2075     }
2076 }
2077
2078 bool
2079 aarch64_function_arg_regno_p (unsigned regno)
2080 {
2081   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2082           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2083 }
2084
2085 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2086    PARM_BOUNDARY bits of alignment, but will be given anything up
2087    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2088    that both before and after the layout of each argument, the Next
2089    Stacked Argument Address (NSAA) will have a minimum alignment of
2090    8 bytes.  */
2091
2092 static unsigned int
2093 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2094 {
2095   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2096
2097   if (alignment < PARM_BOUNDARY)
2098     alignment = PARM_BOUNDARY;
2099   if (alignment > STACK_BOUNDARY)
2100     alignment = STACK_BOUNDARY;
2101   return alignment;
2102 }
2103
2104 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2105
2106    Return true if an argument passed on the stack should be padded upwards,
2107    i.e. if the least-significant byte of the stack slot has useful data.
2108
2109    Small aggregate types are placed in the lowest memory address.
2110
2111    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2112
2113 bool
2114 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2115 {
2116   /* On little-endian targets, the least significant byte of every stack
2117      argument is passed at the lowest byte address of the stack slot.  */
2118   if (!BYTES_BIG_ENDIAN)
2119     return true;
2120
2121   /* Otherwise, integral, floating-point and pointer types are padded downward:
2122      the least significant byte of a stack argument is passed at the highest
2123      byte address of the stack slot.  */
2124   if (type
2125       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2126          || POINTER_TYPE_P (type))
2127       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2128     return false;
2129
2130   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2131   return true;
2132 }
2133
2134 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2135
2136    It specifies padding for the last (may also be the only)
2137    element of a block move between registers and memory.  If
2138    assuming the block is in the memory, padding upward means that
2139    the last element is padded after its highest significant byte,
2140    while in downward padding, the last element is padded at the
2141    its least significant byte side.
2142
2143    Small aggregates and small complex types are always padded
2144    upwards.
2145
2146    We don't need to worry about homogeneous floating-point or
2147    short-vector aggregates; their move is not affected by the
2148    padding direction determined here.  Regardless of endianness,
2149    each element of such an aggregate is put in the least
2150    significant bits of a fp/simd register.
2151
2152    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2153    register has useful data, and return the opposite if the most
2154    significant byte does.  */
2155
2156 bool
2157 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2158                      bool first ATTRIBUTE_UNUSED)
2159 {
2160
2161   /* Small composite types are always padded upward.  */
2162   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2163     {
2164       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2165                             : GET_MODE_SIZE (mode));
2166       if (size < 2 * UNITS_PER_WORD)
2167         return true;
2168     }
2169
2170   /* Otherwise, use the default padding.  */
2171   return !BYTES_BIG_ENDIAN;
2172 }
2173
2174 static machine_mode
2175 aarch64_libgcc_cmp_return_mode (void)
2176 {
2177   return SImode;
2178 }
2179
2180 static bool
2181 aarch64_frame_pointer_required (void)
2182 {
2183   /* In aarch64_override_options_after_change
2184      flag_omit_leaf_frame_pointer turns off the frame pointer by
2185      default.  Turn it back on now if we've not got a leaf
2186      function.  */
2187   if (flag_omit_leaf_frame_pointer
2188       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2189     return true;
2190
2191   return false;
2192 }
2193
2194 /* Mark the registers that need to be saved by the callee and calculate
2195    the size of the callee-saved registers area and frame record (both FP
2196    and LR may be omitted).  */
2197 static void
2198 aarch64_layout_frame (void)
2199 {
2200   HOST_WIDE_INT offset = 0;
2201   int regno;
2202
2203   if (reload_completed && cfun->machine->frame.laid_out)
2204     return;
2205
2206 #define SLOT_NOT_REQUIRED (-2)
2207 #define SLOT_REQUIRED     (-1)
2208
2209   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2210   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2211
2212   /* First mark all the registers that really need to be saved...  */
2213   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2214     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2215
2216   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2217     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2218
2219   /* ... that includes the eh data registers (if needed)...  */
2220   if (crtl->calls_eh_return)
2221     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2222       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2223         = SLOT_REQUIRED;
2224
2225   /* ... and any callee saved register that dataflow says is live.  */
2226   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2227     if (df_regs_ever_live_p (regno)
2228         && (regno == R30_REGNUM
2229             || !call_used_regs[regno]))
2230       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2231
2232   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2233     if (df_regs_ever_live_p (regno)
2234         && !call_used_regs[regno])
2235       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2236
2237   if (frame_pointer_needed)
2238     {
2239       /* FP and LR are placed in the linkage record.  */
2240       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2241       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2242       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2243       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2244       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2245       offset += 2 * UNITS_PER_WORD;
2246     }
2247
2248   /* Now assign stack slots for them.  */
2249   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2250     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2251       {
2252         cfun->machine->frame.reg_offset[regno] = offset;
2253         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2254           cfun->machine->frame.wb_candidate1 = regno;
2255         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2256           cfun->machine->frame.wb_candidate2 = regno;
2257         offset += UNITS_PER_WORD;
2258       }
2259
2260   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2261     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2262       {
2263         cfun->machine->frame.reg_offset[regno] = offset;
2264         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2265           cfun->machine->frame.wb_candidate1 = regno;
2266         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2267                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2268           cfun->machine->frame.wb_candidate2 = regno;
2269         offset += UNITS_PER_WORD;
2270       }
2271
2272   cfun->machine->frame.padding0 =
2273     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2274   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2275
2276   cfun->machine->frame.saved_regs_size = offset;
2277
2278   cfun->machine->frame.hard_fp_offset
2279     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2280                         + get_frame_size ()
2281                         + cfun->machine->frame.saved_regs_size,
2282                         STACK_BOUNDARY / BITS_PER_UNIT);
2283
2284   cfun->machine->frame.frame_size
2285     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2286                         + crtl->outgoing_args_size,
2287                         STACK_BOUNDARY / BITS_PER_UNIT);
2288
2289   cfun->machine->frame.laid_out = true;
2290 }
2291
2292 static bool
2293 aarch64_register_saved_on_entry (int regno)
2294 {
2295   return cfun->machine->frame.reg_offset[regno] >= 0;
2296 }
2297
2298 static unsigned
2299 aarch64_next_callee_save (unsigned regno, unsigned limit)
2300 {
2301   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2302     regno ++;
2303   return regno;
2304 }
2305
2306 static void
2307 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2308                            HOST_WIDE_INT adjustment)
2309  {
2310   rtx base_rtx = stack_pointer_rtx;
2311   rtx insn, reg, mem;
2312
2313   reg = gen_rtx_REG (mode, regno);
2314   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2315                             plus_constant (Pmode, base_rtx, -adjustment));
2316   mem = gen_rtx_MEM (mode, mem);
2317
2318   insn = emit_move_insn (mem, reg);
2319   RTX_FRAME_RELATED_P (insn) = 1;
2320 }
2321
2322 static rtx
2323 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2324                           HOST_WIDE_INT adjustment)
2325 {
2326   switch (mode)
2327     {
2328     case DImode:
2329       return gen_storewb_pairdi_di (base, base, reg, reg2,
2330                                     GEN_INT (-adjustment),
2331                                     GEN_INT (UNITS_PER_WORD - adjustment));
2332     case DFmode:
2333       return gen_storewb_pairdf_di (base, base, reg, reg2,
2334                                     GEN_INT (-adjustment),
2335                                     GEN_INT (UNITS_PER_WORD - adjustment));
2336     default:
2337       gcc_unreachable ();
2338     }
2339 }
2340
2341 static void
2342 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2343                          unsigned regno2, HOST_WIDE_INT adjustment)
2344 {
2345   rtx_insn *insn;
2346   rtx reg1 = gen_rtx_REG (mode, regno1);
2347   rtx reg2 = gen_rtx_REG (mode, regno2);
2348
2349   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2350                                               reg2, adjustment));
2351   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2352   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2353   RTX_FRAME_RELATED_P (insn) = 1;
2354 }
2355
2356 static rtx
2357 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2358                          HOST_WIDE_INT adjustment)
2359 {
2360   switch (mode)
2361     {
2362     case DImode:
2363       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2364                                    GEN_INT (UNITS_PER_WORD));
2365     case DFmode:
2366       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2367                                    GEN_INT (UNITS_PER_WORD));
2368     default:
2369       gcc_unreachable ();
2370     }
2371 }
2372
2373 static rtx
2374 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2375                         rtx reg2)
2376 {
2377   switch (mode)
2378     {
2379     case DImode:
2380       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2381
2382     case DFmode:
2383       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2384
2385     default:
2386       gcc_unreachable ();
2387     }
2388 }
2389
2390 static rtx
2391 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2392                        rtx mem2)
2393 {
2394   switch (mode)
2395     {
2396     case DImode:
2397       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2398
2399     case DFmode:
2400       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2401
2402     default:
2403       gcc_unreachable ();
2404     }
2405 }
2406
2407
2408 static void
2409 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2410                            unsigned start, unsigned limit, bool skip_wb)
2411 {
2412   rtx_insn *insn;
2413   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2414                                                  ? gen_frame_mem : gen_rtx_MEM);
2415   unsigned regno;
2416   unsigned regno2;
2417
2418   for (regno = aarch64_next_callee_save (start, limit);
2419        regno <= limit;
2420        regno = aarch64_next_callee_save (regno + 1, limit))
2421     {
2422       rtx reg, mem;
2423       HOST_WIDE_INT offset;
2424
2425       if (skip_wb
2426           && (regno == cfun->machine->frame.wb_candidate1
2427               || regno == cfun->machine->frame.wb_candidate2))
2428         continue;
2429
2430       reg = gen_rtx_REG (mode, regno);
2431       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2432       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2433                                               offset));
2434
2435       regno2 = aarch64_next_callee_save (regno + 1, limit);
2436
2437       if (regno2 <= limit
2438           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2439               == cfun->machine->frame.reg_offset[regno2]))
2440
2441         {
2442           rtx reg2 = gen_rtx_REG (mode, regno2);
2443           rtx mem2;
2444
2445           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2446           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2447                                                    offset));
2448           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2449                                                     reg2));
2450
2451           /* The first part of a frame-related parallel insn is
2452              always assumed to be relevant to the frame
2453              calculations; subsequent parts, are only
2454              frame-related if explicitly marked.  */
2455           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2456           regno = regno2;
2457         }
2458       else
2459         insn = emit_move_insn (mem, reg);
2460
2461       RTX_FRAME_RELATED_P (insn) = 1;
2462     }
2463 }
2464
2465 static void
2466 aarch64_restore_callee_saves (machine_mode mode,
2467                               HOST_WIDE_INT start_offset, unsigned start,
2468                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2469 {
2470   rtx base_rtx = stack_pointer_rtx;
2471   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2472                                                  ? gen_frame_mem : gen_rtx_MEM);
2473   unsigned regno;
2474   unsigned regno2;
2475   HOST_WIDE_INT offset;
2476
2477   for (regno = aarch64_next_callee_save (start, limit);
2478        regno <= limit;
2479        regno = aarch64_next_callee_save (regno + 1, limit))
2480     {
2481       rtx reg, mem;
2482
2483       if (skip_wb
2484           && (regno == cfun->machine->frame.wb_candidate1
2485               || regno == cfun->machine->frame.wb_candidate2))
2486         continue;
2487
2488       reg = gen_rtx_REG (mode, regno);
2489       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2490       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2491
2492       regno2 = aarch64_next_callee_save (regno + 1, limit);
2493
2494       if (regno2 <= limit
2495           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2496               == cfun->machine->frame.reg_offset[regno2]))
2497         {
2498           rtx reg2 = gen_rtx_REG (mode, regno2);
2499           rtx mem2;
2500
2501           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2502           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2503           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2504
2505           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2506           regno = regno2;
2507         }
2508       else
2509         emit_move_insn (reg, mem);
2510       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2511     }
2512 }
2513
2514 /* AArch64 stack frames generated by this compiler look like:
2515
2516         +-------------------------------+
2517         |                               |
2518         |  incoming stack arguments     |
2519         |                               |
2520         +-------------------------------+
2521         |                               | <-- incoming stack pointer (aligned)
2522         |  callee-allocated save area   |
2523         |  for register varargs         |
2524         |                               |
2525         +-------------------------------+
2526         |  local variables              | <-- frame_pointer_rtx
2527         |                               |
2528         +-------------------------------+
2529         |  padding0                     | \
2530         +-------------------------------+  |
2531         |  callee-saved registers       |  | frame.saved_regs_size
2532         +-------------------------------+  |
2533         |  LR'                          |  |
2534         +-------------------------------+  |
2535         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2536         +-------------------------------+
2537         |  dynamic allocation           |
2538         +-------------------------------+
2539         |  padding                      |
2540         +-------------------------------+
2541         |  outgoing stack arguments     | <-- arg_pointer
2542         |                               |
2543         +-------------------------------+
2544         |                               | <-- stack_pointer_rtx (aligned)
2545
2546    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2547    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2548    unchanged.  */
2549
2550 /* Generate the prologue instructions for entry into a function.
2551    Establish the stack frame by decreasing the stack pointer with a
2552    properly calculated size and, if necessary, create a frame record
2553    filled with the values of LR and previous frame pointer.  The
2554    current FP is also set up if it is in use.  */
2555
2556 void
2557 aarch64_expand_prologue (void)
2558 {
2559   /* sub sp, sp, #<frame_size>
2560      stp {fp, lr}, [sp, #<frame_size> - 16]
2561      add fp, sp, #<frame_size> - hardfp_offset
2562      stp {cs_reg}, [fp, #-16] etc.
2563
2564      sub sp, sp, <final_adjustment_if_any>
2565   */
2566   HOST_WIDE_INT frame_size, offset;
2567   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2568   HOST_WIDE_INT hard_fp_offset;
2569   rtx_insn *insn;
2570
2571   aarch64_layout_frame ();
2572
2573   offset = frame_size = cfun->machine->frame.frame_size;
2574   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2575   fp_offset = frame_size - hard_fp_offset;
2576
2577   if (flag_stack_usage_info)
2578     current_function_static_stack_size = frame_size;
2579
2580   /* Store pairs and load pairs have a range only -512 to 504.  */
2581   if (offset >= 512)
2582     {
2583       /* When the frame has a large size, an initial decrease is done on
2584          the stack pointer to jump over the callee-allocated save area for
2585          register varargs, the local variable area and/or the callee-saved
2586          register area.  This will allow the pre-index write-back
2587          store pair instructions to be used for setting up the stack frame
2588          efficiently.  */
2589       offset = hard_fp_offset;
2590       if (offset >= 512)
2591         offset = cfun->machine->frame.saved_regs_size;
2592
2593       frame_size -= (offset + crtl->outgoing_args_size);
2594       fp_offset = 0;
2595
2596       if (frame_size >= 0x1000000)
2597         {
2598           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2599           emit_move_insn (op0, GEN_INT (-frame_size));
2600           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2601
2602           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2603                         gen_rtx_SET (stack_pointer_rtx,
2604                                      plus_constant (Pmode, stack_pointer_rtx,
2605                                                     -frame_size)));
2606           RTX_FRAME_RELATED_P (insn) = 1;
2607         }
2608       else if (frame_size > 0)
2609         {
2610           int hi_ofs = frame_size & 0xfff000;
2611           int lo_ofs = frame_size & 0x000fff;
2612
2613           if (hi_ofs)
2614             {
2615               insn = emit_insn (gen_add2_insn
2616                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2617               RTX_FRAME_RELATED_P (insn) = 1;
2618             }
2619           if (lo_ofs)
2620             {
2621               insn = emit_insn (gen_add2_insn
2622                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2623               RTX_FRAME_RELATED_P (insn) = 1;
2624             }
2625         }
2626     }
2627   else
2628     frame_size = -1;
2629
2630   if (offset > 0)
2631     {
2632       bool skip_wb = false;
2633
2634       if (frame_pointer_needed)
2635         {
2636           skip_wb = true;
2637
2638           if (fp_offset)
2639             {
2640               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2641                                                GEN_INT (-offset)));
2642               RTX_FRAME_RELATED_P (insn) = 1;
2643
2644               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2645                                          R30_REGNUM, false);
2646             }
2647           else
2648             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2649
2650           /* Set up frame pointer to point to the location of the
2651              previous frame pointer on the stack.  */
2652           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2653                                            stack_pointer_rtx,
2654                                            GEN_INT (fp_offset)));
2655           RTX_FRAME_RELATED_P (insn) = 1;
2656           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2657         }
2658       else
2659         {
2660           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2661           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2662
2663           if (fp_offset
2664               || reg1 == FIRST_PSEUDO_REGISTER
2665               || (reg2 == FIRST_PSEUDO_REGISTER
2666                   && offset >= 256))
2667             {
2668               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2669                                                GEN_INT (-offset)));
2670               RTX_FRAME_RELATED_P (insn) = 1;
2671             }
2672           else
2673             {
2674               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2675
2676               skip_wb = true;
2677
2678               if (reg2 == FIRST_PSEUDO_REGISTER)
2679                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2680               else
2681                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2682             }
2683         }
2684
2685       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2686                                  skip_wb);
2687       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2688                                  skip_wb);
2689     }
2690
2691   /* when offset >= 512,
2692      sub sp, sp, #<outgoing_args_size> */
2693   if (frame_size > -1)
2694     {
2695       if (crtl->outgoing_args_size > 0)
2696         {
2697           insn = emit_insn (gen_add2_insn
2698                             (stack_pointer_rtx,
2699                              GEN_INT (- crtl->outgoing_args_size)));
2700           RTX_FRAME_RELATED_P (insn) = 1;
2701         }
2702     }
2703 }
2704
2705 /* Return TRUE if we can use a simple_return insn.
2706
2707    This function checks whether the callee saved stack is empty, which
2708    means no restore actions are need. The pro_and_epilogue will use
2709    this to check whether shrink-wrapping opt is feasible.  */
2710
2711 bool
2712 aarch64_use_return_insn_p (void)
2713 {
2714   if (!reload_completed)
2715     return false;
2716
2717   if (crtl->profile)
2718     return false;
2719
2720   aarch64_layout_frame ();
2721
2722   return cfun->machine->frame.frame_size == 0;
2723 }
2724
2725 /* Generate the epilogue instructions for returning from a function.  */
2726 void
2727 aarch64_expand_epilogue (bool for_sibcall)
2728 {
2729   HOST_WIDE_INT frame_size, offset;
2730   HOST_WIDE_INT fp_offset;
2731   HOST_WIDE_INT hard_fp_offset;
2732   rtx_insn *insn;
2733   /* We need to add memory barrier to prevent read from deallocated stack.  */
2734   bool need_barrier_p = (get_frame_size () != 0
2735                          || cfun->machine->frame.saved_varargs_size);
2736
2737   aarch64_layout_frame ();
2738
2739   offset = frame_size = cfun->machine->frame.frame_size;
2740   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2741   fp_offset = frame_size - hard_fp_offset;
2742
2743   /* Store pairs and load pairs have a range only -512 to 504.  */
2744   if (offset >= 512)
2745     {
2746       offset = hard_fp_offset;
2747       if (offset >= 512)
2748         offset = cfun->machine->frame.saved_regs_size;
2749
2750       frame_size -= (offset + crtl->outgoing_args_size);
2751       fp_offset = 0;
2752       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2753         {
2754           insn = emit_insn (gen_add2_insn
2755                             (stack_pointer_rtx,
2756                              GEN_INT (crtl->outgoing_args_size)));
2757           RTX_FRAME_RELATED_P (insn) = 1;
2758         }
2759     }
2760   else
2761     frame_size = -1;
2762
2763   /* If there were outgoing arguments or we've done dynamic stack
2764      allocation, then restore the stack pointer from the frame
2765      pointer.  This is at most one insn and more efficient than using
2766      GCC's internal mechanism.  */
2767   if (frame_pointer_needed
2768       && (crtl->outgoing_args_size || cfun->calls_alloca))
2769     {
2770       if (cfun->calls_alloca)
2771         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2772
2773       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2774                                        hard_frame_pointer_rtx,
2775                                        GEN_INT (0)));
2776       offset = offset - fp_offset;
2777     }
2778
2779   if (offset > 0)
2780     {
2781       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2782       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2783       bool skip_wb = true;
2784       rtx cfi_ops = NULL;
2785
2786       if (frame_pointer_needed)
2787         fp_offset = 0;
2788       else if (fp_offset
2789                || reg1 == FIRST_PSEUDO_REGISTER
2790                || (reg2 == FIRST_PSEUDO_REGISTER
2791                    && offset >= 256))
2792         skip_wb = false;
2793
2794       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2795                                     skip_wb, &cfi_ops);
2796       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2797                                     skip_wb, &cfi_ops);
2798
2799       if (need_barrier_p)
2800         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2801
2802       if (skip_wb)
2803         {
2804           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2805           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2806
2807           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2808           if (reg2 == FIRST_PSEUDO_REGISTER)
2809             {
2810               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2811               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2812               mem = gen_rtx_MEM (mode1, mem);
2813               insn = emit_move_insn (rreg1, mem);
2814             }
2815           else
2816             {
2817               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2818
2819               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2820               insn = emit_insn (aarch64_gen_loadwb_pair
2821                                 (mode1, stack_pointer_rtx, rreg1,
2822                                  rreg2, offset));
2823             }
2824         }
2825       else
2826         {
2827           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2828                                            GEN_INT (offset)));
2829         }
2830
2831       /* Reset the CFA to be SP + FRAME_SIZE.  */
2832       rtx new_cfa = stack_pointer_rtx;
2833       if (frame_size > 0)
2834         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2835       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2836       REG_NOTES (insn) = cfi_ops;
2837       RTX_FRAME_RELATED_P (insn) = 1;
2838     }
2839
2840   if (frame_size > 0)
2841     {
2842       if (need_barrier_p)
2843         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2844
2845       if (frame_size >= 0x1000000)
2846         {
2847           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2848           emit_move_insn (op0, GEN_INT (frame_size));
2849           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2850         }
2851       else
2852         {
2853           int hi_ofs = frame_size & 0xfff000;
2854           int lo_ofs = frame_size & 0x000fff;
2855
2856           if (hi_ofs && lo_ofs)
2857             {
2858               insn = emit_insn (gen_add2_insn
2859                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2860               RTX_FRAME_RELATED_P (insn) = 1;
2861               frame_size = lo_ofs;
2862             }
2863           insn = emit_insn (gen_add2_insn
2864                             (stack_pointer_rtx, GEN_INT (frame_size)));
2865         }
2866
2867       /* Reset the CFA to be SP + 0.  */
2868       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2869       RTX_FRAME_RELATED_P (insn) = 1;
2870     }
2871
2872   /* Stack adjustment for exception handler.  */
2873   if (crtl->calls_eh_return)
2874     {
2875       /* We need to unwind the stack by the offset computed by
2876          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2877          to be SP; letting the CFA move during this adjustment
2878          is just as correct as retaining the CFA from the body
2879          of the function.  Therefore, do nothing special.  */
2880       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2881     }
2882
2883   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2884   if (!for_sibcall)
2885     emit_jump_insn (ret_rtx);
2886 }
2887
2888 /* Return the place to copy the exception unwinding return address to.
2889    This will probably be a stack slot, but could (in theory be the
2890    return register).  */
2891 rtx
2892 aarch64_final_eh_return_addr (void)
2893 {
2894   HOST_WIDE_INT fp_offset;
2895
2896   aarch64_layout_frame ();
2897
2898   fp_offset = cfun->machine->frame.frame_size
2899               - cfun->machine->frame.hard_fp_offset;
2900
2901   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2902     return gen_rtx_REG (DImode, LR_REGNUM);
2903
2904   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2905      result in a store to save LR introduced by builtin_eh_return () being
2906      incorrectly deleted because the alias is not detected.
2907      So in the calculation of the address to copy the exception unwinding
2908      return address to, we note 2 cases.
2909      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2910      we return a SP-relative location since all the addresses are SP-relative
2911      in this case.  This prevents the store from being optimized away.
2912      If the fp_offset is not 0, then the addresses will be FP-relative and
2913      therefore we return a FP-relative location.  */
2914
2915   if (frame_pointer_needed)
2916     {
2917       if (fp_offset)
2918         return gen_frame_mem (DImode,
2919                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2920       else
2921         return gen_frame_mem (DImode,
2922                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2923     }
2924
2925   /* If FP is not needed, we calculate the location of LR, which would be
2926      at the top of the saved registers block.  */
2927
2928   return gen_frame_mem (DImode,
2929                         plus_constant (Pmode,
2930                                        stack_pointer_rtx,
2931                                        fp_offset
2932                                        + cfun->machine->frame.saved_regs_size
2933                                        - 2 * UNITS_PER_WORD));
2934 }
2935
2936 /* Possibly output code to build up a constant in a register.  For
2937    the benefit of the costs infrastructure, returns the number of
2938    instructions which would be emitted.  GENERATE inhibits or
2939    enables code generation.  */
2940
2941 static int
2942 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2943 {
2944   int insns = 0;
2945
2946   if (aarch64_bitmask_imm (val, DImode))
2947     {
2948       if (generate)
2949         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2950       insns = 1;
2951     }
2952   else
2953     {
2954       int i;
2955       int ncount = 0;
2956       int zcount = 0;
2957       HOST_WIDE_INT valp = val >> 16;
2958       HOST_WIDE_INT valm;
2959       HOST_WIDE_INT tval;
2960
2961       for (i = 16; i < 64; i += 16)
2962         {
2963           valm = (valp & 0xffff);
2964
2965           if (valm != 0)
2966             ++ zcount;
2967
2968           if (valm != 0xffff)
2969             ++ ncount;
2970
2971           valp >>= 16;
2972         }
2973
2974       /* zcount contains the number of additional MOVK instructions
2975          required if the constant is built up with an initial MOVZ instruction,
2976          while ncount is the number of MOVK instructions required if starting
2977          with a MOVN instruction.  Choose the sequence that yields the fewest
2978          number of instructions, preferring MOVZ instructions when they are both
2979          the same.  */
2980       if (ncount < zcount)
2981         {
2982           if (generate)
2983             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2984                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2985           tval = 0xffff;
2986           insns++;
2987         }
2988       else
2989         {
2990           if (generate)
2991             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2992                             GEN_INT (val & 0xffff));
2993           tval = 0;
2994           insns++;
2995         }
2996
2997       val >>= 16;
2998
2999       for (i = 16; i < 64; i += 16)
3000         {
3001           if ((val & 0xffff) != tval)
3002             {
3003               if (generate)
3004                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3005                                            GEN_INT (i),
3006                                            GEN_INT (val & 0xffff)));
3007               insns++;
3008             }
3009           val >>= 16;
3010         }
3011     }
3012   return insns;
3013 }
3014
3015 static void
3016 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3017 {
3018   HOST_WIDE_INT mdelta = delta;
3019   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3020   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3021
3022   if (mdelta < 0)
3023     mdelta = -mdelta;
3024
3025   if (mdelta >= 4096 * 4096)
3026     {
3027       (void) aarch64_build_constant (scratchreg, delta, true);
3028       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3029     }
3030   else if (mdelta > 0)
3031     {
3032       if (mdelta >= 4096)
3033         {
3034           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3035           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3036           if (delta < 0)
3037             emit_insn (gen_rtx_SET (this_rtx,
3038                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3039           else
3040             emit_insn (gen_rtx_SET (this_rtx,
3041                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3042         }
3043       if (mdelta % 4096 != 0)
3044         {
3045           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3046           emit_insn (gen_rtx_SET (this_rtx,
3047                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3048         }
3049     }
3050 }
3051
3052 /* Output code to add DELTA to the first argument, and then jump
3053    to FUNCTION.  Used for C++ multiple inheritance.  */
3054 static void
3055 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3056                          HOST_WIDE_INT delta,
3057                          HOST_WIDE_INT vcall_offset,
3058                          tree function)
3059 {
3060   /* The this pointer is always in x0.  Note that this differs from
3061      Arm where the this pointer maybe bumped to r1 if r0 is required
3062      to return a pointer to an aggregate.  On AArch64 a result value
3063      pointer will be in x8.  */
3064   int this_regno = R0_REGNUM;
3065   rtx this_rtx, temp0, temp1, addr, funexp;
3066   rtx_insn *insn;
3067
3068   reload_completed = 1;
3069   emit_note (NOTE_INSN_PROLOGUE_END);
3070
3071   if (vcall_offset == 0)
3072     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3073   else
3074     {
3075       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3076
3077       this_rtx = gen_rtx_REG (Pmode, this_regno);
3078       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3079       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3080
3081       addr = this_rtx;
3082       if (delta != 0)
3083         {
3084           if (delta >= -256 && delta < 256)
3085             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3086                                        plus_constant (Pmode, this_rtx, delta));
3087           else
3088             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3089         }
3090
3091       if (Pmode == ptr_mode)
3092         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3093       else
3094         aarch64_emit_move (temp0,
3095                            gen_rtx_ZERO_EXTEND (Pmode,
3096                                                 gen_rtx_MEM (ptr_mode, addr)));
3097
3098       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3099           addr = plus_constant (Pmode, temp0, vcall_offset);
3100       else
3101         {
3102           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3103           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3104         }
3105
3106       if (Pmode == ptr_mode)
3107         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3108       else
3109         aarch64_emit_move (temp1,
3110                            gen_rtx_SIGN_EXTEND (Pmode,
3111                                                 gen_rtx_MEM (ptr_mode, addr)));
3112
3113       emit_insn (gen_add2_insn (this_rtx, temp1));
3114     }
3115
3116   /* Generate a tail call to the target function.  */
3117   if (!TREE_USED (function))
3118     {
3119       assemble_external (function);
3120       TREE_USED (function) = 1;
3121     }
3122   funexp = XEXP (DECL_RTL (function), 0);
3123   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3124   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3125   SIBLING_CALL_P (insn) = 1;
3126
3127   insn = get_insns ();
3128   shorten_branches (insn);
3129   final_start_function (insn, file, 1);
3130   final (insn, file, 1);
3131   final_end_function ();
3132
3133   /* Stop pretending to be a post-reload pass.  */
3134   reload_completed = 0;
3135 }
3136
3137 static bool
3138 aarch64_tls_referenced_p (rtx x)
3139 {
3140   if (!TARGET_HAVE_TLS)
3141     return false;
3142   subrtx_iterator::array_type array;
3143   FOR_EACH_SUBRTX (iter, array, x, ALL)
3144     {
3145       const_rtx x = *iter;
3146       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3147         return true;
3148       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3149          TLS offsets, not real symbol references.  */
3150       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3151         iter.skip_subrtxes ();
3152     }
3153   return false;
3154 }
3155
3156
3157 static int
3158 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3159 {
3160   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3161   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3162
3163   if (*imm1 < *imm2)
3164     return -1;
3165   if (*imm1 > *imm2)
3166     return +1;
3167   return 0;
3168 }
3169
3170
3171 static void
3172 aarch64_build_bitmask_table (void)
3173 {
3174   unsigned HOST_WIDE_INT mask, imm;
3175   unsigned int log_e, e, s, r;
3176   unsigned int nimms = 0;
3177
3178   for (log_e = 1; log_e <= 6; log_e++)
3179     {
3180       e = 1 << log_e;
3181       if (e == 64)
3182         mask = ~(HOST_WIDE_INT) 0;
3183       else
3184         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3185       for (s = 1; s < e; s++)
3186         {
3187           for (r = 0; r < e; r++)
3188             {
3189               /* set s consecutive bits to 1 (s < 64) */
3190               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3191               /* rotate right by r */
3192               if (r != 0)
3193                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3194               /* replicate the constant depending on SIMD size */
3195               switch (log_e) {
3196               case 1: imm |= (imm <<  2);
3197               case 2: imm |= (imm <<  4);
3198               case 3: imm |= (imm <<  8);
3199               case 4: imm |= (imm << 16);
3200               case 5: imm |= (imm << 32);
3201               case 6:
3202                 break;
3203               default:
3204                 gcc_unreachable ();
3205               }
3206               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3207               aarch64_bitmasks[nimms++] = imm;
3208             }
3209         }
3210     }
3211
3212   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3213   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3214          aarch64_bitmasks_cmp);
3215 }
3216
3217
3218 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3219    a left shift of 0 or 12 bits.  */
3220 bool
3221 aarch64_uimm12_shift (HOST_WIDE_INT val)
3222 {
3223   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3224           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3225           );
3226 }
3227
3228
3229 /* Return true if val is an immediate that can be loaded into a
3230    register by a MOVZ instruction.  */
3231 static bool
3232 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3233 {
3234   if (GET_MODE_SIZE (mode) > 4)
3235     {
3236       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3237           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3238         return 1;
3239     }
3240   else
3241     {
3242       /* Ignore sign extension.  */
3243       val &= (HOST_WIDE_INT) 0xffffffff;
3244     }
3245   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3246           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3247 }
3248
3249
3250 /* Return true if val is a valid bitmask immediate.  */
3251 bool
3252 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3253 {
3254   if (GET_MODE_SIZE (mode) < 8)
3255     {
3256       /* Replicate bit pattern.  */
3257       val &= (HOST_WIDE_INT) 0xffffffff;
3258       val |= val << 32;
3259     }
3260   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3261                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3262 }
3263
3264
3265 /* Return true if val is an immediate that can be loaded into a
3266    register in a single instruction.  */
3267 bool
3268 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3269 {
3270   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3271     return 1;
3272   return aarch64_bitmask_imm (val, mode);
3273 }
3274
3275 static bool
3276 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3277 {
3278   rtx base, offset;
3279
3280   if (GET_CODE (x) == HIGH)
3281     return true;
3282
3283   split_const (x, &base, &offset);
3284   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3285     {
3286       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3287           != SYMBOL_FORCE_TO_MEM)
3288         return true;
3289       else
3290         /* Avoid generating a 64-bit relocation in ILP32; leave
3291            to aarch64_expand_mov_immediate to handle it properly.  */
3292         return mode != ptr_mode;
3293     }
3294
3295   return aarch64_tls_referenced_p (x);
3296 }
3297
3298 /* Return true if register REGNO is a valid index register.
3299    STRICT_P is true if REG_OK_STRICT is in effect.  */
3300
3301 bool
3302 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3303 {
3304   if (!HARD_REGISTER_NUM_P (regno))
3305     {
3306       if (!strict_p)
3307         return true;
3308
3309       if (!reg_renumber)
3310         return false;
3311
3312       regno = reg_renumber[regno];
3313     }
3314   return GP_REGNUM_P (regno);
3315 }
3316
3317 /* Return true if register REGNO is a valid base register for mode MODE.
3318    STRICT_P is true if REG_OK_STRICT is in effect.  */
3319
3320 bool
3321 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3322 {
3323   if (!HARD_REGISTER_NUM_P (regno))
3324     {
3325       if (!strict_p)
3326         return true;
3327
3328       if (!reg_renumber)
3329         return false;
3330
3331       regno = reg_renumber[regno];
3332     }
3333
3334   /* The fake registers will be eliminated to either the stack or
3335      hard frame pointer, both of which are usually valid base registers.
3336      Reload deals with the cases where the eliminated form isn't valid.  */
3337   return (GP_REGNUM_P (regno)
3338           || regno == SP_REGNUM
3339           || regno == FRAME_POINTER_REGNUM
3340           || regno == ARG_POINTER_REGNUM);
3341 }
3342
3343 /* Return true if X is a valid base register for mode MODE.
3344    STRICT_P is true if REG_OK_STRICT is in effect.  */
3345
3346 static bool
3347 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3348 {
3349   if (!strict_p && GET_CODE (x) == SUBREG)
3350     x = SUBREG_REG (x);
3351
3352   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3353 }
3354
3355 /* Return true if address offset is a valid index.  If it is, fill in INFO
3356    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3357
3358 static bool
3359 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3360                         machine_mode mode, bool strict_p)
3361 {
3362   enum aarch64_address_type type;
3363   rtx index;
3364   int shift;
3365
3366   /* (reg:P) */
3367   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3368       && GET_MODE (x) == Pmode)
3369     {
3370       type = ADDRESS_REG_REG;
3371       index = x;
3372       shift = 0;
3373     }
3374   /* (sign_extend:DI (reg:SI)) */
3375   else if ((GET_CODE (x) == SIGN_EXTEND
3376             || GET_CODE (x) == ZERO_EXTEND)
3377            && GET_MODE (x) == DImode
3378            && GET_MODE (XEXP (x, 0)) == SImode)
3379     {
3380       type = (GET_CODE (x) == SIGN_EXTEND)
3381         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3382       index = XEXP (x, 0);
3383       shift = 0;
3384     }
3385   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3386   else if (GET_CODE (x) == MULT
3387            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3388                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3389            && GET_MODE (XEXP (x, 0)) == DImode
3390            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3391            && CONST_INT_P (XEXP (x, 1)))
3392     {
3393       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3394         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3395       index = XEXP (XEXP (x, 0), 0);
3396       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3397     }
3398   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3399   else if (GET_CODE (x) == ASHIFT
3400            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3401                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3402            && GET_MODE (XEXP (x, 0)) == DImode
3403            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3404            && CONST_INT_P (XEXP (x, 1)))
3405     {
3406       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3407         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3408       index = XEXP (XEXP (x, 0), 0);
3409       shift = INTVAL (XEXP (x, 1));
3410     }
3411   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3412   else if ((GET_CODE (x) == SIGN_EXTRACT
3413             || GET_CODE (x) == ZERO_EXTRACT)
3414            && GET_MODE (x) == DImode
3415            && GET_CODE (XEXP (x, 0)) == MULT
3416            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3417            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3418     {
3419       type = (GET_CODE (x) == SIGN_EXTRACT)
3420         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3421       index = XEXP (XEXP (x, 0), 0);
3422       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3423       if (INTVAL (XEXP (x, 1)) != 32 + shift
3424           || INTVAL (XEXP (x, 2)) != 0)
3425         shift = -1;
3426     }
3427   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3428      (const_int 0xffffffff<<shift)) */
3429   else if (GET_CODE (x) == AND
3430            && GET_MODE (x) == DImode
3431            && GET_CODE (XEXP (x, 0)) == MULT
3432            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3433            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3434            && CONST_INT_P (XEXP (x, 1)))
3435     {
3436       type = ADDRESS_REG_UXTW;
3437       index = XEXP (XEXP (x, 0), 0);
3438       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3439       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3440         shift = -1;
3441     }
3442   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3443   else if ((GET_CODE (x) == SIGN_EXTRACT
3444             || GET_CODE (x) == ZERO_EXTRACT)
3445            && GET_MODE (x) == DImode
3446            && GET_CODE (XEXP (x, 0)) == ASHIFT
3447            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3448            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3449     {
3450       type = (GET_CODE (x) == SIGN_EXTRACT)
3451         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3452       index = XEXP (XEXP (x, 0), 0);
3453       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3454       if (INTVAL (XEXP (x, 1)) != 32 + shift
3455           || INTVAL (XEXP (x, 2)) != 0)
3456         shift = -1;
3457     }
3458   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3459      (const_int 0xffffffff<<shift)) */
3460   else if (GET_CODE (x) == AND
3461            && GET_MODE (x) == DImode
3462            && GET_CODE (XEXP (x, 0)) == ASHIFT
3463            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3464            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3465            && CONST_INT_P (XEXP (x, 1)))
3466     {
3467       type = ADDRESS_REG_UXTW;
3468       index = XEXP (XEXP (x, 0), 0);
3469       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3470       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3471         shift = -1;
3472     }
3473   /* (mult:P (reg:P) (const_int scale)) */
3474   else if (GET_CODE (x) == MULT
3475            && GET_MODE (x) == Pmode
3476            && GET_MODE (XEXP (x, 0)) == Pmode
3477            && CONST_INT_P (XEXP (x, 1)))
3478     {
3479       type = ADDRESS_REG_REG;
3480       index = XEXP (x, 0);
3481       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3482     }
3483   /* (ashift:P (reg:P) (const_int shift)) */
3484   else if (GET_CODE (x) == ASHIFT
3485            && GET_MODE (x) == Pmode
3486            && GET_MODE (XEXP (x, 0)) == Pmode
3487            && CONST_INT_P (XEXP (x, 1)))
3488     {
3489       type = ADDRESS_REG_REG;
3490       index = XEXP (x, 0);
3491       shift = INTVAL (XEXP (x, 1));
3492     }
3493   else
3494     return false;
3495
3496   if (GET_CODE (index) == SUBREG)
3497     index = SUBREG_REG (index);
3498
3499   if ((shift == 0 ||
3500        (shift > 0 && shift <= 3
3501         && (1 << shift) == GET_MODE_SIZE (mode)))
3502       && REG_P (index)
3503       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3504     {
3505       info->type = type;
3506       info->offset = index;
3507       info->shift = shift;
3508       return true;
3509     }
3510
3511   return false;
3512 }
3513
3514 bool
3515 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3516 {
3517   return (offset >= -64 * GET_MODE_SIZE (mode)
3518           && offset < 64 * GET_MODE_SIZE (mode)
3519           && offset % GET_MODE_SIZE (mode) == 0);
3520 }
3521
3522 static inline bool
3523 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3524                                HOST_WIDE_INT offset)
3525 {
3526   return offset >= -256 && offset < 256;
3527 }
3528
3529 static inline bool
3530 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3531 {
3532   return (offset >= 0
3533           && offset < 4096 * GET_MODE_SIZE (mode)
3534           && offset % GET_MODE_SIZE (mode) == 0);
3535 }
3536
3537 /* Return true if X is a valid address for machine mode MODE.  If it is,
3538    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3539    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3540
3541 static bool
3542 aarch64_classify_address (struct aarch64_address_info *info,
3543                           rtx x, machine_mode mode,
3544                           RTX_CODE outer_code, bool strict_p)
3545 {
3546   enum rtx_code code = GET_CODE (x);
3547   rtx op0, op1;
3548
3549   /* On BE, we use load/store pair for all large int mode load/stores.  */
3550   bool load_store_pair_p = (outer_code == PARALLEL
3551                             || (BYTES_BIG_ENDIAN
3552                                 && aarch64_vect_struct_mode_p (mode)));
3553
3554   bool allow_reg_index_p =
3555     !load_store_pair_p
3556     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3557     && !aarch64_vect_struct_mode_p (mode);
3558
3559   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3560      REG addressing.  */
3561   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3562       && (code != POST_INC && code != REG))
3563     return false;
3564
3565   switch (code)
3566     {
3567     case REG:
3568     case SUBREG:
3569       info->type = ADDRESS_REG_IMM;
3570       info->base = x;
3571       info->offset = const0_rtx;
3572       return aarch64_base_register_rtx_p (x, strict_p);
3573
3574     case PLUS:
3575       op0 = XEXP (x, 0);
3576       op1 = XEXP (x, 1);
3577
3578       if (! strict_p
3579           && REG_P (op0)
3580           && (op0 == virtual_stack_vars_rtx
3581               || op0 == frame_pointer_rtx
3582               || op0 == arg_pointer_rtx)
3583           && CONST_INT_P (op1))
3584         {
3585           info->type = ADDRESS_REG_IMM;
3586           info->base = op0;
3587           info->offset = op1;
3588
3589           return true;
3590         }
3591
3592       if (GET_MODE_SIZE (mode) != 0
3593           && CONST_INT_P (op1)
3594           && aarch64_base_register_rtx_p (op0, strict_p))
3595         {
3596           HOST_WIDE_INT offset = INTVAL (op1);
3597
3598           info->type = ADDRESS_REG_IMM;
3599           info->base = op0;
3600           info->offset = op1;
3601
3602           /* TImode and TFmode values are allowed in both pairs of X
3603              registers and individual Q registers.  The available
3604              address modes are:
3605              X,X: 7-bit signed scaled offset
3606              Q:   9-bit signed offset
3607              We conservatively require an offset representable in either mode.
3608            */
3609           if (mode == TImode || mode == TFmode)
3610             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3611                     && offset_9bit_signed_unscaled_p (mode, offset));
3612
3613           /* A 7bit offset check because OImode will emit a ldp/stp
3614              instruction (only big endian will get here).
3615              For ldp/stp instructions, the offset is scaled for the size of a
3616              single element of the pair.  */
3617           if (mode == OImode)
3618             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3619
3620           /* Three 9/12 bit offsets checks because CImode will emit three
3621              ldr/str instructions (only big endian will get here).  */
3622           if (mode == CImode)
3623             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3624                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3625                         || offset_12bit_unsigned_scaled_p (V16QImode,
3626                                                            offset + 32)));
3627
3628           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3629              instructions (only big endian will get here).  */
3630           if (mode == XImode)
3631             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3632                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3633                                                             offset + 32));
3634
3635           if (load_store_pair_p)
3636             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3637                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3638           else
3639             return (offset_9bit_signed_unscaled_p (mode, offset)
3640                     || offset_12bit_unsigned_scaled_p (mode, offset));
3641         }
3642
3643       if (allow_reg_index_p)
3644         {
3645           /* Look for base + (scaled/extended) index register.  */
3646           if (aarch64_base_register_rtx_p (op0, strict_p)
3647               && aarch64_classify_index (info, op1, mode, strict_p))
3648             {
3649               info->base = op0;
3650               return true;
3651             }
3652           if (aarch64_base_register_rtx_p (op1, strict_p)
3653               && aarch64_classify_index (info, op0, mode, strict_p))
3654             {
3655               info->base = op1;
3656               return true;
3657             }
3658         }
3659
3660       return false;
3661
3662     case POST_INC:
3663     case POST_DEC:
3664     case PRE_INC:
3665     case PRE_DEC:
3666       info->type = ADDRESS_REG_WB;
3667       info->base = XEXP (x, 0);
3668       info->offset = NULL_RTX;
3669       return aarch64_base_register_rtx_p (info->base, strict_p);
3670
3671     case POST_MODIFY:
3672     case PRE_MODIFY:
3673       info->type = ADDRESS_REG_WB;
3674       info->base = XEXP (x, 0);
3675       if (GET_CODE (XEXP (x, 1)) == PLUS
3676           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3677           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3678           && aarch64_base_register_rtx_p (info->base, strict_p))
3679         {
3680           HOST_WIDE_INT offset;
3681           info->offset = XEXP (XEXP (x, 1), 1);
3682           offset = INTVAL (info->offset);
3683
3684           /* TImode and TFmode values are allowed in both pairs of X
3685              registers and individual Q registers.  The available
3686              address modes are:
3687              X,X: 7-bit signed scaled offset
3688              Q:   9-bit signed offset
3689              We conservatively require an offset representable in either mode.
3690            */
3691           if (mode == TImode || mode == TFmode)
3692             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3693                     && offset_9bit_signed_unscaled_p (mode, offset));
3694
3695           if (load_store_pair_p)
3696             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3697                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3698           else
3699             return offset_9bit_signed_unscaled_p (mode, offset);
3700         }
3701       return false;
3702
3703     case CONST:
3704     case SYMBOL_REF:
3705     case LABEL_REF:
3706       /* load literal: pc-relative constant pool entry.  Only supported
3707          for SI mode or larger.  */
3708       info->type = ADDRESS_SYMBOLIC;
3709
3710       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3711         {
3712           rtx sym, addend;
3713
3714           split_const (x, &sym, &addend);
3715           return (GET_CODE (sym) == LABEL_REF
3716                   || (GET_CODE (sym) == SYMBOL_REF
3717                       && CONSTANT_POOL_ADDRESS_P (sym)));
3718         }
3719       return false;
3720
3721     case LO_SUM:
3722       info->type = ADDRESS_LO_SUM;
3723       info->base = XEXP (x, 0);
3724       info->offset = XEXP (x, 1);
3725       if (allow_reg_index_p
3726           && aarch64_base_register_rtx_p (info->base, strict_p))
3727         {
3728           rtx sym, offs;
3729           split_const (info->offset, &sym, &offs);
3730           if (GET_CODE (sym) == SYMBOL_REF
3731               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3732                   == SYMBOL_SMALL_ABSOLUTE))
3733             {
3734               /* The symbol and offset must be aligned to the access size.  */
3735               unsigned int align;
3736               unsigned int ref_size;
3737
3738               if (CONSTANT_POOL_ADDRESS_P (sym))
3739                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3740               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3741                 {
3742                   tree exp = SYMBOL_REF_DECL (sym);
3743                   align = TYPE_ALIGN (TREE_TYPE (exp));
3744                   align = CONSTANT_ALIGNMENT (exp, align);
3745                 }
3746               else if (SYMBOL_REF_DECL (sym))
3747                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3748               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3749                        && SYMBOL_REF_BLOCK (sym) != NULL)
3750                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3751               else
3752                 align = BITS_PER_UNIT;
3753
3754               ref_size = GET_MODE_SIZE (mode);
3755               if (ref_size == 0)
3756                 ref_size = GET_MODE_SIZE (DImode);
3757
3758               return ((INTVAL (offs) & (ref_size - 1)) == 0
3759                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3760             }
3761         }
3762       return false;
3763
3764     default:
3765       return false;
3766     }
3767 }
3768
3769 bool
3770 aarch64_symbolic_address_p (rtx x)
3771 {
3772   rtx offset;
3773
3774   split_const (x, &x, &offset);
3775   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3776 }
3777
3778 /* Classify the base of symbolic expression X, given that X appears in
3779    context CONTEXT.  */
3780
3781 enum aarch64_symbol_type
3782 aarch64_classify_symbolic_expression (rtx x,
3783                                       enum aarch64_symbol_context context)
3784 {
3785   rtx offset;
3786
3787   split_const (x, &x, &offset);
3788   return aarch64_classify_symbol (x, offset, context);
3789 }
3790
3791
3792 /* Return TRUE if X is a legitimate address for accessing memory in
3793    mode MODE.  */
3794 static bool
3795 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3796 {
3797   struct aarch64_address_info addr;
3798
3799   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3800 }
3801
3802 /* Return TRUE if X is a legitimate address for accessing memory in
3803    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3804    pair operation.  */
3805 bool
3806 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3807                               RTX_CODE outer_code, bool strict_p)
3808 {
3809   struct aarch64_address_info addr;
3810
3811   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3812 }
3813
3814 /* Return TRUE if rtx X is immediate constant 0.0 */
3815 bool
3816 aarch64_float_const_zero_rtx_p (rtx x)
3817 {
3818   REAL_VALUE_TYPE r;
3819
3820   if (GET_MODE (x) == VOIDmode)
3821     return false;
3822
3823   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3824   if (REAL_VALUE_MINUS_ZERO (r))
3825     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3826   return REAL_VALUES_EQUAL (r, dconst0);
3827 }
3828
3829 /* Return the fixed registers used for condition codes.  */
3830
3831 static bool
3832 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3833 {
3834   *p1 = CC_REGNUM;
3835   *p2 = INVALID_REGNUM;
3836   return true;
3837 }
3838
3839 /* Emit call insn with PAT and do aarch64-specific handling.  */
3840
3841 void
3842 aarch64_emit_call_insn (rtx pat)
3843 {
3844   rtx insn = emit_call_insn (pat);
3845
3846   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3847   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3848   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3849 }
3850
3851 machine_mode
3852 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3853 {
3854   /* All floating point compares return CCFP if it is an equality
3855      comparison, and CCFPE otherwise.  */
3856   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3857     {
3858       switch (code)
3859         {
3860         case EQ:
3861         case NE:
3862         case UNORDERED:
3863         case ORDERED:
3864         case UNLT:
3865         case UNLE:
3866         case UNGT:
3867         case UNGE:
3868         case UNEQ:
3869         case LTGT:
3870           return CCFPmode;
3871
3872         case LT:
3873         case LE:
3874         case GT:
3875         case GE:
3876           return CCFPEmode;
3877
3878         default:
3879           gcc_unreachable ();
3880         }
3881     }
3882
3883   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3884       && y == const0_rtx
3885       && (code == EQ || code == NE || code == LT || code == GE)
3886       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3887           || GET_CODE (x) == NEG))
3888     return CC_NZmode;
3889
3890   /* A compare with a shifted operand.  Because of canonicalization,
3891      the comparison will have to be swapped when we emit the assembly
3892      code.  */
3893   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3894       && (REG_P (y) || GET_CODE (y) == SUBREG)
3895       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3896           || GET_CODE (x) == LSHIFTRT
3897           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3898     return CC_SWPmode;
3899
3900   /* Similarly for a negated operand, but we can only do this for
3901      equalities.  */
3902   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3903       && (REG_P (y) || GET_CODE (y) == SUBREG)
3904       && (code == EQ || code == NE)
3905       && GET_CODE (x) == NEG)
3906     return CC_Zmode;
3907
3908   /* A compare of a mode narrower than SI mode against zero can be done
3909      by extending the value in the comparison.  */
3910   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3911       && y == const0_rtx)
3912     /* Only use sign-extension if we really need it.  */
3913     return ((code == GT || code == GE || code == LE || code == LT)
3914             ? CC_SESWPmode : CC_ZESWPmode);
3915
3916   /* For everything else, return CCmode.  */
3917   return CCmode;
3918 }
3919
3920 static int
3921 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3922
3923 int
3924 aarch64_get_condition_code (rtx x)
3925 {
3926   machine_mode mode = GET_MODE (XEXP (x, 0));
3927   enum rtx_code comp_code = GET_CODE (x);
3928
3929   if (GET_MODE_CLASS (mode) != MODE_CC)
3930     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3931   return aarch64_get_condition_code_1 (mode, comp_code);
3932 }
3933
3934 static int
3935 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3936 {
3937   int ne = -1, eq = -1;
3938   switch (mode)
3939     {
3940     case CCFPmode:
3941     case CCFPEmode:
3942       switch (comp_code)
3943         {
3944         case GE: return AARCH64_GE;
3945         case GT: return AARCH64_GT;
3946         case LE: return AARCH64_LS;
3947         case LT: return AARCH64_MI;
3948         case NE: return AARCH64_NE;
3949         case EQ: return AARCH64_EQ;
3950         case ORDERED: return AARCH64_VC;
3951         case UNORDERED: return AARCH64_VS;
3952         case UNLT: return AARCH64_LT;
3953         case UNLE: return AARCH64_LE;
3954         case UNGT: return AARCH64_HI;
3955         case UNGE: return AARCH64_PL;
3956         default: return -1;
3957         }
3958       break;
3959
3960     case CC_DNEmode:
3961       ne = AARCH64_NE;
3962       eq = AARCH64_EQ;
3963       break;
3964
3965     case CC_DEQmode:
3966       ne = AARCH64_EQ;
3967       eq = AARCH64_NE;
3968       break;
3969
3970     case CC_DGEmode:
3971       ne = AARCH64_GE;
3972       eq = AARCH64_LT;
3973       break;
3974
3975     case CC_DLTmode:
3976       ne = AARCH64_LT;
3977       eq = AARCH64_GE;
3978       break;
3979
3980     case CC_DGTmode:
3981       ne = AARCH64_GT;
3982       eq = AARCH64_LE;
3983       break;
3984
3985     case CC_DLEmode:
3986       ne = AARCH64_LE;
3987       eq = AARCH64_GT;
3988       break;
3989
3990     case CC_DGEUmode:
3991       ne = AARCH64_CS;
3992       eq = AARCH64_CC;
3993       break;
3994
3995     case CC_DLTUmode:
3996       ne = AARCH64_CC;
3997       eq = AARCH64_CS;
3998       break;
3999
4000     case CC_DGTUmode:
4001       ne = AARCH64_HI;
4002       eq = AARCH64_LS;
4003       break;
4004
4005     case CC_DLEUmode:
4006       ne = AARCH64_LS;
4007       eq = AARCH64_HI;
4008       break;
4009
4010     case CCmode:
4011       switch (comp_code)
4012         {
4013         case NE: return AARCH64_NE;
4014         case EQ: return AARCH64_EQ;
4015         case GE: return AARCH64_GE;
4016         case GT: return AARCH64_GT;
4017         case LE: return AARCH64_LE;
4018         case LT: return AARCH64_LT;
4019         case GEU: return AARCH64_CS;
4020         case GTU: return AARCH64_HI;
4021         case LEU: return AARCH64_LS;
4022         case LTU: return AARCH64_CC;
4023         default: return -1;
4024         }
4025       break;
4026
4027     case CC_SWPmode:
4028     case CC_ZESWPmode:
4029     case CC_SESWPmode:
4030       switch (comp_code)
4031         {
4032         case NE: return AARCH64_NE;
4033         case EQ: return AARCH64_EQ;
4034         case GE: return AARCH64_LE;
4035         case GT: return AARCH64_LT;
4036         case LE: return AARCH64_GE;
4037         case LT: return AARCH64_GT;
4038         case GEU: return AARCH64_LS;
4039         case GTU: return AARCH64_CC;
4040         case LEU: return AARCH64_CS;
4041         case LTU: return AARCH64_HI;
4042         default: return -1;
4043         }
4044       break;
4045
4046     case CC_NZmode:
4047       switch (comp_code)
4048         {
4049         case NE: return AARCH64_NE;
4050         case EQ: return AARCH64_EQ;
4051         case GE: return AARCH64_PL;
4052         case LT: return AARCH64_MI;
4053         default: return -1;
4054         }
4055       break;
4056
4057     case CC_Zmode:
4058       switch (comp_code)
4059         {
4060         case NE: return AARCH64_NE;
4061         case EQ: return AARCH64_EQ;
4062         default: return -1;
4063         }
4064       break;
4065
4066     default:
4067       return -1;
4068       break;
4069     }
4070
4071   if (comp_code == NE)
4072     return ne;
4073
4074   if (comp_code == EQ)
4075     return eq;
4076
4077   return -1;
4078 }
4079
4080 bool
4081 aarch64_const_vec_all_same_in_range_p (rtx x,
4082                                   HOST_WIDE_INT minval,
4083                                   HOST_WIDE_INT maxval)
4084 {
4085   HOST_WIDE_INT firstval;
4086   int count, i;
4087
4088   if (GET_CODE (x) != CONST_VECTOR
4089       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4090     return false;
4091
4092   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4093   if (firstval < minval || firstval > maxval)
4094     return false;
4095
4096   count = CONST_VECTOR_NUNITS (x);
4097   for (i = 1; i < count; i++)
4098     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4099       return false;
4100
4101   return true;
4102 }
4103
4104 bool
4105 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4106 {
4107   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4108 }
4109
4110 static unsigned
4111 bit_count (unsigned HOST_WIDE_INT value)
4112 {
4113   unsigned count = 0;
4114
4115   while (value)
4116     {
4117       count++;
4118       value &= value - 1;
4119     }
4120
4121   return count;
4122 }
4123
4124 /* N Z C V.  */
4125 #define AARCH64_CC_V 1
4126 #define AARCH64_CC_C (1 << 1)
4127 #define AARCH64_CC_Z (1 << 2)
4128 #define AARCH64_CC_N (1 << 3)
4129
4130 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4131    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4132 static const int aarch64_nzcv_codes[][2] =
4133 {
4134   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4135   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4136   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4137   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4138   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4139   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4140   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4141   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4142   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4143   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4144   {0, AARCH64_CC_V}, /* GE, N == V.  */
4145   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4146   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4147   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4148   {0, 0}, /* AL, Any.  */
4149   {0, 0}, /* NV, Any.  */
4150 };
4151
4152 int
4153 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4154 {
4155   switch (mode)
4156     {
4157     case CC_DNEmode:
4158       return NE;
4159
4160     case CC_DEQmode:
4161       return EQ;
4162
4163     case CC_DLEmode:
4164       return LE;
4165
4166     case CC_DGTmode:
4167       return GT;
4168
4169     case CC_DLTmode:
4170       return LT;
4171
4172     case CC_DGEmode:
4173       return GE;
4174
4175     case CC_DLEUmode:
4176       return LEU;
4177
4178     case CC_DGTUmode:
4179       return GTU;
4180
4181     case CC_DLTUmode:
4182       return LTU;
4183
4184     case CC_DGEUmode:
4185       return GEU;
4186
4187     default:
4188       gcc_unreachable ();
4189     }
4190 }
4191
4192
4193 void
4194 aarch64_print_operand (FILE *f, rtx x, char code)
4195 {
4196   switch (code)
4197     {
4198     /* An integer or symbol address without a preceding # sign.  */
4199     case 'c':
4200       switch (GET_CODE (x))
4201         {
4202         case CONST_INT:
4203           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4204           break;
4205
4206         case SYMBOL_REF:
4207           output_addr_const (f, x);
4208           break;
4209
4210         case CONST:
4211           if (GET_CODE (XEXP (x, 0)) == PLUS
4212               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4213             {
4214               output_addr_const (f, x);
4215               break;
4216             }
4217           /* Fall through.  */
4218
4219         default:
4220           output_operand_lossage ("Unsupported operand for code '%c'", code);
4221         }
4222       break;
4223
4224     case 'e':
4225       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4226       {
4227         int n;
4228
4229         if (!CONST_INT_P (x)
4230             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4231           {
4232             output_operand_lossage ("invalid operand for '%%%c'", code);
4233             return;
4234           }
4235
4236         switch (n)
4237           {
4238           case 3:
4239             fputc ('b', f);
4240             break;
4241           case 4:
4242             fputc ('h', f);
4243             break;
4244           case 5:
4245             fputc ('w', f);
4246             break;
4247           default:
4248             output_operand_lossage ("invalid operand for '%%%c'", code);
4249             return;
4250           }
4251       }
4252       break;
4253
4254     case 'p':
4255       {
4256         int n;
4257
4258         /* Print N such that 2^N == X.  */
4259         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4260           {
4261             output_operand_lossage ("invalid operand for '%%%c'", code);
4262             return;
4263           }
4264
4265         asm_fprintf (f, "%d", n);
4266       }
4267       break;
4268
4269     case 'P':
4270       /* Print the number of non-zero bits in X (a const_int).  */
4271       if (!CONST_INT_P (x))
4272         {
4273           output_operand_lossage ("invalid operand for '%%%c'", code);
4274           return;
4275         }
4276
4277       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4278       break;
4279
4280     case 'H':
4281       /* Print the higher numbered register of a pair (TImode) of regs.  */
4282       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4283         {
4284           output_operand_lossage ("invalid operand for '%%%c'", code);
4285           return;
4286         }
4287
4288       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4289       break;
4290
4291     case 'm':
4292       {
4293         int cond_code;
4294         /* Print a condition (eq, ne, etc).  */
4295
4296         /* CONST_TRUE_RTX means always -- that's the default.  */
4297         if (x == const_true_rtx)
4298           return;
4299
4300         if (!COMPARISON_P (x))
4301           {
4302             output_operand_lossage ("invalid operand for '%%%c'", code);
4303             return;
4304           }
4305
4306         cond_code = aarch64_get_condition_code (x);
4307         gcc_assert (cond_code >= 0);
4308         fputs (aarch64_condition_codes[cond_code], f);
4309       }
4310       break;
4311
4312     case 'M':
4313       {
4314         int cond_code;
4315         /* Print the inverse of a condition (eq <-> ne, etc).  */
4316
4317         /* CONST_TRUE_RTX means never -- that's the default.  */
4318         if (x == const_true_rtx)
4319           {
4320             fputs ("nv", f);
4321             return;
4322           }
4323
4324         if (!COMPARISON_P (x))
4325           {
4326             output_operand_lossage ("invalid operand for '%%%c'", code);
4327             return;
4328           }
4329         cond_code = aarch64_get_condition_code (x);
4330         gcc_assert (cond_code >= 0);
4331         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4332                                        (cond_code)], f);
4333       }
4334       break;
4335
4336     case 'b':
4337     case 'h':
4338     case 's':
4339     case 'd':
4340     case 'q':
4341       /* Print a scalar FP/SIMD register name.  */
4342       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4343         {
4344           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4345           return;
4346         }
4347       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4348       break;
4349
4350     case 'S':
4351     case 'T':
4352     case 'U':
4353     case 'V':
4354       /* Print the first FP/SIMD register name in a list.  */
4355       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4356         {
4357           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4358           return;
4359         }
4360       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4361       break;
4362
4363     case 'R':
4364       /* Print a scalar FP/SIMD register name + 1.  */
4365       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4366         {
4367           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4368           return;
4369         }
4370       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4371       break;
4372
4373     case 'X':
4374       /* Print bottom 16 bits of integer constant in hex.  */
4375       if (!CONST_INT_P (x))
4376         {
4377           output_operand_lossage ("invalid operand for '%%%c'", code);
4378           return;
4379         }
4380       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4381       break;
4382
4383     case 'w':
4384     case 'x':
4385       /* Print a general register name or the zero register (32-bit or
4386          64-bit).  */
4387       if (x == const0_rtx
4388           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4389         {
4390           asm_fprintf (f, "%czr", code);
4391           break;
4392         }
4393
4394       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4395         {
4396           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4397           break;
4398         }
4399
4400       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4401         {
4402           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4403           break;
4404         }
4405
4406       /* Fall through */
4407
4408     case 0:
4409       /* Print a normal operand, if it's a general register, then we
4410          assume DImode.  */
4411       if (x == NULL)
4412         {
4413           output_operand_lossage ("missing operand");
4414           return;
4415         }
4416
4417       switch (GET_CODE (x))
4418         {
4419         case REG:
4420           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4421           break;
4422
4423         case MEM:
4424           aarch64_memory_reference_mode = GET_MODE (x);
4425           output_address (XEXP (x, 0));
4426           break;
4427
4428         case LABEL_REF:
4429         case SYMBOL_REF:
4430           output_addr_const (asm_out_file, x);
4431           break;
4432
4433         case CONST_INT:
4434           asm_fprintf (f, "%wd", INTVAL (x));
4435           break;
4436
4437         case CONST_VECTOR:
4438           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4439             {
4440               gcc_assert (
4441                   aarch64_const_vec_all_same_in_range_p (x,
4442                                                          HOST_WIDE_INT_MIN,
4443                                                          HOST_WIDE_INT_MAX));
4444               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4445             }
4446           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4447             {
4448               fputc ('0', f);
4449             }
4450           else
4451             gcc_unreachable ();
4452           break;
4453
4454         case CONST_DOUBLE:
4455           /* CONST_DOUBLE can represent a double-width integer.
4456              In this case, the mode of x is VOIDmode.  */
4457           if (GET_MODE (x) == VOIDmode)
4458             ; /* Do Nothing.  */
4459           else if (aarch64_float_const_zero_rtx_p (x))
4460             {
4461               fputc ('0', f);
4462               break;
4463             }
4464           else if (aarch64_float_const_representable_p (x))
4465             {
4466 #define buf_size 20
4467               char float_buf[buf_size] = {'\0'};
4468               REAL_VALUE_TYPE r;
4469               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4470               real_to_decimal_for_mode (float_buf, &r,
4471                                         buf_size, buf_size,
4472                                         1, GET_MODE (x));
4473               asm_fprintf (asm_out_file, "%s", float_buf);
4474               break;
4475 #undef buf_size
4476             }
4477           output_operand_lossage ("invalid constant");
4478           return;
4479         default:
4480           output_operand_lossage ("invalid operand");
4481           return;
4482         }
4483       break;
4484
4485     case 'A':
4486       if (GET_CODE (x) == HIGH)
4487         x = XEXP (x, 0);
4488
4489       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4490         {
4491         case SYMBOL_SMALL_GOT_4G:
4492           asm_fprintf (asm_out_file, ":got:");
4493           break;
4494
4495         case SYMBOL_SMALL_TLSGD:
4496           asm_fprintf (asm_out_file, ":tlsgd:");
4497           break;
4498
4499         case SYMBOL_SMALL_TLSDESC:
4500           asm_fprintf (asm_out_file, ":tlsdesc:");
4501           break;
4502
4503         case SYMBOL_SMALL_GOTTPREL:
4504           asm_fprintf (asm_out_file, ":gottprel:");
4505           break;
4506
4507         case SYMBOL_SMALL_TPREL:
4508           asm_fprintf (asm_out_file, ":tprel:");
4509           break;
4510
4511         case SYMBOL_TINY_GOT:
4512           gcc_unreachable ();
4513           break;
4514
4515         default:
4516           break;
4517         }
4518       output_addr_const (asm_out_file, x);
4519       break;
4520
4521     case 'L':
4522       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4523         {
4524         case SYMBOL_SMALL_GOT_4G:
4525           asm_fprintf (asm_out_file, ":lo12:");
4526           break;
4527
4528         case SYMBOL_SMALL_TLSGD:
4529           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4530           break;
4531
4532         case SYMBOL_SMALL_TLSDESC:
4533           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4534           break;
4535
4536         case SYMBOL_SMALL_GOTTPREL:
4537           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4538           break;
4539
4540         case SYMBOL_SMALL_TPREL:
4541           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4542           break;
4543
4544         case SYMBOL_TINY_GOT:
4545           asm_fprintf (asm_out_file, ":got:");
4546           break;
4547
4548         default:
4549           break;
4550         }
4551       output_addr_const (asm_out_file, x);
4552       break;
4553
4554     case 'G':
4555
4556       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4557         {
4558         case SYMBOL_SMALL_TPREL:
4559           asm_fprintf (asm_out_file, ":tprel_hi12:");
4560           break;
4561         default:
4562           break;
4563         }
4564       output_addr_const (asm_out_file, x);
4565       break;
4566
4567     case 'K':
4568       {
4569         int cond_code;
4570         /* Print nzcv.  */
4571
4572         if (!COMPARISON_P (x))
4573           {
4574             output_operand_lossage ("invalid operand for '%%%c'", code);
4575             return;
4576           }
4577
4578         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4579         gcc_assert (cond_code >= 0);
4580         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4581       }
4582       break;
4583
4584     case 'k':
4585       {
4586         int cond_code;
4587         /* Print nzcv.  */
4588
4589         if (!COMPARISON_P (x))
4590           {
4591             output_operand_lossage ("invalid operand for '%%%c'", code);
4592             return;
4593           }
4594
4595         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4596         gcc_assert (cond_code >= 0);
4597         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4598       }
4599       break;
4600
4601     default:
4602       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4603       return;
4604     }
4605 }
4606
4607 void
4608 aarch64_print_operand_address (FILE *f, rtx x)
4609 {
4610   struct aarch64_address_info addr;
4611
4612   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4613                              MEM, true))
4614     switch (addr.type)
4615       {
4616       case ADDRESS_REG_IMM:
4617         if (addr.offset == const0_rtx)
4618           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4619         else
4620           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4621                        INTVAL (addr.offset));
4622         return;
4623
4624       case ADDRESS_REG_REG:
4625         if (addr.shift == 0)
4626           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4627                        reg_names [REGNO (addr.offset)]);
4628         else
4629           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4630                        reg_names [REGNO (addr.offset)], addr.shift);
4631         return;
4632
4633       case ADDRESS_REG_UXTW:
4634         if (addr.shift == 0)
4635           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4636                        REGNO (addr.offset) - R0_REGNUM);
4637         else
4638           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4639                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4640         return;
4641
4642       case ADDRESS_REG_SXTW:
4643         if (addr.shift == 0)
4644           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4645                        REGNO (addr.offset) - R0_REGNUM);
4646         else
4647           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4648                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4649         return;
4650
4651       case ADDRESS_REG_WB:
4652         switch (GET_CODE (x))
4653           {
4654           case PRE_INC:
4655             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4656                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4657             return;
4658           case POST_INC:
4659             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4660                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4661             return;
4662           case PRE_DEC:
4663             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4664                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4665             return;
4666           case POST_DEC:
4667             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4668                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4669             return;
4670           case PRE_MODIFY:
4671             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4672                          INTVAL (addr.offset));
4673             return;
4674           case POST_MODIFY:
4675             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4676                          INTVAL (addr.offset));
4677             return;
4678           default:
4679             break;
4680           }
4681         break;
4682
4683       case ADDRESS_LO_SUM:
4684         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4685         output_addr_const (f, addr.offset);
4686         asm_fprintf (f, "]");
4687         return;
4688
4689       case ADDRESS_SYMBOLIC:
4690         break;
4691       }
4692
4693   output_addr_const (f, x);
4694 }
4695
4696 bool
4697 aarch64_label_mentioned_p (rtx x)
4698 {
4699   const char *fmt;
4700   int i;
4701
4702   if (GET_CODE (x) == LABEL_REF)
4703     return true;
4704
4705   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4706      referencing instruction, but they are constant offsets, not
4707      symbols.  */
4708   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4709     return false;
4710
4711   fmt = GET_RTX_FORMAT (GET_CODE (x));
4712   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4713     {
4714       if (fmt[i] == 'E')
4715         {
4716           int j;
4717
4718           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4719             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4720               return 1;
4721         }
4722       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4723         return 1;
4724     }
4725
4726   return 0;
4727 }
4728
4729 /* Implement REGNO_REG_CLASS.  */
4730
4731 enum reg_class
4732 aarch64_regno_regclass (unsigned regno)
4733 {
4734   if (GP_REGNUM_P (regno))
4735     return GENERAL_REGS;
4736
4737   if (regno == SP_REGNUM)
4738     return STACK_REG;
4739
4740   if (regno == FRAME_POINTER_REGNUM
4741       || regno == ARG_POINTER_REGNUM)
4742     return POINTER_REGS;
4743
4744   if (FP_REGNUM_P (regno))
4745     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4746
4747   return NO_REGS;
4748 }
4749
4750 static rtx
4751 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4752 {
4753   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4754      where mask is selected by alignment and size of the offset.
4755      We try to pick as large a range for the offset as possible to
4756      maximize the chance of a CSE.  However, for aligned addresses
4757      we limit the range to 4k so that structures with different sized
4758      elements are likely to use the same base.  */
4759
4760   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4761     {
4762       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4763       HOST_WIDE_INT base_offset;
4764
4765       /* Does it look like we'll need a load/store-pair operation?  */
4766       if (GET_MODE_SIZE (mode) > 16
4767           || mode == TImode)
4768         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4769                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4770       /* For offsets aren't a multiple of the access size, the limit is
4771          -256...255.  */
4772       else if (offset & (GET_MODE_SIZE (mode) - 1))
4773         base_offset = (offset + 0x100) & ~0x1ff;
4774       else
4775         base_offset = offset & ~0xfff;
4776
4777       if (base_offset == 0)
4778         return x;
4779
4780       offset -= base_offset;
4781       rtx base_reg = gen_reg_rtx (Pmode);
4782       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4783                            NULL_RTX);
4784       emit_move_insn (base_reg, val);
4785       x = plus_constant (Pmode, base_reg, offset);
4786     }
4787
4788   return x;
4789 }
4790
4791 /* Try a machine-dependent way of reloading an illegitimate address
4792    operand.  If we find one, push the reload and return the new rtx.  */
4793
4794 rtx
4795 aarch64_legitimize_reload_address (rtx *x_p,
4796                                    machine_mode mode,
4797                                    int opnum, int type,
4798                                    int ind_levels ATTRIBUTE_UNUSED)
4799 {
4800   rtx x = *x_p;
4801
4802   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4803   if (aarch64_vect_struct_mode_p (mode)
4804       && GET_CODE (x) == PLUS
4805       && REG_P (XEXP (x, 0))
4806       && CONST_INT_P (XEXP (x, 1)))
4807     {
4808       rtx orig_rtx = x;
4809       x = copy_rtx (x);
4810       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4811                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4812                    opnum, (enum reload_type) type);
4813       return x;
4814     }
4815
4816   /* We must recognize output that we have already generated ourselves.  */
4817   if (GET_CODE (x) == PLUS
4818       && GET_CODE (XEXP (x, 0)) == PLUS
4819       && REG_P (XEXP (XEXP (x, 0), 0))
4820       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4821       && CONST_INT_P (XEXP (x, 1)))
4822     {
4823       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4824                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4825                    opnum, (enum reload_type) type);
4826       return x;
4827     }
4828
4829   /* We wish to handle large displacements off a base register by splitting
4830      the addend across an add and the mem insn.  This can cut the number of
4831      extra insns needed from 3 to 1.  It is only useful for load/store of a
4832      single register with 12 bit offset field.  */
4833   if (GET_CODE (x) == PLUS
4834       && REG_P (XEXP (x, 0))
4835       && CONST_INT_P (XEXP (x, 1))
4836       && HARD_REGISTER_P (XEXP (x, 0))
4837       && mode != TImode
4838       && mode != TFmode
4839       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4840     {
4841       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4842       HOST_WIDE_INT low = val & 0xfff;
4843       HOST_WIDE_INT high = val - low;
4844       HOST_WIDE_INT offs;
4845       rtx cst;
4846       machine_mode xmode = GET_MODE (x);
4847
4848       /* In ILP32, xmode can be either DImode or SImode.  */
4849       gcc_assert (xmode == DImode || xmode == SImode);
4850
4851       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4852          BLKmode alignment.  */
4853       if (GET_MODE_SIZE (mode) == 0)
4854         return NULL_RTX;
4855
4856       offs = low % GET_MODE_SIZE (mode);
4857
4858       /* Align misaligned offset by adjusting high part to compensate.  */
4859       if (offs != 0)
4860         {
4861           if (aarch64_uimm12_shift (high + offs))
4862             {
4863               /* Align down.  */
4864               low = low - offs;
4865               high = high + offs;
4866             }
4867           else
4868             {
4869               /* Align up.  */
4870               offs = GET_MODE_SIZE (mode) - offs;
4871               low = low + offs;
4872               high = high + (low & 0x1000) - offs;
4873               low &= 0xfff;
4874             }
4875         }
4876
4877       /* Check for overflow.  */
4878       if (high + low != val)
4879         return NULL_RTX;
4880
4881       cst = GEN_INT (high);
4882       if (!aarch64_uimm12_shift (high))
4883         cst = force_const_mem (xmode, cst);
4884
4885       /* Reload high part into base reg, leaving the low part
4886          in the mem instruction.
4887          Note that replacing this gen_rtx_PLUS with plus_constant is
4888          wrong in this case because we rely on the
4889          (plus (plus reg c1) c2) structure being preserved so that
4890          XEXP (*p, 0) in push_reload below uses the correct term.  */
4891       x = gen_rtx_PLUS (xmode,
4892                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4893                         GEN_INT (low));
4894
4895       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4896                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4897                    opnum, (enum reload_type) type);
4898       return x;
4899     }
4900
4901   return NULL_RTX;
4902 }
4903
4904
4905 static reg_class_t
4906 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4907                           reg_class_t rclass,
4908                           machine_mode mode,
4909                           secondary_reload_info *sri)
4910 {
4911   /* Without the TARGET_SIMD instructions we cannot move a Q register
4912      to a Q register directly.  We need a scratch.  */
4913   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4914       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4915       && reg_class_subset_p (rclass, FP_REGS))
4916     {
4917       if (mode == TFmode)
4918         sri->icode = CODE_FOR_aarch64_reload_movtf;
4919       else if (mode == TImode)
4920         sri->icode = CODE_FOR_aarch64_reload_movti;
4921       return NO_REGS;
4922     }
4923
4924   /* A TFmode or TImode memory access should be handled via an FP_REGS
4925      because AArch64 has richer addressing modes for LDR/STR instructions
4926      than LDP/STP instructions.  */
4927   if (TARGET_FLOAT && rclass == GENERAL_REGS
4928       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4929     return FP_REGS;
4930
4931   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4932       return GENERAL_REGS;
4933
4934   return NO_REGS;
4935 }
4936
4937 static bool
4938 aarch64_can_eliminate (const int from, const int to)
4939 {
4940   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4941      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4942
4943   if (frame_pointer_needed)
4944     {
4945       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4946         return true;
4947       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4948         return false;
4949       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4950           && !cfun->calls_alloca)
4951         return true;
4952       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4953         return true;
4954
4955       return false;
4956     }
4957   else
4958     {
4959       /* If we decided that we didn't need a leaf frame pointer but then used
4960          LR in the function, then we'll want a frame pointer after all, so
4961          prevent this elimination to ensure a frame pointer is used.  */
4962       if (to == STACK_POINTER_REGNUM
4963           && flag_omit_leaf_frame_pointer
4964           && df_regs_ever_live_p (LR_REGNUM))
4965         return false;
4966     }
4967
4968   return true;
4969 }
4970
4971 HOST_WIDE_INT
4972 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4973 {
4974   aarch64_layout_frame ();
4975
4976   if (to == HARD_FRAME_POINTER_REGNUM)
4977     {
4978       if (from == ARG_POINTER_REGNUM)
4979         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4980
4981       if (from == FRAME_POINTER_REGNUM)
4982         return (cfun->machine->frame.hard_fp_offset
4983                 - cfun->machine->frame.saved_varargs_size);
4984     }
4985
4986   if (to == STACK_POINTER_REGNUM)
4987     {
4988       if (from == FRAME_POINTER_REGNUM)
4989           return (cfun->machine->frame.frame_size
4990                   - cfun->machine->frame.saved_varargs_size);
4991     }
4992
4993   return cfun->machine->frame.frame_size;
4994 }
4995
4996 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4997    previous frame.  */
4998
4999 rtx
5000 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5001 {
5002   if (count != 0)
5003     return const0_rtx;
5004   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5005 }
5006
5007
5008 static void
5009 aarch64_asm_trampoline_template (FILE *f)
5010 {
5011   if (TARGET_ILP32)
5012     {
5013       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5014       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5015     }
5016   else
5017     {
5018       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5019       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5020     }
5021   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5022   assemble_aligned_integer (4, const0_rtx);
5023   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5024   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5025 }
5026
5027 static void
5028 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5029 {
5030   rtx fnaddr, mem, a_tramp;
5031   const int tramp_code_sz = 16;
5032
5033   /* Don't need to copy the trailing D-words, we fill those in below.  */
5034   emit_block_move (m_tramp, assemble_trampoline_template (),
5035                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5036   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5037   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5038   if (GET_MODE (fnaddr) != ptr_mode)
5039     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5040   emit_move_insn (mem, fnaddr);
5041
5042   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5043   emit_move_insn (mem, chain_value);
5044
5045   /* XXX We should really define a "clear_cache" pattern and use
5046      gen_clear_cache().  */
5047   a_tramp = XEXP (m_tramp, 0);
5048   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5049                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5050                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5051                      ptr_mode);
5052 }
5053
5054 static unsigned char
5055 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5056 {
5057   switch (regclass)
5058     {
5059     case CALLER_SAVE_REGS:
5060     case POINTER_REGS:
5061     case GENERAL_REGS:
5062     case ALL_REGS:
5063     case FP_REGS:
5064     case FP_LO_REGS:
5065       return
5066         aarch64_vector_mode_p (mode)
5067           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5068           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5069     case STACK_REG:
5070       return 1;
5071
5072     case NO_REGS:
5073       return 0;
5074
5075     default:
5076       break;
5077     }
5078   gcc_unreachable ();
5079 }
5080
5081 static reg_class_t
5082 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5083 {
5084   if (regclass == POINTER_REGS)
5085     return GENERAL_REGS;
5086
5087   if (regclass == STACK_REG)
5088     {
5089       if (REG_P(x)
5090           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5091           return regclass;
5092
5093       return NO_REGS;
5094     }
5095
5096   /* If it's an integer immediate that MOVI can't handle, then
5097      FP_REGS is not an option, so we return NO_REGS instead.  */
5098   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5099       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5100     return NO_REGS;
5101
5102   /* Register eliminiation can result in a request for
5103      SP+constant->FP_REGS.  We cannot support such operations which
5104      use SP as source and an FP_REG as destination, so reject out
5105      right now.  */
5106   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5107     {
5108       rtx lhs = XEXP (x, 0);
5109
5110       /* Look through a possible SUBREG introduced by ILP32.  */
5111       if (GET_CODE (lhs) == SUBREG)
5112         lhs = SUBREG_REG (lhs);
5113
5114       gcc_assert (REG_P (lhs));
5115       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5116                                       POINTER_REGS));
5117       return NO_REGS;
5118     }
5119
5120   return regclass;
5121 }
5122
5123 void
5124 aarch64_asm_output_labelref (FILE* f, const char *name)
5125 {
5126   asm_fprintf (f, "%U%s", name);
5127 }
5128
5129 static void
5130 aarch64_elf_asm_constructor (rtx symbol, int priority)
5131 {
5132   if (priority == DEFAULT_INIT_PRIORITY)
5133     default_ctor_section_asm_out_constructor (symbol, priority);
5134   else
5135     {
5136       section *s;
5137       char buf[18];
5138       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5139       s = get_section (buf, SECTION_WRITE, NULL);
5140       switch_to_section (s);
5141       assemble_align (POINTER_SIZE);
5142       assemble_aligned_integer (POINTER_BYTES, symbol);
5143     }
5144 }
5145
5146 static void
5147 aarch64_elf_asm_destructor (rtx symbol, int priority)
5148 {
5149   if (priority == DEFAULT_INIT_PRIORITY)
5150     default_dtor_section_asm_out_destructor (symbol, priority);
5151   else
5152     {
5153       section *s;
5154       char buf[18];
5155       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5156       s = get_section (buf, SECTION_WRITE, NULL);
5157       switch_to_section (s);
5158       assemble_align (POINTER_SIZE);
5159       assemble_aligned_integer (POINTER_BYTES, symbol);
5160     }
5161 }
5162
5163 const char*
5164 aarch64_output_casesi (rtx *operands)
5165 {
5166   char buf[100];
5167   char label[100];
5168   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5169   int index;
5170   static const char *const patterns[4][2] =
5171   {
5172     {
5173       "ldrb\t%w3, [%0,%w1,uxtw]",
5174       "add\t%3, %4, %w3, sxtb #2"
5175     },
5176     {
5177       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5178       "add\t%3, %4, %w3, sxth #2"
5179     },
5180     {
5181       "ldr\t%w3, [%0,%w1,uxtw #2]",
5182       "add\t%3, %4, %w3, sxtw #2"
5183     },
5184     /* We assume that DImode is only generated when not optimizing and
5185        that we don't really need 64-bit address offsets.  That would
5186        imply an object file with 8GB of code in a single function!  */
5187     {
5188       "ldr\t%w3, [%0,%w1,uxtw #2]",
5189       "add\t%3, %4, %w3, sxtw #2"
5190     }
5191   };
5192
5193   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5194
5195   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5196
5197   gcc_assert (index >= 0 && index <= 3);
5198
5199   /* Need to implement table size reduction, by chaning the code below.  */
5200   output_asm_insn (patterns[index][0], operands);
5201   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5202   snprintf (buf, sizeof (buf),
5203             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5204   output_asm_insn (buf, operands);
5205   output_asm_insn (patterns[index][1], operands);
5206   output_asm_insn ("br\t%3", operands);
5207   assemble_label (asm_out_file, label);
5208   return "";
5209 }
5210
5211
5212 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5213    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5214    operator.  */
5215
5216 int
5217 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5218 {
5219   if (shift >= 0 && shift <= 3)
5220     {
5221       int size;
5222       for (size = 8; size <= 32; size *= 2)
5223         {
5224           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5225           if (mask == bits << shift)
5226             return size;
5227         }
5228     }
5229   return 0;
5230 }
5231
5232 static bool
5233 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5234                                    const_rtx x ATTRIBUTE_UNUSED)
5235 {
5236   /* We can't use blocks for constants when we're using a per-function
5237      constant pool.  */
5238   return false;
5239 }
5240
5241 static section *
5242 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5243                             rtx x ATTRIBUTE_UNUSED,
5244                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5245 {
5246   /* Force all constant pool entries into the current function section.  */
5247   return function_section (current_function_decl);
5248 }
5249
5250
5251 /* Costs.  */
5252
5253 /* Helper function for rtx cost calculation.  Strip a shift expression
5254    from X.  Returns the inner operand if successful, or the original
5255    expression on failure.  */
5256 static rtx
5257 aarch64_strip_shift (rtx x)
5258 {
5259   rtx op = x;
5260
5261   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5262      we can convert both to ROR during final output.  */
5263   if ((GET_CODE (op) == ASHIFT
5264        || GET_CODE (op) == ASHIFTRT
5265        || GET_CODE (op) == LSHIFTRT
5266        || GET_CODE (op) == ROTATERT
5267        || GET_CODE (op) == ROTATE)
5268       && CONST_INT_P (XEXP (op, 1)))
5269     return XEXP (op, 0);
5270
5271   if (GET_CODE (op) == MULT
5272       && CONST_INT_P (XEXP (op, 1))
5273       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5274     return XEXP (op, 0);
5275
5276   return x;
5277 }
5278
5279 /* Helper function for rtx cost calculation.  Strip an extend
5280    expression from X.  Returns the inner operand if successful, or the
5281    original expression on failure.  We deal with a number of possible
5282    canonicalization variations here.  */
5283 static rtx
5284 aarch64_strip_extend (rtx x)
5285 {
5286   rtx op = x;
5287
5288   /* Zero and sign extraction of a widened value.  */
5289   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5290       && XEXP (op, 2) == const0_rtx
5291       && GET_CODE (XEXP (op, 0)) == MULT
5292       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5293                                          XEXP (op, 1)))
5294     return XEXP (XEXP (op, 0), 0);
5295
5296   /* It can also be represented (for zero-extend) as an AND with an
5297      immediate.  */
5298   if (GET_CODE (op) == AND
5299       && GET_CODE (XEXP (op, 0)) == MULT
5300       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5301       && CONST_INT_P (XEXP (op, 1))
5302       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5303                            INTVAL (XEXP (op, 1))) != 0)
5304     return XEXP (XEXP (op, 0), 0);
5305
5306   /* Now handle extended register, as this may also have an optional
5307      left shift by 1..4.  */
5308   if (GET_CODE (op) == ASHIFT
5309       && CONST_INT_P (XEXP (op, 1))
5310       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5311     op = XEXP (op, 0);
5312
5313   if (GET_CODE (op) == ZERO_EXTEND
5314       || GET_CODE (op) == SIGN_EXTEND)
5315     op = XEXP (op, 0);
5316
5317   if (op != x)
5318     return op;
5319
5320   return x;
5321 }
5322
5323 /* Return true iff CODE is a shift supported in combination
5324    with arithmetic instructions.  */
5325
5326 static bool
5327 aarch64_shift_p (enum rtx_code code)
5328 {
5329   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5330 }
5331
5332 /* Helper function for rtx cost calculation.  Calculate the cost of
5333    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5334    Return the calculated cost of the expression, recursing manually in to
5335    operands where needed.  */
5336
5337 static int
5338 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5339 {
5340   rtx op0, op1;
5341   const struct cpu_cost_table *extra_cost
5342     = aarch64_tune_params.insn_extra_cost;
5343   int cost = 0;
5344   bool compound_p = (outer == PLUS || outer == MINUS);
5345   machine_mode mode = GET_MODE (x);
5346
5347   gcc_checking_assert (code == MULT);
5348
5349   op0 = XEXP (x, 0);
5350   op1 = XEXP (x, 1);
5351
5352   if (VECTOR_MODE_P (mode))
5353     mode = GET_MODE_INNER (mode);
5354
5355   /* Integer multiply/fma.  */
5356   if (GET_MODE_CLASS (mode) == MODE_INT)
5357     {
5358       /* The multiply will be canonicalized as a shift, cost it as such.  */
5359       if (aarch64_shift_p (GET_CODE (x))
5360           || (CONST_INT_P (op1)
5361               && exact_log2 (INTVAL (op1)) > 0))
5362         {
5363           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5364                            || GET_CODE (op0) == SIGN_EXTEND;
5365           if (speed)
5366             {
5367               if (compound_p)
5368                 {
5369                   if (REG_P (op1))
5370                     /* ARITH + shift-by-register.  */
5371                     cost += extra_cost->alu.arith_shift_reg;
5372                   else if (is_extend)
5373                     /* ARITH + extended register.  We don't have a cost field
5374                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5375                     cost += extra_cost->alu.extend_arith;
5376                   else
5377                     /* ARITH + shift-by-immediate.  */
5378                     cost += extra_cost->alu.arith_shift;
5379                 }
5380               else
5381                 /* LSL (immediate).  */
5382                 cost += extra_cost->alu.shift;
5383
5384             }
5385           /* Strip extends as we will have costed them in the case above.  */
5386           if (is_extend)
5387             op0 = aarch64_strip_extend (op0);
5388
5389           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5390
5391           return cost;
5392         }
5393
5394       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5395          compound and let the below cases handle it.  After all, MNEG is a
5396          special-case alias of MSUB.  */
5397       if (GET_CODE (op0) == NEG)
5398         {
5399           op0 = XEXP (op0, 0);
5400           compound_p = true;
5401         }
5402
5403       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5404       if ((GET_CODE (op0) == ZERO_EXTEND
5405            && GET_CODE (op1) == ZERO_EXTEND)
5406           || (GET_CODE (op0) == SIGN_EXTEND
5407               && GET_CODE (op1) == SIGN_EXTEND))
5408         {
5409           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5410                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5411
5412           if (speed)
5413             {
5414               if (compound_p)
5415                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5416                 cost += extra_cost->mult[0].extend_add;
5417               else
5418                 /* MUL/SMULL/UMULL.  */
5419                 cost += extra_cost->mult[0].extend;
5420             }
5421
5422           return cost;
5423         }
5424
5425       /* This is either an integer multiply or a MADD.  In both cases
5426          we want to recurse and cost the operands.  */
5427       cost += rtx_cost (op0, MULT, 0, speed)
5428               + rtx_cost (op1, MULT, 1, speed);
5429
5430       if (speed)
5431         {
5432           if (compound_p)
5433             /* MADD/MSUB.  */
5434             cost += extra_cost->mult[mode == DImode].add;
5435           else
5436             /* MUL.  */
5437             cost += extra_cost->mult[mode == DImode].simple;
5438         }
5439
5440       return cost;
5441     }
5442   else
5443     {
5444       if (speed)
5445         {
5446           /* Floating-point FMA/FMUL can also support negations of the
5447              operands.  */
5448           if (GET_CODE (op0) == NEG)
5449             op0 = XEXP (op0, 0);
5450           if (GET_CODE (op1) == NEG)
5451             op1 = XEXP (op1, 0);
5452
5453           if (compound_p)
5454             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5455             cost += extra_cost->fp[mode == DFmode].fma;
5456           else
5457             /* FMUL/FNMUL.  */
5458             cost += extra_cost->fp[mode == DFmode].mult;
5459         }
5460
5461       cost += rtx_cost (op0, MULT, 0, speed)
5462               + rtx_cost (op1, MULT, 1, speed);
5463       return cost;
5464     }
5465 }
5466
5467 static int
5468 aarch64_address_cost (rtx x,
5469                       machine_mode mode,
5470                       addr_space_t as ATTRIBUTE_UNUSED,
5471                       bool speed)
5472 {
5473   enum rtx_code c = GET_CODE (x);
5474   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5475   struct aarch64_address_info info;
5476   int cost = 0;
5477   info.shift = 0;
5478
5479   if (!aarch64_classify_address (&info, x, mode, c, false))
5480     {
5481       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5482         {
5483           /* This is a CONST or SYMBOL ref which will be split
5484              in a different way depending on the code model in use.
5485              Cost it through the generic infrastructure.  */
5486           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5487           /* Divide through by the cost of one instruction to
5488              bring it to the same units as the address costs.  */
5489           cost_symbol_ref /= COSTS_N_INSNS (1);
5490           /* The cost is then the cost of preparing the address,
5491              followed by an immediate (possibly 0) offset.  */
5492           return cost_symbol_ref + addr_cost->imm_offset;
5493         }
5494       else
5495         {
5496           /* This is most likely a jump table from a case
5497              statement.  */
5498           return addr_cost->register_offset;
5499         }
5500     }
5501
5502   switch (info.type)
5503     {
5504       case ADDRESS_LO_SUM:
5505       case ADDRESS_SYMBOLIC:
5506       case ADDRESS_REG_IMM:
5507         cost += addr_cost->imm_offset;
5508         break;
5509
5510       case ADDRESS_REG_WB:
5511         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5512           cost += addr_cost->pre_modify;
5513         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5514           cost += addr_cost->post_modify;
5515         else
5516           gcc_unreachable ();
5517
5518         break;
5519
5520       case ADDRESS_REG_REG:
5521         cost += addr_cost->register_offset;
5522         break;
5523
5524       case ADDRESS_REG_UXTW:
5525       case ADDRESS_REG_SXTW:
5526         cost += addr_cost->register_extend;
5527         break;
5528
5529       default:
5530         gcc_unreachable ();
5531     }
5532
5533
5534   if (info.shift > 0)
5535     {
5536       /* For the sake of calculating the cost of the shifted register
5537          component, we can treat same sized modes in the same way.  */
5538       switch (GET_MODE_BITSIZE (mode))
5539         {
5540           case 16:
5541             cost += addr_cost->addr_scale_costs.hi;
5542             break;
5543
5544           case 32:
5545             cost += addr_cost->addr_scale_costs.si;
5546             break;
5547
5548           case 64:
5549             cost += addr_cost->addr_scale_costs.di;
5550             break;
5551
5552           /* We can't tell, or this is a 128-bit vector.  */
5553           default:
5554             cost += addr_cost->addr_scale_costs.ti;
5555             break;
5556         }
5557     }
5558
5559   return cost;
5560 }
5561
5562 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5563    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5564    to be taken.  */
5565
5566 int
5567 aarch64_branch_cost (bool speed_p, bool predictable_p)
5568 {
5569   /* When optimizing for speed, use the cost of unpredictable branches.  */
5570   const struct cpu_branch_cost *branch_costs =
5571     aarch64_tune_params.branch_costs;
5572
5573   if (!speed_p || predictable_p)
5574     return branch_costs->predictable;
5575   else
5576     return branch_costs->unpredictable;
5577 }
5578
5579 /* Return true if the RTX X in mode MODE is a zero or sign extract
5580    usable in an ADD or SUB (extended register) instruction.  */
5581 static bool
5582 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5583 {
5584   /* Catch add with a sign extract.
5585      This is add_<optab><mode>_multp2.  */
5586   if (GET_CODE (x) == SIGN_EXTRACT
5587       || GET_CODE (x) == ZERO_EXTRACT)
5588     {
5589       rtx op0 = XEXP (x, 0);
5590       rtx op1 = XEXP (x, 1);
5591       rtx op2 = XEXP (x, 2);
5592
5593       if (GET_CODE (op0) == MULT
5594           && CONST_INT_P (op1)
5595           && op2 == const0_rtx
5596           && CONST_INT_P (XEXP (op0, 1))
5597           && aarch64_is_extend_from_extract (mode,
5598                                              XEXP (op0, 1),
5599                                              op1))
5600         {
5601           return true;
5602         }
5603     }
5604
5605   return false;
5606 }
5607
5608 static bool
5609 aarch64_frint_unspec_p (unsigned int u)
5610 {
5611   switch (u)
5612     {
5613       case UNSPEC_FRINTZ:
5614       case UNSPEC_FRINTP:
5615       case UNSPEC_FRINTM:
5616       case UNSPEC_FRINTA:
5617       case UNSPEC_FRINTN:
5618       case UNSPEC_FRINTX:
5619       case UNSPEC_FRINTI:
5620         return true;
5621
5622       default:
5623         return false;
5624     }
5625 }
5626
5627 /* Return true iff X is an rtx that will match an extr instruction
5628    i.e. as described in the *extr<mode>5_insn family of patterns.
5629    OP0 and OP1 will be set to the operands of the shifts involved
5630    on success and will be NULL_RTX otherwise.  */
5631
5632 static bool
5633 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5634 {
5635   rtx op0, op1;
5636   machine_mode mode = GET_MODE (x);
5637
5638   *res_op0 = NULL_RTX;
5639   *res_op1 = NULL_RTX;
5640
5641   if (GET_CODE (x) != IOR)
5642     return false;
5643
5644   op0 = XEXP (x, 0);
5645   op1 = XEXP (x, 1);
5646
5647   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5648       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5649     {
5650      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5651       if (GET_CODE (op1) == ASHIFT)
5652         std::swap (op0, op1);
5653
5654       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5655         return false;
5656
5657       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5658       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5659
5660       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5661           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5662         {
5663           *res_op0 = XEXP (op0, 0);
5664           *res_op1 = XEXP (op1, 0);
5665           return true;
5666         }
5667     }
5668
5669   return false;
5670 }
5671
5672 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5673    storing it in *COST.  Result is true if the total cost of the operation
5674    has now been calculated.  */
5675 static bool
5676 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5677 {
5678   rtx inner;
5679   rtx comparator;
5680   enum rtx_code cmpcode;
5681
5682   if (COMPARISON_P (op0))
5683     {
5684       inner = XEXP (op0, 0);
5685       comparator = XEXP (op0, 1);
5686       cmpcode = GET_CODE (op0);
5687     }
5688   else
5689     {
5690       inner = op0;
5691       comparator = const0_rtx;
5692       cmpcode = NE;
5693     }
5694
5695   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5696     {
5697       /* Conditional branch.  */
5698       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5699         return true;
5700       else
5701         {
5702           if (cmpcode == NE || cmpcode == EQ)
5703             {
5704               if (comparator == const0_rtx)
5705                 {
5706                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5707                   if (GET_CODE (inner) == ZERO_EXTRACT)
5708                     /* TBZ/TBNZ.  */
5709                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5710                                        0, speed);
5711                 else
5712                   /* CBZ/CBNZ.  */
5713                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5714
5715                 return true;
5716               }
5717             }
5718           else if (cmpcode == LT || cmpcode == GE)
5719             {
5720               /* TBZ/TBNZ.  */
5721               if (comparator == const0_rtx)
5722                 return true;
5723             }
5724         }
5725     }
5726   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5727     {
5728       /* It's a conditional operation based on the status flags,
5729          so it must be some flavor of CSEL.  */
5730
5731       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5732       if (GET_CODE (op1) == NEG
5733           || GET_CODE (op1) == NOT
5734           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5735         op1 = XEXP (op1, 0);
5736
5737       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5738       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5739       return true;
5740     }
5741
5742   /* We don't know what this is, cost all operands.  */
5743   return false;
5744 }
5745
5746 /* Calculate the cost of calculating X, storing it in *COST.  Result
5747    is true if the total cost of the operation has now been calculated.  */
5748 static bool
5749 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5750                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5751 {
5752   rtx op0, op1, op2;
5753   const struct cpu_cost_table *extra_cost
5754     = aarch64_tune_params.insn_extra_cost;
5755   machine_mode mode = GET_MODE (x);
5756
5757   /* By default, assume that everything has equivalent cost to the
5758      cheapest instruction.  Any additional costs are applied as a delta
5759      above this default.  */
5760   *cost = COSTS_N_INSNS (1);
5761
5762   switch (code)
5763     {
5764     case SET:
5765       /* The cost depends entirely on the operands to SET.  */
5766       *cost = 0;
5767       op0 = SET_DEST (x);
5768       op1 = SET_SRC (x);
5769
5770       switch (GET_CODE (op0))
5771         {
5772         case MEM:
5773           if (speed)
5774             {
5775               rtx address = XEXP (op0, 0);
5776               if (VECTOR_MODE_P (mode))
5777                 *cost += extra_cost->ldst.storev;
5778               else if (GET_MODE_CLASS (mode) == MODE_INT)
5779                 *cost += extra_cost->ldst.store;
5780               else if (mode == SFmode)
5781                 *cost += extra_cost->ldst.storef;
5782               else if (mode == DFmode)
5783                 *cost += extra_cost->ldst.stored;
5784
5785               *cost +=
5786                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5787                                                      0, speed));
5788             }
5789
5790           *cost += rtx_cost (op1, SET, 1, speed);
5791           return true;
5792
5793         case SUBREG:
5794           if (! REG_P (SUBREG_REG (op0)))
5795             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5796
5797           /* Fall through.  */
5798         case REG:
5799           /* The cost is one per vector-register copied.  */
5800           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5801             {
5802               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5803                               / GET_MODE_SIZE (V4SImode);
5804               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5805             }
5806           /* const0_rtx is in general free, but we will use an
5807              instruction to set a register to 0.  */
5808           else if (REG_P (op1) || op1 == const0_rtx)
5809             {
5810               /* The cost is 1 per register copied.  */
5811               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5812                               / UNITS_PER_WORD;
5813               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5814             }
5815           else
5816             /* Cost is just the cost of the RHS of the set.  */
5817             *cost += rtx_cost (op1, SET, 1, speed);
5818           return true;
5819
5820         case ZERO_EXTRACT:
5821         case SIGN_EXTRACT:
5822           /* Bit-field insertion.  Strip any redundant widening of
5823              the RHS to meet the width of the target.  */
5824           if (GET_CODE (op1) == SUBREG)
5825             op1 = SUBREG_REG (op1);
5826           if ((GET_CODE (op1) == ZERO_EXTEND
5827                || GET_CODE (op1) == SIGN_EXTEND)
5828               && CONST_INT_P (XEXP (op0, 1))
5829               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5830                   >= INTVAL (XEXP (op0, 1))))
5831             op1 = XEXP (op1, 0);
5832
5833           if (CONST_INT_P (op1))
5834             {
5835               /* MOV immediate is assumed to always be cheap.  */
5836               *cost = COSTS_N_INSNS (1);
5837             }
5838           else
5839             {
5840               /* BFM.  */
5841               if (speed)
5842                 *cost += extra_cost->alu.bfi;
5843               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5844             }
5845
5846           return true;
5847
5848         default:
5849           /* We can't make sense of this, assume default cost.  */
5850           *cost = COSTS_N_INSNS (1);
5851           return false;
5852         }
5853       return false;
5854
5855     case CONST_INT:
5856       /* If an instruction can incorporate a constant within the
5857          instruction, the instruction's expression avoids calling
5858          rtx_cost() on the constant.  If rtx_cost() is called on a
5859          constant, then it is usually because the constant must be
5860          moved into a register by one or more instructions.
5861
5862          The exception is constant 0, which can be expressed
5863          as XZR/WZR and is therefore free.  The exception to this is
5864          if we have (set (reg) (const0_rtx)) in which case we must cost
5865          the move.  However, we can catch that when we cost the SET, so
5866          we don't need to consider that here.  */
5867       if (x == const0_rtx)
5868         *cost = 0;
5869       else
5870         {
5871           /* To an approximation, building any other constant is
5872              proportionally expensive to the number of instructions
5873              required to build that constant.  This is true whether we
5874              are compiling for SPEED or otherwise.  */
5875           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5876                                  (NULL_RTX, x, false, mode));
5877         }
5878       return true;
5879
5880     case CONST_DOUBLE:
5881       if (speed)
5882         {
5883           /* mov[df,sf]_aarch64.  */
5884           if (aarch64_float_const_representable_p (x))
5885             /* FMOV (scalar immediate).  */
5886             *cost += extra_cost->fp[mode == DFmode].fpconst;
5887           else if (!aarch64_float_const_zero_rtx_p (x))
5888             {
5889               /* This will be a load from memory.  */
5890               if (mode == DFmode)
5891                 *cost += extra_cost->ldst.loadd;
5892               else
5893                 *cost += extra_cost->ldst.loadf;
5894             }
5895           else
5896             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5897                or MOV v0.s[0], wzr - neither of which are modeled by the
5898                cost tables.  Just use the default cost.  */
5899             {
5900             }
5901         }
5902
5903       return true;
5904
5905     case MEM:
5906       if (speed)
5907         {
5908           /* For loads we want the base cost of a load, plus an
5909              approximation for the additional cost of the addressing
5910              mode.  */
5911           rtx address = XEXP (x, 0);
5912           if (VECTOR_MODE_P (mode))
5913             *cost += extra_cost->ldst.loadv;
5914           else if (GET_MODE_CLASS (mode) == MODE_INT)
5915             *cost += extra_cost->ldst.load;
5916           else if (mode == SFmode)
5917             *cost += extra_cost->ldst.loadf;
5918           else if (mode == DFmode)
5919             *cost += extra_cost->ldst.loadd;
5920
5921           *cost +=
5922                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5923                                                      0, speed));
5924         }
5925
5926       return true;
5927
5928     case NEG:
5929       op0 = XEXP (x, 0);
5930
5931       if (VECTOR_MODE_P (mode))
5932         {
5933           if (speed)
5934             {
5935               /* FNEG.  */
5936               *cost += extra_cost->vect.alu;
5937             }
5938           return false;
5939         }
5940
5941       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5942        {
5943           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5944               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5945             {
5946               /* CSETM.  */
5947               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5948               return true;
5949             }
5950
5951           /* Cost this as SUB wzr, X.  */
5952           op0 = CONST0_RTX (GET_MODE (x));
5953           op1 = XEXP (x, 0);
5954           goto cost_minus;
5955         }
5956
5957       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5958         {
5959           /* Support (neg(fma...)) as a single instruction only if
5960              sign of zeros is unimportant.  This matches the decision
5961              making in aarch64.md.  */
5962           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5963             {
5964               /* FNMADD.  */
5965               *cost = rtx_cost (op0, NEG, 0, speed);
5966               return true;
5967             }
5968           if (speed)
5969             /* FNEG.  */
5970             *cost += extra_cost->fp[mode == DFmode].neg;
5971           return false;
5972         }
5973
5974       return false;
5975
5976     case CLRSB:
5977     case CLZ:
5978       if (speed)
5979         {
5980           if (VECTOR_MODE_P (mode))
5981             *cost += extra_cost->vect.alu;
5982           else
5983             *cost += extra_cost->alu.clz;
5984         }
5985
5986       return false;
5987
5988     case COMPARE:
5989       op0 = XEXP (x, 0);
5990       op1 = XEXP (x, 1);
5991
5992       if (op1 == const0_rtx
5993           && GET_CODE (op0) == AND)
5994         {
5995           x = op0;
5996           goto cost_logic;
5997         }
5998
5999       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6000         {
6001           /* TODO: A write to the CC flags possibly costs extra, this
6002              needs encoding in the cost tables.  */
6003
6004           /* CC_ZESWPmode supports zero extend for free.  */
6005           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6006             op0 = XEXP (op0, 0);
6007
6008           /* ANDS.  */
6009           if (GET_CODE (op0) == AND)
6010             {
6011               x = op0;
6012               goto cost_logic;
6013             }
6014
6015           if (GET_CODE (op0) == PLUS)
6016             {
6017               /* ADDS (and CMN alias).  */
6018               x = op0;
6019               goto cost_plus;
6020             }
6021
6022           if (GET_CODE (op0) == MINUS)
6023             {
6024               /* SUBS.  */
6025               x = op0;
6026               goto cost_minus;
6027             }
6028
6029           if (GET_CODE (op1) == NEG)
6030             {
6031               /* CMN.  */
6032               if (speed)
6033                 *cost += extra_cost->alu.arith;
6034
6035               *cost += rtx_cost (op0, COMPARE, 0, speed);
6036               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
6037               return true;
6038             }
6039
6040           /* CMP.
6041
6042              Compare can freely swap the order of operands, and
6043              canonicalization puts the more complex operation first.
6044              But the integer MINUS logic expects the shift/extend
6045              operation in op1.  */
6046           if (! (REG_P (op0)
6047                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6048           {
6049             op0 = XEXP (x, 1);
6050             op1 = XEXP (x, 0);
6051           }
6052           goto cost_minus;
6053         }
6054
6055       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6056         {
6057           /* FCMP.  */
6058           if (speed)
6059             *cost += extra_cost->fp[mode == DFmode].compare;
6060
6061           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6062             {
6063               *cost += rtx_cost (op0, COMPARE, 0, speed);
6064               /* FCMP supports constant 0.0 for no extra cost. */
6065               return true;
6066             }
6067           return false;
6068         }
6069
6070       if (VECTOR_MODE_P (mode))
6071         {
6072           /* Vector compare.  */
6073           if (speed)
6074             *cost += extra_cost->vect.alu;
6075
6076           if (aarch64_float_const_zero_rtx_p (op1))
6077             {
6078               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6079                  cost.  */
6080               return true;
6081             }
6082           return false;
6083         }
6084       return false;
6085
6086     case MINUS:
6087       {
6088         op0 = XEXP (x, 0);
6089         op1 = XEXP (x, 1);
6090
6091 cost_minus:
6092         *cost += rtx_cost (op0, MINUS, 0, speed);
6093
6094         /* Detect valid immediates.  */
6095         if ((GET_MODE_CLASS (mode) == MODE_INT
6096              || (GET_MODE_CLASS (mode) == MODE_CC
6097                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6098             && CONST_INT_P (op1)
6099             && aarch64_uimm12_shift (INTVAL (op1)))
6100           {
6101             if (speed)
6102               /* SUB(S) (immediate).  */
6103               *cost += extra_cost->alu.arith;
6104             return true;
6105           }
6106
6107         /* Look for SUB (extended register).  */
6108         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6109           {
6110             if (speed)
6111               *cost += extra_cost->alu.extend_arith;
6112
6113             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
6114                                (enum rtx_code) GET_CODE (op1),
6115                                0, speed);
6116             return true;
6117           }
6118
6119         rtx new_op1 = aarch64_strip_extend (op1);
6120
6121         /* Cost this as an FMA-alike operation.  */
6122         if ((GET_CODE (new_op1) == MULT
6123              || aarch64_shift_p (GET_CODE (new_op1)))
6124             && code != COMPARE)
6125           {
6126             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6127                                             (enum rtx_code) code,
6128                                             speed);
6129             return true;
6130           }
6131
6132         *cost += rtx_cost (new_op1, MINUS, 1, speed);
6133
6134         if (speed)
6135           {
6136             if (VECTOR_MODE_P (mode))
6137               {
6138                 /* Vector SUB.  */
6139                 *cost += extra_cost->vect.alu;
6140               }
6141             else if (GET_MODE_CLASS (mode) == MODE_INT)
6142               {
6143                 /* SUB(S).  */
6144                 *cost += extra_cost->alu.arith;
6145               }
6146             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6147               {
6148                 /* FSUB.  */
6149                 *cost += extra_cost->fp[mode == DFmode].addsub;
6150               }
6151           }
6152         return true;
6153       }
6154
6155     case PLUS:
6156       {
6157         rtx new_op0;
6158
6159         op0 = XEXP (x, 0);
6160         op1 = XEXP (x, 1);
6161
6162 cost_plus:
6163         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6164             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6165           {
6166             /* CSINC.  */
6167             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6168             *cost += rtx_cost (op1, PLUS, 1, speed);
6169             return true;
6170           }
6171
6172         if (GET_MODE_CLASS (mode) == MODE_INT
6173             && CONST_INT_P (op1)
6174             && aarch64_uimm12_shift (INTVAL (op1)))
6175           {
6176             *cost += rtx_cost (op0, PLUS, 0, speed);
6177
6178             if (speed)
6179               /* ADD (immediate).  */
6180               *cost += extra_cost->alu.arith;
6181             return true;
6182           }
6183
6184         *cost += rtx_cost (op1, PLUS, 1, speed);
6185
6186         /* Look for ADD (extended register).  */
6187         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6188           {
6189             if (speed)
6190               *cost += extra_cost->alu.extend_arith;
6191
6192             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6193                                (enum rtx_code) GET_CODE (op0),
6194                                0, speed);
6195             return true;
6196           }
6197
6198         /* Strip any extend, leave shifts behind as we will
6199            cost them through mult_cost.  */
6200         new_op0 = aarch64_strip_extend (op0);
6201
6202         if (GET_CODE (new_op0) == MULT
6203             || aarch64_shift_p (GET_CODE (new_op0)))
6204           {
6205             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6206                                             speed);
6207             return true;
6208           }
6209
6210         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6211
6212         if (speed)
6213           {
6214             if (VECTOR_MODE_P (mode))
6215               {
6216                 /* Vector ADD.  */
6217                 *cost += extra_cost->vect.alu;
6218               }
6219             else if (GET_MODE_CLASS (mode) == MODE_INT)
6220               {
6221                 /* ADD.  */
6222                 *cost += extra_cost->alu.arith;
6223               }
6224             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6225               {
6226                 /* FADD.  */
6227                 *cost += extra_cost->fp[mode == DFmode].addsub;
6228               }
6229           }
6230         return true;
6231       }
6232
6233     case BSWAP:
6234       *cost = COSTS_N_INSNS (1);
6235
6236       if (speed)
6237         {
6238           if (VECTOR_MODE_P (mode))
6239             *cost += extra_cost->vect.alu;
6240           else
6241             *cost += extra_cost->alu.rev;
6242         }
6243       return false;
6244
6245     case IOR:
6246       if (aarch_rev16_p (x))
6247         {
6248           *cost = COSTS_N_INSNS (1);
6249
6250           if (speed)
6251             {
6252               if (VECTOR_MODE_P (mode))
6253                 *cost += extra_cost->vect.alu;
6254               else
6255                 *cost += extra_cost->alu.rev;
6256             }
6257           return true;
6258         }
6259
6260       if (aarch64_extr_rtx_p (x, &op0, &op1))
6261         {
6262           *cost += rtx_cost (op0, IOR, 0, speed)
6263                    + rtx_cost (op1, IOR, 1, speed);
6264           if (speed)
6265             *cost += extra_cost->alu.shift;
6266
6267           return true;
6268         }
6269     /* Fall through.  */
6270     case XOR:
6271     case AND:
6272     cost_logic:
6273       op0 = XEXP (x, 0);
6274       op1 = XEXP (x, 1);
6275
6276       if (VECTOR_MODE_P (mode))
6277         {
6278           if (speed)
6279             *cost += extra_cost->vect.alu;
6280           return true;
6281         }
6282
6283       if (code == AND
6284           && GET_CODE (op0) == MULT
6285           && CONST_INT_P (XEXP (op0, 1))
6286           && CONST_INT_P (op1)
6287           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6288                                INTVAL (op1)) != 0)
6289         {
6290           /* This is a UBFM/SBFM.  */
6291           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6292           if (speed)
6293             *cost += extra_cost->alu.bfx;
6294           return true;
6295         }
6296
6297       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6298         {
6299           /* We possibly get the immediate for free, this is not
6300              modelled.  */
6301           if (CONST_INT_P (op1)
6302               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6303             {
6304               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6305
6306               if (speed)
6307                 *cost += extra_cost->alu.logical;
6308
6309               return true;
6310             }
6311           else
6312             {
6313               rtx new_op0 = op0;
6314
6315               /* Handle ORN, EON, or BIC.  */
6316               if (GET_CODE (op0) == NOT)
6317                 op0 = XEXP (op0, 0);
6318
6319               new_op0 = aarch64_strip_shift (op0);
6320
6321               /* If we had a shift on op0 then this is a logical-shift-
6322                  by-register/immediate operation.  Otherwise, this is just
6323                  a logical operation.  */
6324               if (speed)
6325                 {
6326                   if (new_op0 != op0)
6327                     {
6328                       /* Shift by immediate.  */
6329                       if (CONST_INT_P (XEXP (op0, 1)))
6330                         *cost += extra_cost->alu.log_shift;
6331                       else
6332                         *cost += extra_cost->alu.log_shift_reg;
6333                     }
6334                   else
6335                     *cost += extra_cost->alu.logical;
6336                 }
6337
6338               /* In both cases we want to cost both operands.  */
6339               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6340                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6341
6342               return true;
6343             }
6344         }
6345       return false;
6346
6347     case NOT:
6348       x = XEXP (x, 0);
6349       op0 = aarch64_strip_shift (x);
6350
6351       if (VECTOR_MODE_P (mode))
6352         {
6353           /* Vector NOT.  */
6354           *cost += extra_cost->vect.alu;
6355           return false;
6356         }
6357
6358       /* MVN-shifted-reg.  */
6359       if (op0 != x)
6360         {
6361           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6362
6363           if (speed)
6364             *cost += extra_cost->alu.log_shift;
6365
6366           return true;
6367         }
6368       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6369          Handle the second form here taking care that 'a' in the above can
6370          be a shift.  */
6371       else if (GET_CODE (op0) == XOR)
6372         {
6373           rtx newop0 = XEXP (op0, 0);
6374           rtx newop1 = XEXP (op0, 1);
6375           rtx op0_stripped = aarch64_strip_shift (newop0);
6376
6377           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6378                    + rtx_cost (op0_stripped, XOR, 0, speed);
6379
6380           if (speed)
6381             {
6382               if (op0_stripped != newop0)
6383                 *cost += extra_cost->alu.log_shift;
6384               else
6385                 *cost += extra_cost->alu.logical;
6386             }
6387
6388           return true;
6389         }
6390       /* MVN.  */
6391       if (speed)
6392         *cost += extra_cost->alu.logical;
6393
6394       return false;
6395
6396     case ZERO_EXTEND:
6397
6398       op0 = XEXP (x, 0);
6399       /* If a value is written in SI mode, then zero extended to DI
6400          mode, the operation will in general be free as a write to
6401          a 'w' register implicitly zeroes the upper bits of an 'x'
6402          register.  However, if this is
6403
6404            (set (reg) (zero_extend (reg)))
6405
6406          we must cost the explicit register move.  */
6407       if (mode == DImode
6408           && GET_MODE (op0) == SImode
6409           && outer == SET)
6410         {
6411           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6412
6413           if (!op_cost && speed)
6414             /* MOV.  */
6415             *cost += extra_cost->alu.extend;
6416           else
6417             /* Free, the cost is that of the SI mode operation.  */
6418             *cost = op_cost;
6419
6420           return true;
6421         }
6422       else if (MEM_P (XEXP (x, 0)))
6423         {
6424           /* All loads can zero extend to any size for free.  */
6425           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6426           return true;
6427         }
6428
6429       if (speed)
6430         {
6431           if (VECTOR_MODE_P (mode))
6432             {
6433               /* UMOV.  */
6434               *cost += extra_cost->vect.alu;
6435             }
6436           else
6437             {
6438               /* UXTB/UXTH.  */
6439               *cost += extra_cost->alu.extend;
6440             }
6441         }
6442       return false;
6443
6444     case SIGN_EXTEND:
6445       if (MEM_P (XEXP (x, 0)))
6446         {
6447           /* LDRSH.  */
6448           if (speed)
6449             {
6450               rtx address = XEXP (XEXP (x, 0), 0);
6451               *cost += extra_cost->ldst.load_sign_extend;
6452
6453               *cost +=
6454                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6455                                                      0, speed));
6456             }
6457           return true;
6458         }
6459
6460       if (speed)
6461         {
6462           if (VECTOR_MODE_P (mode))
6463             *cost += extra_cost->vect.alu;
6464           else
6465             *cost += extra_cost->alu.extend;
6466         }
6467       return false;
6468
6469     case ASHIFT:
6470       op0 = XEXP (x, 0);
6471       op1 = XEXP (x, 1);
6472
6473       if (CONST_INT_P (op1))
6474         {
6475           if (speed)
6476             {
6477               if (VECTOR_MODE_P (mode))
6478                 {
6479                   /* Vector shift (immediate).  */
6480                   *cost += extra_cost->vect.alu;
6481                 }
6482               else
6483                 {
6484                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6485                      aliases.  */
6486                   *cost += extra_cost->alu.shift;
6487                 }
6488             }
6489
6490           /* We can incorporate zero/sign extend for free.  */
6491           if (GET_CODE (op0) == ZERO_EXTEND
6492               || GET_CODE (op0) == SIGN_EXTEND)
6493             op0 = XEXP (op0, 0);
6494
6495           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6496           return true;
6497         }
6498       else
6499         {
6500           if (speed)
6501             {
6502               if (VECTOR_MODE_P (mode))
6503                 {
6504                   /* Vector shift (register).  */
6505                   *cost += extra_cost->vect.alu;
6506                 }
6507               else
6508                 {
6509                   /* LSLV.  */
6510                   *cost += extra_cost->alu.shift_reg;
6511                 }
6512             }
6513           return false;  /* All arguments need to be in registers.  */
6514         }
6515
6516     case ROTATE:
6517     case ROTATERT:
6518     case LSHIFTRT:
6519     case ASHIFTRT:
6520       op0 = XEXP (x, 0);
6521       op1 = XEXP (x, 1);
6522
6523       if (CONST_INT_P (op1))
6524         {
6525           /* ASR (immediate) and friends.  */
6526           if (speed)
6527             {
6528               if (VECTOR_MODE_P (mode))
6529                 *cost += extra_cost->vect.alu;
6530               else
6531                 *cost += extra_cost->alu.shift;
6532             }
6533
6534           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6535           return true;
6536         }
6537       else
6538         {
6539
6540           /* ASR (register) and friends.  */
6541           if (speed)
6542             {
6543               if (VECTOR_MODE_P (mode))
6544                 *cost += extra_cost->vect.alu;
6545               else
6546                 *cost += extra_cost->alu.shift_reg;
6547             }
6548           return false;  /* All arguments need to be in registers.  */
6549         }
6550
6551     case SYMBOL_REF:
6552
6553       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6554           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6555         {
6556           /* LDR.  */
6557           if (speed)
6558             *cost += extra_cost->ldst.load;
6559         }
6560       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6561                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6562         {
6563           /* ADRP, followed by ADD.  */
6564           *cost += COSTS_N_INSNS (1);
6565           if (speed)
6566             *cost += 2 * extra_cost->alu.arith;
6567         }
6568       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6569                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6570         {
6571           /* ADR.  */
6572           if (speed)
6573             *cost += extra_cost->alu.arith;
6574         }
6575
6576       if (flag_pic)
6577         {
6578           /* One extra load instruction, after accessing the GOT.  */
6579           *cost += COSTS_N_INSNS (1);
6580           if (speed)
6581             *cost += extra_cost->ldst.load;
6582         }
6583       return true;
6584
6585     case HIGH:
6586     case LO_SUM:
6587       /* ADRP/ADD (immediate).  */
6588       if (speed)
6589         *cost += extra_cost->alu.arith;
6590       return true;
6591
6592     case ZERO_EXTRACT:
6593     case SIGN_EXTRACT:
6594       /* UBFX/SBFX.  */
6595       if (speed)
6596         {
6597           if (VECTOR_MODE_P (mode))
6598             *cost += extra_cost->vect.alu;
6599           else
6600             *cost += extra_cost->alu.bfx;
6601         }
6602
6603       /* We can trust that the immediates used will be correct (there
6604          are no by-register forms), so we need only cost op0.  */
6605       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6606       return true;
6607
6608     case MULT:
6609       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6610       /* aarch64_rtx_mult_cost always handles recursion to its
6611          operands.  */
6612       return true;
6613
6614     case MOD:
6615     case UMOD:
6616       if (speed)
6617         {
6618           if (VECTOR_MODE_P (mode))
6619             *cost += extra_cost->vect.alu;
6620           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6621             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6622                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6623           else if (GET_MODE (x) == DFmode)
6624             *cost += (extra_cost->fp[1].mult
6625                       + extra_cost->fp[1].div);
6626           else if (GET_MODE (x) == SFmode)
6627             *cost += (extra_cost->fp[0].mult
6628                       + extra_cost->fp[0].div);
6629         }
6630       return false;  /* All arguments need to be in registers.  */
6631
6632     case DIV:
6633     case UDIV:
6634     case SQRT:
6635       if (speed)
6636         {
6637           if (VECTOR_MODE_P (mode))
6638             *cost += extra_cost->vect.alu;
6639           else if (GET_MODE_CLASS (mode) == MODE_INT)
6640             /* There is no integer SQRT, so only DIV and UDIV can get
6641                here.  */
6642             *cost += extra_cost->mult[mode == DImode].idiv;
6643           else
6644             *cost += extra_cost->fp[mode == DFmode].div;
6645         }
6646       return false;  /* All arguments need to be in registers.  */
6647
6648     case IF_THEN_ELSE:
6649       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6650                                          XEXP (x, 2), cost, speed);
6651
6652     case EQ:
6653     case NE:
6654     case GT:
6655     case GTU:
6656     case LT:
6657     case LTU:
6658     case GE:
6659     case GEU:
6660     case LE:
6661     case LEU:
6662
6663       return false; /* All arguments must be in registers.  */
6664
6665     case FMA:
6666       op0 = XEXP (x, 0);
6667       op1 = XEXP (x, 1);
6668       op2 = XEXP (x, 2);
6669
6670       if (speed)
6671         {
6672           if (VECTOR_MODE_P (mode))
6673             *cost += extra_cost->vect.alu;
6674           else
6675             *cost += extra_cost->fp[mode == DFmode].fma;
6676         }
6677
6678       /* FMSUB, FNMADD, and FNMSUB are free.  */
6679       if (GET_CODE (op0) == NEG)
6680         op0 = XEXP (op0, 0);
6681
6682       if (GET_CODE (op2) == NEG)
6683         op2 = XEXP (op2, 0);
6684
6685       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6686          and the by-element operand as operand 0.  */
6687       if (GET_CODE (op1) == NEG)
6688         op1 = XEXP (op1, 0);
6689
6690       /* Catch vector-by-element operations.  The by-element operand can
6691          either be (vec_duplicate (vec_select (x))) or just
6692          (vec_select (x)), depending on whether we are multiplying by
6693          a vector or a scalar.
6694
6695          Canonicalization is not very good in these cases, FMA4 will put the
6696          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6697       if (GET_CODE (op0) == VEC_DUPLICATE)
6698         op0 = XEXP (op0, 0);
6699       else if (GET_CODE (op1) == VEC_DUPLICATE)
6700         op1 = XEXP (op1, 0);
6701
6702       if (GET_CODE (op0) == VEC_SELECT)
6703         op0 = XEXP (op0, 0);
6704       else if (GET_CODE (op1) == VEC_SELECT)
6705         op1 = XEXP (op1, 0);
6706
6707       /* If the remaining parameters are not registers,
6708          get the cost to put them into registers.  */
6709       *cost += rtx_cost (op0, FMA, 0, speed);
6710       *cost += rtx_cost (op1, FMA, 1, speed);
6711       *cost += rtx_cost (op2, FMA, 2, speed);
6712       return true;
6713
6714     case FLOAT:
6715     case UNSIGNED_FLOAT:
6716       if (speed)
6717         *cost += extra_cost->fp[mode == DFmode].fromint;
6718       return false;
6719
6720     case FLOAT_EXTEND:
6721       if (speed)
6722         {
6723           if (VECTOR_MODE_P (mode))
6724             {
6725               /*Vector truncate.  */
6726               *cost += extra_cost->vect.alu;
6727             }
6728           else
6729             *cost += extra_cost->fp[mode == DFmode].widen;
6730         }
6731       return false;
6732
6733     case FLOAT_TRUNCATE:
6734       if (speed)
6735         {
6736           if (VECTOR_MODE_P (mode))
6737             {
6738               /*Vector conversion.  */
6739               *cost += extra_cost->vect.alu;
6740             }
6741           else
6742             *cost += extra_cost->fp[mode == DFmode].narrow;
6743         }
6744       return false;
6745
6746     case FIX:
6747     case UNSIGNED_FIX:
6748       x = XEXP (x, 0);
6749       /* Strip the rounding part.  They will all be implemented
6750          by the fcvt* family of instructions anyway.  */
6751       if (GET_CODE (x) == UNSPEC)
6752         {
6753           unsigned int uns_code = XINT (x, 1);
6754
6755           if (uns_code == UNSPEC_FRINTA
6756               || uns_code == UNSPEC_FRINTM
6757               || uns_code == UNSPEC_FRINTN
6758               || uns_code == UNSPEC_FRINTP
6759               || uns_code == UNSPEC_FRINTZ)
6760             x = XVECEXP (x, 0, 0);
6761         }
6762
6763       if (speed)
6764         {
6765           if (VECTOR_MODE_P (mode))
6766             *cost += extra_cost->vect.alu;
6767           else
6768             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6769         }
6770       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6771       return true;
6772
6773     case ABS:
6774       if (VECTOR_MODE_P (mode))
6775         {
6776           /* ABS (vector).  */
6777           if (speed)
6778             *cost += extra_cost->vect.alu;
6779         }
6780       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6781         {
6782           op0 = XEXP (x, 0);
6783
6784           /* FABD, which is analogous to FADD.  */
6785           if (GET_CODE (op0) == MINUS)
6786             {
6787               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6788                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6789               if (speed)
6790                 *cost += extra_cost->fp[mode == DFmode].addsub;
6791
6792               return true;
6793             }
6794           /* Simple FABS is analogous to FNEG.  */
6795           if (speed)
6796             *cost += extra_cost->fp[mode == DFmode].neg;
6797         }
6798       else
6799         {
6800           /* Integer ABS will either be split to
6801              two arithmetic instructions, or will be an ABS
6802              (scalar), which we don't model.  */
6803           *cost = COSTS_N_INSNS (2);
6804           if (speed)
6805             *cost += 2 * extra_cost->alu.arith;
6806         }
6807       return false;
6808
6809     case SMAX:
6810     case SMIN:
6811       if (speed)
6812         {
6813           if (VECTOR_MODE_P (mode))
6814             *cost += extra_cost->vect.alu;
6815           else
6816             {
6817               /* FMAXNM/FMINNM/FMAX/FMIN.
6818                  TODO: This may not be accurate for all implementations, but
6819                  we do not model this in the cost tables.  */
6820               *cost += extra_cost->fp[mode == DFmode].addsub;
6821             }
6822         }
6823       return false;
6824
6825     case UNSPEC:
6826       /* The floating point round to integer frint* instructions.  */
6827       if (aarch64_frint_unspec_p (XINT (x, 1)))
6828         {
6829           if (speed)
6830             *cost += extra_cost->fp[mode == DFmode].roundint;
6831
6832           return false;
6833         }
6834
6835       if (XINT (x, 1) == UNSPEC_RBIT)
6836         {
6837           if (speed)
6838             *cost += extra_cost->alu.rev;
6839
6840           return false;
6841         }
6842       break;
6843
6844     case TRUNCATE:
6845
6846       /* Decompose <su>muldi3_highpart.  */
6847       if (/* (truncate:DI  */
6848           mode == DImode
6849           /*   (lshiftrt:TI  */
6850           && GET_MODE (XEXP (x, 0)) == TImode
6851           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6852           /*      (mult:TI  */
6853           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6854           /*        (ANY_EXTEND:TI (reg:DI))
6855                     (ANY_EXTEND:TI (reg:DI)))  */
6856           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6857                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6858               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6859                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6860           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6861           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6862           /*     (const_int 64)  */
6863           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6864           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6865         {
6866           /* UMULH/SMULH.  */
6867           if (speed)
6868             *cost += extra_cost->mult[mode == DImode].extend;
6869           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6870                              MULT, 0, speed);
6871           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6872                              MULT, 1, speed);
6873           return true;
6874         }
6875
6876       /* Fall through.  */
6877     default:
6878       break;
6879     }
6880
6881   if (dump_file && (dump_flags & TDF_DETAILS))
6882     fprintf (dump_file,
6883       "\nFailed to cost RTX.  Assuming default cost.\n");
6884
6885   return true;
6886 }
6887
6888 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6889    calculated for X.  This cost is stored in *COST.  Returns true
6890    if the total cost of X was calculated.  */
6891 static bool
6892 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6893                    int param, int *cost, bool speed)
6894 {
6895   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6896
6897   if (dump_file && (dump_flags & TDF_DETAILS))
6898     {
6899       print_rtl_single (dump_file, x);
6900       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6901                speed ? "Hot" : "Cold",
6902                *cost, result ? "final" : "partial");
6903     }
6904
6905   return result;
6906 }
6907
6908 static int
6909 aarch64_register_move_cost (machine_mode mode,
6910                             reg_class_t from_i, reg_class_t to_i)
6911 {
6912   enum reg_class from = (enum reg_class) from_i;
6913   enum reg_class to = (enum reg_class) to_i;
6914   const struct cpu_regmove_cost *regmove_cost
6915     = aarch64_tune_params.regmove_cost;
6916
6917   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6918   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6919     to = GENERAL_REGS;
6920
6921   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6922     from = GENERAL_REGS;
6923
6924   /* Moving between GPR and stack cost is the same as GP2GP.  */
6925   if ((from == GENERAL_REGS && to == STACK_REG)
6926       || (to == GENERAL_REGS && from == STACK_REG))
6927     return regmove_cost->GP2GP;
6928
6929   /* To/From the stack register, we move via the gprs.  */
6930   if (to == STACK_REG || from == STACK_REG)
6931     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6932             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6933
6934   if (GET_MODE_SIZE (mode) == 16)
6935     {
6936       /* 128-bit operations on general registers require 2 instructions.  */
6937       if (from == GENERAL_REGS && to == GENERAL_REGS)
6938         return regmove_cost->GP2GP * 2;
6939       else if (from == GENERAL_REGS)
6940         return regmove_cost->GP2FP * 2;
6941       else if (to == GENERAL_REGS)
6942         return regmove_cost->FP2GP * 2;
6943
6944       /* When AdvSIMD instructions are disabled it is not possible to move
6945          a 128-bit value directly between Q registers.  This is handled in
6946          secondary reload.  A general register is used as a scratch to move
6947          the upper DI value and the lower DI value is moved directly,
6948          hence the cost is the sum of three moves. */
6949       if (! TARGET_SIMD)
6950         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6951
6952       return regmove_cost->FP2FP;
6953     }
6954
6955   if (from == GENERAL_REGS && to == GENERAL_REGS)
6956     return regmove_cost->GP2GP;
6957   else if (from == GENERAL_REGS)
6958     return regmove_cost->GP2FP;
6959   else if (to == GENERAL_REGS)
6960     return regmove_cost->FP2GP;
6961
6962   return regmove_cost->FP2FP;
6963 }
6964
6965 static int
6966 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6967                           reg_class_t rclass ATTRIBUTE_UNUSED,
6968                           bool in ATTRIBUTE_UNUSED)
6969 {
6970   return aarch64_tune_params.memmov_cost;
6971 }
6972
6973 /* Return the number of instructions that can be issued per cycle.  */
6974 static int
6975 aarch64_sched_issue_rate (void)
6976 {
6977   return aarch64_tune_params.issue_rate;
6978 }
6979
6980 static int
6981 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6982 {
6983   int issue_rate = aarch64_sched_issue_rate ();
6984
6985   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6986 }
6987
6988 /* Vectorizer cost model target hooks.  */
6989
6990 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6991 static int
6992 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6993                                     tree vectype,
6994                                     int misalign ATTRIBUTE_UNUSED)
6995 {
6996   unsigned elements;
6997
6998   switch (type_of_cost)
6999     {
7000       case scalar_stmt:
7001         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7002
7003       case scalar_load:
7004         return aarch64_tune_params.vec_costs->scalar_load_cost;
7005
7006       case scalar_store:
7007         return aarch64_tune_params.vec_costs->scalar_store_cost;
7008
7009       case vector_stmt:
7010         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7011
7012       case vector_load:
7013         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7014
7015       case vector_store:
7016         return aarch64_tune_params.vec_costs->vec_store_cost;
7017
7018       case vec_to_scalar:
7019         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7020
7021       case scalar_to_vec:
7022         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7023
7024       case unaligned_load:
7025         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7026
7027       case unaligned_store:
7028         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7029
7030       case cond_branch_taken:
7031         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7032
7033       case cond_branch_not_taken:
7034         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7035
7036       case vec_perm:
7037       case vec_promote_demote:
7038         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7039
7040       case vec_construct:
7041         elements = TYPE_VECTOR_SUBPARTS (vectype);
7042         return elements / 2 + 1;
7043
7044       default:
7045         gcc_unreachable ();
7046     }
7047 }
7048
7049 /* Implement targetm.vectorize.add_stmt_cost.  */
7050 static unsigned
7051 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7052                        struct _stmt_vec_info *stmt_info, int misalign,
7053                        enum vect_cost_model_location where)
7054 {
7055   unsigned *cost = (unsigned *) data;
7056   unsigned retval = 0;
7057
7058   if (flag_vect_cost_model)
7059     {
7060       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7061       int stmt_cost =
7062             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7063
7064       /* Statements in an inner loop relative to the loop being
7065          vectorized are weighted more heavily.  The value here is
7066          a function (linear for now) of the loop nest level.  */
7067       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7068         {
7069           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
7070           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
7071           unsigned nest_level = loop_depth (loop);
7072
7073           count *= nest_level;
7074         }
7075
7076       retval = (unsigned) (count * stmt_cost);
7077       cost[where] += retval;
7078     }
7079
7080   return retval;
7081 }
7082
7083 static void initialize_aarch64_code_model (void);
7084
7085 /* Parse the architecture extension string.  */
7086
7087 static void
7088 aarch64_parse_extension (char *str)
7089 {
7090   /* The extension string is parsed left to right.  */
7091   const struct aarch64_option_extension *opt = NULL;
7092
7093   /* Flag to say whether we are adding or removing an extension.  */
7094   int adding_ext = -1;
7095
7096   while (str != NULL && *str != 0)
7097     {
7098       char *ext;
7099       size_t len;
7100
7101       str++;
7102       ext = strchr (str, '+');
7103
7104       if (ext != NULL)
7105         len = ext - str;
7106       else
7107         len = strlen (str);
7108
7109       if (len >= 2 && strncmp (str, "no", 2) == 0)
7110         {
7111           adding_ext = 0;
7112           len -= 2;
7113           str += 2;
7114         }
7115       else if (len > 0)
7116         adding_ext = 1;
7117
7118       if (len == 0)
7119         {
7120           error ("missing feature modifier after %qs", adding_ext ? "+"
7121                                                                   : "+no");
7122           return;
7123         }
7124
7125       /* Scan over the extensions table trying to find an exact match.  */
7126       for (opt = all_extensions; opt->name != NULL; opt++)
7127         {
7128           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7129             {
7130               /* Add or remove the extension.  */
7131               if (adding_ext)
7132                 aarch64_isa_flags |= opt->flags_on;
7133               else
7134                 aarch64_isa_flags &= ~(opt->flags_off);
7135               break;
7136             }
7137         }
7138
7139       if (opt->name == NULL)
7140         {
7141           /* Extension not found in list.  */
7142           error ("unknown feature modifier %qs", str);
7143           return;
7144         }
7145
7146       str = ext;
7147     };
7148
7149   return;
7150 }
7151
7152 /* Parse the ARCH string.  */
7153
7154 static void
7155 aarch64_parse_arch (void)
7156 {
7157   char *ext;
7158   const struct processor *arch;
7159   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7160   size_t len;
7161
7162   strcpy (str, aarch64_arch_string);
7163
7164   ext = strchr (str, '+');
7165
7166   if (ext != NULL)
7167     len = ext - str;
7168   else
7169     len = strlen (str);
7170
7171   if (len == 0)
7172     {
7173       error ("missing arch name in -march=%qs", str);
7174       return;
7175     }
7176
7177   /* Loop through the list of supported ARCHs to find a match.  */
7178   for (arch = all_architectures; arch->name != NULL; arch++)
7179     {
7180       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7181         {
7182           selected_arch = arch;
7183           aarch64_isa_flags = selected_arch->flags;
7184
7185           if (!selected_cpu)
7186             selected_cpu = &all_cores[selected_arch->core];
7187
7188           if (ext != NULL)
7189             {
7190               /* ARCH string contains at least one extension.  */
7191               aarch64_parse_extension (ext);
7192             }
7193
7194           if (strcmp (selected_arch->arch, selected_cpu->arch))
7195             {
7196               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7197                        selected_cpu->name, selected_arch->name);
7198             }
7199
7200           return;
7201         }
7202     }
7203
7204   /* ARCH name not found in list.  */
7205   error ("unknown value %qs for -march", str);
7206   return;
7207 }
7208
7209 /* Parse the CPU string.  */
7210
7211 static void
7212 aarch64_parse_cpu (void)
7213 {
7214   char *ext;
7215   const struct processor *cpu;
7216   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7217   size_t len;
7218
7219   strcpy (str, aarch64_cpu_string);
7220
7221   ext = strchr (str, '+');
7222
7223   if (ext != NULL)
7224     len = ext - str;
7225   else
7226     len = strlen (str);
7227
7228   if (len == 0)
7229     {
7230       error ("missing cpu name in -mcpu=%qs", str);
7231       return;
7232     }
7233
7234   /* Loop through the list of supported CPUs to find a match.  */
7235   for (cpu = all_cores; cpu->name != NULL; cpu++)
7236     {
7237       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7238         {
7239           selected_cpu = cpu;
7240           aarch64_isa_flags = selected_cpu->flags;
7241
7242           if (ext != NULL)
7243             {
7244               /* CPU string contains at least one extension.  */
7245               aarch64_parse_extension (ext);
7246             }
7247
7248           return;
7249         }
7250     }
7251
7252   /* CPU name not found in list.  */
7253   error ("unknown value %qs for -mcpu", str);
7254   return;
7255 }
7256
7257 /* Parse the TUNE string.  */
7258
7259 static void
7260 aarch64_parse_tune (void)
7261 {
7262   const struct processor *cpu;
7263   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7264   strcpy (str, aarch64_tune_string);
7265
7266   /* Loop through the list of supported CPUs to find a match.  */
7267   for (cpu = all_cores; cpu->name != NULL; cpu++)
7268     {
7269       if (strcmp (cpu->name, str) == 0)
7270         {
7271           selected_tune = cpu;
7272           return;
7273         }
7274     }
7275
7276   /* CPU name not found in list.  */
7277   error ("unknown value %qs for -mtune", str);
7278   return;
7279 }
7280
7281 /* Parse TOKEN, which has length LENGTH to see if it is an option
7282    described in FLAG.  If it is, return the index bit for that fusion type.
7283    If not, error (printing OPTION_NAME) and return zero.  */
7284
7285 static unsigned int
7286 aarch64_parse_one_option_token (const char *token,
7287                                 size_t length,
7288                                 const struct aarch64_flag_desc *flag,
7289                                 const char *option_name)
7290 {
7291   for (; flag->name != NULL; flag++)
7292     {
7293       if (length == strlen (flag->name)
7294           && !strncmp (flag->name, token, length))
7295         return flag->flag;
7296     }
7297
7298   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7299   return 0;
7300 }
7301
7302 /* Parse OPTION which is a comma-separated list of flags to enable.
7303    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7304    default state we inherit from the CPU tuning structures.  OPTION_NAME
7305    gives the top-level option we are parsing in the -moverride string,
7306    for use in error messages.  */
7307
7308 static unsigned int
7309 aarch64_parse_boolean_options (const char *option,
7310                                const struct aarch64_flag_desc *flags,
7311                                unsigned int initial_state,
7312                                const char *option_name)
7313 {
7314   const char separator = '.';
7315   const char* specs = option;
7316   const char* ntoken = option;
7317   unsigned int found_flags = initial_state;
7318
7319   while ((ntoken = strchr (specs, separator)))
7320     {
7321       size_t token_length = ntoken - specs;
7322       unsigned token_ops = aarch64_parse_one_option_token (specs,
7323                                                            token_length,
7324                                                            flags,
7325                                                            option_name);
7326       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7327          in the token stream, reset the supported operations.  So:
7328
7329            adrp+add.cmp+branch.none.adrp+add
7330
7331            would have the result of turning on only adrp+add fusion.  */
7332       if (!token_ops)
7333         found_flags = 0;
7334
7335       found_flags |= token_ops;
7336       specs = ++ntoken;
7337     }
7338
7339   /* We ended with a comma, print something.  */
7340   if (!(*specs))
7341     {
7342       error ("%s string ill-formed\n", option_name);
7343       return 0;
7344     }
7345
7346   /* We still have one more token to parse.  */
7347   size_t token_length = strlen (specs);
7348   unsigned token_ops = aarch64_parse_one_option_token (specs,
7349                                                        token_length,
7350                                                        flags,
7351                                                        option_name);
7352    if (!token_ops)
7353      found_flags = 0;
7354
7355   found_flags |= token_ops;
7356   return found_flags;
7357 }
7358
7359 /* Support for overriding instruction fusion.  */
7360
7361 static void
7362 aarch64_parse_fuse_string (const char *fuse_string,
7363                             struct tune_params *tune)
7364 {
7365   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7366                                                      aarch64_fusible_pairs,
7367                                                      tune->fusible_ops,
7368                                                      "fuse=");
7369 }
7370
7371 /* Support for overriding other tuning flags.  */
7372
7373 static void
7374 aarch64_parse_tune_string (const char *tune_string,
7375                             struct tune_params *tune)
7376 {
7377   tune->extra_tuning_flags
7378     = aarch64_parse_boolean_options (tune_string,
7379                                      aarch64_tuning_flags,
7380                                      tune->extra_tuning_flags,
7381                                      "tune=");
7382 }
7383
7384 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7385    we understand.  If it is, extract the option string and handoff to
7386    the appropriate function.  */
7387
7388 void
7389 aarch64_parse_one_override_token (const char* token,
7390                                   size_t length,
7391                                   struct tune_params *tune)
7392 {
7393   const struct aarch64_tuning_override_function *fn
7394     = aarch64_tuning_override_functions;
7395
7396   const char *option_part = strchr (token, '=');
7397   if (!option_part)
7398     {
7399       error ("tuning string missing in option (%s)", token);
7400       return;
7401     }
7402
7403   /* Get the length of the option name.  */
7404   length = option_part - token;
7405   /* Skip the '=' to get to the option string.  */
7406   option_part++;
7407
7408   for (; fn->name != NULL; fn++)
7409     {
7410       if (!strncmp (fn->name, token, length))
7411         {
7412           fn->parse_override (option_part, tune);
7413           return;
7414         }
7415     }
7416
7417   error ("unknown tuning option (%s)",token);
7418   return;
7419 }
7420
7421 /* Parse STRING looking for options in the format:
7422      string     :: option:string
7423      option     :: name=substring
7424      name       :: {a-z}
7425      substring  :: defined by option.  */
7426
7427 static void
7428 aarch64_parse_override_string (const char* input_string,
7429                                struct tune_params* tune)
7430 {
7431   const char separator = ':';
7432   size_t string_length = strlen (input_string) + 1;
7433   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7434   char *string = string_root;
7435   strncpy (string, input_string, string_length);
7436   string[string_length - 1] = '\0';
7437
7438   char* ntoken = string;
7439
7440   while ((ntoken = strchr (string, separator)))
7441     {
7442       size_t token_length = ntoken - string;
7443       /* Make this substring look like a string.  */
7444       *ntoken = '\0';
7445       aarch64_parse_one_override_token (string, token_length, tune);
7446       string = ++ntoken;
7447     }
7448
7449   /* One last option to parse.  */
7450   aarch64_parse_one_override_token (string, strlen (string), tune);
7451   free (string_root);
7452 }
7453
7454 /* Implement TARGET_OPTION_OVERRIDE.  */
7455
7456 static void
7457 aarch64_override_options (void)
7458 {
7459   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7460      If either of -march or -mtune is given, they override their
7461      respective component of -mcpu.
7462
7463      So, first parse AARCH64_CPU_STRING, then the others, be careful
7464      with -march as, if -mcpu is not present on the command line, march
7465      must set a sensible default CPU.  */
7466   if (aarch64_cpu_string)
7467     {
7468       aarch64_parse_cpu ();
7469     }
7470
7471   if (aarch64_arch_string)
7472     {
7473       aarch64_parse_arch ();
7474     }
7475
7476   if (aarch64_tune_string)
7477     {
7478       aarch64_parse_tune ();
7479     }
7480
7481 #ifndef HAVE_AS_MABI_OPTION
7482   /* The compiler may have been configured with 2.23.* binutils, which does
7483      not have support for ILP32.  */
7484   if (TARGET_ILP32)
7485     error ("Assembler does not support -mabi=ilp32");
7486 #endif
7487
7488   initialize_aarch64_code_model ();
7489
7490   aarch64_build_bitmask_table ();
7491
7492   /* This target defaults to strict volatile bitfields.  */
7493   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7494     flag_strict_volatile_bitfields = 1;
7495
7496   /* If the user did not specify a processor, choose the default
7497      one for them.  This will be the CPU set during configuration using
7498      --with-cpu, otherwise it is "generic".  */
7499   if (!selected_cpu)
7500     {
7501       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7502       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7503     }
7504
7505   gcc_assert (selected_cpu);
7506
7507   if (!selected_tune)
7508     selected_tune = selected_cpu;
7509
7510   aarch64_tune_flags = selected_tune->flags;
7511   aarch64_tune = selected_tune->core;
7512   /* Make a copy of the tuning parameters attached to the core, which
7513      we may later overwrite.  */
7514   aarch64_tune_params = *(selected_tune->tune);
7515   aarch64_architecture_version = selected_cpu->architecture_version;
7516
7517   if (aarch64_override_tune_string)
7518     aarch64_parse_override_string (aarch64_override_tune_string,
7519                                    &aarch64_tune_params);
7520
7521   if (aarch64_fix_a53_err835769 == 2)
7522     {
7523 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7524       aarch64_fix_a53_err835769 = 1;
7525 #else
7526       aarch64_fix_a53_err835769 = 0;
7527 #endif
7528     }
7529
7530   aarch64_register_fma_steering ();
7531
7532   aarch64_override_options_after_change ();
7533 }
7534
7535 /* Implement targetm.override_options_after_change.  */
7536
7537 static void
7538 aarch64_override_options_after_change (void)
7539 {
7540   if (flag_omit_frame_pointer)
7541     flag_omit_leaf_frame_pointer = false;
7542   else if (flag_omit_leaf_frame_pointer)
7543     flag_omit_frame_pointer = true;
7544
7545   /* If not optimizing for size, set the default
7546      alignment to what the target wants */
7547   if (!optimize_size)
7548     {
7549       if (align_loops <= 0)
7550         align_loops = aarch64_tune_params.loop_align;
7551       if (align_jumps <= 0)
7552         align_jumps = aarch64_tune_params.jump_align;
7553       if (align_functions <= 0)
7554         align_functions = aarch64_tune_params.function_align;
7555     }
7556 }
7557
7558 static struct machine_function *
7559 aarch64_init_machine_status (void)
7560 {
7561   struct machine_function *machine;
7562   machine = ggc_cleared_alloc<machine_function> ();
7563   return machine;
7564 }
7565
7566 void
7567 aarch64_init_expanders (void)
7568 {
7569   init_machine_status = aarch64_init_machine_status;
7570 }
7571
7572 /* A checking mechanism for the implementation of the various code models.  */
7573 static void
7574 initialize_aarch64_code_model (void)
7575 {
7576    if (flag_pic)
7577      {
7578        switch (aarch64_cmodel_var)
7579          {
7580          case AARCH64_CMODEL_TINY:
7581            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7582            break;
7583          case AARCH64_CMODEL_SMALL:
7584            aarch64_cmodel = (flag_pic == 2
7585                              ? AARCH64_CMODEL_SMALL_PIC
7586                              : AARCH64_CMODEL_SMALL_SPIC);
7587            break;
7588          case AARCH64_CMODEL_LARGE:
7589            sorry ("code model %qs with -f%s", "large",
7590                   flag_pic > 1 ? "PIC" : "pic");
7591          default:
7592            gcc_unreachable ();
7593          }
7594      }
7595    else
7596      aarch64_cmodel = aarch64_cmodel_var;
7597 }
7598
7599 /* Return true if SYMBOL_REF X binds locally.  */
7600
7601 static bool
7602 aarch64_symbol_binds_local_p (const_rtx x)
7603 {
7604   return (SYMBOL_REF_DECL (x)
7605           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7606           : SYMBOL_REF_LOCAL_P (x));
7607 }
7608
7609 /* Return true if SYMBOL_REF X is thread local */
7610 static bool
7611 aarch64_tls_symbol_p (rtx x)
7612 {
7613   if (! TARGET_HAVE_TLS)
7614     return false;
7615
7616   if (GET_CODE (x) != SYMBOL_REF)
7617     return false;
7618
7619   return SYMBOL_REF_TLS_MODEL (x) != 0;
7620 }
7621
7622 /* Classify a TLS symbol into one of the TLS kinds.  */
7623 enum aarch64_symbol_type
7624 aarch64_classify_tls_symbol (rtx x)
7625 {
7626   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7627
7628   switch (tls_kind)
7629     {
7630     case TLS_MODEL_GLOBAL_DYNAMIC:
7631     case TLS_MODEL_LOCAL_DYNAMIC:
7632       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7633
7634     case TLS_MODEL_INITIAL_EXEC:
7635       return SYMBOL_SMALL_GOTTPREL;
7636
7637     case TLS_MODEL_LOCAL_EXEC:
7638       return SYMBOL_SMALL_TPREL;
7639
7640     case TLS_MODEL_EMULATED:
7641     case TLS_MODEL_NONE:
7642       return SYMBOL_FORCE_TO_MEM;
7643
7644     default:
7645       gcc_unreachable ();
7646     }
7647 }
7648
7649 /* Return the method that should be used to access SYMBOL_REF or
7650    LABEL_REF X in context CONTEXT.  */
7651
7652 enum aarch64_symbol_type
7653 aarch64_classify_symbol (rtx x, rtx offset,
7654                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7655 {
7656   if (GET_CODE (x) == LABEL_REF)
7657     {
7658       switch (aarch64_cmodel)
7659         {
7660         case AARCH64_CMODEL_LARGE:
7661           return SYMBOL_FORCE_TO_MEM;
7662
7663         case AARCH64_CMODEL_TINY_PIC:
7664         case AARCH64_CMODEL_TINY:
7665           return SYMBOL_TINY_ABSOLUTE;
7666
7667         case AARCH64_CMODEL_SMALL_SPIC:
7668         case AARCH64_CMODEL_SMALL_PIC:
7669         case AARCH64_CMODEL_SMALL:
7670           return SYMBOL_SMALL_ABSOLUTE;
7671
7672         default:
7673           gcc_unreachable ();
7674         }
7675     }
7676
7677   if (GET_CODE (x) == SYMBOL_REF)
7678     {
7679       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7680           return SYMBOL_FORCE_TO_MEM;
7681
7682       if (aarch64_tls_symbol_p (x))
7683         return aarch64_classify_tls_symbol (x);
7684
7685       switch (aarch64_cmodel)
7686         {
7687         case AARCH64_CMODEL_TINY:
7688           /* When we retreive symbol + offset address, we have to make sure
7689              the offset does not cause overflow of the final address.  But
7690              we have no way of knowing the address of symbol at compile time
7691              so we can't accurately say if the distance between the PC and
7692              symbol + offset is outside the addressible range of +/-1M in the
7693              TINY code model.  So we rely on images not being greater than
7694              1M and cap the offset at 1M and anything beyond 1M will have to
7695              be loaded using an alternative mechanism.  */
7696           if (SYMBOL_REF_WEAK (x)
7697               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7698             return SYMBOL_FORCE_TO_MEM;
7699           return SYMBOL_TINY_ABSOLUTE;
7700
7701         case AARCH64_CMODEL_SMALL:
7702           /* Same reasoning as the tiny code model, but the offset cap here is
7703              4G.  */
7704           if (SYMBOL_REF_WEAK (x)
7705               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7706                             HOST_WIDE_INT_C (4294967264)))
7707             return SYMBOL_FORCE_TO_MEM;
7708           return SYMBOL_SMALL_ABSOLUTE;
7709
7710         case AARCH64_CMODEL_TINY_PIC:
7711           if (!aarch64_symbol_binds_local_p (x))
7712             return SYMBOL_TINY_GOT;
7713           return SYMBOL_TINY_ABSOLUTE;
7714
7715         case AARCH64_CMODEL_SMALL_SPIC:
7716         case AARCH64_CMODEL_SMALL_PIC:
7717           if (!aarch64_symbol_binds_local_p (x))
7718             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
7719                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
7720           return SYMBOL_SMALL_ABSOLUTE;
7721
7722         default:
7723           gcc_unreachable ();
7724         }
7725     }
7726
7727   /* By default push everything into the constant pool.  */
7728   return SYMBOL_FORCE_TO_MEM;
7729 }
7730
7731 bool
7732 aarch64_constant_address_p (rtx x)
7733 {
7734   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7735 }
7736
7737 bool
7738 aarch64_legitimate_pic_operand_p (rtx x)
7739 {
7740   if (GET_CODE (x) == SYMBOL_REF
7741       || (GET_CODE (x) == CONST
7742           && GET_CODE (XEXP (x, 0)) == PLUS
7743           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7744      return false;
7745
7746   return true;
7747 }
7748
7749 /* Return true if X holds either a quarter-precision or
7750      floating-point +0.0 constant.  */
7751 static bool
7752 aarch64_valid_floating_const (machine_mode mode, rtx x)
7753 {
7754   if (!CONST_DOUBLE_P (x))
7755     return false;
7756
7757   if (aarch64_float_const_zero_rtx_p (x))
7758     return true;
7759
7760   /* We only handle moving 0.0 to a TFmode register.  */
7761   if (!(mode == SFmode || mode == DFmode))
7762     return false;
7763
7764   return aarch64_float_const_representable_p (x);
7765 }
7766
7767 static bool
7768 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7769 {
7770   /* Do not allow vector struct mode constants.  We could support
7771      0 and -1 easily, but they need support in aarch64-simd.md.  */
7772   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7773     return false;
7774
7775   /* This could probably go away because
7776      we now decompose CONST_INTs according to expand_mov_immediate.  */
7777   if ((GET_CODE (x) == CONST_VECTOR
7778        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7779       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7780         return !targetm.cannot_force_const_mem (mode, x);
7781
7782   if (GET_CODE (x) == HIGH
7783       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7784     return true;
7785
7786   return aarch64_constant_address_p (x);
7787 }
7788
7789 rtx
7790 aarch64_load_tp (rtx target)
7791 {
7792   if (!target
7793       || GET_MODE (target) != Pmode
7794       || !register_operand (target, Pmode))
7795     target = gen_reg_rtx (Pmode);
7796
7797   /* Can return in any reg.  */
7798   emit_insn (gen_aarch64_load_tp_hard (target));
7799   return target;
7800 }
7801
7802 /* On AAPCS systems, this is the "struct __va_list".  */
7803 static GTY(()) tree va_list_type;
7804
7805 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7806    Return the type to use as __builtin_va_list.
7807
7808    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7809
7810    struct __va_list
7811    {
7812      void *__stack;
7813      void *__gr_top;
7814      void *__vr_top;
7815      int   __gr_offs;
7816      int   __vr_offs;
7817    };  */
7818
7819 static tree
7820 aarch64_build_builtin_va_list (void)
7821 {
7822   tree va_list_name;
7823   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7824
7825   /* Create the type.  */
7826   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7827   /* Give it the required name.  */
7828   va_list_name = build_decl (BUILTINS_LOCATION,
7829                              TYPE_DECL,
7830                              get_identifier ("__va_list"),
7831                              va_list_type);
7832   DECL_ARTIFICIAL (va_list_name) = 1;
7833   TYPE_NAME (va_list_type) = va_list_name;
7834   TYPE_STUB_DECL (va_list_type) = va_list_name;
7835
7836   /* Create the fields.  */
7837   f_stack = build_decl (BUILTINS_LOCATION,
7838                         FIELD_DECL, get_identifier ("__stack"),
7839                         ptr_type_node);
7840   f_grtop = build_decl (BUILTINS_LOCATION,
7841                         FIELD_DECL, get_identifier ("__gr_top"),
7842                         ptr_type_node);
7843   f_vrtop = build_decl (BUILTINS_LOCATION,
7844                         FIELD_DECL, get_identifier ("__vr_top"),
7845                         ptr_type_node);
7846   f_groff = build_decl (BUILTINS_LOCATION,
7847                         FIELD_DECL, get_identifier ("__gr_offs"),
7848                         integer_type_node);
7849   f_vroff = build_decl (BUILTINS_LOCATION,
7850                         FIELD_DECL, get_identifier ("__vr_offs"),
7851                         integer_type_node);
7852
7853   DECL_ARTIFICIAL (f_stack) = 1;
7854   DECL_ARTIFICIAL (f_grtop) = 1;
7855   DECL_ARTIFICIAL (f_vrtop) = 1;
7856   DECL_ARTIFICIAL (f_groff) = 1;
7857   DECL_ARTIFICIAL (f_vroff) = 1;
7858
7859   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7860   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7861   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7862   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7863   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7864
7865   TYPE_FIELDS (va_list_type) = f_stack;
7866   DECL_CHAIN (f_stack) = f_grtop;
7867   DECL_CHAIN (f_grtop) = f_vrtop;
7868   DECL_CHAIN (f_vrtop) = f_groff;
7869   DECL_CHAIN (f_groff) = f_vroff;
7870
7871   /* Compute its layout.  */
7872   layout_type (va_list_type);
7873
7874   return va_list_type;
7875 }
7876
7877 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7878 static void
7879 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7880 {
7881   const CUMULATIVE_ARGS *cum;
7882   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7883   tree stack, grtop, vrtop, groff, vroff;
7884   tree t;
7885   int gr_save_area_size;
7886   int vr_save_area_size;
7887   int vr_offset;
7888
7889   cum = &crtl->args.info;
7890   gr_save_area_size
7891     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7892   vr_save_area_size
7893     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7894
7895   if (!TARGET_FLOAT)
7896     {
7897       gcc_assert (cum->aapcs_nvrn == 0);
7898       vr_save_area_size = 0;
7899     }
7900
7901   f_stack = TYPE_FIELDS (va_list_type_node);
7902   f_grtop = DECL_CHAIN (f_stack);
7903   f_vrtop = DECL_CHAIN (f_grtop);
7904   f_groff = DECL_CHAIN (f_vrtop);
7905   f_vroff = DECL_CHAIN (f_groff);
7906
7907   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7908                   NULL_TREE);
7909   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7910                   NULL_TREE);
7911   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7912                   NULL_TREE);
7913   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7914                   NULL_TREE);
7915   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7916                   NULL_TREE);
7917
7918   /* Emit code to initialize STACK, which points to the next varargs stack
7919      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7920      by named arguments.  STACK is 8-byte aligned.  */
7921   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7922   if (cum->aapcs_stack_size > 0)
7923     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7924   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7925   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7926
7927   /* Emit code to initialize GRTOP, the top of the GR save area.
7928      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7929   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7930   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7931   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7932
7933   /* Emit code to initialize VRTOP, the top of the VR save area.
7934      This address is gr_save_area_bytes below GRTOP, rounded
7935      down to the next 16-byte boundary.  */
7936   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7937   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7938                              STACK_BOUNDARY / BITS_PER_UNIT);
7939
7940   if (vr_offset)
7941     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7942   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7943   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944
7945   /* Emit code to initialize GROFF, the offset from GRTOP of the
7946      next GPR argument.  */
7947   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7948               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7949   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7950
7951   /* Likewise emit code to initialize VROFF, the offset from FTOP
7952      of the next VR argument.  */
7953   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7954               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7955   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7956 }
7957
7958 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7959
7960 static tree
7961 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7962                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7963 {
7964   tree addr;
7965   bool indirect_p;
7966   bool is_ha;           /* is HFA or HVA.  */
7967   bool dw_align;        /* double-word align.  */
7968   machine_mode ag_mode = VOIDmode;
7969   int nregs;
7970   machine_mode mode;
7971
7972   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7973   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7974   HOST_WIDE_INT size, rsize, adjust, align;
7975   tree t, u, cond1, cond2;
7976
7977   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7978   if (indirect_p)
7979     type = build_pointer_type (type);
7980
7981   mode = TYPE_MODE (type);
7982
7983   f_stack = TYPE_FIELDS (va_list_type_node);
7984   f_grtop = DECL_CHAIN (f_stack);
7985   f_vrtop = DECL_CHAIN (f_grtop);
7986   f_groff = DECL_CHAIN (f_vrtop);
7987   f_vroff = DECL_CHAIN (f_groff);
7988
7989   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7990                   f_stack, NULL_TREE);
7991   size = int_size_in_bytes (type);
7992   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7993
7994   dw_align = false;
7995   adjust = 0;
7996   if (aarch64_vfp_is_call_or_return_candidate (mode,
7997                                                type,
7998                                                &ag_mode,
7999                                                &nregs,
8000                                                &is_ha))
8001     {
8002       /* TYPE passed in fp/simd registers.  */
8003       if (!TARGET_FLOAT)
8004         aarch64_err_no_fpadvsimd (mode, "varargs");
8005
8006       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
8007                       unshare_expr (valist), f_vrtop, NULL_TREE);
8008       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
8009                       unshare_expr (valist), f_vroff, NULL_TREE);
8010
8011       rsize = nregs * UNITS_PER_VREG;
8012
8013       if (is_ha)
8014         {
8015           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
8016             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
8017         }
8018       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
8019                && size < UNITS_PER_VREG)
8020         {
8021           adjust = UNITS_PER_VREG - size;
8022         }
8023     }
8024   else
8025     {
8026       /* TYPE passed in general registers.  */
8027       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
8028                       unshare_expr (valist), f_grtop, NULL_TREE);
8029       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
8030                       unshare_expr (valist), f_groff, NULL_TREE);
8031       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
8032       nregs = rsize / UNITS_PER_WORD;
8033
8034       if (align > 8)
8035         dw_align = true;
8036
8037       if (BLOCK_REG_PADDING (mode, type, 1) == downward
8038           && size < UNITS_PER_WORD)
8039         {
8040           adjust = UNITS_PER_WORD  - size;
8041         }
8042     }
8043
8044   /* Get a local temporary for the field value.  */
8045   off = get_initialized_tmp_var (f_off, pre_p, NULL);
8046
8047   /* Emit code to branch if off >= 0.  */
8048   t = build2 (GE_EXPR, boolean_type_node, off,
8049               build_int_cst (TREE_TYPE (off), 0));
8050   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
8051
8052   if (dw_align)
8053     {
8054       /* Emit: offs = (offs + 15) & -16.  */
8055       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8056                   build_int_cst (TREE_TYPE (off), 15));
8057       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
8058                   build_int_cst (TREE_TYPE (off), -16));
8059       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
8060     }
8061   else
8062     roundup = NULL;
8063
8064   /* Update ap.__[g|v]r_offs  */
8065   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8066               build_int_cst (TREE_TYPE (off), rsize));
8067   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
8068
8069   /* String up.  */
8070   if (roundup)
8071     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8072
8073   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
8074   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
8075               build_int_cst (TREE_TYPE (f_off), 0));
8076   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
8077
8078   /* String up: make sure the assignment happens before the use.  */
8079   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
8080   COND_EXPR_ELSE (cond1) = t;
8081
8082   /* Prepare the trees handling the argument that is passed on the stack;
8083      the top level node will store in ON_STACK.  */
8084   arg = get_initialized_tmp_var (stack, pre_p, NULL);
8085   if (align > 8)
8086     {
8087       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
8088       t = fold_convert (intDI_type_node, arg);
8089       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8090                   build_int_cst (TREE_TYPE (t), 15));
8091       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8092                   build_int_cst (TREE_TYPE (t), -16));
8093       t = fold_convert (TREE_TYPE (arg), t);
8094       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
8095     }
8096   else
8097     roundup = NULL;
8098   /* Advance ap.__stack  */
8099   t = fold_convert (intDI_type_node, arg);
8100   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8101               build_int_cst (TREE_TYPE (t), size + 7));
8102   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8103               build_int_cst (TREE_TYPE (t), -8));
8104   t = fold_convert (TREE_TYPE (arg), t);
8105   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
8106   /* String up roundup and advance.  */
8107   if (roundup)
8108     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8109   /* String up with arg */
8110   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
8111   /* Big-endianness related address adjustment.  */
8112   if (BLOCK_REG_PADDING (mode, type, 1) == downward
8113       && size < UNITS_PER_WORD)
8114   {
8115     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
8116                 size_int (UNITS_PER_WORD - size));
8117     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
8118   }
8119
8120   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
8121   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
8122
8123   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
8124   t = off;
8125   if (adjust)
8126     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
8127                 build_int_cst (TREE_TYPE (off), adjust));
8128
8129   t = fold_convert (sizetype, t);
8130   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
8131
8132   if (is_ha)
8133     {
8134       /* type ha; // treat as "struct {ftype field[n];}"
8135          ... [computing offs]
8136          for (i = 0; i <nregs; ++i, offs += 16)
8137            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
8138          return ha;  */
8139       int i;
8140       tree tmp_ha, field_t, field_ptr_t;
8141
8142       /* Declare a local variable.  */
8143       tmp_ha = create_tmp_var_raw (type, "ha");
8144       gimple_add_tmp_var (tmp_ha);
8145
8146       /* Establish the base type.  */
8147       switch (ag_mode)
8148         {
8149         case SFmode:
8150           field_t = float_type_node;
8151           field_ptr_t = float_ptr_type_node;
8152           break;
8153         case DFmode:
8154           field_t = double_type_node;
8155           field_ptr_t = double_ptr_type_node;
8156           break;
8157         case TFmode:
8158           field_t = long_double_type_node;
8159           field_ptr_t = long_double_ptr_type_node;
8160           break;
8161 /* The half precision and quad precision are not fully supported yet.  Enable
8162    the following code after the support is complete.  Need to find the correct
8163    type node for __fp16 *.  */
8164 #if 0
8165         case HFmode:
8166           field_t = float_type_node;
8167           field_ptr_t = float_ptr_type_node;
8168           break;
8169 #endif
8170         case V2SImode:
8171         case V4SImode:
8172             {
8173               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
8174               field_t = build_vector_type_for_mode (innertype, ag_mode);
8175               field_ptr_t = build_pointer_type (field_t);
8176             }
8177           break;
8178         default:
8179           gcc_assert (0);
8180         }
8181
8182       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
8183       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
8184       addr = t;
8185       t = fold_convert (field_ptr_t, addr);
8186       t = build2 (MODIFY_EXPR, field_t,
8187                   build1 (INDIRECT_REF, field_t, tmp_ha),
8188                   build1 (INDIRECT_REF, field_t, t));
8189
8190       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
8191       for (i = 1; i < nregs; ++i)
8192         {
8193           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
8194           u = fold_convert (field_ptr_t, addr);
8195           u = build2 (MODIFY_EXPR, field_t,
8196                       build2 (MEM_REF, field_t, tmp_ha,
8197                               build_int_cst (field_ptr_t,
8198                                              (i *
8199                                               int_size_in_bytes (field_t)))),
8200                       build1 (INDIRECT_REF, field_t, u));
8201           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
8202         }
8203
8204       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
8205       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
8206     }
8207
8208   COND_EXPR_ELSE (cond2) = t;
8209   addr = fold_convert (build_pointer_type (type), cond1);
8210   addr = build_va_arg_indirect_ref (addr);
8211
8212   if (indirect_p)
8213     addr = build_va_arg_indirect_ref (addr);
8214
8215   return addr;
8216 }
8217
8218 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
8219
8220 static void
8221 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
8222                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8223                                 int no_rtl)
8224 {
8225   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8226   CUMULATIVE_ARGS local_cum;
8227   int gr_saved, vr_saved;
8228
8229   /* The caller has advanced CUM up to, but not beyond, the last named
8230      argument.  Advance a local copy of CUM past the last "real" named
8231      argument, to find out how many registers are left over.  */
8232   local_cum = *cum;
8233   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
8234
8235   /* Found out how many registers we need to save.  */
8236   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
8237   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
8238
8239   if (!TARGET_FLOAT)
8240     {
8241       gcc_assert (local_cum.aapcs_nvrn == 0);
8242       vr_saved = 0;
8243     }
8244
8245   if (!no_rtl)
8246     {
8247       if (gr_saved > 0)
8248         {
8249           rtx ptr, mem;
8250
8251           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
8252           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
8253                                - gr_saved * UNITS_PER_WORD);
8254           mem = gen_frame_mem (BLKmode, ptr);
8255           set_mem_alias_set (mem, get_varargs_alias_set ());
8256
8257           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
8258                                mem, gr_saved);
8259         }
8260       if (vr_saved > 0)
8261         {
8262           /* We can't use move_block_from_reg, because it will use
8263              the wrong mode, storing D regs only.  */
8264           machine_mode mode = TImode;
8265           int off, i;
8266
8267           /* Set OFF to the offset from virtual_incoming_args_rtx of
8268              the first vector register.  The VR save area lies below
8269              the GR one, and is aligned to 16 bytes.  */
8270           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8271                                    STACK_BOUNDARY / BITS_PER_UNIT);
8272           off -= vr_saved * UNITS_PER_VREG;
8273
8274           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
8275             {
8276               rtx ptr, mem;
8277
8278               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
8279               mem = gen_frame_mem (mode, ptr);
8280               set_mem_alias_set (mem, get_varargs_alias_set ());
8281               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
8282               off += UNITS_PER_VREG;
8283             }
8284         }
8285     }
8286
8287   /* We don't save the size into *PRETEND_SIZE because we want to avoid
8288      any complication of having crtl->args.pretend_args_size changed.  */
8289   cfun->machine->frame.saved_varargs_size
8290     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8291                       STACK_BOUNDARY / BITS_PER_UNIT)
8292        + vr_saved * UNITS_PER_VREG);
8293 }
8294
8295 static void
8296 aarch64_conditional_register_usage (void)
8297 {
8298   int i;
8299   if (!TARGET_FLOAT)
8300     {
8301       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
8302         {
8303           fixed_regs[i] = 1;
8304           call_used_regs[i] = 1;
8305         }
8306     }
8307 }
8308
8309 /* Walk down the type tree of TYPE counting consecutive base elements.
8310    If *MODEP is VOIDmode, then set it to the first valid floating point
8311    type.  If a non-floating point type is found, or if a floating point
8312    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
8313    otherwise return the count in the sub-tree.  */
8314 static int
8315 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8316 {
8317   machine_mode mode;
8318   HOST_WIDE_INT size;
8319
8320   switch (TREE_CODE (type))
8321     {
8322     case REAL_TYPE:
8323       mode = TYPE_MODE (type);
8324       if (mode != DFmode && mode != SFmode && mode != TFmode)
8325         return -1;
8326
8327       if (*modep == VOIDmode)
8328         *modep = mode;
8329
8330       if (*modep == mode)
8331         return 1;
8332
8333       break;
8334
8335     case COMPLEX_TYPE:
8336       mode = TYPE_MODE (TREE_TYPE (type));
8337       if (mode != DFmode && mode != SFmode && mode != TFmode)
8338         return -1;
8339
8340       if (*modep == VOIDmode)
8341         *modep = mode;
8342
8343       if (*modep == mode)
8344         return 2;
8345
8346       break;
8347
8348     case VECTOR_TYPE:
8349       /* Use V2SImode and V4SImode as representatives of all 64-bit
8350          and 128-bit vector types.  */
8351       size = int_size_in_bytes (type);
8352       switch (size)
8353         {
8354         case 8:
8355           mode = V2SImode;
8356           break;
8357         case 16:
8358           mode = V4SImode;
8359           break;
8360         default:
8361           return -1;
8362         }
8363
8364       if (*modep == VOIDmode)
8365         *modep = mode;
8366
8367       /* Vector modes are considered to be opaque: two vectors are
8368          equivalent for the purposes of being homogeneous aggregates
8369          if they are the same size.  */
8370       if (*modep == mode)
8371         return 1;
8372
8373       break;
8374
8375     case ARRAY_TYPE:
8376       {
8377         int count;
8378         tree index = TYPE_DOMAIN (type);
8379
8380         /* Can't handle incomplete types nor sizes that are not
8381            fixed.  */
8382         if (!COMPLETE_TYPE_P (type)
8383             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8384           return -1;
8385
8386         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8387         if (count == -1
8388             || !index
8389             || !TYPE_MAX_VALUE (index)
8390             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8391             || !TYPE_MIN_VALUE (index)
8392             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8393             || count < 0)
8394           return -1;
8395
8396         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8397                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8398
8399         /* There must be no padding.  */
8400         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8401           return -1;
8402
8403         return count;
8404       }
8405
8406     case RECORD_TYPE:
8407       {
8408         int count = 0;
8409         int sub_count;
8410         tree field;
8411
8412         /* Can't handle incomplete types nor sizes that are not
8413            fixed.  */
8414         if (!COMPLETE_TYPE_P (type)
8415             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8416           return -1;
8417
8418         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8419           {
8420             if (TREE_CODE (field) != FIELD_DECL)
8421               continue;
8422
8423             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8424             if (sub_count < 0)
8425               return -1;
8426             count += sub_count;
8427           }
8428
8429         /* There must be no padding.  */
8430         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8431           return -1;
8432
8433         return count;
8434       }
8435
8436     case UNION_TYPE:
8437     case QUAL_UNION_TYPE:
8438       {
8439         /* These aren't very interesting except in a degenerate case.  */
8440         int count = 0;
8441         int sub_count;
8442         tree field;
8443
8444         /* Can't handle incomplete types nor sizes that are not
8445            fixed.  */
8446         if (!COMPLETE_TYPE_P (type)
8447             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8448           return -1;
8449
8450         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8451           {
8452             if (TREE_CODE (field) != FIELD_DECL)
8453               continue;
8454
8455             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8456             if (sub_count < 0)
8457               return -1;
8458             count = count > sub_count ? count : sub_count;
8459           }
8460
8461         /* There must be no padding.  */
8462         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8463           return -1;
8464
8465         return count;
8466       }
8467
8468     default:
8469       break;
8470     }
8471
8472   return -1;
8473 }
8474
8475 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8476    type as described in AAPCS64 \S 4.1.2.
8477
8478    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8479
8480 static bool
8481 aarch64_short_vector_p (const_tree type,
8482                         machine_mode mode)
8483 {
8484   HOST_WIDE_INT size = -1;
8485
8486   if (type && TREE_CODE (type) == VECTOR_TYPE)
8487     size = int_size_in_bytes (type);
8488   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8489             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8490     size = GET_MODE_SIZE (mode);
8491
8492   return (size == 8 || size == 16);
8493 }
8494
8495 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8496    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8497    array types.  The C99 floating-point complex types are also considered
8498    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8499    types, which are GCC extensions and out of the scope of AAPCS64, are
8500    treated as composite types here as well.
8501
8502    Note that MODE itself is not sufficient in determining whether a type
8503    is such a composite type or not.  This is because
8504    stor-layout.c:compute_record_mode may have already changed the MODE
8505    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8506    structure with only one field may have its MODE set to the mode of the
8507    field.  Also an integer mode whose size matches the size of the
8508    RECORD_TYPE type may be used to substitute the original mode
8509    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8510    solely relied on.  */
8511
8512 static bool
8513 aarch64_composite_type_p (const_tree type,
8514                           machine_mode mode)
8515 {
8516   if (aarch64_short_vector_p (type, mode))
8517     return false;
8518
8519   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8520     return true;
8521
8522   if (mode == BLKmode
8523       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8524       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8525     return true;
8526
8527   return false;
8528 }
8529
8530 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8531    shall be passed or returned in simd/fp register(s) (providing these
8532    parameter passing registers are available).
8533
8534    Upon successful return, *COUNT returns the number of needed registers,
8535    *BASE_MODE returns the mode of the individual register and when IS_HAF
8536    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8537    floating-point aggregate or a homogeneous short-vector aggregate.  */
8538
8539 static bool
8540 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8541                                          const_tree type,
8542                                          machine_mode *base_mode,
8543                                          int *count,
8544                                          bool *is_ha)
8545 {
8546   machine_mode new_mode = VOIDmode;
8547   bool composite_p = aarch64_composite_type_p (type, mode);
8548
8549   if (is_ha != NULL) *is_ha = false;
8550
8551   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8552       || aarch64_short_vector_p (type, mode))
8553     {
8554       *count = 1;
8555       new_mode = mode;
8556     }
8557   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8558     {
8559       if (is_ha != NULL) *is_ha = true;
8560       *count = 2;
8561       new_mode = GET_MODE_INNER (mode);
8562     }
8563   else if (type && composite_p)
8564     {
8565       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8566
8567       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8568         {
8569           if (is_ha != NULL) *is_ha = true;
8570           *count = ag_count;
8571         }
8572       else
8573         return false;
8574     }
8575   else
8576     return false;
8577
8578   *base_mode = new_mode;
8579   return true;
8580 }
8581
8582 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8583
8584 static rtx
8585 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8586                           int incoming ATTRIBUTE_UNUSED)
8587 {
8588   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8589 }
8590
8591 /* Implements target hook vector_mode_supported_p.  */
8592 static bool
8593 aarch64_vector_mode_supported_p (machine_mode mode)
8594 {
8595   if (TARGET_SIMD
8596       && (mode == V4SImode  || mode == V8HImode
8597           || mode == V16QImode || mode == V2DImode
8598           || mode == V2SImode  || mode == V4HImode
8599           || mode == V8QImode || mode == V2SFmode
8600           || mode == V4SFmode || mode == V2DFmode
8601           || mode == V1DFmode))
8602     return true;
8603
8604   return false;
8605 }
8606
8607 /* Return appropriate SIMD container
8608    for MODE within a vector of WIDTH bits.  */
8609 static machine_mode
8610 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8611 {
8612   gcc_assert (width == 64 || width == 128);
8613   if (TARGET_SIMD)
8614     {
8615       if (width == 128)
8616         switch (mode)
8617           {
8618           case DFmode:
8619             return V2DFmode;
8620           case SFmode:
8621             return V4SFmode;
8622           case SImode:
8623             return V4SImode;
8624           case HImode:
8625             return V8HImode;
8626           case QImode:
8627             return V16QImode;
8628           case DImode:
8629             return V2DImode;
8630           default:
8631             break;
8632           }
8633       else
8634         switch (mode)
8635           {
8636           case SFmode:
8637             return V2SFmode;
8638           case SImode:
8639             return V2SImode;
8640           case HImode:
8641             return V4HImode;
8642           case QImode:
8643             return V8QImode;
8644           default:
8645             break;
8646           }
8647     }
8648   return word_mode;
8649 }
8650
8651 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8652 static machine_mode
8653 aarch64_preferred_simd_mode (machine_mode mode)
8654 {
8655   return aarch64_simd_container_mode (mode, 128);
8656 }
8657
8658 /* Return the bitmask of possible vector sizes for the vectorizer
8659    to iterate over.  */
8660 static unsigned int
8661 aarch64_autovectorize_vector_sizes (void)
8662 {
8663   return (16 | 8);
8664 }
8665
8666 /* Implement TARGET_MANGLE_TYPE.  */
8667
8668 static const char *
8669 aarch64_mangle_type (const_tree type)
8670 {
8671   /* The AArch64 ABI documents say that "__va_list" has to be
8672      managled as if it is in the "std" namespace.  */
8673   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8674     return "St9__va_list";
8675
8676   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8677      builtin types.  */
8678   if (TYPE_NAME (type) != NULL)
8679     return aarch64_mangle_builtin_type (type);
8680
8681   /* Use the default mangling.  */
8682   return NULL;
8683 }
8684
8685
8686 /* Return true if the rtx_insn contains a MEM RTX somewhere
8687    in it.  */
8688
8689 static bool
8690 has_memory_op (rtx_insn *mem_insn)
8691 {
8692   subrtx_iterator::array_type array;
8693   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8694     if (MEM_P (*iter))
8695       return true;
8696
8697   return false;
8698 }
8699
8700 /* Find the first rtx_insn before insn that will generate an assembly
8701    instruction.  */
8702
8703 static rtx_insn *
8704 aarch64_prev_real_insn (rtx_insn *insn)
8705 {
8706   if (!insn)
8707     return NULL;
8708
8709   do
8710     {
8711       insn = prev_real_insn (insn);
8712     }
8713   while (insn && recog_memoized (insn) < 0);
8714
8715   return insn;
8716 }
8717
8718 static bool
8719 is_madd_op (enum attr_type t1)
8720 {
8721   unsigned int i;
8722   /* A number of these may be AArch32 only.  */
8723   enum attr_type mlatypes[] = {
8724     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8725     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8726     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8727   };
8728
8729   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8730     {
8731       if (t1 == mlatypes[i])
8732         return true;
8733     }
8734
8735   return false;
8736 }
8737
8738 /* Check if there is a register dependency between a load and the insn
8739    for which we hold recog_data.  */
8740
8741 static bool
8742 dep_between_memop_and_curr (rtx memop)
8743 {
8744   rtx load_reg;
8745   int opno;
8746
8747   gcc_assert (GET_CODE (memop) == SET);
8748
8749   if (!REG_P (SET_DEST (memop)))
8750     return false;
8751
8752   load_reg = SET_DEST (memop);
8753   for (opno = 1; opno < recog_data.n_operands; opno++)
8754     {
8755       rtx operand = recog_data.operand[opno];
8756       if (REG_P (operand)
8757           && reg_overlap_mentioned_p (load_reg, operand))
8758         return true;
8759
8760     }
8761   return false;
8762 }
8763
8764
8765 /* When working around the Cortex-A53 erratum 835769,
8766    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8767    instruction and has a preceding memory instruction such that a NOP
8768    should be inserted between them.  */
8769
8770 bool
8771 aarch64_madd_needs_nop (rtx_insn* insn)
8772 {
8773   enum attr_type attr_type;
8774   rtx_insn *prev;
8775   rtx body;
8776
8777   if (!aarch64_fix_a53_err835769)
8778     return false;
8779
8780   if (recog_memoized (insn) < 0)
8781     return false;
8782
8783   attr_type = get_attr_type (insn);
8784   if (!is_madd_op (attr_type))
8785     return false;
8786
8787   prev = aarch64_prev_real_insn (insn);
8788   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8789      Restore recog state to INSN to avoid state corruption.  */
8790   extract_constrain_insn_cached (insn);
8791
8792   if (!prev || !has_memory_op (prev))
8793     return false;
8794
8795   body = single_set (prev);
8796
8797   /* If the previous insn is a memory op and there is no dependency between
8798      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8799      have a complex memory operation, probably a load/store pair.
8800      Be conservative for now and emit a NOP.  */
8801   if (GET_MODE (recog_data.operand[0]) == DImode
8802       && (!body || !dep_between_memop_and_curr (body)))
8803     return true;
8804
8805   return false;
8806
8807 }
8808
8809
8810 /* Implement FINAL_PRESCAN_INSN.  */
8811
8812 void
8813 aarch64_final_prescan_insn (rtx_insn *insn)
8814 {
8815   if (aarch64_madd_needs_nop (insn))
8816     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8817 }
8818
8819
8820 /* Return the equivalent letter for size.  */
8821 static char
8822 sizetochar (int size)
8823 {
8824   switch (size)
8825     {
8826     case 64: return 'd';
8827     case 32: return 's';
8828     case 16: return 'h';
8829     case 8 : return 'b';
8830     default: gcc_unreachable ();
8831     }
8832 }
8833
8834 /* Return true iff x is a uniform vector of floating-point
8835    constants, and the constant can be represented in
8836    quarter-precision form.  Note, as aarch64_float_const_representable
8837    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8838 static bool
8839 aarch64_vect_float_const_representable_p (rtx x)
8840 {
8841   int i = 0;
8842   REAL_VALUE_TYPE r0, ri;
8843   rtx x0, xi;
8844
8845   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8846     return false;
8847
8848   x0 = CONST_VECTOR_ELT (x, 0);
8849   if (!CONST_DOUBLE_P (x0))
8850     return false;
8851
8852   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8853
8854   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8855     {
8856       xi = CONST_VECTOR_ELT (x, i);
8857       if (!CONST_DOUBLE_P (xi))
8858         return false;
8859
8860       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8861       if (!REAL_VALUES_EQUAL (r0, ri))
8862         return false;
8863     }
8864
8865   return aarch64_float_const_representable_p (x0);
8866 }
8867
8868 /* Return true for valid and false for invalid.  */
8869 bool
8870 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8871                               struct simd_immediate_info *info)
8872 {
8873 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8874   matches = 1;                                          \
8875   for (i = 0; i < idx; i += (STRIDE))                   \
8876     if (!(TEST))                                        \
8877       matches = 0;                                      \
8878   if (matches)                                          \
8879     {                                                   \
8880       immtype = (CLASS);                                \
8881       elsize = (ELSIZE);                                \
8882       eshift = (SHIFT);                                 \
8883       emvn = (NEG);                                     \
8884       break;                                            \
8885     }
8886
8887   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8888   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8889   unsigned char bytes[16];
8890   int immtype = -1, matches;
8891   unsigned int invmask = inverse ? 0xff : 0;
8892   int eshift, emvn;
8893
8894   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8895     {
8896       if (! (aarch64_simd_imm_zero_p (op, mode)
8897              || aarch64_vect_float_const_representable_p (op)))
8898         return false;
8899
8900       if (info)
8901         {
8902           info->value = CONST_VECTOR_ELT (op, 0);
8903           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8904           info->mvn = false;
8905           info->shift = 0;
8906         }
8907
8908       return true;
8909     }
8910
8911   /* Splat vector constant out into a byte vector.  */
8912   for (i = 0; i < n_elts; i++)
8913     {
8914       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8915          it must be laid out in the vector register in reverse order.  */
8916       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8917       unsigned HOST_WIDE_INT elpart;
8918       unsigned int part, parts;
8919
8920       if (CONST_INT_P (el))
8921         {
8922           elpart = INTVAL (el);
8923           parts = 1;
8924         }
8925       else if (GET_CODE (el) == CONST_DOUBLE)
8926         {
8927           elpart = CONST_DOUBLE_LOW (el);
8928           parts = 2;
8929         }
8930       else
8931         gcc_unreachable ();
8932
8933       for (part = 0; part < parts; part++)
8934         {
8935           unsigned int byte;
8936           for (byte = 0; byte < innersize; byte++)
8937             {
8938               bytes[idx++] = (elpart & 0xff) ^ invmask;
8939               elpart >>= BITS_PER_UNIT;
8940             }
8941           if (GET_CODE (el) == CONST_DOUBLE)
8942             elpart = CONST_DOUBLE_HIGH (el);
8943         }
8944     }
8945
8946   /* Sanity check.  */
8947   gcc_assert (idx == GET_MODE_SIZE (mode));
8948
8949   do
8950     {
8951       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8952              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8953
8954       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8955              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8956
8957       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8958              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8959
8960       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8961              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8962
8963       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8964
8965       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8966
8967       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8968              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8969
8970       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8971              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8972
8973       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8974              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8975
8976       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8977              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8978
8979       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8980
8981       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8982
8983       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8984              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8985
8986       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8987              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8988
8989       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8990              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8991
8992       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8993              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8994
8995       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8996
8997       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8998              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8999     }
9000   while (0);
9001
9002   if (immtype == -1)
9003     return false;
9004
9005   if (info)
9006     {
9007       info->element_width = elsize;
9008       info->mvn = emvn != 0;
9009       info->shift = eshift;
9010
9011       unsigned HOST_WIDE_INT imm = 0;
9012
9013       if (immtype >= 12 && immtype <= 15)
9014         info->msl = true;
9015
9016       /* Un-invert bytes of recognized vector, if necessary.  */
9017       if (invmask != 0)
9018         for (i = 0; i < idx; i++)
9019           bytes[i] ^= invmask;
9020
9021       if (immtype == 17)
9022         {
9023           /* FIXME: Broken on 32-bit H_W_I hosts.  */
9024           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
9025
9026           for (i = 0; i < 8; i++)
9027             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
9028               << (i * BITS_PER_UNIT);
9029
9030
9031           info->value = GEN_INT (imm);
9032         }
9033       else
9034         {
9035           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
9036             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
9037
9038           /* Construct 'abcdefgh' because the assembler cannot handle
9039              generic constants.  */
9040           if (info->mvn)
9041             imm = ~imm;
9042           imm = (imm >> info->shift) & 0xff;
9043           info->value = GEN_INT (imm);
9044         }
9045     }
9046
9047   return true;
9048 #undef CHECK
9049 }
9050
9051 /* Check of immediate shift constants are within range.  */
9052 bool
9053 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
9054 {
9055   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
9056   if (left)
9057     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
9058   else
9059     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
9060 }
9061
9062 /* Return true if X is a uniform vector where all elements
9063    are either the floating-point constant 0.0 or the
9064    integer constant 0.  */
9065 bool
9066 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
9067 {
9068   return x == CONST0_RTX (mode);
9069 }
9070
9071 bool
9072 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
9073 {
9074   HOST_WIDE_INT imm = INTVAL (x);
9075   int i;
9076
9077   for (i = 0; i < 8; i++)
9078     {
9079       unsigned int byte = imm & 0xff;
9080       if (byte != 0xff && byte != 0)
9081        return false;
9082       imm >>= 8;
9083     }
9084
9085   return true;
9086 }
9087
9088 bool
9089 aarch64_mov_operand_p (rtx x,
9090                        enum aarch64_symbol_context context,
9091                        machine_mode mode)
9092 {
9093   if (GET_CODE (x) == HIGH
9094       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9095     return true;
9096
9097   if (CONST_INT_P (x))
9098     return true;
9099
9100   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
9101     return true;
9102
9103   return aarch64_classify_symbolic_expression (x, context)
9104     == SYMBOL_TINY_ABSOLUTE;
9105 }
9106
9107 /* Return a const_int vector of VAL.  */
9108 rtx
9109 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
9110 {
9111   int nunits = GET_MODE_NUNITS (mode);
9112   rtvec v = rtvec_alloc (nunits);
9113   int i;
9114
9115   for (i=0; i < nunits; i++)
9116     RTVEC_ELT (v, i) = GEN_INT (val);
9117
9118   return gen_rtx_CONST_VECTOR (mode, v);
9119 }
9120
9121 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
9122
9123 bool
9124 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
9125 {
9126   machine_mode vmode;
9127
9128   gcc_assert (!VECTOR_MODE_P (mode));
9129   vmode = aarch64_preferred_simd_mode (mode);
9130   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
9131   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
9132 }
9133
9134 /* Construct and return a PARALLEL RTX vector with elements numbering the
9135    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
9136    the vector - from the perspective of the architecture.  This does not
9137    line up with GCC's perspective on lane numbers, so we end up with
9138    different masks depending on our target endian-ness.  The diagram
9139    below may help.  We must draw the distinction when building masks
9140    which select one half of the vector.  An instruction selecting
9141    architectural low-lanes for a big-endian target, must be described using
9142    a mask selecting GCC high-lanes.
9143
9144                  Big-Endian             Little-Endian
9145
9146 GCC             0   1   2   3           3   2   1   0
9147               | x | x | x | x |       | x | x | x | x |
9148 Architecture    3   2   1   0           3   2   1   0
9149
9150 Low Mask:         { 2, 3 }                { 0, 1 }
9151 High Mask:        { 0, 1 }                { 2, 3 }
9152 */
9153
9154 rtx
9155 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
9156 {
9157   int nunits = GET_MODE_NUNITS (mode);
9158   rtvec v = rtvec_alloc (nunits / 2);
9159   int high_base = nunits / 2;
9160   int low_base = 0;
9161   int base;
9162   rtx t1;
9163   int i;
9164
9165   if (BYTES_BIG_ENDIAN)
9166     base = high ? low_base : high_base;
9167   else
9168     base = high ? high_base : low_base;
9169
9170   for (i = 0; i < nunits / 2; i++)
9171     RTVEC_ELT (v, i) = GEN_INT (base + i);
9172
9173   t1 = gen_rtx_PARALLEL (mode, v);
9174   return t1;
9175 }
9176
9177 /* Check OP for validity as a PARALLEL RTX vector with elements
9178    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
9179    from the perspective of the architecture.  See the diagram above
9180    aarch64_simd_vect_par_cnst_half for more details.  */
9181
9182 bool
9183 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
9184                                        bool high)
9185 {
9186   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
9187   HOST_WIDE_INT count_op = XVECLEN (op, 0);
9188   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
9189   int i = 0;
9190
9191   if (!VECTOR_MODE_P (mode))
9192     return false;
9193
9194   if (count_op != count_ideal)
9195     return false;
9196
9197   for (i = 0; i < count_ideal; i++)
9198     {
9199       rtx elt_op = XVECEXP (op, 0, i);
9200       rtx elt_ideal = XVECEXP (ideal, 0, i);
9201
9202       if (!CONST_INT_P (elt_op)
9203           || INTVAL (elt_ideal) != INTVAL (elt_op))
9204         return false;
9205     }
9206   return true;
9207 }
9208
9209 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
9210    HIGH (exclusive).  */
9211 void
9212 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
9213                           const_tree exp)
9214 {
9215   HOST_WIDE_INT lane;
9216   gcc_assert (CONST_INT_P (operand));
9217   lane = INTVAL (operand);
9218
9219   if (lane < low || lane >= high)
9220   {
9221     if (exp)
9222       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
9223     else
9224       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
9225   }
9226 }
9227
9228 /* Return TRUE if OP is a valid vector addressing mode.  */
9229 bool
9230 aarch64_simd_mem_operand_p (rtx op)
9231 {
9232   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
9233                         || REG_P (XEXP (op, 0)));
9234 }
9235
9236 /* Emit a register copy from operand to operand, taking care not to
9237    early-clobber source registers in the process.
9238
9239    COUNT is the number of components into which the copy needs to be
9240    decomposed.  */
9241 void
9242 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
9243                                 unsigned int count)
9244 {
9245   unsigned int i;
9246   int rdest = REGNO (operands[0]);
9247   int rsrc = REGNO (operands[1]);
9248
9249   if (!reg_overlap_mentioned_p (operands[0], operands[1])
9250       || rdest < rsrc)
9251     for (i = 0; i < count; i++)
9252       emit_move_insn (gen_rtx_REG (mode, rdest + i),
9253                       gen_rtx_REG (mode, rsrc + i));
9254   else
9255     for (i = 0; i < count; i++)
9256       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
9257                       gen_rtx_REG (mode, rsrc + count - i - 1));
9258 }
9259
9260 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
9261    one of VSTRUCT modes: OI, CI or XI.  */
9262 int
9263 aarch64_simd_attr_length_move (rtx_insn *insn)
9264 {
9265   machine_mode mode;
9266
9267   extract_insn_cached (insn);
9268
9269   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
9270     {
9271       mode = GET_MODE (recog_data.operand[0]);
9272       switch (mode)
9273         {
9274         case OImode:
9275           return 8;
9276         case CImode:
9277           return 12;
9278         case XImode:
9279           return 16;
9280         default:
9281           gcc_unreachable ();
9282         }
9283     }
9284   return 4;
9285 }
9286
9287 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
9288    one of VSTRUCT modes: OI, CI, EI, or XI.  */
9289 int
9290 aarch64_simd_attr_length_rglist (enum machine_mode mode)
9291 {
9292   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
9293 }
9294
9295 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
9296    alignment of a vector to 128 bits.  */
9297 static HOST_WIDE_INT
9298 aarch64_simd_vector_alignment (const_tree type)
9299 {
9300   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
9301   return MIN (align, 128);
9302 }
9303
9304 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
9305 static bool
9306 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
9307 {
9308   if (is_packed)
9309     return false;
9310
9311   /* We guarantee alignment for vectors up to 128-bits.  */
9312   if (tree_int_cst_compare (TYPE_SIZE (type),
9313                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9314     return false;
9315
9316   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
9317   return true;
9318 }
9319
9320 /* If VALS is a vector constant that can be loaded into a register
9321    using DUP, generate instructions to do so and return an RTX to
9322    assign to the register.  Otherwise return NULL_RTX.  */
9323 static rtx
9324 aarch64_simd_dup_constant (rtx vals)
9325 {
9326   machine_mode mode = GET_MODE (vals);
9327   machine_mode inner_mode = GET_MODE_INNER (mode);
9328   int n_elts = GET_MODE_NUNITS (mode);
9329   bool all_same = true;
9330   rtx x;
9331   int i;
9332
9333   if (GET_CODE (vals) != CONST_VECTOR)
9334     return NULL_RTX;
9335
9336   for (i = 1; i < n_elts; ++i)
9337     {
9338       x = CONST_VECTOR_ELT (vals, i);
9339       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9340         all_same = false;
9341     }
9342
9343   if (!all_same)
9344     return NULL_RTX;
9345
9346   /* We can load this constant by using DUP and a constant in a
9347      single ARM register.  This will be cheaper than a vector
9348      load.  */
9349   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9350   return gen_rtx_VEC_DUPLICATE (mode, x);
9351 }
9352
9353
9354 /* Generate code to load VALS, which is a PARALLEL containing only
9355    constants (for vec_init) or CONST_VECTOR, efficiently into a
9356    register.  Returns an RTX to copy into the register, or NULL_RTX
9357    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
9358 static rtx
9359 aarch64_simd_make_constant (rtx vals)
9360 {
9361   machine_mode mode = GET_MODE (vals);
9362   rtx const_dup;
9363   rtx const_vec = NULL_RTX;
9364   int n_elts = GET_MODE_NUNITS (mode);
9365   int n_const = 0;
9366   int i;
9367
9368   if (GET_CODE (vals) == CONST_VECTOR)
9369     const_vec = vals;
9370   else if (GET_CODE (vals) == PARALLEL)
9371     {
9372       /* A CONST_VECTOR must contain only CONST_INTs and
9373          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9374          Only store valid constants in a CONST_VECTOR.  */
9375       for (i = 0; i < n_elts; ++i)
9376         {
9377           rtx x = XVECEXP (vals, 0, i);
9378           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9379             n_const++;
9380         }
9381       if (n_const == n_elts)
9382         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9383     }
9384   else
9385     gcc_unreachable ();
9386
9387   if (const_vec != NULL_RTX
9388       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9389     /* Load using MOVI/MVNI.  */
9390     return const_vec;
9391   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9392     /* Loaded using DUP.  */
9393     return const_dup;
9394   else if (const_vec != NULL_RTX)
9395     /* Load from constant pool. We can not take advantage of single-cycle
9396        LD1 because we need a PC-relative addressing mode.  */
9397     return const_vec;
9398   else
9399     /* A PARALLEL containing something not valid inside CONST_VECTOR.
9400        We can not construct an initializer.  */
9401     return NULL_RTX;
9402 }
9403
9404 void
9405 aarch64_expand_vector_init (rtx target, rtx vals)
9406 {
9407   machine_mode mode = GET_MODE (target);
9408   machine_mode inner_mode = GET_MODE_INNER (mode);
9409   int n_elts = GET_MODE_NUNITS (mode);
9410   int n_var = 0;
9411   rtx any_const = NULL_RTX;
9412   bool all_same = true;
9413
9414   for (int i = 0; i < n_elts; ++i)
9415     {
9416       rtx x = XVECEXP (vals, 0, i);
9417       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9418         ++n_var;
9419       else
9420         any_const = x;
9421
9422       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9423         all_same = false;
9424     }
9425
9426   if (n_var == 0)
9427     {
9428       rtx constant = aarch64_simd_make_constant (vals);
9429       if (constant != NULL_RTX)
9430         {
9431           emit_move_insn (target, constant);
9432           return;
9433         }
9434     }
9435
9436   /* Splat a single non-constant element if we can.  */
9437   if (all_same)
9438     {
9439       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9440       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9441       return;
9442     }
9443
9444   /* Half the fields (or less) are non-constant.  Load constant then overwrite
9445      varying fields.  Hope that this is more efficient than using the stack.  */
9446   if (n_var <= n_elts/2)
9447     {
9448       rtx copy = copy_rtx (vals);
9449
9450       /* Load constant part of vector.  We really don't care what goes into the
9451          parts we will overwrite, but we're more likely to be able to load the
9452          constant efficiently if it has fewer, larger, repeating parts
9453          (see aarch64_simd_valid_immediate).  */
9454       for (int i = 0; i < n_elts; i++)
9455         {
9456           rtx x = XVECEXP (vals, 0, i);
9457           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9458             continue;
9459           rtx subst = any_const;
9460           for (int bit = n_elts / 2; bit > 0; bit /= 2)
9461             {
9462               /* Look in the copied vector, as more elements are const.  */
9463               rtx test = XVECEXP (copy, 0, i ^ bit);
9464               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9465                 {
9466                   subst = test;
9467                   break;
9468                 }
9469             }
9470           XVECEXP (copy, 0, i) = subst;
9471         }
9472       aarch64_expand_vector_init (target, copy);
9473
9474       /* Insert variables.  */
9475       enum insn_code icode = optab_handler (vec_set_optab, mode);
9476       gcc_assert (icode != CODE_FOR_nothing);
9477
9478       for (int i = 0; i < n_elts; i++)
9479         {
9480           rtx x = XVECEXP (vals, 0, i);
9481           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9482             continue;
9483           x = copy_to_mode_reg (inner_mode, x);
9484           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9485         }
9486       return;
9487     }
9488
9489   /* Construct the vector in memory one field at a time
9490      and load the whole vector.  */
9491   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9492   for (int i = 0; i < n_elts; i++)
9493     emit_move_insn (adjust_address_nv (mem, inner_mode,
9494                                     i * GET_MODE_SIZE (inner_mode)),
9495                     XVECEXP (vals, 0, i));
9496   emit_move_insn (target, mem);
9497
9498 }
9499
9500 static unsigned HOST_WIDE_INT
9501 aarch64_shift_truncation_mask (machine_mode mode)
9502 {
9503   return
9504     (aarch64_vector_mode_supported_p (mode)
9505      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9506 }
9507
9508 #ifndef TLS_SECTION_ASM_FLAG
9509 #define TLS_SECTION_ASM_FLAG 'T'
9510 #endif
9511
9512 void
9513 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9514                                tree decl ATTRIBUTE_UNUSED)
9515 {
9516   char flagchars[10], *f = flagchars;
9517
9518   /* If we have already declared this section, we can use an
9519      abbreviated form to switch back to it -- unless this section is
9520      part of a COMDAT groups, in which case GAS requires the full
9521      declaration every time.  */
9522   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9523       && (flags & SECTION_DECLARED))
9524     {
9525       fprintf (asm_out_file, "\t.section\t%s\n", name);
9526       return;
9527     }
9528
9529   if (!(flags & SECTION_DEBUG))
9530     *f++ = 'a';
9531   if (flags & SECTION_WRITE)
9532     *f++ = 'w';
9533   if (flags & SECTION_CODE)
9534     *f++ = 'x';
9535   if (flags & SECTION_SMALL)
9536     *f++ = 's';
9537   if (flags & SECTION_MERGE)
9538     *f++ = 'M';
9539   if (flags & SECTION_STRINGS)
9540     *f++ = 'S';
9541   if (flags & SECTION_TLS)
9542     *f++ = TLS_SECTION_ASM_FLAG;
9543   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9544     *f++ = 'G';
9545   *f = '\0';
9546
9547   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9548
9549   if (!(flags & SECTION_NOTYPE))
9550     {
9551       const char *type;
9552       const char *format;
9553
9554       if (flags & SECTION_BSS)
9555         type = "nobits";
9556       else
9557         type = "progbits";
9558
9559 #ifdef TYPE_OPERAND_FMT
9560       format = "," TYPE_OPERAND_FMT;
9561 #else
9562       format = ",@%s";
9563 #endif
9564
9565       fprintf (asm_out_file, format, type);
9566
9567       if (flags & SECTION_ENTSIZE)
9568         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9569       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9570         {
9571           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9572             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9573           else
9574             fprintf (asm_out_file, ",%s,comdat",
9575                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9576         }
9577     }
9578
9579   putc ('\n', asm_out_file);
9580 }
9581
9582 /* Select a format to encode pointers in exception handling data.  */
9583 int
9584 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9585 {
9586    int type;
9587    switch (aarch64_cmodel)
9588      {
9589      case AARCH64_CMODEL_TINY:
9590      case AARCH64_CMODEL_TINY_PIC:
9591      case AARCH64_CMODEL_SMALL:
9592      case AARCH64_CMODEL_SMALL_PIC:
9593      case AARCH64_CMODEL_SMALL_SPIC:
9594        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9595           for everything.  */
9596        type = DW_EH_PE_sdata4;
9597        break;
9598      default:
9599        /* No assumptions here.  8-byte relocs required.  */
9600        type = DW_EH_PE_sdata8;
9601        break;
9602      }
9603    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9604 }
9605
9606 /* Emit load exclusive.  */
9607
9608 static void
9609 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9610                              rtx mem, rtx model_rtx)
9611 {
9612   rtx (*gen) (rtx, rtx, rtx);
9613
9614   switch (mode)
9615     {
9616     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9617     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9618     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9619     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9620     default:
9621       gcc_unreachable ();
9622     }
9623
9624   emit_insn (gen (rval, mem, model_rtx));
9625 }
9626
9627 /* Emit store exclusive.  */
9628
9629 static void
9630 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9631                               rtx rval, rtx mem, rtx model_rtx)
9632 {
9633   rtx (*gen) (rtx, rtx, rtx, rtx);
9634
9635   switch (mode)
9636     {
9637     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9638     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9639     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9640     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9641     default:
9642       gcc_unreachable ();
9643     }
9644
9645   emit_insn (gen (bval, rval, mem, model_rtx));
9646 }
9647
9648 /* Mark the previous jump instruction as unlikely.  */
9649
9650 static void
9651 aarch64_emit_unlikely_jump (rtx insn)
9652 {
9653   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9654
9655   insn = emit_jump_insn (insn);
9656   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9657 }
9658
9659 /* Expand a compare and swap pattern.  */
9660
9661 void
9662 aarch64_expand_compare_and_swap (rtx operands[])
9663 {
9664   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9665   machine_mode mode, cmp_mode;
9666   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9667
9668   bval = operands[0];
9669   rval = operands[1];
9670   mem = operands[2];
9671   oldval = operands[3];
9672   newval = operands[4];
9673   is_weak = operands[5];
9674   mod_s = operands[6];
9675   mod_f = operands[7];
9676   mode = GET_MODE (mem);
9677   cmp_mode = mode;
9678
9679   /* Normally the succ memory model must be stronger than fail, but in the
9680      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9681      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9682
9683   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9684       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9685     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9686
9687   switch (mode)
9688     {
9689     case QImode:
9690     case HImode:
9691       /* For short modes, we're going to perform the comparison in SImode,
9692          so do the zero-extension now.  */
9693       cmp_mode = SImode;
9694       rval = gen_reg_rtx (SImode);
9695       oldval = convert_modes (SImode, mode, oldval, true);
9696       /* Fall through.  */
9697
9698     case SImode:
9699     case DImode:
9700       /* Force the value into a register if needed.  */
9701       if (!aarch64_plus_operand (oldval, mode))
9702         oldval = force_reg (cmp_mode, oldval);
9703       break;
9704
9705     default:
9706       gcc_unreachable ();
9707     }
9708
9709   switch (mode)
9710     {
9711     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9712     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9713     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9714     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9715     default:
9716       gcc_unreachable ();
9717     }
9718
9719   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9720
9721   if (mode == QImode || mode == HImode)
9722     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9723
9724   x = gen_rtx_REG (CCmode, CC_REGNUM);
9725   x = gen_rtx_EQ (SImode, x, const0_rtx);
9726   emit_insn (gen_rtx_SET (bval, x));
9727 }
9728
9729 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9730    sequence implementing an atomic operation.  */
9731
9732 static void
9733 aarch64_emit_post_barrier (enum memmodel model)
9734 {
9735   const enum memmodel base_model = memmodel_base (model);
9736
9737   if (is_mm_sync (model)
9738       && (base_model == MEMMODEL_ACQUIRE
9739           || base_model == MEMMODEL_ACQ_REL
9740           || base_model == MEMMODEL_SEQ_CST))
9741     {
9742       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9743     }
9744 }
9745
9746 /* Split a compare and swap pattern.  */
9747
9748 void
9749 aarch64_split_compare_and_swap (rtx operands[])
9750 {
9751   rtx rval, mem, oldval, newval, scratch;
9752   machine_mode mode;
9753   bool is_weak;
9754   rtx_code_label *label1, *label2;
9755   rtx x, cond;
9756   enum memmodel model;
9757   rtx model_rtx;
9758
9759   rval = operands[0];
9760   mem = operands[1];
9761   oldval = operands[2];
9762   newval = operands[3];
9763   is_weak = (operands[4] != const0_rtx);
9764   model_rtx = operands[5];
9765   scratch = operands[7];
9766   mode = GET_MODE (mem);
9767   model = memmodel_from_int (INTVAL (model_rtx));
9768
9769   label1 = NULL;
9770   if (!is_weak)
9771     {
9772       label1 = gen_label_rtx ();
9773       emit_label (label1);
9774     }
9775   label2 = gen_label_rtx ();
9776
9777   /* The initial load can be relaxed for a __sync operation since a final
9778      barrier will be emitted to stop code hoisting.  */
9779   if (is_mm_sync (model))
9780     aarch64_emit_load_exclusive (mode, rval, mem,
9781                                  GEN_INT (MEMMODEL_RELAXED));
9782   else
9783     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9784
9785   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9786   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9787   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9788                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9789   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9790
9791   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9792
9793   if (!is_weak)
9794     {
9795       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9796       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9797                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9798       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9799     }
9800   else
9801     {
9802       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9803       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9804       emit_insn (gen_rtx_SET (cond, x));
9805     }
9806
9807   emit_label (label2);
9808
9809   /* Emit any final barrier needed for a __sync operation.  */
9810   if (is_mm_sync (model))
9811     aarch64_emit_post_barrier (model);
9812 }
9813
9814 /* Split an atomic operation.  */
9815
9816 void
9817 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9818                      rtx value, rtx model_rtx, rtx cond)
9819 {
9820   machine_mode mode = GET_MODE (mem);
9821   machine_mode wmode = (mode == DImode ? DImode : SImode);
9822   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9823   const bool is_sync = is_mm_sync (model);
9824   rtx_code_label *label;
9825   rtx x;
9826
9827   label = gen_label_rtx ();
9828   emit_label (label);
9829
9830   if (new_out)
9831     new_out = gen_lowpart (wmode, new_out);
9832   if (old_out)
9833     old_out = gen_lowpart (wmode, old_out);
9834   else
9835     old_out = new_out;
9836   value = simplify_gen_subreg (wmode, value, mode, 0);
9837
9838   /* The initial load can be relaxed for a __sync operation since a final
9839      barrier will be emitted to stop code hoisting.  */
9840  if (is_sync)
9841     aarch64_emit_load_exclusive (mode, old_out, mem,
9842                                  GEN_INT (MEMMODEL_RELAXED));
9843   else
9844     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9845
9846   switch (code)
9847     {
9848     case SET:
9849       new_out = value;
9850       break;
9851
9852     case NOT:
9853       x = gen_rtx_AND (wmode, old_out, value);
9854       emit_insn (gen_rtx_SET (new_out, x));
9855       x = gen_rtx_NOT (wmode, new_out);
9856       emit_insn (gen_rtx_SET (new_out, x));
9857       break;
9858
9859     case MINUS:
9860       if (CONST_INT_P (value))
9861         {
9862           value = GEN_INT (-INTVAL (value));
9863           code = PLUS;
9864         }
9865       /* Fall through.  */
9866
9867     default:
9868       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9869       emit_insn (gen_rtx_SET (new_out, x));
9870       break;
9871     }
9872
9873   aarch64_emit_store_exclusive (mode, cond, mem,
9874                                 gen_lowpart (mode, new_out), model_rtx);
9875
9876   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9877   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9878                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9879   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9880
9881   /* Emit any final barrier needed for a __sync operation.  */
9882   if (is_sync)
9883     aarch64_emit_post_barrier (model);
9884 }
9885
9886 static void
9887 aarch64_print_extension (void)
9888 {
9889   const struct aarch64_option_extension *opt = NULL;
9890
9891   for (opt = all_extensions; opt->name != NULL; opt++)
9892     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9893       asm_fprintf (asm_out_file, "+%s", opt->name);
9894
9895   asm_fprintf (asm_out_file, "\n");
9896 }
9897
9898 static void
9899 aarch64_start_file (void)
9900 {
9901   if (selected_arch)
9902     {
9903       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9904       aarch64_print_extension ();
9905     }
9906   else if (selected_cpu)
9907     {
9908       const char *truncated_name
9909             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9910       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9911       aarch64_print_extension ();
9912     }
9913   default_file_start();
9914 }
9915
9916 /* Target hook for c_mode_for_suffix.  */
9917 static machine_mode
9918 aarch64_c_mode_for_suffix (char suffix)
9919 {
9920   if (suffix == 'q')
9921     return TFmode;
9922
9923   return VOIDmode;
9924 }
9925
9926 /* We can only represent floating point constants which will fit in
9927    "quarter-precision" values.  These values are characterised by
9928    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9929    by:
9930
9931    (-1)^s * (n/16) * 2^r
9932
9933    Where:
9934      's' is the sign bit.
9935      'n' is an integer in the range 16 <= n <= 31.
9936      'r' is an integer in the range -3 <= r <= 4.  */
9937
9938 /* Return true iff X can be represented by a quarter-precision
9939    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9940 bool
9941 aarch64_float_const_representable_p (rtx x)
9942 {
9943   /* This represents our current view of how many bits
9944      make up the mantissa.  */
9945   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9946   int exponent;
9947   unsigned HOST_WIDE_INT mantissa, mask;
9948   REAL_VALUE_TYPE r, m;
9949   bool fail;
9950
9951   if (!CONST_DOUBLE_P (x))
9952     return false;
9953
9954   if (GET_MODE (x) == VOIDmode)
9955     return false;
9956
9957   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9958
9959   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9960      know if we have +zero until we analyse the mantissa, but we
9961      can reject the other invalid values.  */
9962   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9963       || REAL_VALUE_MINUS_ZERO (r))
9964     return false;
9965
9966   /* Extract exponent.  */
9967   r = real_value_abs (&r);
9968   exponent = REAL_EXP (&r);
9969
9970   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9971      highest (sign) bit, with a fixed binary point at bit point_pos.
9972      m1 holds the low part of the mantissa, m2 the high part.
9973      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9974      bits for the mantissa, this can fail (low bits will be lost).  */
9975   real_ldexp (&m, &r, point_pos - exponent);
9976   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9977
9978   /* If the low part of the mantissa has bits set we cannot represent
9979      the value.  */
9980   if (w.elt (0) != 0)
9981     return false;
9982   /* We have rejected the lower HOST_WIDE_INT, so update our
9983      understanding of how many bits lie in the mantissa and
9984      look only at the high HOST_WIDE_INT.  */
9985   mantissa = w.elt (1);
9986   point_pos -= HOST_BITS_PER_WIDE_INT;
9987
9988   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9989   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9990   if ((mantissa & mask) != 0)
9991     return false;
9992
9993   /* Having filtered unrepresentable values, we may now remove all
9994      but the highest 5 bits.  */
9995   mantissa >>= point_pos - 5;
9996
9997   /* We cannot represent the value 0.0, so reject it.  This is handled
9998      elsewhere.  */
9999   if (mantissa == 0)
10000     return false;
10001
10002   /* Then, as bit 4 is always set, we can mask it off, leaving
10003      the mantissa in the range [0, 15].  */
10004   mantissa &= ~(1 << 4);
10005   gcc_assert (mantissa <= 15);
10006
10007   /* GCC internally does not use IEEE754-like encoding (where normalized
10008      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
10009      Our mantissa values are shifted 4 places to the left relative to
10010      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
10011      by 5 places to correct for GCC's representation.  */
10012   exponent = 5 - exponent;
10013
10014   return (exponent >= 0 && exponent <= 7);
10015 }
10016
10017 char*
10018 aarch64_output_simd_mov_immediate (rtx const_vector,
10019                                    machine_mode mode,
10020                                    unsigned width)
10021 {
10022   bool is_valid;
10023   static char templ[40];
10024   const char *mnemonic;
10025   const char *shift_op;
10026   unsigned int lane_count = 0;
10027   char element_char;
10028
10029   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
10030
10031   /* This will return true to show const_vector is legal for use as either
10032      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
10033      also update INFO to show how the immediate should be generated.  */
10034   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
10035   gcc_assert (is_valid);
10036
10037   element_char = sizetochar (info.element_width);
10038   lane_count = width / info.element_width;
10039
10040   mode = GET_MODE_INNER (mode);
10041   if (mode == SFmode || mode == DFmode)
10042     {
10043       gcc_assert (info.shift == 0 && ! info.mvn);
10044       if (aarch64_float_const_zero_rtx_p (info.value))
10045         info.value = GEN_INT (0);
10046       else
10047         {
10048 #define buf_size 20
10049           REAL_VALUE_TYPE r;
10050           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
10051           char float_buf[buf_size] = {'\0'};
10052           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
10053 #undef buf_size
10054
10055           if (lane_count == 1)
10056             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
10057           else
10058             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
10059                       lane_count, element_char, float_buf);
10060           return templ;
10061         }
10062     }
10063
10064   mnemonic = info.mvn ? "mvni" : "movi";
10065   shift_op = info.msl ? "msl" : "lsl";
10066
10067   if (lane_count == 1)
10068     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
10069               mnemonic, UINTVAL (info.value));
10070   else if (info.shift)
10071     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
10072               ", %s %d", mnemonic, lane_count, element_char,
10073               UINTVAL (info.value), shift_op, info.shift);
10074   else
10075     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
10076               mnemonic, lane_count, element_char, UINTVAL (info.value));
10077   return templ;
10078 }
10079
10080 char*
10081 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
10082                                           machine_mode mode)
10083 {
10084   machine_mode vmode;
10085
10086   gcc_assert (!VECTOR_MODE_P (mode));
10087   vmode = aarch64_simd_container_mode (mode, 64);
10088   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
10089   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
10090 }
10091
10092 /* Split operands into moves from op[1] + op[2] into op[0].  */
10093
10094 void
10095 aarch64_split_combinev16qi (rtx operands[3])
10096 {
10097   unsigned int dest = REGNO (operands[0]);
10098   unsigned int src1 = REGNO (operands[1]);
10099   unsigned int src2 = REGNO (operands[2]);
10100   machine_mode halfmode = GET_MODE (operands[1]);
10101   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
10102   rtx destlo, desthi;
10103
10104   gcc_assert (halfmode == V16QImode);
10105
10106   if (src1 == dest && src2 == dest + halfregs)
10107     {
10108       /* No-op move.  Can't split to nothing; emit something.  */
10109       emit_note (NOTE_INSN_DELETED);
10110       return;
10111     }
10112
10113   /* Preserve register attributes for variable tracking.  */
10114   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
10115   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
10116                                GET_MODE_SIZE (halfmode));
10117
10118   /* Special case of reversed high/low parts.  */
10119   if (reg_overlap_mentioned_p (operands[2], destlo)
10120       && reg_overlap_mentioned_p (operands[1], desthi))
10121     {
10122       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10123       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
10124       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10125     }
10126   else if (!reg_overlap_mentioned_p (operands[2], destlo))
10127     {
10128       /* Try to avoid unnecessary moves if part of the result
10129          is in the right place already.  */
10130       if (src1 != dest)
10131         emit_move_insn (destlo, operands[1]);
10132       if (src2 != dest + halfregs)
10133         emit_move_insn (desthi, operands[2]);
10134     }
10135   else
10136     {
10137       if (src2 != dest + halfregs)
10138         emit_move_insn (desthi, operands[2]);
10139       if (src1 != dest)
10140         emit_move_insn (destlo, operands[1]);
10141     }
10142 }
10143
10144 /* vec_perm support.  */
10145
10146 #define MAX_VECT_LEN 16
10147
10148 struct expand_vec_perm_d
10149 {
10150   rtx target, op0, op1;
10151   unsigned char perm[MAX_VECT_LEN];
10152   machine_mode vmode;
10153   unsigned char nelt;
10154   bool one_vector_p;
10155   bool testing_p;
10156 };
10157
10158 /* Generate a variable permutation.  */
10159
10160 static void
10161 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
10162 {
10163   machine_mode vmode = GET_MODE (target);
10164   bool one_vector_p = rtx_equal_p (op0, op1);
10165
10166   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
10167   gcc_checking_assert (GET_MODE (op0) == vmode);
10168   gcc_checking_assert (GET_MODE (op1) == vmode);
10169   gcc_checking_assert (GET_MODE (sel) == vmode);
10170   gcc_checking_assert (TARGET_SIMD);
10171
10172   if (one_vector_p)
10173     {
10174       if (vmode == V8QImode)
10175         {
10176           /* Expand the argument to a V16QI mode by duplicating it.  */
10177           rtx pair = gen_reg_rtx (V16QImode);
10178           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
10179           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10180         }
10181       else
10182         {
10183           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
10184         }
10185     }
10186   else
10187     {
10188       rtx pair;
10189
10190       if (vmode == V8QImode)
10191         {
10192           pair = gen_reg_rtx (V16QImode);
10193           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
10194           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10195         }
10196       else
10197         {
10198           pair = gen_reg_rtx (OImode);
10199           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
10200           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
10201         }
10202     }
10203 }
10204
10205 void
10206 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
10207 {
10208   machine_mode vmode = GET_MODE (target);
10209   unsigned int nelt = GET_MODE_NUNITS (vmode);
10210   bool one_vector_p = rtx_equal_p (op0, op1);
10211   rtx mask;
10212
10213   /* The TBL instruction does not use a modulo index, so we must take care
10214      of that ourselves.  */
10215   mask = aarch64_simd_gen_const_vector_dup (vmode,
10216       one_vector_p ? nelt - 1 : 2 * nelt - 1);
10217   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
10218
10219   /* For big-endian, we also need to reverse the index within the vector
10220      (but not which vector).  */
10221   if (BYTES_BIG_ENDIAN)
10222     {
10223       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
10224       if (!one_vector_p)
10225         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
10226       sel = expand_simple_binop (vmode, XOR, sel, mask,
10227                                  NULL, 0, OPTAB_LIB_WIDEN);
10228     }
10229   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
10230 }
10231
10232 /* Recognize patterns suitable for the TRN instructions.  */
10233 static bool
10234 aarch64_evpc_trn (struct expand_vec_perm_d *d)
10235 {
10236   unsigned int i, odd, mask, nelt = d->nelt;
10237   rtx out, in0, in1, x;
10238   rtx (*gen) (rtx, rtx, rtx);
10239   machine_mode vmode = d->vmode;
10240
10241   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10242     return false;
10243
10244   /* Note that these are little-endian tests.
10245      We correct for big-endian later.  */
10246   if (d->perm[0] == 0)
10247     odd = 0;
10248   else if (d->perm[0] == 1)
10249     odd = 1;
10250   else
10251     return false;
10252   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10253
10254   for (i = 0; i < nelt; i += 2)
10255     {
10256       if (d->perm[i] != i + odd)
10257         return false;
10258       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
10259         return false;
10260     }
10261
10262   /* Success!  */
10263   if (d->testing_p)
10264     return true;
10265
10266   in0 = d->op0;
10267   in1 = d->op1;
10268   if (BYTES_BIG_ENDIAN)
10269     {
10270       x = in0, in0 = in1, in1 = x;
10271       odd = !odd;
10272     }
10273   out = d->target;
10274
10275   if (odd)
10276     {
10277       switch (vmode)
10278         {
10279         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
10280         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
10281         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
10282         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
10283         case V4SImode: gen = gen_aarch64_trn2v4si; break;
10284         case V2SImode: gen = gen_aarch64_trn2v2si; break;
10285         case V2DImode: gen = gen_aarch64_trn2v2di; break;
10286         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
10287         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
10288         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
10289         default:
10290           return false;
10291         }
10292     }
10293   else
10294     {
10295       switch (vmode)
10296         {
10297         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
10298         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
10299         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
10300         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
10301         case V4SImode: gen = gen_aarch64_trn1v4si; break;
10302         case V2SImode: gen = gen_aarch64_trn1v2si; break;
10303         case V2DImode: gen = gen_aarch64_trn1v2di; break;
10304         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
10305         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
10306         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
10307         default:
10308           return false;
10309         }
10310     }
10311
10312   emit_insn (gen (out, in0, in1));
10313   return true;
10314 }
10315
10316 /* Recognize patterns suitable for the UZP instructions.  */
10317 static bool
10318 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10319 {
10320   unsigned int i, odd, mask, nelt = d->nelt;
10321   rtx out, in0, in1, x;
10322   rtx (*gen) (rtx, rtx, rtx);
10323   machine_mode vmode = d->vmode;
10324
10325   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10326     return false;
10327
10328   /* Note that these are little-endian tests.
10329      We correct for big-endian later.  */
10330   if (d->perm[0] == 0)
10331     odd = 0;
10332   else if (d->perm[0] == 1)
10333     odd = 1;
10334   else
10335     return false;
10336   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10337
10338   for (i = 0; i < nelt; i++)
10339     {
10340       unsigned elt = (i * 2 + odd) & mask;
10341       if (d->perm[i] != elt)
10342         return false;
10343     }
10344
10345   /* Success!  */
10346   if (d->testing_p)
10347     return true;
10348
10349   in0 = d->op0;
10350   in1 = d->op1;
10351   if (BYTES_BIG_ENDIAN)
10352     {
10353       x = in0, in0 = in1, in1 = x;
10354       odd = !odd;
10355     }
10356   out = d->target;
10357
10358   if (odd)
10359     {
10360       switch (vmode)
10361         {
10362         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10363         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10364         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10365         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10366         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10367         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10368         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10369         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10370         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10371         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10372         default:
10373           return false;
10374         }
10375     }
10376   else
10377     {
10378       switch (vmode)
10379         {
10380         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10381         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10382         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10383         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10384         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10385         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10386         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10387         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10388         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10389         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10390         default:
10391           return false;
10392         }
10393     }
10394
10395   emit_insn (gen (out, in0, in1));
10396   return true;
10397 }
10398
10399 /* Recognize patterns suitable for the ZIP instructions.  */
10400 static bool
10401 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10402 {
10403   unsigned int i, high, mask, nelt = d->nelt;
10404   rtx out, in0, in1, x;
10405   rtx (*gen) (rtx, rtx, rtx);
10406   machine_mode vmode = d->vmode;
10407
10408   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10409     return false;
10410
10411   /* Note that these are little-endian tests.
10412      We correct for big-endian later.  */
10413   high = nelt / 2;
10414   if (d->perm[0] == high)
10415     /* Do Nothing.  */
10416     ;
10417   else if (d->perm[0] == 0)
10418     high = 0;
10419   else
10420     return false;
10421   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10422
10423   for (i = 0; i < nelt / 2; i++)
10424     {
10425       unsigned elt = (i + high) & mask;
10426       if (d->perm[i * 2] != elt)
10427         return false;
10428       elt = (elt + nelt) & mask;
10429       if (d->perm[i * 2 + 1] != elt)
10430         return false;
10431     }
10432
10433   /* Success!  */
10434   if (d->testing_p)
10435     return true;
10436
10437   in0 = d->op0;
10438   in1 = d->op1;
10439   if (BYTES_BIG_ENDIAN)
10440     {
10441       x = in0, in0 = in1, in1 = x;
10442       high = !high;
10443     }
10444   out = d->target;
10445
10446   if (high)
10447     {
10448       switch (vmode)
10449         {
10450         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10451         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10452         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10453         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10454         case V4SImode: gen = gen_aarch64_zip2v4si; break;
10455         case V2SImode: gen = gen_aarch64_zip2v2si; break;
10456         case V2DImode: gen = gen_aarch64_zip2v2di; break;
10457         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10458         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10459         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10460         default:
10461           return false;
10462         }
10463     }
10464   else
10465     {
10466       switch (vmode)
10467         {
10468         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10469         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10470         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10471         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10472         case V4SImode: gen = gen_aarch64_zip1v4si; break;
10473         case V2SImode: gen = gen_aarch64_zip1v2si; break;
10474         case V2DImode: gen = gen_aarch64_zip1v2di; break;
10475         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10476         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10477         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10478         default:
10479           return false;
10480         }
10481     }
10482
10483   emit_insn (gen (out, in0, in1));
10484   return true;
10485 }
10486
10487 /* Recognize patterns for the EXT insn.  */
10488
10489 static bool
10490 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10491 {
10492   unsigned int i, nelt = d->nelt;
10493   rtx (*gen) (rtx, rtx, rtx, rtx);
10494   rtx offset;
10495
10496   unsigned int location = d->perm[0]; /* Always < nelt.  */
10497
10498   /* Check if the extracted indices are increasing by one.  */
10499   for (i = 1; i < nelt; i++)
10500     {
10501       unsigned int required = location + i;
10502       if (d->one_vector_p)
10503         {
10504           /* We'll pass the same vector in twice, so allow indices to wrap.  */
10505           required &= (nelt - 1);
10506         }
10507       if (d->perm[i] != required)
10508         return false;
10509     }
10510
10511   switch (d->vmode)
10512     {
10513     case V16QImode: gen = gen_aarch64_extv16qi; break;
10514     case V8QImode: gen = gen_aarch64_extv8qi; break;
10515     case V4HImode: gen = gen_aarch64_extv4hi; break;
10516     case V8HImode: gen = gen_aarch64_extv8hi; break;
10517     case V2SImode: gen = gen_aarch64_extv2si; break;
10518     case V4SImode: gen = gen_aarch64_extv4si; break;
10519     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10520     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10521     case V2DImode: gen = gen_aarch64_extv2di; break;
10522     case V2DFmode: gen = gen_aarch64_extv2df; break;
10523     default:
10524       return false;
10525     }
10526
10527   /* Success! */
10528   if (d->testing_p)
10529     return true;
10530
10531   /* The case where (location == 0) is a no-op for both big- and little-endian,
10532      and is removed by the mid-end at optimization levels -O1 and higher.  */
10533
10534   if (BYTES_BIG_ENDIAN && (location != 0))
10535     {
10536       /* After setup, we want the high elements of the first vector (stored
10537          at the LSB end of the register), and the low elements of the second
10538          vector (stored at the MSB end of the register). So swap.  */
10539       std::swap (d->op0, d->op1);
10540       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10541       location = nelt - location;
10542     }
10543
10544   offset = GEN_INT (location);
10545   emit_insn (gen (d->target, d->op0, d->op1, offset));
10546   return true;
10547 }
10548
10549 /* Recognize patterns for the REV insns.  */
10550
10551 static bool
10552 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10553 {
10554   unsigned int i, j, diff, nelt = d->nelt;
10555   rtx (*gen) (rtx, rtx);
10556
10557   if (!d->one_vector_p)
10558     return false;
10559
10560   diff = d->perm[0];
10561   switch (diff)
10562     {
10563     case 7:
10564       switch (d->vmode)
10565         {
10566         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10567         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10568         default:
10569           return false;
10570         }
10571       break;
10572     case 3:
10573       switch (d->vmode)
10574         {
10575         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10576         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10577         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10578         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10579         default:
10580           return false;
10581         }
10582       break;
10583     case 1:
10584       switch (d->vmode)
10585         {
10586         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10587         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10588         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10589         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10590         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10591         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10592         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10593         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10594         default:
10595           return false;
10596         }
10597       break;
10598     default:
10599       return false;
10600     }
10601
10602   for (i = 0; i < nelt ; i += diff + 1)
10603     for (j = 0; j <= diff; j += 1)
10604       {
10605         /* This is guaranteed to be true as the value of diff
10606            is 7, 3, 1 and we should have enough elements in the
10607            queue to generate this.  Getting a vector mask with a
10608            value of diff other than these values implies that
10609            something is wrong by the time we get here.  */
10610         gcc_assert (i + j < nelt);
10611         if (d->perm[i + j] != i + diff - j)
10612           return false;
10613       }
10614
10615   /* Success! */
10616   if (d->testing_p)
10617     return true;
10618
10619   emit_insn (gen (d->target, d->op0));
10620   return true;
10621 }
10622
10623 static bool
10624 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10625 {
10626   rtx (*gen) (rtx, rtx, rtx);
10627   rtx out = d->target;
10628   rtx in0;
10629   machine_mode vmode = d->vmode;
10630   unsigned int i, elt, nelt = d->nelt;
10631   rtx lane;
10632
10633   elt = d->perm[0];
10634   for (i = 1; i < nelt; i++)
10635     {
10636       if (elt != d->perm[i])
10637         return false;
10638     }
10639
10640   /* The generic preparation in aarch64_expand_vec_perm_const_1
10641      swaps the operand order and the permute indices if it finds
10642      d->perm[0] to be in the second operand.  Thus, we can always
10643      use d->op0 and need not do any extra arithmetic to get the
10644      correct lane number.  */
10645   in0 = d->op0;
10646   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10647
10648   switch (vmode)
10649     {
10650     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10651     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10652     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10653     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10654     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10655     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10656     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10657     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10658     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10659     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10660     default:
10661       return false;
10662     }
10663
10664   emit_insn (gen (out, in0, lane));
10665   return true;
10666 }
10667
10668 static bool
10669 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10670 {
10671   rtx rperm[MAX_VECT_LEN], sel;
10672   machine_mode vmode = d->vmode;
10673   unsigned int i, nelt = d->nelt;
10674
10675   if (d->testing_p)
10676     return true;
10677
10678   /* Generic code will try constant permutation twice.  Once with the
10679      original mode and again with the elements lowered to QImode.
10680      So wait and don't do the selector expansion ourselves.  */
10681   if (vmode != V8QImode && vmode != V16QImode)
10682     return false;
10683
10684   for (i = 0; i < nelt; ++i)
10685     {
10686       int nunits = GET_MODE_NUNITS (vmode);
10687
10688       /* If big-endian and two vectors we end up with a weird mixed-endian
10689          mode on NEON.  Reverse the index within each word but not the word
10690          itself.  */
10691       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10692                                            : d->perm[i]);
10693     }
10694   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10695   sel = force_reg (vmode, sel);
10696
10697   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10698   return true;
10699 }
10700
10701 static bool
10702 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10703 {
10704   /* The pattern matching functions above are written to look for a small
10705      number to begin the sequence (0, 1, N/2).  If we begin with an index
10706      from the second operand, we can swap the operands.  */
10707   if (d->perm[0] >= d->nelt)
10708     {
10709       unsigned i, nelt = d->nelt;
10710
10711       gcc_assert (nelt == (nelt & -nelt));
10712       for (i = 0; i < nelt; ++i)
10713         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10714
10715       std::swap (d->op0, d->op1);
10716     }
10717
10718   if (TARGET_SIMD)
10719     {
10720       if (aarch64_evpc_rev (d))
10721         return true;
10722       else if (aarch64_evpc_ext (d))
10723         return true;
10724       else if (aarch64_evpc_dup (d))
10725         return true;
10726       else if (aarch64_evpc_zip (d))
10727         return true;
10728       else if (aarch64_evpc_uzp (d))
10729         return true;
10730       else if (aarch64_evpc_trn (d))
10731         return true;
10732       return aarch64_evpc_tbl (d);
10733     }
10734   return false;
10735 }
10736
10737 /* Expand a vec_perm_const pattern.  */
10738
10739 bool
10740 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10741 {
10742   struct expand_vec_perm_d d;
10743   int i, nelt, which;
10744
10745   d.target = target;
10746   d.op0 = op0;
10747   d.op1 = op1;
10748
10749   d.vmode = GET_MODE (target);
10750   gcc_assert (VECTOR_MODE_P (d.vmode));
10751   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10752   d.testing_p = false;
10753
10754   for (i = which = 0; i < nelt; ++i)
10755     {
10756       rtx e = XVECEXP (sel, 0, i);
10757       int ei = INTVAL (e) & (2 * nelt - 1);
10758       which |= (ei < nelt ? 1 : 2);
10759       d.perm[i] = ei;
10760     }
10761
10762   switch (which)
10763     {
10764     default:
10765       gcc_unreachable ();
10766
10767     case 3:
10768       d.one_vector_p = false;
10769       if (!rtx_equal_p (op0, op1))
10770         break;
10771
10772       /* The elements of PERM do not suggest that only the first operand
10773          is used, but both operands are identical.  Allow easier matching
10774          of the permutation by folding the permutation into the single
10775          input vector.  */
10776       /* Fall Through.  */
10777     case 2:
10778       for (i = 0; i < nelt; ++i)
10779         d.perm[i] &= nelt - 1;
10780       d.op0 = op1;
10781       d.one_vector_p = true;
10782       break;
10783
10784     case 1:
10785       d.op1 = op0;
10786       d.one_vector_p = true;
10787       break;
10788     }
10789
10790   return aarch64_expand_vec_perm_const_1 (&d);
10791 }
10792
10793 static bool
10794 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10795                                      const unsigned char *sel)
10796 {
10797   struct expand_vec_perm_d d;
10798   unsigned int i, nelt, which;
10799   bool ret;
10800
10801   d.vmode = vmode;
10802   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10803   d.testing_p = true;
10804   memcpy (d.perm, sel, nelt);
10805
10806   /* Calculate whether all elements are in one vector.  */
10807   for (i = which = 0; i < nelt; ++i)
10808     {
10809       unsigned char e = d.perm[i];
10810       gcc_assert (e < 2 * nelt);
10811       which |= (e < nelt ? 1 : 2);
10812     }
10813
10814   /* If all elements are from the second vector, reindex as if from the
10815      first vector.  */
10816   if (which == 2)
10817     for (i = 0; i < nelt; ++i)
10818       d.perm[i] -= nelt;
10819
10820   /* Check whether the mask can be applied to a single vector.  */
10821   d.one_vector_p = (which != 3);
10822
10823   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10824   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10825   if (!d.one_vector_p)
10826     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10827
10828   start_sequence ();
10829   ret = aarch64_expand_vec_perm_const_1 (&d);
10830   end_sequence ();
10831
10832   return ret;
10833 }
10834
10835 rtx
10836 aarch64_reverse_mask (enum machine_mode mode)
10837 {
10838   /* We have to reverse each vector because we dont have
10839      a permuted load that can reverse-load according to ABI rules.  */
10840   rtx mask;
10841   rtvec v = rtvec_alloc (16);
10842   int i, j;
10843   int nunits = GET_MODE_NUNITS (mode);
10844   int usize = GET_MODE_UNIT_SIZE (mode);
10845
10846   gcc_assert (BYTES_BIG_ENDIAN);
10847   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10848
10849   for (i = 0; i < nunits; i++)
10850     for (j = 0; j < usize; j++)
10851       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10852   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10853   return force_reg (V16QImode, mask);
10854 }
10855
10856 /* Implement MODES_TIEABLE_P.  */
10857
10858 bool
10859 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10860 {
10861   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10862     return true;
10863
10864   /* We specifically want to allow elements of "structure" modes to
10865      be tieable to the structure.  This more general condition allows
10866      other rarer situations too.  */
10867   if (TARGET_SIMD
10868       && aarch64_vector_mode_p (mode1)
10869       && aarch64_vector_mode_p (mode2))
10870     return true;
10871
10872   return false;
10873 }
10874
10875 /* Return a new RTX holding the result of moving POINTER forward by
10876    AMOUNT bytes.  */
10877
10878 static rtx
10879 aarch64_move_pointer (rtx pointer, int amount)
10880 {
10881   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10882
10883   return adjust_automodify_address (pointer, GET_MODE (pointer),
10884                                     next, amount);
10885 }
10886
10887 /* Return a new RTX holding the result of moving POINTER forward by the
10888    size of the mode it points to.  */
10889
10890 static rtx
10891 aarch64_progress_pointer (rtx pointer)
10892 {
10893   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10894
10895   return aarch64_move_pointer (pointer, amount);
10896 }
10897
10898 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10899    MODE bytes.  */
10900
10901 static void
10902 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10903                                               machine_mode mode)
10904 {
10905   rtx reg = gen_reg_rtx (mode);
10906
10907   /* "Cast" the pointers to the correct mode.  */
10908   *src = adjust_address (*src, mode, 0);
10909   *dst = adjust_address (*dst, mode, 0);
10910   /* Emit the memcpy.  */
10911   emit_move_insn (reg, *src);
10912   emit_move_insn (*dst, reg);
10913   /* Move the pointers forward.  */
10914   *src = aarch64_progress_pointer (*src);
10915   *dst = aarch64_progress_pointer (*dst);
10916 }
10917
10918 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10919    we succeed, otherwise return false.  */
10920
10921 bool
10922 aarch64_expand_movmem (rtx *operands)
10923 {
10924   unsigned int n;
10925   rtx dst = operands[0];
10926   rtx src = operands[1];
10927   rtx base;
10928   bool speed_p = !optimize_function_for_size_p (cfun);
10929
10930   /* When optimizing for size, give a better estimate of the length of a
10931      memcpy call, but use the default otherwise.  */
10932   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10933
10934   /* We can't do anything smart if the amount to copy is not constant.  */
10935   if (!CONST_INT_P (operands[2]))
10936     return false;
10937
10938   n = UINTVAL (operands[2]);
10939
10940   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10941      need to make at most two moves.  For cases above 16 bytes it will be one
10942      move for each 16 byte chunk, then at most two additional moves.  */
10943   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10944     return false;
10945
10946   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10947   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10948
10949   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10950   src = adjust_automodify_address (src, VOIDmode, base, 0);
10951
10952   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10953      1-byte chunk.  */
10954   if (n < 4)
10955     {
10956       if (n >= 2)
10957         {
10958           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10959           n -= 2;
10960         }
10961
10962       if (n == 1)
10963         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10964
10965       return true;
10966     }
10967
10968   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10969      4-byte chunk, partially overlapping with the previously copied chunk.  */
10970   if (n < 8)
10971     {
10972       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10973       n -= 4;
10974       if (n > 0)
10975         {
10976           int move = n - 4;
10977
10978           src = aarch64_move_pointer (src, move);
10979           dst = aarch64_move_pointer (dst, move);
10980           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10981         }
10982       return true;
10983     }
10984
10985   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10986      them, then (if applicable) an 8-byte chunk.  */
10987   while (n >= 8)
10988     {
10989       if (n / 16)
10990         {
10991           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10992           n -= 16;
10993         }
10994       else
10995         {
10996           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10997           n -= 8;
10998         }
10999     }
11000
11001   /* Finish the final bytes of the copy.  We can always do this in one
11002      instruction.  We either copy the exact amount we need, or partially
11003      overlap with the previous chunk we copied and copy 8-bytes.  */
11004   if (n == 0)
11005     return true;
11006   else if (n == 1)
11007     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
11008   else if (n == 2)
11009     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
11010   else if (n == 4)
11011     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11012   else
11013     {
11014       if (n == 3)
11015         {
11016           src = aarch64_move_pointer (src, -1);
11017           dst = aarch64_move_pointer (dst, -1);
11018           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11019         }
11020       else
11021         {
11022           int move = n - 8;
11023
11024           src = aarch64_move_pointer (src, move);
11025           dst = aarch64_move_pointer (dst, move);
11026           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
11027         }
11028     }
11029
11030   return true;
11031 }
11032
11033 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
11034
11035 static unsigned HOST_WIDE_INT
11036 aarch64_asan_shadow_offset (void)
11037 {
11038   return (HOST_WIDE_INT_1 << 36);
11039 }
11040
11041 static bool
11042 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
11043                                         unsigned int align,
11044                                         enum by_pieces_operation op,
11045                                         bool speed_p)
11046 {
11047   /* STORE_BY_PIECES can be used when copying a constant string, but
11048      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
11049      For now we always fail this and let the move_by_pieces code copy
11050      the string from read-only memory.  */
11051   if (op == STORE_BY_PIECES)
11052     return false;
11053
11054   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
11055 }
11056
11057 static enum machine_mode
11058 aarch64_code_to_ccmode (enum rtx_code code)
11059 {
11060   switch (code)
11061     {
11062     case NE:
11063       return CC_DNEmode;
11064
11065     case EQ:
11066       return CC_DEQmode;
11067
11068     case LE:
11069       return CC_DLEmode;
11070
11071     case LT:
11072       return CC_DLTmode;
11073
11074     case GE:
11075       return CC_DGEmode;
11076
11077     case GT:
11078       return CC_DGTmode;
11079
11080     case LEU:
11081       return CC_DLEUmode;
11082
11083     case LTU:
11084       return CC_DLTUmode;
11085
11086     case GEU:
11087       return CC_DGEUmode;
11088
11089     case GTU:
11090       return CC_DGTUmode;
11091
11092     default:
11093       return CCmode;
11094     }
11095 }
11096
11097 static rtx
11098 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
11099                         int code, tree treeop0, tree treeop1)
11100 {
11101   enum machine_mode op_mode, cmp_mode, cc_mode;
11102   rtx op0, op1, cmp, target;
11103   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11104   enum insn_code icode;
11105   struct expand_operand ops[4];
11106
11107   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
11108   if (cc_mode == CCmode)
11109     return NULL_RTX;
11110
11111   start_sequence ();
11112   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11113
11114   op_mode = GET_MODE (op0);
11115   if (op_mode == VOIDmode)
11116     op_mode = GET_MODE (op1);
11117
11118   switch (op_mode)
11119     {
11120     case QImode:
11121     case HImode:
11122     case SImode:
11123       cmp_mode = SImode;
11124       icode = CODE_FOR_cmpsi;
11125       break;
11126
11127     case DImode:
11128       cmp_mode = DImode;
11129       icode = CODE_FOR_cmpdi;
11130       break;
11131
11132     default:
11133       end_sequence ();
11134       return NULL_RTX;
11135     }
11136
11137   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11138   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11139   if (!op0 || !op1)
11140     {
11141       end_sequence ();
11142       return NULL_RTX;
11143     }
11144   *prep_seq = get_insns ();
11145   end_sequence ();
11146
11147   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
11148   target = gen_rtx_REG (CCmode, CC_REGNUM);
11149
11150   create_output_operand (&ops[0], target, CCmode);
11151   create_fixed_operand (&ops[1], cmp);
11152   create_fixed_operand (&ops[2], op0);
11153   create_fixed_operand (&ops[3], op1);
11154
11155   start_sequence ();
11156   if (!maybe_expand_insn (icode, 4, ops))
11157     {
11158       end_sequence ();
11159       return NULL_RTX;
11160     }
11161   *gen_seq = get_insns ();
11162   end_sequence ();
11163
11164   return gen_rtx_REG (cc_mode, CC_REGNUM);
11165 }
11166
11167 static rtx
11168 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
11169                        tree treeop0, tree treeop1, int bit_code)
11170 {
11171   rtx op0, op1, cmp0, cmp1, target;
11172   enum machine_mode op_mode, cmp_mode, cc_mode;
11173   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11174   enum insn_code icode = CODE_FOR_ccmp_andsi;
11175   struct expand_operand ops[6];
11176
11177   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
11178   if (cc_mode == CCmode)
11179     return NULL_RTX;
11180
11181   push_to_sequence ((rtx_insn*) *prep_seq);
11182   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11183
11184   op_mode = GET_MODE (op0);
11185   if (op_mode == VOIDmode)
11186     op_mode = GET_MODE (op1);
11187
11188   switch (op_mode)
11189     {
11190     case QImode:
11191     case HImode:
11192     case SImode:
11193       cmp_mode = SImode;
11194       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
11195                                                 : CODE_FOR_ccmp_iorsi;
11196       break;
11197
11198     case DImode:
11199       cmp_mode = DImode;
11200       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
11201                                                 : CODE_FOR_ccmp_iordi;
11202       break;
11203
11204     default:
11205       end_sequence ();
11206       return NULL_RTX;
11207     }
11208
11209   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11210   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11211   if (!op0 || !op1)
11212     {
11213       end_sequence ();
11214       return NULL_RTX;
11215     }
11216   *prep_seq = get_insns ();
11217   end_sequence ();
11218
11219   target = gen_rtx_REG (cc_mode, CC_REGNUM);
11220   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
11221   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
11222
11223   create_fixed_operand (&ops[0], prev);
11224   create_fixed_operand (&ops[1], target);
11225   create_fixed_operand (&ops[2], op0);
11226   create_fixed_operand (&ops[3], op1);
11227   create_fixed_operand (&ops[4], cmp0);
11228   create_fixed_operand (&ops[5], cmp1);
11229
11230   push_to_sequence ((rtx_insn*) *gen_seq);
11231   if (!maybe_expand_insn (icode, 6, ops))
11232     {
11233       end_sequence ();
11234       return NULL_RTX;
11235     }
11236
11237   *gen_seq = get_insns ();
11238   end_sequence ();
11239
11240   return target;
11241 }
11242
11243 #undef TARGET_GEN_CCMP_FIRST
11244 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
11245
11246 #undef TARGET_GEN_CCMP_NEXT
11247 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
11248
11249 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
11250    instruction fusion of some sort.  */
11251
11252 static bool
11253 aarch64_macro_fusion_p (void)
11254 {
11255   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
11256 }
11257
11258
11259 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
11260    should be kept together during scheduling.  */
11261
11262 static bool
11263 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
11264 {
11265   rtx set_dest;
11266   rtx prev_set = single_set (prev);
11267   rtx curr_set = single_set (curr);
11268   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
11269   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
11270
11271   if (!aarch64_macro_fusion_p ())
11272     return false;
11273
11274   if (simple_sets_p
11275       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
11276     {
11277       /* We are trying to match:
11278          prev (mov)  == (set (reg r0) (const_int imm16))
11279          curr (movk) == (set (zero_extract (reg r0)
11280                                            (const_int 16)
11281                                            (const_int 16))
11282                              (const_int imm16_1))  */
11283
11284       set_dest = SET_DEST (curr_set);
11285
11286       if (GET_CODE (set_dest) == ZERO_EXTRACT
11287           && CONST_INT_P (SET_SRC (curr_set))
11288           && CONST_INT_P (SET_SRC (prev_set))
11289           && CONST_INT_P (XEXP (set_dest, 2))
11290           && INTVAL (XEXP (set_dest, 2)) == 16
11291           && REG_P (XEXP (set_dest, 0))
11292           && REG_P (SET_DEST (prev_set))
11293           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
11294         {
11295           return true;
11296         }
11297     }
11298
11299   if (simple_sets_p
11300       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
11301     {
11302
11303       /*  We're trying to match:
11304           prev (adrp) == (set (reg r1)
11305                               (high (symbol_ref ("SYM"))))
11306           curr (add) == (set (reg r0)
11307                              (lo_sum (reg r1)
11308                                      (symbol_ref ("SYM"))))
11309           Note that r0 need not necessarily be the same as r1, especially
11310           during pre-regalloc scheduling.  */
11311
11312       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11313           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11314         {
11315           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11316               && REG_P (XEXP (SET_SRC (curr_set), 0))
11317               && REGNO (XEXP (SET_SRC (curr_set), 0))
11318                  == REGNO (SET_DEST (prev_set))
11319               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11320                               XEXP (SET_SRC (curr_set), 1)))
11321             return true;
11322         }
11323     }
11324
11325   if (simple_sets_p
11326       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11327     {
11328
11329       /* We're trying to match:
11330          prev (movk) == (set (zero_extract (reg r0)
11331                                            (const_int 16)
11332                                            (const_int 32))
11333                              (const_int imm16_1))
11334          curr (movk) == (set (zero_extract (reg r0)
11335                                            (const_int 16)
11336                                            (const_int 48))
11337                              (const_int imm16_2))  */
11338
11339       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11340           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11341           && REG_P (XEXP (SET_DEST (prev_set), 0))
11342           && REG_P (XEXP (SET_DEST (curr_set), 0))
11343           && REGNO (XEXP (SET_DEST (prev_set), 0))
11344              == REGNO (XEXP (SET_DEST (curr_set), 0))
11345           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11346           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11347           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11348           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11349           && CONST_INT_P (SET_SRC (prev_set))
11350           && CONST_INT_P (SET_SRC (curr_set)))
11351         return true;
11352
11353     }
11354   if (simple_sets_p
11355       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
11356     {
11357       /* We're trying to match:
11358           prev (adrp) == (set (reg r0)
11359                               (high (symbol_ref ("SYM"))))
11360           curr (ldr) == (set (reg r1)
11361                              (mem (lo_sum (reg r0)
11362                                              (symbol_ref ("SYM")))))
11363                  or
11364           curr (ldr) == (set (reg r1)
11365                              (zero_extend (mem
11366                                            (lo_sum (reg r0)
11367                                                    (symbol_ref ("SYM"))))))  */
11368       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11369           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11370         {
11371           rtx curr_src = SET_SRC (curr_set);
11372
11373           if (GET_CODE (curr_src) == ZERO_EXTEND)
11374             curr_src = XEXP (curr_src, 0);
11375
11376           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11377               && REG_P (XEXP (XEXP (curr_src, 0), 0))
11378               && REGNO (XEXP (XEXP (curr_src, 0), 0))
11379                  == REGNO (SET_DEST (prev_set))
11380               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11381                               XEXP (SET_SRC (prev_set), 0)))
11382               return true;
11383         }
11384     }
11385
11386   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11387       && any_condjump_p (curr))
11388     {
11389       enum attr_type prev_type = get_attr_type (prev);
11390
11391       /* FIXME: this misses some which is considered simple arthematic
11392          instructions for ThunderX.  Simple shifts are missed here.  */
11393       if (prev_type == TYPE_ALUS_SREG
11394           || prev_type == TYPE_ALUS_IMM
11395           || prev_type == TYPE_LOGICS_REG
11396           || prev_type == TYPE_LOGICS_IMM)
11397         return true;
11398     }
11399
11400   return false;
11401 }
11402
11403 /* If MEM is in the form of [base+offset], extract the two parts
11404    of address and set to BASE and OFFSET, otherwise return false
11405    after clearing BASE and OFFSET.  */
11406
11407 bool
11408 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11409 {
11410   rtx addr;
11411
11412   gcc_assert (MEM_P (mem));
11413
11414   addr = XEXP (mem, 0);
11415
11416   if (REG_P (addr))
11417     {
11418       *base = addr;
11419       *offset = const0_rtx;
11420       return true;
11421     }
11422
11423   if (GET_CODE (addr) == PLUS
11424       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11425     {
11426       *base = XEXP (addr, 0);
11427       *offset = XEXP (addr, 1);
11428       return true;
11429     }
11430
11431   *base = NULL_RTX;
11432   *offset = NULL_RTX;
11433
11434   return false;
11435 }
11436
11437 /* Types for scheduling fusion.  */
11438 enum sched_fusion_type
11439 {
11440   SCHED_FUSION_NONE = 0,
11441   SCHED_FUSION_LD_SIGN_EXTEND,
11442   SCHED_FUSION_LD_ZERO_EXTEND,
11443   SCHED_FUSION_LD,
11444   SCHED_FUSION_ST,
11445   SCHED_FUSION_NUM
11446 };
11447
11448 /* If INSN is a load or store of address in the form of [base+offset],
11449    extract the two parts and set to BASE and OFFSET.  Return scheduling
11450    fusion type this INSN is.  */
11451
11452 static enum sched_fusion_type
11453 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11454 {
11455   rtx x, dest, src;
11456   enum sched_fusion_type fusion = SCHED_FUSION_LD;
11457
11458   gcc_assert (INSN_P (insn));
11459   x = PATTERN (insn);
11460   if (GET_CODE (x) != SET)
11461     return SCHED_FUSION_NONE;
11462
11463   src = SET_SRC (x);
11464   dest = SET_DEST (x);
11465
11466   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11467       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11468     return SCHED_FUSION_NONE;
11469
11470   if (GET_CODE (src) == SIGN_EXTEND)
11471     {
11472       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11473       src = XEXP (src, 0);
11474       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11475         return SCHED_FUSION_NONE;
11476     }
11477   else if (GET_CODE (src) == ZERO_EXTEND)
11478     {
11479       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11480       src = XEXP (src, 0);
11481       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11482         return SCHED_FUSION_NONE;
11483     }
11484
11485   if (GET_CODE (src) == MEM && REG_P (dest))
11486     extract_base_offset_in_addr (src, base, offset);
11487   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11488     {
11489       fusion = SCHED_FUSION_ST;
11490       extract_base_offset_in_addr (dest, base, offset);
11491     }
11492   else
11493     return SCHED_FUSION_NONE;
11494
11495   if (*base == NULL_RTX || *offset == NULL_RTX)
11496     fusion = SCHED_FUSION_NONE;
11497
11498   return fusion;
11499 }
11500
11501 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11502
11503    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11504    and PRI are only calculated for these instructions.  For other instruction,
11505    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
11506    type instruction fusion can be added by returning different priorities.
11507
11508    It's important that irrelevant instructions get the largest FUSION_PRI.  */
11509
11510 static void
11511 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11512                                int *fusion_pri, int *pri)
11513 {
11514   int tmp, off_val;
11515   rtx base, offset;
11516   enum sched_fusion_type fusion;
11517
11518   gcc_assert (INSN_P (insn));
11519
11520   tmp = max_pri - 1;
11521   fusion = fusion_load_store (insn, &base, &offset);
11522   if (fusion == SCHED_FUSION_NONE)
11523     {
11524       *pri = tmp;
11525       *fusion_pri = tmp;
11526       return;
11527     }
11528
11529   /* Set FUSION_PRI according to fusion type and base register.  */
11530   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11531
11532   /* Calculate PRI.  */
11533   tmp /= 2;
11534
11535   /* INSN with smaller offset goes first.  */
11536   off_val = (int)(INTVAL (offset));
11537   if (off_val >= 0)
11538     tmp -= (off_val & 0xfffff);
11539   else
11540     tmp += ((- off_val) & 0xfffff);
11541
11542   *pri = tmp;
11543   return;
11544 }
11545
11546 /* Given OPERANDS of consecutive load/store, check if we can merge
11547    them into ldp/stp.  LOAD is true if they are load instructions.
11548    MODE is the mode of memory operands.  */
11549
11550 bool
11551 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11552                                 enum machine_mode mode)
11553 {
11554   HOST_WIDE_INT offval_1, offval_2, msize;
11555   enum reg_class rclass_1, rclass_2;
11556   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11557
11558   if (load)
11559     {
11560       mem_1 = operands[1];
11561       mem_2 = operands[3];
11562       reg_1 = operands[0];
11563       reg_2 = operands[2];
11564       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11565       if (REGNO (reg_1) == REGNO (reg_2))
11566         return false;
11567     }
11568   else
11569     {
11570       mem_1 = operands[0];
11571       mem_2 = operands[2];
11572       reg_1 = operands[1];
11573       reg_2 = operands[3];
11574     }
11575
11576   /* The mems cannot be volatile.  */
11577   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11578     return false;
11579
11580   /* Check if the addresses are in the form of [base+offset].  */
11581   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11582   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11583     return false;
11584   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11585   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11586     return false;
11587
11588   /* Check if the bases are same.  */
11589   if (!rtx_equal_p (base_1, base_2))
11590     return false;
11591
11592   offval_1 = INTVAL (offset_1);
11593   offval_2 = INTVAL (offset_2);
11594   msize = GET_MODE_SIZE (mode);
11595   /* Check if the offsets are consecutive.  */
11596   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11597     return false;
11598
11599   /* Check if the addresses are clobbered by load.  */
11600   if (load)
11601     {
11602       if (reg_mentioned_p (reg_1, mem_1))
11603         return false;
11604
11605       /* In increasing order, the last load can clobber the address.  */
11606       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11607       return false;
11608     }
11609
11610   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11611     rclass_1 = FP_REGS;
11612   else
11613     rclass_1 = GENERAL_REGS;
11614
11615   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11616     rclass_2 = FP_REGS;
11617   else
11618     rclass_2 = GENERAL_REGS;
11619
11620   /* Check if the registers are of same class.  */
11621   if (rclass_1 != rclass_2)
11622     return false;
11623
11624   return true;
11625 }
11626
11627 /* Given OPERANDS of consecutive load/store, check if we can merge
11628    them into ldp/stp by adjusting the offset.  LOAD is true if they
11629    are load instructions.  MODE is the mode of memory operands.
11630
11631    Given below consecutive stores:
11632
11633      str  w1, [xb, 0x100]
11634      str  w1, [xb, 0x104]
11635      str  w1, [xb, 0x108]
11636      str  w1, [xb, 0x10c]
11637
11638    Though the offsets are out of the range supported by stp, we can
11639    still pair them after adjusting the offset, like:
11640
11641      add  scratch, xb, 0x100
11642      stp  w1, w1, [scratch]
11643      stp  w1, w1, [scratch, 0x8]
11644
11645    The peephole patterns detecting this opportunity should guarantee
11646    the scratch register is avaliable.  */
11647
11648 bool
11649 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11650                                        enum machine_mode mode)
11651 {
11652   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11653   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11654   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11655   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11656
11657   if (load)
11658     {
11659       reg_1 = operands[0];
11660       mem_1 = operands[1];
11661       reg_2 = operands[2];
11662       mem_2 = operands[3];
11663       reg_3 = operands[4];
11664       mem_3 = operands[5];
11665       reg_4 = operands[6];
11666       mem_4 = operands[7];
11667       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11668                   && REG_P (reg_3) && REG_P (reg_4));
11669       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11670         return false;
11671     }
11672   else
11673     {
11674       mem_1 = operands[0];
11675       reg_1 = operands[1];
11676       mem_2 = operands[2];
11677       reg_2 = operands[3];
11678       mem_3 = operands[4];
11679       reg_3 = operands[5];
11680       mem_4 = operands[6];
11681       reg_4 = operands[7];
11682     }
11683   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11684   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11685     return false;
11686
11687   /* The mems cannot be volatile.  */
11688   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11689       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11690     return false;
11691
11692   /* Check if the addresses are in the form of [base+offset].  */
11693   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11694   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11695     return false;
11696   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11697   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11698     return false;
11699   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11700   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11701     return false;
11702   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11703   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11704     return false;
11705
11706   /* Check if the bases are same.  */
11707   if (!rtx_equal_p (base_1, base_2)
11708       || !rtx_equal_p (base_2, base_3)
11709       || !rtx_equal_p (base_3, base_4))
11710     return false;
11711
11712   offval_1 = INTVAL (offset_1);
11713   offval_2 = INTVAL (offset_2);
11714   offval_3 = INTVAL (offset_3);
11715   offval_4 = INTVAL (offset_4);
11716   msize = GET_MODE_SIZE (mode);
11717   /* Check if the offsets are consecutive.  */
11718   if ((offval_1 != (offval_2 + msize)
11719        || offval_1 != (offval_3 + msize * 2)
11720        || offval_1 != (offval_4 + msize * 3))
11721       && (offval_4 != (offval_3 + msize)
11722           || offval_4 != (offval_2 + msize * 2)
11723           || offval_4 != (offval_1 + msize * 3)))
11724     return false;
11725
11726   /* Check if the addresses are clobbered by load.  */
11727   if (load)
11728     {
11729       if (reg_mentioned_p (reg_1, mem_1)
11730           || reg_mentioned_p (reg_2, mem_2)
11731           || reg_mentioned_p (reg_3, mem_3))
11732         return false;
11733
11734       /* In increasing order, the last load can clobber the address.  */
11735       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11736         return false;
11737     }
11738
11739   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11740     rclass_1 = FP_REGS;
11741   else
11742     rclass_1 = GENERAL_REGS;
11743
11744   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11745     rclass_2 = FP_REGS;
11746   else
11747     rclass_2 = GENERAL_REGS;
11748
11749   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11750     rclass_3 = FP_REGS;
11751   else
11752     rclass_3 = GENERAL_REGS;
11753
11754   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11755     rclass_4 = FP_REGS;
11756   else
11757     rclass_4 = GENERAL_REGS;
11758
11759   /* Check if the registers are of same class.  */
11760   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11761     return false;
11762
11763   return true;
11764 }
11765
11766 /* Given OPERANDS of consecutive load/store, this function pairs them
11767    into ldp/stp after adjusting the offset.  It depends on the fact
11768    that addresses of load/store instructions are in increasing order.
11769    MODE is the mode of memory operands.  CODE is the rtl operator
11770    which should be applied to all memory operands, it's SIGN_EXTEND,
11771    ZERO_EXTEND or UNKNOWN.  */
11772
11773 bool
11774 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11775                              enum machine_mode mode, RTX_CODE code)
11776 {
11777   rtx base, offset, t1, t2;
11778   rtx mem_1, mem_2, mem_3, mem_4;
11779   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11780
11781   if (load)
11782     {
11783       mem_1 = operands[1];
11784       mem_2 = operands[3];
11785       mem_3 = operands[5];
11786       mem_4 = operands[7];
11787     }
11788   else
11789     {
11790       mem_1 = operands[0];
11791       mem_2 = operands[2];
11792       mem_3 = operands[4];
11793       mem_4 = operands[6];
11794       gcc_assert (code == UNKNOWN);
11795     }
11796
11797   extract_base_offset_in_addr (mem_1, &base, &offset);
11798   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11799
11800   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11801   msize = GET_MODE_SIZE (mode);
11802   stp_off_limit = msize * 0x40;
11803   off_val = INTVAL (offset);
11804   abs_off = (off_val < 0) ? -off_val : off_val;
11805   new_off = abs_off % stp_off_limit;
11806   adj_off = abs_off - new_off;
11807
11808   /* Further adjust to make sure all offsets are OK.  */
11809   if ((new_off + msize * 2) >= stp_off_limit)
11810     {
11811       adj_off += stp_off_limit;
11812       new_off -= stp_off_limit;
11813     }
11814
11815   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11816   if (adj_off >= 0x1000)
11817     return false;
11818
11819   if (off_val < 0)
11820     {
11821       adj_off = -adj_off;
11822       new_off = -new_off;
11823     }
11824
11825   /* Create new memory references.  */
11826   mem_1 = change_address (mem_1, VOIDmode,
11827                           plus_constant (DImode, operands[8], new_off));
11828
11829   /* Check if the adjusted address is OK for ldp/stp.  */
11830   if (!aarch64_mem_pair_operand (mem_1, mode))
11831     return false;
11832
11833   msize = GET_MODE_SIZE (mode);
11834   mem_2 = change_address (mem_2, VOIDmode,
11835                           plus_constant (DImode,
11836                                          operands[8],
11837                                          new_off + msize));
11838   mem_3 = change_address (mem_3, VOIDmode,
11839                           plus_constant (DImode,
11840                                          operands[8],
11841                                          new_off + msize * 2));
11842   mem_4 = change_address (mem_4, VOIDmode,
11843                           plus_constant (DImode,
11844                                          operands[8],
11845                                          new_off + msize * 3));
11846
11847   if (code == ZERO_EXTEND)
11848     {
11849       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11850       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11851       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11852       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11853     }
11854   else if (code == SIGN_EXTEND)
11855     {
11856       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11857       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11858       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11859       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11860     }
11861
11862   if (load)
11863     {
11864       operands[1] = mem_1;
11865       operands[3] = mem_2;
11866       operands[5] = mem_3;
11867       operands[7] = mem_4;
11868     }
11869   else
11870     {
11871       operands[0] = mem_1;
11872       operands[2] = mem_2;
11873       operands[4] = mem_3;
11874       operands[6] = mem_4;
11875     }
11876
11877   /* Emit adjusting instruction.  */
11878   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11879   /* Emit ldp/stp instructions.  */
11880   t1 = gen_rtx_SET (operands[0], operands[1]);
11881   t2 = gen_rtx_SET (operands[2], operands[3]);
11882   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11883   t1 = gen_rtx_SET (operands[4], operands[5]);
11884   t2 = gen_rtx_SET (operands[6], operands[7]);
11885   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11886   return true;
11887 }
11888
11889 /* Return 1 if pseudo register should be created and used to hold
11890    GOT address for PIC code.  */
11891
11892 bool
11893 aarch64_use_pseudo_pic_reg (void)
11894 {
11895   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
11896 }
11897
11898 #undef TARGET_ADDRESS_COST
11899 #define TARGET_ADDRESS_COST aarch64_address_cost
11900
11901 /* This hook will determines whether unnamed bitfields affect the alignment
11902    of the containing structure.  The hook returns true if the structure
11903    should inherit the alignment requirements of an unnamed bitfield's
11904    type.  */
11905 #undef TARGET_ALIGN_ANON_BITFIELD
11906 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11907
11908 #undef TARGET_ASM_ALIGNED_DI_OP
11909 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11910
11911 #undef TARGET_ASM_ALIGNED_HI_OP
11912 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11913
11914 #undef TARGET_ASM_ALIGNED_SI_OP
11915 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11916
11917 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11918 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11919   hook_bool_const_tree_hwi_hwi_const_tree_true
11920
11921 #undef TARGET_ASM_FILE_START
11922 #define TARGET_ASM_FILE_START aarch64_start_file
11923
11924 #undef TARGET_ASM_OUTPUT_MI_THUNK
11925 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11926
11927 #undef TARGET_ASM_SELECT_RTX_SECTION
11928 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11929
11930 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11931 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11932
11933 #undef TARGET_BUILD_BUILTIN_VA_LIST
11934 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11935
11936 #undef TARGET_CALLEE_COPIES
11937 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11938
11939 #undef TARGET_CAN_ELIMINATE
11940 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11941
11942 #undef TARGET_CANNOT_FORCE_CONST_MEM
11943 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11944
11945 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11946 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11947
11948 /* Only the least significant bit is used for initialization guard
11949    variables.  */
11950 #undef TARGET_CXX_GUARD_MASK_BIT
11951 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11952
11953 #undef TARGET_C_MODE_FOR_SUFFIX
11954 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11955
11956 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11957 #undef  TARGET_DEFAULT_TARGET_FLAGS
11958 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11959 #endif
11960
11961 #undef TARGET_CLASS_MAX_NREGS
11962 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11963
11964 #undef TARGET_BUILTIN_DECL
11965 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11966
11967 #undef  TARGET_EXPAND_BUILTIN
11968 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11969
11970 #undef TARGET_EXPAND_BUILTIN_VA_START
11971 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11972
11973 #undef TARGET_FOLD_BUILTIN
11974 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11975
11976 #undef TARGET_FUNCTION_ARG
11977 #define TARGET_FUNCTION_ARG aarch64_function_arg
11978
11979 #undef TARGET_FUNCTION_ARG_ADVANCE
11980 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11981
11982 #undef TARGET_FUNCTION_ARG_BOUNDARY
11983 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11984
11985 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11986 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11987
11988 #undef TARGET_FUNCTION_VALUE
11989 #define TARGET_FUNCTION_VALUE aarch64_function_value
11990
11991 #undef TARGET_FUNCTION_VALUE_REGNO_P
11992 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11993
11994 #undef TARGET_FRAME_POINTER_REQUIRED
11995 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11996
11997 #undef TARGET_GIMPLE_FOLD_BUILTIN
11998 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11999
12000 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
12001 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
12002
12003 #undef  TARGET_INIT_BUILTINS
12004 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
12005
12006 #undef TARGET_LEGITIMATE_ADDRESS_P
12007 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
12008
12009 #undef TARGET_LEGITIMATE_CONSTANT_P
12010 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
12011
12012 #undef TARGET_LIBGCC_CMP_RETURN_MODE
12013 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
12014
12015 #undef TARGET_LRA_P
12016 #define TARGET_LRA_P hook_bool_void_true
12017
12018 #undef TARGET_MANGLE_TYPE
12019 #define TARGET_MANGLE_TYPE aarch64_mangle_type
12020
12021 #undef TARGET_MEMORY_MOVE_COST
12022 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
12023
12024 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
12025 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
12026
12027 #undef TARGET_MUST_PASS_IN_STACK
12028 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
12029
12030 /* This target hook should return true if accesses to volatile bitfields
12031    should use the narrowest mode possible.  It should return false if these
12032    accesses should use the bitfield container type.  */
12033 #undef TARGET_NARROW_VOLATILE_BITFIELD
12034 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
12035
12036 #undef  TARGET_OPTION_OVERRIDE
12037 #define TARGET_OPTION_OVERRIDE aarch64_override_options
12038
12039 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
12040 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
12041   aarch64_override_options_after_change
12042
12043 #undef TARGET_PASS_BY_REFERENCE
12044 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
12045
12046 #undef TARGET_PREFERRED_RELOAD_CLASS
12047 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
12048
12049 #undef TARGET_SCHED_REASSOCIATION_WIDTH
12050 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
12051
12052 #undef TARGET_SECONDARY_RELOAD
12053 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
12054
12055 #undef TARGET_SHIFT_TRUNCATION_MASK
12056 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
12057
12058 #undef TARGET_SETUP_INCOMING_VARARGS
12059 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
12060
12061 #undef TARGET_STRUCT_VALUE_RTX
12062 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
12063
12064 #undef TARGET_REGISTER_MOVE_COST
12065 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
12066
12067 #undef TARGET_RETURN_IN_MEMORY
12068 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
12069
12070 #undef TARGET_RETURN_IN_MSB
12071 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
12072
12073 #undef TARGET_RTX_COSTS
12074 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
12075
12076 #undef TARGET_SCHED_ISSUE_RATE
12077 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
12078
12079 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
12080 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
12081   aarch64_sched_first_cycle_multipass_dfa_lookahead
12082
12083 #undef TARGET_TRAMPOLINE_INIT
12084 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
12085
12086 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
12087 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
12088
12089 #undef TARGET_VECTOR_MODE_SUPPORTED_P
12090 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
12091
12092 #undef TARGET_ARRAY_MODE_SUPPORTED_P
12093 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
12094
12095 #undef TARGET_VECTORIZE_ADD_STMT_COST
12096 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
12097
12098 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
12099 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
12100   aarch64_builtin_vectorization_cost
12101
12102 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
12103 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
12104
12105 #undef TARGET_VECTORIZE_BUILTINS
12106 #define TARGET_VECTORIZE_BUILTINS
12107
12108 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
12109 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
12110   aarch64_builtin_vectorized_function
12111
12112 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
12113 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
12114   aarch64_autovectorize_vector_sizes
12115
12116 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
12117 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
12118   aarch64_atomic_assign_expand_fenv
12119
12120 /* Section anchor support.  */
12121
12122 #undef TARGET_MIN_ANCHOR_OFFSET
12123 #define TARGET_MIN_ANCHOR_OFFSET -256
12124
12125 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
12126    byte offset; we can do much more for larger data types, but have no way
12127    to determine the size of the access.  We assume accesses are aligned.  */
12128 #undef TARGET_MAX_ANCHOR_OFFSET
12129 #define TARGET_MAX_ANCHOR_OFFSET 4095
12130
12131 #undef TARGET_VECTOR_ALIGNMENT
12132 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
12133
12134 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
12135 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
12136   aarch64_simd_vector_alignment_reachable
12137
12138 /* vec_perm support.  */
12139
12140 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
12141 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
12142   aarch64_vectorize_vec_perm_const_ok
12143
12144
12145 #undef TARGET_FIXED_CONDITION_CODE_REGS
12146 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
12147
12148 #undef TARGET_FLAGS_REGNUM
12149 #define TARGET_FLAGS_REGNUM CC_REGNUM
12150
12151 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
12152 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
12153
12154 #undef TARGET_ASAN_SHADOW_OFFSET
12155 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
12156
12157 #undef TARGET_LEGITIMIZE_ADDRESS
12158 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
12159
12160 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
12161 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
12162   aarch64_use_by_pieces_infrastructure_p
12163
12164 #undef TARGET_CAN_USE_DOLOOP_P
12165 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
12166
12167 #undef TARGET_SCHED_MACRO_FUSION_P
12168 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
12169
12170 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
12171 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
12172
12173 #undef TARGET_SCHED_FUSION_PRIORITY
12174 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
12175
12176 #undef TARGET_USE_PSEUDO_PIC_REG
12177 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
12178
12179 struct gcc_target targetm = TARGET_INITIALIZER;
12180
12181 #include "gt-aarch64.h"