gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stringpool.h"
  33 #include "stor-layout.h"
  34 #include "calls.h"
  35 #include "varasm.h"
  36 #include "regs.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfgrtl.h"
  40 #include "cfganal.h"
  41 #include "lcm.h"
  42 #include "cfgbuild.h"
  43 #include "cfgcleanup.h"
  44 #include "predict.h"
  45 #include "basic-block.h"
  46 #include "df.h"
  47 #include "hard-reg-set.h"
  48 #include "output.h"
  49 #include "function.h"
  50 #include "flags.h"
  51 #include "insn-config.h"
  52 #include "expmed.h"
  53 #include "dojump.h"
  54 #include "explow.h"
  55 #include "emit-rtl.h"
  56 #include "stmt.h"
  57 #include "expr.h"
  58 #include "reload.h"
  59 #include "toplev.h"
  60 #include "target.h"
  61 #include "target-def.h"
  62 #include "targhooks.h"
  63 #include "tm_p.h"
  64 #include "recog.h"
  65 #include "langhooks.h"
  66 #include "diagnostic-core.h"
  67 #include "tree-ssa-alias.h"
  68 #include "internal-fn.h"
  69 #include "gimple-fold.h"
  70 #include "tree-eh.h"
  71 #include "gimple-expr.h"
  72 #include "gimple.h"
  73 #include "gimplify.h"
  74 #include "optabs.h"
  75 #include "dwarf2.h"
  76 #include "cfgloop.h"
  77 #include "tree-vectorizer.h"
  78 #include "aarch64-cost-tables.h"
  79 #include "dumpfile.h"
  80 #include "builtins.h"
  81 #include "rtl-iter.h"
  82 #include "tm-constrs.h"
  83 #include "sched-int.h"
  84 #include "cortex-a57-fma-steering.h"
  85
  86 /* Defined for convenience.  */
  87 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  88
  89 /* Classifies an address.
  90
  91    ADDRESS_REG_IMM
  92        A simple base register plus immediate offset.
  93
  94    ADDRESS_REG_WB
  95        A base register indexed by immediate offset with writeback.
  96
  97    ADDRESS_REG_REG
  98        A base register indexed by (optionally scaled) register.
  99
 100    ADDRESS_REG_UXTW
 101        A base register indexed by (optionally scaled) zero-extended register.
 102
 103    ADDRESS_REG_SXTW
 104        A base register indexed by (optionally scaled) sign-extended register.
 105
 106    ADDRESS_LO_SUM
 107        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 108
 109    ADDRESS_SYMBOLIC:
 110        A constant symbolic address, in pc-relative literal pool.  */
 111
 112 enum aarch64_address_type {
 113   ADDRESS_REG_IMM,
 114   ADDRESS_REG_WB,
 115   ADDRESS_REG_REG,
 116   ADDRESS_REG_UXTW,
 117   ADDRESS_REG_SXTW,
 118   ADDRESS_LO_SUM,
 119   ADDRESS_SYMBOLIC
 120 };
 121
 122 struct aarch64_address_info {
 123   enum aarch64_address_type type;
 124   rtx base;
 125   rtx offset;
 126   int shift;
 127   enum aarch64_symbol_type symbol_type;
 128 };
 129
 130 struct simd_immediate_info
 131 {
 132   rtx value;
 133   int shift;
 134   int element_width;
 135   bool mvn;
 136   bool msl;
 137 };
 138
 139 /* The current code model.  */
 140 enum aarch64_code_model aarch64_cmodel;
 141
 142 #ifdef HAVE_AS_TLS
 143 #undef TARGET_HAVE_TLS
 144 #define TARGET_HAVE_TLS 1
 145 #endif
 146
 147 static bool aarch64_composite_type_p (const_tree, machine_mode);
 148 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 149                                                      const_tree,
 150                                                      machine_mode *, int *,
 151                                                      bool *);
 152 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 153 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 154 static void aarch64_override_options_after_change (void);
 155 static bool aarch64_vector_mode_supported_p (machine_mode);
 156 static unsigned bit_count (unsigned HOST_WIDE_INT);
 157 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 158                                                  const unsigned char *sel);
 159 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 160
 161 /* Major revision number of the ARM Architecture implemented by the target.  */
 162 unsigned aarch64_architecture_version;
 163
 164 /* The processor for which instructions should be scheduled.  */
 165 enum aarch64_processor aarch64_tune = cortexa53;
 166
 167 /* The current tuning set.  */
 168 const struct tune_params *aarch64_tune_params;
 169
 170 /* Mask to specify which instructions we are allowed to generate.  */
 171 unsigned long aarch64_isa_flags = 0;
 172
 173 /* Mask to specify which instruction scheduling options should be used.  */
 174 unsigned long aarch64_tune_flags = 0;
 175
 176 /* Tuning parameters.  */
 177
 178 static const struct cpu_addrcost_table generic_addrcost_table =
 179 {
 180     {
 181       0, /* hi  */
 182       0, /* si  */
 183       0, /* di  */
 184       0, /* ti  */
 185     },
 186   0, /* pre_modify  */
 187   0, /* post_modify  */
 188   0, /* register_offset  */
 189   0, /* register_extend  */
 190   0 /* imm_offset  */
 191 };
 192
 193 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 194 {
 195     {
 196       1, /* hi  */
 197       0, /* si  */
 198       0, /* di  */
 199       1, /* ti  */
 200     },
 201   0, /* pre_modify  */
 202   0, /* post_modify  */
 203   0, /* register_offset  */
 204   0, /* register_extend  */
 205   0, /* imm_offset  */
 206 };
 207
 208 static const struct cpu_addrcost_table xgene1_addrcost_table =
 209 {
 210     {
 211       1, /* hi  */
 212       0, /* si  */
 213       0, /* di  */
 214       1, /* ti  */
 215     },
 216   1, /* pre_modify  */
 217   0, /* post_modify  */
 218   0, /* register_offset  */
 219   1, /* register_extend  */
 220   0, /* imm_offset  */
 221 };
 222
 223 static const struct cpu_regmove_cost generic_regmove_cost =
 224 {
 225   1, /* GP2GP  */
 226   /* Avoid the use of slow int<->fp moves for spilling by setting
 227      their cost higher than memmov_cost.  */
 228   5, /* GP2FP  */
 229   5, /* FP2GP  */
 230   2 /* FP2FP  */
 231 };
 232
 233 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 234 {
 235   1, /* GP2GP  */
 236   /* Avoid the use of slow int<->fp moves for spilling by setting
 237      their cost higher than memmov_cost.  */
 238   5, /* GP2FP  */
 239   5, /* FP2GP  */
 240   2 /* FP2FP  */
 241 };
 242
 243 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 244 {
 245   1, /* GP2GP  */
 246   /* Avoid the use of slow int<->fp moves for spilling by setting
 247      their cost higher than memmov_cost.  */
 248   5, /* GP2FP  */
 249   5, /* FP2GP  */
 250   2 /* FP2FP  */
 251 };
 252
 253 static const struct cpu_regmove_cost thunderx_regmove_cost =
 254 {
 255   2, /* GP2GP  */
 256   2, /* GP2FP  */
 257   6, /* FP2GP  */
 258   4 /* FP2FP  */
 259 };
 260
 261 static const struct cpu_regmove_cost xgene1_regmove_cost =
 262 {
 263   1, /* GP2GP  */
 264   /* Avoid the use of slow int<->fp moves for spilling by setting
 265      their cost higher than memmov_cost.  */
 266   8, /* GP2FP  */
 267   8, /* FP2GP  */
 268   2 /* FP2FP  */
 269 };
 270
 271 /* Generic costs for vector insn classes.  */
 272 static const struct cpu_vector_cost generic_vector_cost =
 273 {
 274   1, /* scalar_stmt_cost  */
 275   1, /* scalar_load_cost  */
 276   1, /* scalar_store_cost  */
 277   1, /* vec_stmt_cost  */
 278   1, /* vec_to_scalar_cost  */
 279   1, /* scalar_to_vec_cost  */
 280   1, /* vec_align_load_cost  */
 281   1, /* vec_unalign_load_cost  */
 282   1, /* vec_unalign_store_cost  */
 283   1, /* vec_store_cost  */
 284   3, /* cond_taken_branch_cost  */
 285   1 /* cond_not_taken_branch_cost  */
 286 };
 287
 288 /* Generic costs for vector insn classes.  */
 289 static const struct cpu_vector_cost cortexa57_vector_cost =
 290 {
 291   1, /* scalar_stmt_cost  */
 292   4, /* scalar_load_cost  */
 293   1, /* scalar_store_cost  */
 294   3, /* vec_stmt_cost  */
 295   8, /* vec_to_scalar_cost  */
 296   8, /* scalar_to_vec_cost  */
 297   5, /* vec_align_load_cost  */
 298   5, /* vec_unalign_load_cost  */
 299   1, /* vec_unalign_store_cost  */
 300   1, /* vec_store_cost  */
 301   1, /* cond_taken_branch_cost  */
 302   1 /* cond_not_taken_branch_cost  */
 303 };
 304
 305 /* Generic costs for vector insn classes.  */
 306 static const struct cpu_vector_cost xgene1_vector_cost =
 307 {
 308   1, /* scalar_stmt_cost  */
 309   5, /* scalar_load_cost  */
 310   1, /* scalar_store_cost  */
 311   2, /* vec_stmt_cost  */
 312   4, /* vec_to_scalar_cost  */
 313   4, /* scalar_to_vec_cost  */
 314   10, /* vec_align_load_cost  */
 315   10, /* vec_unalign_load_cost  */
 316   2, /* vec_unalign_store_cost  */
 317   2, /* vec_store_cost  */
 318   2, /* cond_taken_branch_cost  */
 319   1 /* cond_not_taken_branch_cost  */
 320 };
 321
 322 #define AARCH64_FUSE_NOTHING    (0)
 323 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 324 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 325 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 326 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 327 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 328
 329 /* Generic costs for branch instructions.  */
 330 static const struct cpu_branch_cost generic_branch_cost =
 331 {
 332   2,  /* Predictable.  */
 333   2   /* Unpredictable.  */
 334 };
 335
 336 static const struct tune_params generic_tunings =
 337 {
 338   &cortexa57_extra_costs,
 339   &generic_addrcost_table,
 340   &generic_regmove_cost,
 341   &generic_vector_cost,
 342   &generic_branch_cost,
 343   4, /* memmov_cost  */
 344   2, /* issue_rate  */
 345   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 346   8,    /* function_align.  */
 347   8,    /* jump_align.  */
 348   4,    /* loop_align.  */
 349   2,    /* int_reassoc_width.  */
 350   4,    /* fp_reassoc_width.  */
 351   1,    /* vec_reassoc_width.  */
 352   2,    /* min_div_recip_mul_sf.  */
 353   2     /* min_div_recip_mul_df.  */
 354 };
 355
 356 static const struct tune_params cortexa53_tunings =
 357 {
 358   &cortexa53_extra_costs,
 359   &generic_addrcost_table,
 360   &cortexa53_regmove_cost,
 361   &generic_vector_cost,
 362   &generic_branch_cost,
 363   4, /* memmov_cost  */
 364   2, /* issue_rate  */
 365   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 366    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 367   8,    /* function_align.  */
 368   8,    /* jump_align.  */
 369   4,    /* loop_align.  */
 370   2,    /* int_reassoc_width.  */
 371   4,    /* fp_reassoc_width.  */
 372   1,    /* vec_reassoc_width.  */
 373   2,    /* min_div_recip_mul_sf.  */
 374   2     /* min_div_recip_mul_df.  */
 375 };
 376
 377 static const struct tune_params cortexa57_tunings =
 378 {
 379   &cortexa57_extra_costs,
 380   &cortexa57_addrcost_table,
 381   &cortexa57_regmove_cost,
 382   &cortexa57_vector_cost,
 383   &generic_branch_cost,
 384   4, /* memmov_cost  */
 385   3, /* issue_rate  */
 386   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 387    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 388   16,   /* function_align.  */
 389   8,    /* jump_align.  */
 390   4,    /* loop_align.  */
 391   2,    /* int_reassoc_width.  */
 392   4,    /* fp_reassoc_width.  */
 393   1,    /* vec_reassoc_width.  */
 394   2,    /* min_div_recip_mul_sf.  */
 395   2     /* min_div_recip_mul_df.  */
 396 };
 397
 398 static const struct tune_params thunderx_tunings =
 399 {
 400   &thunderx_extra_costs,
 401   &generic_addrcost_table,
 402   &thunderx_regmove_cost,
 403   &generic_vector_cost,
 404   &generic_branch_cost,
 405   6, /* memmov_cost  */
 406   2, /* issue_rate  */
 407   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 408   8,    /* function_align.  */
 409   8,    /* jump_align.  */
 410   8,    /* loop_align.  */
 411   2,    /* int_reassoc_width.  */
 412   4,    /* fp_reassoc_width.  */
 413   1,    /* vec_reassoc_width.  */
 414   2,    /* min_div_recip_mul_sf.  */
 415   2     /* min_div_recip_mul_df.  */
 416 };
 417
 418 static const struct tune_params xgene1_tunings =
 419 {
 420   &xgene1_extra_costs,
 421   &xgene1_addrcost_table,
 422   &xgene1_regmove_cost,
 423   &xgene1_vector_cost,
 424   &generic_branch_cost,
 425   6, /* memmov_cost  */
 426   4, /* issue_rate  */
 427   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 428   16,   /* function_align.  */
 429   8,    /* jump_align.  */
 430   16,   /* loop_align.  */
 431   2,    /* int_reassoc_width.  */
 432   4,    /* fp_reassoc_width.  */
 433   1,    /* vec_reassoc_width.  */
 434   2,    /* min_div_recip_mul_sf.  */
 435   2     /* min_div_recip_mul_df.  */
 436 };
 437
 438 /* A processor implementing AArch64.  */
 439 struct processor
 440 {
 441   const char *const name;
 442   enum aarch64_processor core;
 443   const char *arch;
 444   unsigned architecture_version;
 445   const unsigned long flags;
 446   const struct tune_params *const tune;
 447 };
 448
 449 /* Processor cores implementing AArch64.  */
 450 static const struct processor all_cores[] =
 451 {
 452 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 453   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 454 #include "aarch64-cores.def"
 455 #undef AARCH64_CORE
 456   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 457   {NULL, aarch64_none, NULL, 0, 0, NULL}
 458 };
 459
 460 /* Architectures implementing AArch64.  */
 461 static const struct processor all_architectures[] =
 462 {
 463 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 464   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 465 #include "aarch64-arches.def"
 466 #undef AARCH64_ARCH
 467   {NULL, aarch64_none, NULL, 0, 0, NULL}
 468 };
 469
 470 /* Target specification.  These are populated as commandline arguments
 471    are processed, or NULL if not specified.  */
 472 static const struct processor *selected_arch;
 473 static const struct processor *selected_cpu;
 474 static const struct processor *selected_tune;
 475
 476 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 477
 478 /* An ISA extension in the co-processor and main instruction set space.  */
 479 struct aarch64_option_extension
 480 {
 481   const char *const name;
 482   const unsigned long flags_on;
 483   const unsigned long flags_off;
 484 };
 485
 486 /* ISA extensions in AArch64.  */
 487 static const struct aarch64_option_extension all_extensions[] =
 488 {
 489 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 490   {NAME, FLAGS_ON, FLAGS_OFF},
 491 #include "aarch64-option-extensions.def"
 492 #undef AARCH64_OPT_EXTENSION
 493   {NULL, 0, 0}
 494 };
 495
 496 /* Used to track the size of an address when generating a pre/post
 497    increment address.  */
 498 static machine_mode aarch64_memory_reference_mode;
 499
 500 /* A table of valid AArch64 "bitmask immediate" values for
 501    logical instructions.  */
 502
 503 #define AARCH64_NUM_BITMASKS  5334
 504 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 505
 506 typedef enum aarch64_cond_code
 507 {
 508   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 509   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 510   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 511 }
 512 aarch64_cc;
 513
 514 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 515
 516 /* The condition codes of the processor, and the inverse function.  */
 517 static const char * const aarch64_condition_codes[] =
 518 {
 519   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 520   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 521 };
 522
 523 void
 524 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 525 {
 526   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 527   if (TARGET_GENERAL_REGS_ONLY)
 528     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 529   else
 530     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 531 }
 532
 533 static unsigned int
 534 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 535 {
 536   if (GET_MODE_UNIT_SIZE (mode) == 4)
 537     return aarch64_tune_params->min_div_recip_mul_sf;
 538   return aarch64_tune_params->min_div_recip_mul_df;
 539 }
 540
 541 static int
 542 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 543                              enum machine_mode mode)
 544 {
 545   if (VECTOR_MODE_P (mode))
 546     return aarch64_tune_params->vec_reassoc_width;
 547   if (INTEGRAL_MODE_P (mode))
 548     return aarch64_tune_params->int_reassoc_width;
 549   if (FLOAT_MODE_P (mode))
 550     return aarch64_tune_params->fp_reassoc_width;
 551   return 1;
 552 }
 553
 554 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 555 unsigned
 556 aarch64_dbx_register_number (unsigned regno)
 557 {
 558    if (GP_REGNUM_P (regno))
 559      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 560    else if (regno == SP_REGNUM)
 561      return AARCH64_DWARF_SP;
 562    else if (FP_REGNUM_P (regno))
 563      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 564
 565    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 566       equivalent DWARF register.  */
 567    return DWARF_FRAME_REGISTERS;
 568 }
 569
 570 /* Return TRUE if MODE is any of the large INT modes.  */
 571 static bool
 572 aarch64_vect_struct_mode_p (machine_mode mode)
 573 {
 574   return mode == OImode || mode == CImode || mode == XImode;
 575 }
 576
 577 /* Return TRUE if MODE is any of the vector modes.  */
 578 static bool
 579 aarch64_vector_mode_p (machine_mode mode)
 580 {
 581   return aarch64_vector_mode_supported_p (mode)
 582          || aarch64_vect_struct_mode_p (mode);
 583 }
 584
 585 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 586 static bool
 587 aarch64_array_mode_supported_p (machine_mode mode,
 588                                 unsigned HOST_WIDE_INT nelems)
 589 {
 590   if (TARGET_SIMD
 591       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 592       && (nelems >= 2 && nelems <= 4))
 593     return true;
 594
 595   return false;
 596 }
 597
 598 /* Implement HARD_REGNO_NREGS.  */
 599
 600 int
 601 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 602 {
 603   switch (aarch64_regno_regclass (regno))
 604     {
 605     case FP_REGS:
 606     case FP_LO_REGS:
 607       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 608     default:
 609       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 610     }
 611   gcc_unreachable ();
 612 }
 613
 614 /* Implement HARD_REGNO_MODE_OK.  */
 615
 616 int
 617 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 618 {
 619   if (GET_MODE_CLASS (mode) == MODE_CC)
 620     return regno == CC_REGNUM;
 621
 622   if (regno == SP_REGNUM)
 623     /* The purpose of comparing with ptr_mode is to support the
 624        global register variable associated with the stack pointer
 625        register via the syntax of asm ("wsp") in ILP32.  */
 626     return mode == Pmode || mode == ptr_mode;
 627
 628   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 629     return mode == Pmode;
 630
 631   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 632     return 1;
 633
 634   if (FP_REGNUM_P (regno))
 635     {
 636       if (aarch64_vect_struct_mode_p (mode))
 637         return
 638           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 639       else
 640         return 1;
 641     }
 642
 643   return 0;
 644 }
 645
 646 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 647 machine_mode
 648 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 649                                      machine_mode mode)
 650 {
 651   /* Handle modes that fit within single registers.  */
 652   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 653     {
 654       if (GET_MODE_SIZE (mode) >= 4)
 655         return mode;
 656       else
 657         return SImode;
 658     }
 659   /* Fall back to generic for multi-reg and very large modes.  */
 660   else
 661     return choose_hard_reg_mode (regno, nregs, false);
 662 }
 663
 664 /* Return true if calls to DECL should be treated as
 665    long-calls (ie called via a register).  */
 666 static bool
 667 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 668 {
 669   return false;
 670 }
 671
 672 /* Return true if calls to symbol-ref SYM should be treated as
 673    long-calls (ie called via a register).  */
 674 bool
 675 aarch64_is_long_call_p (rtx sym)
 676 {
 677   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 678 }
 679
 680 /* Return true if the offsets to a zero/sign-extract operation
 681    represent an expression that matches an extend operation.  The
 682    operands represent the paramters from
 683
 684    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 685 bool
 686 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 687                                 rtx extract_imm)
 688 {
 689   HOST_WIDE_INT mult_val, extract_val;
 690
 691   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 692     return false;
 693
 694   mult_val = INTVAL (mult_imm);
 695   extract_val = INTVAL (extract_imm);
 696
 697   if (extract_val > 8
 698       && extract_val < GET_MODE_BITSIZE (mode)
 699       && exact_log2 (extract_val & ~7) > 0
 700       && (extract_val & 7) <= 4
 701       && mult_val == (1 << (extract_val & 7)))
 702     return true;
 703
 704   return false;
 705 }
 706
 707 /* Emit an insn that's a simple single-set.  Both the operands must be
 708    known to be valid.  */
 709 inline static rtx
 710 emit_set_insn (rtx x, rtx y)
 711 {
 712   return emit_insn (gen_rtx_SET (x, y));
 713 }
 714
 715 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 716    return the rtx for register 0 in the proper mode.  */
 717 rtx
 718 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 719 {
 720   machine_mode mode = SELECT_CC_MODE (code, x, y);
 721   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 722
 723   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 724   return cc_reg;
 725 }
 726
 727 /* Build the SYMBOL_REF for __tls_get_addr.  */
 728
 729 static GTY(()) rtx tls_get_addr_libfunc;
 730
 731 rtx
 732 aarch64_tls_get_addr (void)
 733 {
 734   if (!tls_get_addr_libfunc)
 735     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 736   return tls_get_addr_libfunc;
 737 }
 738
 739 /* Return the TLS model to use for ADDR.  */
 740
 741 static enum tls_model
 742 tls_symbolic_operand_type (rtx addr)
 743 {
 744   enum tls_model tls_kind = TLS_MODEL_NONE;
 745   rtx sym, addend;
 746
 747   if (GET_CODE (addr) == CONST)
 748     {
 749       split_const (addr, &sym, &addend);
 750       if (GET_CODE (sym) == SYMBOL_REF)
 751         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 752     }
 753   else if (GET_CODE (addr) == SYMBOL_REF)
 754     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 755
 756   return tls_kind;
 757 }
 758
 759 /* We'll allow lo_sum's in addresses in our legitimate addresses
 760    so that combine would take care of combining addresses where
 761    necessary, but for generation purposes, we'll generate the address
 762    as :
 763    RTL                               Absolute
 764    tmp = hi (symbol_ref);            adrp  x1, foo
 765    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 766                                      nop
 767
 768    PIC                               TLS
 769    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 770    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 771                                      bl   __tls_get_addr
 772                                      nop
 773
 774    Load TLS symbol, depending on TLS mechanism and TLS access model.
 775
 776    Global Dynamic - Traditional TLS:
 777    adrp tmp, :tlsgd:imm
 778    add  dest, tmp, #:tlsgd_lo12:imm
 779    bl   __tls_get_addr
 780
 781    Global Dynamic - TLS Descriptors:
 782    adrp dest, :tlsdesc:imm
 783    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 784    add  dest, dest, #:tlsdesc_lo12:imm
 785    blr  tmp
 786    mrs  tp, tpidr_el0
 787    add  dest, dest, tp
 788
 789    Initial Exec:
 790    mrs  tp, tpidr_el0
 791    adrp tmp, :gottprel:imm
 792    ldr  dest, [tmp, #:gottprel_lo12:imm]
 793    add  dest, dest, tp
 794
 795    Local Exec:
 796    mrs  tp, tpidr_el0
 797    add  t0, tp, #:tprel_hi12:imm, lsl #12
 798    add  t0, t0, #:tprel_lo12_nc:imm
 799 */
 800
 801 static void
 802 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 803                                    enum aarch64_symbol_type type)
 804 {
 805   switch (type)
 806     {
 807     case SYMBOL_SMALL_ABSOLUTE:
 808       {
 809         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 810         rtx tmp_reg = dest;
 811         machine_mode mode = GET_MODE (dest);
 812
 813         gcc_assert (mode == Pmode || mode == ptr_mode);
 814
 815         if (can_create_pseudo_p ())
 816           tmp_reg = gen_reg_rtx (mode);
 817
 818         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 819         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 820         return;
 821       }
 822
 823     case SYMBOL_TINY_ABSOLUTE:
 824       emit_insn (gen_rtx_SET (dest, imm));
 825       return;
 826
 827     case SYMBOL_SMALL_GOT:
 828       {
 829         /* In ILP32, the mode of dest can be either SImode or DImode,
 830            while the got entry is always of SImode size.  The mode of
 831            dest depends on how dest is used: if dest is assigned to a
 832            pointer (e.g. in the memory), it has SImode; it may have
 833            DImode if dest is dereferenced to access the memeory.
 834            This is why we have to handle three different ldr_got_small
 835            patterns here (two patterns for ILP32).  */
 836         rtx tmp_reg = dest;
 837         machine_mode mode = GET_MODE (dest);
 838
 839         if (can_create_pseudo_p ())
 840           tmp_reg = gen_reg_rtx (mode);
 841
 842         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 843         if (mode == ptr_mode)
 844           {
 845             if (mode == DImode)
 846               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 847             else
 848               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 849           }
 850         else
 851           {
 852             gcc_assert (mode == Pmode);
 853             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 854           }
 855
 856         return;
 857       }
 858
 859     case SYMBOL_SMALL_TLSGD:
 860       {
 861         rtx_insn *insns;
 862         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 863
 864         start_sequence ();
 865         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 866         insns = get_insns ();
 867         end_sequence ();
 868
 869         RTL_CONST_CALL_P (insns) = 1;
 870         emit_libcall_block (insns, dest, result, imm);
 871         return;
 872       }
 873
 874     case SYMBOL_SMALL_TLSDESC:
 875       {
 876         machine_mode mode = GET_MODE (dest);
 877         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 878         rtx tp;
 879
 880         gcc_assert (mode == Pmode || mode == ptr_mode);
 881
 882         /* In ILP32, the got entry is always of SImode size.  Unlike
 883            small GOT, the dest is fixed at reg 0.  */
 884         if (TARGET_ILP32)
 885           emit_insn (gen_tlsdesc_small_si (imm));
 886         else
 887           emit_insn (gen_tlsdesc_small_di (imm));
 888         tp = aarch64_load_tp (NULL);
 889
 890         if (mode != Pmode)
 891           tp = gen_lowpart (mode, tp);
 892
 893         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
 894         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 895         return;
 896       }
 897
 898     case SYMBOL_SMALL_GOTTPREL:
 899       {
 900         /* In ILP32, the mode of dest can be either SImode or DImode,
 901            while the got entry is always of SImode size.  The mode of
 902            dest depends on how dest is used: if dest is assigned to a
 903            pointer (e.g. in the memory), it has SImode; it may have
 904            DImode if dest is dereferenced to access the memeory.
 905            This is why we have to handle three different tlsie_small
 906            patterns here (two patterns for ILP32).  */
 907         machine_mode mode = GET_MODE (dest);
 908         rtx tmp_reg = gen_reg_rtx (mode);
 909         rtx tp = aarch64_load_tp (NULL);
 910
 911         if (mode == ptr_mode)
 912           {
 913             if (mode == DImode)
 914               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 915             else
 916               {
 917                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 918                 tp = gen_lowpart (mode, tp);
 919               }
 920           }
 921         else
 922           {
 923             gcc_assert (mode == Pmode);
 924             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 925           }
 926
 927         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 928         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 929         return;
 930       }
 931
 932     case SYMBOL_SMALL_TPREL:
 933       {
 934         rtx tp = aarch64_load_tp (NULL);
 935
 936         if (GET_MODE (dest) != Pmode)
 937           tp = gen_lowpart (GET_MODE (dest), tp);
 938
 939         emit_insn (gen_tlsle_small (dest, tp, imm));
 940         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 941         return;
 942       }
 943
 944     case SYMBOL_TINY_GOT:
 945       emit_insn (gen_ldr_got_tiny (dest, imm));
 946       return;
 947
 948     default:
 949       gcc_unreachable ();
 950     }
 951 }
 952
 953 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 954    handle all moves if !can_create_pseudo_p ().  The distinction is
 955    important because, unlike emit_move_insn, the move expanders know
 956    how to force Pmode objects into the constant pool even when the
 957    constant pool address is not itself legitimate.  */
 958 static rtx
 959 aarch64_emit_move (rtx dest, rtx src)
 960 {
 961   return (can_create_pseudo_p ()
 962           ? emit_move_insn (dest, src)
 963           : emit_move_insn_1 (dest, src));
 964 }
 965
 966 /* Split a 128-bit move operation into two 64-bit move operations,
 967    taking care to handle partial overlap of register to register
 968    copies.  Special cases are needed when moving between GP regs and
 969    FP regs.  SRC can be a register, constant or memory; DST a register
 970    or memory.  If either operand is memory it must not have any side
 971    effects.  */
 972 void
 973 aarch64_split_128bit_move (rtx dst, rtx src)
 974 {
 975   rtx dst_lo, dst_hi;
 976   rtx src_lo, src_hi;
 977
 978   machine_mode mode = GET_MODE (dst);
 979
 980   gcc_assert (mode == TImode || mode == TFmode);
 981   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 982   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 983
 984   if (REG_P (dst) && REG_P (src))
 985     {
 986       int src_regno = REGNO (src);
 987       int dst_regno = REGNO (dst);
 988
 989       /* Handle FP <-> GP regs.  */
 990       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 991         {
 992           src_lo = gen_lowpart (word_mode, src);
 993           src_hi = gen_highpart (word_mode, src);
 994
 995           if (mode == TImode)
 996             {
 997               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 998               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 999             }
1000           else
1001             {
1002               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1003               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1004             }
1005           return;
1006         }
1007       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1008         {
1009           dst_lo = gen_lowpart (word_mode, dst);
1010           dst_hi = gen_highpart (word_mode, dst);
1011
1012           if (mode == TImode)
1013             {
1014               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1015               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1016             }
1017           else
1018             {
1019               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1020               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1021             }
1022           return;
1023         }
1024     }
1025
1026   dst_lo = gen_lowpart (word_mode, dst);
1027   dst_hi = gen_highpart (word_mode, dst);
1028   src_lo = gen_lowpart (word_mode, src);
1029   src_hi = gen_highpart_mode (word_mode, mode, src);
1030
1031   /* At most one pairing may overlap.  */
1032   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1033     {
1034       aarch64_emit_move (dst_hi, src_hi);
1035       aarch64_emit_move (dst_lo, src_lo);
1036     }
1037   else
1038     {
1039       aarch64_emit_move (dst_lo, src_lo);
1040       aarch64_emit_move (dst_hi, src_hi);
1041     }
1042 }
1043
1044 bool
1045 aarch64_split_128bit_move_p (rtx dst, rtx src)
1046 {
1047   return (! REG_P (src)
1048           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1049 }
1050
1051 /* Split a complex SIMD combine.  */
1052
1053 void
1054 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1055 {
1056   machine_mode src_mode = GET_MODE (src1);
1057   machine_mode dst_mode = GET_MODE (dst);
1058
1059   gcc_assert (VECTOR_MODE_P (dst_mode));
1060
1061   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1062     {
1063       rtx (*gen) (rtx, rtx, rtx);
1064
1065       switch (src_mode)
1066         {
1067         case V8QImode:
1068           gen = gen_aarch64_simd_combinev8qi;
1069           break;
1070         case V4HImode:
1071           gen = gen_aarch64_simd_combinev4hi;
1072           break;
1073         case V2SImode:
1074           gen = gen_aarch64_simd_combinev2si;
1075           break;
1076         case V2SFmode:
1077           gen = gen_aarch64_simd_combinev2sf;
1078           break;
1079         case DImode:
1080           gen = gen_aarch64_simd_combinedi;
1081           break;
1082         case DFmode:
1083           gen = gen_aarch64_simd_combinedf;
1084           break;
1085         default:
1086           gcc_unreachable ();
1087         }
1088
1089       emit_insn (gen (dst, src1, src2));
1090       return;
1091     }
1092 }
1093
1094 /* Split a complex SIMD move.  */
1095
1096 void
1097 aarch64_split_simd_move (rtx dst, rtx src)
1098 {
1099   machine_mode src_mode = GET_MODE (src);
1100   machine_mode dst_mode = GET_MODE (dst);
1101
1102   gcc_assert (VECTOR_MODE_P (dst_mode));
1103
1104   if (REG_P (dst) && REG_P (src))
1105     {
1106       rtx (*gen) (rtx, rtx);
1107
1108       gcc_assert (VECTOR_MODE_P (src_mode));
1109
1110       switch (src_mode)
1111         {
1112         case V16QImode:
1113           gen = gen_aarch64_split_simd_movv16qi;
1114           break;
1115         case V8HImode:
1116           gen = gen_aarch64_split_simd_movv8hi;
1117           break;
1118         case V4SImode:
1119           gen = gen_aarch64_split_simd_movv4si;
1120           break;
1121         case V2DImode:
1122           gen = gen_aarch64_split_simd_movv2di;
1123           break;
1124         case V4SFmode:
1125           gen = gen_aarch64_split_simd_movv4sf;
1126           break;
1127         case V2DFmode:
1128           gen = gen_aarch64_split_simd_movv2df;
1129           break;
1130         default:
1131           gcc_unreachable ();
1132         }
1133
1134       emit_insn (gen (dst, src));
1135       return;
1136     }
1137 }
1138
1139 static rtx
1140 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1141 {
1142   if (can_create_pseudo_p ())
1143     return force_reg (mode, value);
1144   else
1145     {
1146       x = aarch64_emit_move (x, value);
1147       return x;
1148     }
1149 }
1150
1151
1152 static rtx
1153 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1154 {
1155   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1156     {
1157       rtx high;
1158       /* Load the full offset into a register.  This
1159          might be improvable in the future.  */
1160       high = GEN_INT (offset);
1161       offset = 0;
1162       high = aarch64_force_temporary (mode, temp, high);
1163       reg = aarch64_force_temporary (mode, temp,
1164                                      gen_rtx_PLUS (mode, high, reg));
1165     }
1166   return plus_constant (mode, reg, offset);
1167 }
1168
1169 static int
1170 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1171                                 machine_mode mode)
1172 {
1173   unsigned HOST_WIDE_INT mask;
1174   int i;
1175   bool first;
1176   unsigned HOST_WIDE_INT val;
1177   bool subtargets;
1178   rtx subtarget;
1179   int one_match, zero_match, first_not_ffff_match;
1180   int num_insns = 0;
1181
1182   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1183     {
1184       if (generate)
1185         emit_insn (gen_rtx_SET (dest, imm));
1186       num_insns++;
1187       return num_insns;
1188     }
1189
1190   if (mode == SImode)
1191     {
1192       /* We know we can't do this in 1 insn, and we must be able to do it
1193          in two; so don't mess around looking for sequences that don't buy
1194          us anything.  */
1195       if (generate)
1196         {
1197           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1198           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1199                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1200         }
1201       num_insns += 2;
1202       return num_insns;
1203     }
1204
1205   /* Remaining cases are all for DImode.  */
1206
1207   val = INTVAL (imm);
1208   subtargets = optimize && can_create_pseudo_p ();
1209
1210   one_match = 0;
1211   zero_match = 0;
1212   mask = 0xffff;
1213   first_not_ffff_match = -1;
1214
1215   for (i = 0; i < 64; i += 16, mask <<= 16)
1216     {
1217       if ((val & mask) == mask)
1218         one_match++;
1219       else
1220         {
1221           if (first_not_ffff_match < 0)
1222             first_not_ffff_match = i;
1223           if ((val & mask) == 0)
1224             zero_match++;
1225         }
1226     }
1227
1228   if (one_match == 2)
1229     {
1230       /* Set one of the quarters and then insert back into result.  */
1231       mask = 0xffffll << first_not_ffff_match;
1232       if (generate)
1233         {
1234           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1235           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1236                                      GEN_INT ((val >> first_not_ffff_match)
1237                                               & 0xffff)));
1238         }
1239       num_insns += 2;
1240       return num_insns;
1241     }
1242
1243   if (zero_match == 2)
1244     goto simple_sequence;
1245
1246   mask = 0x0ffff0000UL;
1247   for (i = 16; i < 64; i += 16, mask <<= 16)
1248     {
1249       HOST_WIDE_INT comp = mask & ~(mask - 1);
1250
1251       if (aarch64_uimm12_shift (val - (val & mask)))
1252         {
1253           if (generate)
1254             {
1255               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1256               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1257               emit_insn (gen_adddi3 (dest, subtarget,
1258                                      GEN_INT (val - (val & mask))));
1259             }
1260           num_insns += 2;
1261           return num_insns;
1262         }
1263       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1264         {
1265           if (generate)
1266             {
1267               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1268               emit_insn (gen_rtx_SET (subtarget,
1269                                       GEN_INT ((val + comp) & mask)));
1270               emit_insn (gen_adddi3 (dest, subtarget,
1271                                      GEN_INT (val - ((val + comp) & mask))));
1272             }
1273           num_insns += 2;
1274           return num_insns;
1275         }
1276       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1277         {
1278           if (generate)
1279             {
1280               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1281               emit_insn (gen_rtx_SET (subtarget,
1282                                       GEN_INT ((val - comp) | ~mask)));
1283               emit_insn (gen_adddi3 (dest, subtarget,
1284                                      GEN_INT (val - ((val - comp) | ~mask))));
1285             }
1286           num_insns += 2;
1287           return num_insns;
1288         }
1289       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1290         {
1291           if (generate)
1292             {
1293               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1294               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1295               emit_insn (gen_adddi3 (dest, subtarget,
1296                                      GEN_INT (val - (val | ~mask))));
1297             }
1298           num_insns += 2;
1299           return num_insns;
1300         }
1301     }
1302
1303   /* See if we can do it by arithmetically combining two
1304      immediates.  */
1305   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1306     {
1307       int j;
1308       mask = 0xffff;
1309
1310       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1311           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1312         {
1313           if (generate)
1314             {
1315               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1316               emit_insn (gen_rtx_SET (subtarget,
1317                                       GEN_INT (aarch64_bitmasks[i])));
1318               emit_insn (gen_adddi3 (dest, subtarget,
1319                                      GEN_INT (val - aarch64_bitmasks[i])));
1320             }
1321           num_insns += 2;
1322           return num_insns;
1323         }
1324
1325       for (j = 0; j < 64; j += 16, mask <<= 16)
1326         {
1327           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1328             {
1329               if (generate)
1330                 {
1331                   emit_insn (gen_rtx_SET (dest,
1332                                           GEN_INT (aarch64_bitmasks[i])));
1333                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1334                                              GEN_INT ((val >> j) & 0xffff)));
1335                 }
1336               num_insns += 2;
1337               return num_insns;
1338             }
1339         }
1340     }
1341
1342   /* See if we can do it by logically combining two immediates.  */
1343   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1344     {
1345       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1346         {
1347           int j;
1348
1349           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1350             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1351               {
1352                 if (generate)
1353                   {
1354                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1355                     emit_insn (gen_rtx_SET (subtarget,
1356                                             GEN_INT (aarch64_bitmasks[i])));
1357                     emit_insn (gen_iordi3 (dest, subtarget,
1358                                            GEN_INT (aarch64_bitmasks[j])));
1359                   }
1360                 num_insns += 2;
1361                 return num_insns;
1362               }
1363         }
1364       else if ((val & aarch64_bitmasks[i]) == val)
1365         {
1366           int j;
1367
1368           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1369             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1370               {
1371                 if (generate)
1372                   {
1373                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1374                     emit_insn (gen_rtx_SET (subtarget,
1375                                             GEN_INT (aarch64_bitmasks[j])));
1376                     emit_insn (gen_anddi3 (dest, subtarget,
1377                                            GEN_INT (aarch64_bitmasks[i])));
1378                   }
1379                 num_insns += 2;
1380                 return num_insns;
1381               }
1382         }
1383     }
1384
1385   if (one_match > zero_match)
1386     {
1387       /* Set either first three quarters or all but the third.   */
1388       mask = 0xffffll << (16 - first_not_ffff_match);
1389       if (generate)
1390         emit_insn (gen_rtx_SET (dest,
1391                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1392       num_insns ++;
1393
1394       /* Now insert other two quarters.  */
1395       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1396            i < 64; i += 16, mask <<= 16)
1397         {
1398           if ((val & mask) != mask)
1399             {
1400               if (generate)
1401                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1402                                            GEN_INT ((val >> i) & 0xffff)));
1403               num_insns ++;
1404             }
1405         }
1406       return num_insns;
1407     }
1408
1409  simple_sequence:
1410   first = true;
1411   mask = 0xffff;
1412   for (i = 0; i < 64; i += 16, mask <<= 16)
1413     {
1414       if ((val & mask) != 0)
1415         {
1416           if (first)
1417             {
1418               if (generate)
1419                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1420               num_insns ++;
1421               first = false;
1422             }
1423           else
1424             {
1425               if (generate)
1426                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1427                                            GEN_INT ((val >> i) & 0xffff)));
1428               num_insns ++;
1429             }
1430         }
1431     }
1432
1433   return num_insns;
1434 }
1435
1436
1437 void
1438 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1439 {
1440   machine_mode mode = GET_MODE (dest);
1441
1442   gcc_assert (mode == SImode || mode == DImode);
1443
1444   /* Check on what type of symbol it is.  */
1445   if (GET_CODE (imm) == SYMBOL_REF
1446       || GET_CODE (imm) == LABEL_REF
1447       || GET_CODE (imm) == CONST)
1448     {
1449       rtx mem, base, offset;
1450       enum aarch64_symbol_type sty;
1451
1452       /* If we have (const (plus symbol offset)), separate out the offset
1453          before we start classifying the symbol.  */
1454       split_const (imm, &base, &offset);
1455
1456       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1457       switch (sty)
1458         {
1459         case SYMBOL_FORCE_TO_MEM:
1460           if (offset != const0_rtx
1461               && targetm.cannot_force_const_mem (mode, imm))
1462             {
1463               gcc_assert (can_create_pseudo_p ());
1464               base = aarch64_force_temporary (mode, dest, base);
1465               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1466               aarch64_emit_move (dest, base);
1467               return;
1468             }
1469           mem = force_const_mem (ptr_mode, imm);
1470           gcc_assert (mem);
1471           if (mode != ptr_mode)
1472             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1473           emit_insn (gen_rtx_SET (dest, mem));
1474           return;
1475
1476         case SYMBOL_SMALL_TLSGD:
1477         case SYMBOL_SMALL_TLSDESC:
1478         case SYMBOL_SMALL_GOTTPREL:
1479         case SYMBOL_SMALL_GOT:
1480         case SYMBOL_TINY_GOT:
1481           if (offset != const0_rtx)
1482             {
1483               gcc_assert(can_create_pseudo_p ());
1484               base = aarch64_force_temporary (mode, dest, base);
1485               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1486               aarch64_emit_move (dest, base);
1487               return;
1488             }
1489           /* FALLTHRU */
1490
1491         case SYMBOL_SMALL_TPREL:
1492         case SYMBOL_SMALL_ABSOLUTE:
1493         case SYMBOL_TINY_ABSOLUTE:
1494           aarch64_load_symref_appropriately (dest, imm, sty);
1495           return;
1496
1497         default:
1498           gcc_unreachable ();
1499         }
1500     }
1501
1502   if (!CONST_INT_P (imm))
1503     {
1504       if (GET_CODE (imm) == HIGH)
1505         emit_insn (gen_rtx_SET (dest, imm));
1506       else
1507         {
1508           rtx mem = force_const_mem (mode, imm);
1509           gcc_assert (mem);
1510           emit_insn (gen_rtx_SET (dest, mem));
1511         }
1512
1513       return;
1514     }
1515
1516   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1517 }
1518
1519 static bool
1520 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1521                                  tree exp ATTRIBUTE_UNUSED)
1522 {
1523   /* Currently, always true.  */
1524   return true;
1525 }
1526
1527 /* Implement TARGET_PASS_BY_REFERENCE.  */
1528
1529 static bool
1530 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1531                            machine_mode mode,
1532                            const_tree type,
1533                            bool named ATTRIBUTE_UNUSED)
1534 {
1535   HOST_WIDE_INT size;
1536   machine_mode dummymode;
1537   int nregs;
1538
1539   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1540   size = (mode == BLKmode && type)
1541     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1542
1543   /* Aggregates are passed by reference based on their size.  */
1544   if (type && AGGREGATE_TYPE_P (type))
1545     {
1546       size = int_size_in_bytes (type);
1547     }
1548
1549   /* Variable sized arguments are always returned by reference.  */
1550   if (size < 0)
1551     return true;
1552
1553   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1554   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1555                                                &dummymode, &nregs,
1556                                                NULL))
1557     return false;
1558
1559   /* Arguments which are variable sized or larger than 2 registers are
1560      passed by reference unless they are a homogenous floating point
1561      aggregate.  */
1562   return size > 2 * UNITS_PER_WORD;
1563 }
1564
1565 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1566 static bool
1567 aarch64_return_in_msb (const_tree valtype)
1568 {
1569   machine_mode dummy_mode;
1570   int dummy_int;
1571
1572   /* Never happens in little-endian mode.  */
1573   if (!BYTES_BIG_ENDIAN)
1574     return false;
1575
1576   /* Only composite types smaller than or equal to 16 bytes can
1577      be potentially returned in registers.  */
1578   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1579       || int_size_in_bytes (valtype) <= 0
1580       || int_size_in_bytes (valtype) > 16)
1581     return false;
1582
1583   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1584      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1585      is always passed/returned in the least significant bits of fp/simd
1586      register(s).  */
1587   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1588                                                &dummy_mode, &dummy_int, NULL))
1589     return false;
1590
1591   return true;
1592 }
1593
1594 /* Implement TARGET_FUNCTION_VALUE.
1595    Define how to find the value returned by a function.  */
1596
1597 static rtx
1598 aarch64_function_value (const_tree type, const_tree func,
1599                         bool outgoing ATTRIBUTE_UNUSED)
1600 {
1601   machine_mode mode;
1602   int unsignedp;
1603   int count;
1604   machine_mode ag_mode;
1605
1606   mode = TYPE_MODE (type);
1607   if (INTEGRAL_TYPE_P (type))
1608     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1609
1610   if (aarch64_return_in_msb (type))
1611     {
1612       HOST_WIDE_INT size = int_size_in_bytes (type);
1613
1614       if (size % UNITS_PER_WORD != 0)
1615         {
1616           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1617           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1618         }
1619     }
1620
1621   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1622                                                &ag_mode, &count, NULL))
1623     {
1624       if (!aarch64_composite_type_p (type, mode))
1625         {
1626           gcc_assert (count == 1 && mode == ag_mode);
1627           return gen_rtx_REG (mode, V0_REGNUM);
1628         }
1629       else
1630         {
1631           int i;
1632           rtx par;
1633
1634           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1635           for (i = 0; i < count; i++)
1636             {
1637               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1638               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1639                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1640               XVECEXP (par, 0, i) = tmp;
1641             }
1642           return par;
1643         }
1644     }
1645   else
1646     return gen_rtx_REG (mode, R0_REGNUM);
1647 }
1648
1649 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1650    Return true if REGNO is the number of a hard register in which the values
1651    of called function may come back.  */
1652
1653 static bool
1654 aarch64_function_value_regno_p (const unsigned int regno)
1655 {
1656   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1657      of 16-byte return values are: 128-bit integers and 16-byte small
1658      structures (excluding homogeneous floating-point aggregates).  */
1659   if (regno == R0_REGNUM || regno == R1_REGNUM)
1660     return true;
1661
1662   /* Up to four fp/simd registers can return a function value, e.g. a
1663      homogeneous floating-point aggregate having four members.  */
1664   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1665     return TARGET_FLOAT;
1666
1667   return false;
1668 }
1669
1670 /* Implement TARGET_RETURN_IN_MEMORY.
1671
1672    If the type T of the result of a function is such that
1673      void func (T arg)
1674    would require that arg be passed as a value in a register (or set of
1675    registers) according to the parameter passing rules, then the result
1676    is returned in the same registers as would be used for such an
1677    argument.  */
1678
1679 static bool
1680 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1681 {
1682   HOST_WIDE_INT size;
1683   machine_mode ag_mode;
1684   int count;
1685
1686   if (!AGGREGATE_TYPE_P (type)
1687       && TREE_CODE (type) != COMPLEX_TYPE
1688       && TREE_CODE (type) != VECTOR_TYPE)
1689     /* Simple scalar types always returned in registers.  */
1690     return false;
1691
1692   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1693                                                type,
1694                                                &ag_mode,
1695                                                &count,
1696                                                NULL))
1697     return false;
1698
1699   /* Types larger than 2 registers returned in memory.  */
1700   size = int_size_in_bytes (type);
1701   return (size < 0 || size > 2 * UNITS_PER_WORD);
1702 }
1703
1704 static bool
1705 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1706                                const_tree type, int *nregs)
1707 {
1708   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1709   return aarch64_vfp_is_call_or_return_candidate (mode,
1710                                                   type,
1711                                                   &pcum->aapcs_vfp_rmode,
1712                                                   nregs,
1713                                                   NULL);
1714 }
1715
1716 /* Given MODE and TYPE of a function argument, return the alignment in
1717    bits.  The idea is to suppress any stronger alignment requested by
1718    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1719    This is a helper function for local use only.  */
1720
1721 static unsigned int
1722 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1723 {
1724   unsigned int alignment;
1725
1726   if (type)
1727     {
1728       if (!integer_zerop (TYPE_SIZE (type)))
1729         {
1730           if (TYPE_MODE (type) == mode)
1731             alignment = TYPE_ALIGN (type);
1732           else
1733             alignment = GET_MODE_ALIGNMENT (mode);
1734         }
1735       else
1736         alignment = 0;
1737     }
1738   else
1739     alignment = GET_MODE_ALIGNMENT (mode);
1740
1741   return alignment;
1742 }
1743
1744 /* Layout a function argument according to the AAPCS64 rules.  The rule
1745    numbers refer to the rule numbers in the AAPCS64.  */
1746
1747 static void
1748 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1749                     const_tree type,
1750                     bool named ATTRIBUTE_UNUSED)
1751 {
1752   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1753   int ncrn, nvrn, nregs;
1754   bool allocate_ncrn, allocate_nvrn;
1755   HOST_WIDE_INT size;
1756
1757   /* We need to do this once per argument.  */
1758   if (pcum->aapcs_arg_processed)
1759     return;
1760
1761   pcum->aapcs_arg_processed = true;
1762
1763   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1764   size
1765     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1766                         UNITS_PER_WORD);
1767
1768   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1769   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1770                                                  mode,
1771                                                  type,
1772                                                  &nregs);
1773
1774   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1775      The following code thus handles passing by SIMD/FP registers first.  */
1776
1777   nvrn = pcum->aapcs_nvrn;
1778
1779   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1780      and homogenous short-vector aggregates (HVA).  */
1781   if (allocate_nvrn)
1782     {
1783       if (!TARGET_FLOAT)
1784         aarch64_err_no_fpadvsimd (mode, "argument");
1785
1786       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1787         {
1788           pcum->aapcs_nextnvrn = nvrn + nregs;
1789           if (!aarch64_composite_type_p (type, mode))
1790             {
1791               gcc_assert (nregs == 1);
1792               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1793             }
1794           else
1795             {
1796               rtx par;
1797               int i;
1798               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1799               for (i = 0; i < nregs; i++)
1800                 {
1801                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1802                                          V0_REGNUM + nvrn + i);
1803                   tmp = gen_rtx_EXPR_LIST
1804                     (VOIDmode, tmp,
1805                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1806                   XVECEXP (par, 0, i) = tmp;
1807                 }
1808               pcum->aapcs_reg = par;
1809             }
1810           return;
1811         }
1812       else
1813         {
1814           /* C.3 NSRN is set to 8.  */
1815           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1816           goto on_stack;
1817         }
1818     }
1819
1820   ncrn = pcum->aapcs_ncrn;
1821   nregs = size / UNITS_PER_WORD;
1822
1823   /* C6 - C9.  though the sign and zero extension semantics are
1824      handled elsewhere.  This is the case where the argument fits
1825      entirely general registers.  */
1826   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1827     {
1828       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1829
1830       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1831
1832       /* C.8 if the argument has an alignment of 16 then the NGRN is
1833          rounded up to the next even number.  */
1834       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1835         {
1836           ++ncrn;
1837           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1838         }
1839       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1840          A reg is still generated for it, but the caller should be smart
1841          enough not to use it.  */
1842       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1843         {
1844           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1845         }
1846       else
1847         {
1848           rtx par;
1849           int i;
1850
1851           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1852           for (i = 0; i < nregs; i++)
1853             {
1854               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1855               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1856                                        GEN_INT (i * UNITS_PER_WORD));
1857               XVECEXP (par, 0, i) = tmp;
1858             }
1859           pcum->aapcs_reg = par;
1860         }
1861
1862       pcum->aapcs_nextncrn = ncrn + nregs;
1863       return;
1864     }
1865
1866   /* C.11  */
1867   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1868
1869   /* The argument is passed on stack; record the needed number of words for
1870      this argument and align the total size if necessary.  */
1871 on_stack:
1872   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1873   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1874     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1875                                                16 / UNITS_PER_WORD);
1876   return;
1877 }
1878
1879 /* Implement TARGET_FUNCTION_ARG.  */
1880
1881 static rtx
1882 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1883                       const_tree type, bool named)
1884 {
1885   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1886   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1887
1888   if (mode == VOIDmode)
1889     return NULL_RTX;
1890
1891   aarch64_layout_arg (pcum_v, mode, type, named);
1892   return pcum->aapcs_reg;
1893 }
1894
1895 void
1896 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1897                            const_tree fntype ATTRIBUTE_UNUSED,
1898                            rtx libname ATTRIBUTE_UNUSED,
1899                            const_tree fndecl ATTRIBUTE_UNUSED,
1900                            unsigned n_named ATTRIBUTE_UNUSED)
1901 {
1902   pcum->aapcs_ncrn = 0;
1903   pcum->aapcs_nvrn = 0;
1904   pcum->aapcs_nextncrn = 0;
1905   pcum->aapcs_nextnvrn = 0;
1906   pcum->pcs_variant = ARM_PCS_AAPCS64;
1907   pcum->aapcs_reg = NULL_RTX;
1908   pcum->aapcs_arg_processed = false;
1909   pcum->aapcs_stack_words = 0;
1910   pcum->aapcs_stack_size = 0;
1911
1912   if (!TARGET_FLOAT
1913       && fndecl && TREE_PUBLIC (fndecl)
1914       && fntype && fntype != error_mark_node)
1915     {
1916       const_tree type = TREE_TYPE (fntype);
1917       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
1918       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
1919       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
1920                                                    &mode, &nregs, NULL))
1921         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
1922     }
1923   return;
1924 }
1925
1926 static void
1927 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1928                               machine_mode mode,
1929                               const_tree type,
1930                               bool named)
1931 {
1932   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1933   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1934     {
1935       aarch64_layout_arg (pcum_v, mode, type, named);
1936       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1937                   != (pcum->aapcs_stack_words != 0));
1938       pcum->aapcs_arg_processed = false;
1939       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1940       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1941       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1942       pcum->aapcs_stack_words = 0;
1943       pcum->aapcs_reg = NULL_RTX;
1944     }
1945 }
1946
1947 bool
1948 aarch64_function_arg_regno_p (unsigned regno)
1949 {
1950   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1951           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1952 }
1953
1954 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1955    PARM_BOUNDARY bits of alignment, but will be given anything up
1956    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1957    that both before and after the layout of each argument, the Next
1958    Stacked Argument Address (NSAA) will have a minimum alignment of
1959    8 bytes.  */
1960
1961 static unsigned int
1962 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1963 {
1964   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1965
1966   if (alignment < PARM_BOUNDARY)
1967     alignment = PARM_BOUNDARY;
1968   if (alignment > STACK_BOUNDARY)
1969     alignment = STACK_BOUNDARY;
1970   return alignment;
1971 }
1972
1973 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1974
1975    Return true if an argument passed on the stack should be padded upwards,
1976    i.e. if the least-significant byte of the stack slot has useful data.
1977
1978    Small aggregate types are placed in the lowest memory address.
1979
1980    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1981
1982 bool
1983 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1984 {
1985   /* On little-endian targets, the least significant byte of every stack
1986      argument is passed at the lowest byte address of the stack slot.  */
1987   if (!BYTES_BIG_ENDIAN)
1988     return true;
1989
1990   /* Otherwise, integral, floating-point and pointer types are padded downward:
1991      the least significant byte of a stack argument is passed at the highest
1992      byte address of the stack slot.  */
1993   if (type
1994       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1995          || POINTER_TYPE_P (type))
1996       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1997     return false;
1998
1999   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2000   return true;
2001 }
2002
2003 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2004
2005    It specifies padding for the last (may also be the only)
2006    element of a block move between registers and memory.  If
2007    assuming the block is in the memory, padding upward means that
2008    the last element is padded after its highest significant byte,
2009    while in downward padding, the last element is padded at the
2010    its least significant byte side.
2011
2012    Small aggregates and small complex types are always padded
2013    upwards.
2014
2015    We don't need to worry about homogeneous floating-point or
2016    short-vector aggregates; their move is not affected by the
2017    padding direction determined here.  Regardless of endianness,
2018    each element of such an aggregate is put in the least
2019    significant bits of a fp/simd register.
2020
2021    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2022    register has useful data, and return the opposite if the most
2023    significant byte does.  */
2024
2025 bool
2026 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2027                      bool first ATTRIBUTE_UNUSED)
2028 {
2029
2030   /* Small composite types are always padded upward.  */
2031   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2032     {
2033       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2034                             : GET_MODE_SIZE (mode));
2035       if (size < 2 * UNITS_PER_WORD)
2036         return true;
2037     }
2038
2039   /* Otherwise, use the default padding.  */
2040   return !BYTES_BIG_ENDIAN;
2041 }
2042
2043 static machine_mode
2044 aarch64_libgcc_cmp_return_mode (void)
2045 {
2046   return SImode;
2047 }
2048
2049 static bool
2050 aarch64_frame_pointer_required (void)
2051 {
2052   /* In aarch64_override_options_after_change
2053      flag_omit_leaf_frame_pointer turns off the frame pointer by
2054      default.  Turn it back on now if we've not got a leaf
2055      function.  */
2056   if (flag_omit_leaf_frame_pointer
2057       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2058     return true;
2059
2060   return false;
2061 }
2062
2063 /* Mark the registers that need to be saved by the callee and calculate
2064    the size of the callee-saved registers area and frame record (both FP
2065    and LR may be omitted).  */
2066 static void
2067 aarch64_layout_frame (void)
2068 {
2069   HOST_WIDE_INT offset = 0;
2070   int regno;
2071
2072   if (reload_completed && cfun->machine->frame.laid_out)
2073     return;
2074
2075 #define SLOT_NOT_REQUIRED (-2)
2076 #define SLOT_REQUIRED     (-1)
2077
2078   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2079   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2080
2081   /* First mark all the registers that really need to be saved...  */
2082   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2083     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2084
2085   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2086     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2087
2088   /* ... that includes the eh data registers (if needed)...  */
2089   if (crtl->calls_eh_return)
2090     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2091       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2092         = SLOT_REQUIRED;
2093
2094   /* ... and any callee saved register that dataflow says is live.  */
2095   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2096     if (df_regs_ever_live_p (regno)
2097         && (regno == R30_REGNUM
2098             || !call_used_regs[regno]))
2099       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2100
2101   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2102     if (df_regs_ever_live_p (regno)
2103         && !call_used_regs[regno])
2104       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2105
2106   if (frame_pointer_needed)
2107     {
2108       /* FP and LR are placed in the linkage record.  */
2109       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2110       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2111       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2112       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2113       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2114       offset += 2 * UNITS_PER_WORD;
2115     }
2116
2117   /* Now assign stack slots for them.  */
2118   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2119     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2120       {
2121         cfun->machine->frame.reg_offset[regno] = offset;
2122         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2123           cfun->machine->frame.wb_candidate1 = regno;
2124         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2125           cfun->machine->frame.wb_candidate2 = regno;
2126         offset += UNITS_PER_WORD;
2127       }
2128
2129   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2130     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2131       {
2132         cfun->machine->frame.reg_offset[regno] = offset;
2133         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2134           cfun->machine->frame.wb_candidate1 = regno;
2135         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2136                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2137           cfun->machine->frame.wb_candidate2 = regno;
2138         offset += UNITS_PER_WORD;
2139       }
2140
2141   cfun->machine->frame.padding0 =
2142     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2143   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2144
2145   cfun->machine->frame.saved_regs_size = offset;
2146
2147   cfun->machine->frame.hard_fp_offset
2148     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2149                         + get_frame_size ()
2150                         + cfun->machine->frame.saved_regs_size,
2151                         STACK_BOUNDARY / BITS_PER_UNIT);
2152
2153   cfun->machine->frame.frame_size
2154     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2155                         + crtl->outgoing_args_size,
2156                         STACK_BOUNDARY / BITS_PER_UNIT);
2157
2158   cfun->machine->frame.laid_out = true;
2159 }
2160
2161 static bool
2162 aarch64_register_saved_on_entry (int regno)
2163 {
2164   return cfun->machine->frame.reg_offset[regno] >= 0;
2165 }
2166
2167 static unsigned
2168 aarch64_next_callee_save (unsigned regno, unsigned limit)
2169 {
2170   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2171     regno ++;
2172   return regno;
2173 }
2174
2175 static void
2176 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2177                            HOST_WIDE_INT adjustment)
2178  {
2179   rtx base_rtx = stack_pointer_rtx;
2180   rtx insn, reg, mem;
2181
2182   reg = gen_rtx_REG (mode, regno);
2183   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2184                             plus_constant (Pmode, base_rtx, -adjustment));
2185   mem = gen_rtx_MEM (mode, mem);
2186
2187   insn = emit_move_insn (mem, reg);
2188   RTX_FRAME_RELATED_P (insn) = 1;
2189 }
2190
2191 static rtx
2192 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2193                           HOST_WIDE_INT adjustment)
2194 {
2195   switch (mode)
2196     {
2197     case DImode:
2198       return gen_storewb_pairdi_di (base, base, reg, reg2,
2199                                     GEN_INT (-adjustment),
2200                                     GEN_INT (UNITS_PER_WORD - adjustment));
2201     case DFmode:
2202       return gen_storewb_pairdf_di (base, base, reg, reg2,
2203                                     GEN_INT (-adjustment),
2204                                     GEN_INT (UNITS_PER_WORD - adjustment));
2205     default:
2206       gcc_unreachable ();
2207     }
2208 }
2209
2210 static void
2211 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2212                          unsigned regno2, HOST_WIDE_INT adjustment)
2213 {
2214   rtx_insn *insn;
2215   rtx reg1 = gen_rtx_REG (mode, regno1);
2216   rtx reg2 = gen_rtx_REG (mode, regno2);
2217
2218   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2219                                               reg2, adjustment));
2220   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2221   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2222   RTX_FRAME_RELATED_P (insn) = 1;
2223 }
2224
2225 static rtx
2226 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2227                          HOST_WIDE_INT adjustment)
2228 {
2229   switch (mode)
2230     {
2231     case DImode:
2232       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2233                                    GEN_INT (UNITS_PER_WORD));
2234     case DFmode:
2235       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2236                                    GEN_INT (UNITS_PER_WORD));
2237     default:
2238       gcc_unreachable ();
2239     }
2240 }
2241
2242 static rtx
2243 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2244                         rtx reg2)
2245 {
2246   switch (mode)
2247     {
2248     case DImode:
2249       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2250
2251     case DFmode:
2252       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2253
2254     default:
2255       gcc_unreachable ();
2256     }
2257 }
2258
2259 static rtx
2260 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2261                        rtx mem2)
2262 {
2263   switch (mode)
2264     {
2265     case DImode:
2266       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2267
2268     case DFmode:
2269       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2270
2271     default:
2272       gcc_unreachable ();
2273     }
2274 }
2275
2276
2277 static void
2278 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2279                            unsigned start, unsigned limit, bool skip_wb)
2280 {
2281   rtx_insn *insn;
2282   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2283                                                  ? gen_frame_mem : gen_rtx_MEM);
2284   unsigned regno;
2285   unsigned regno2;
2286
2287   for (regno = aarch64_next_callee_save (start, limit);
2288        regno <= limit;
2289        regno = aarch64_next_callee_save (regno + 1, limit))
2290     {
2291       rtx reg, mem;
2292       HOST_WIDE_INT offset;
2293
2294       if (skip_wb
2295           && (regno == cfun->machine->frame.wb_candidate1
2296               || regno == cfun->machine->frame.wb_candidate2))
2297         continue;
2298
2299       reg = gen_rtx_REG (mode, regno);
2300       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2301       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2302                                               offset));
2303
2304       regno2 = aarch64_next_callee_save (regno + 1, limit);
2305
2306       if (regno2 <= limit
2307           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2308               == cfun->machine->frame.reg_offset[regno2]))
2309
2310         {
2311           rtx reg2 = gen_rtx_REG (mode, regno2);
2312           rtx mem2;
2313
2314           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2315           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2316                                                    offset));
2317           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2318                                                     reg2));
2319
2320           /* The first part of a frame-related parallel insn is
2321              always assumed to be relevant to the frame
2322              calculations; subsequent parts, are only
2323              frame-related if explicitly marked.  */
2324           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2325           regno = regno2;
2326         }
2327       else
2328         insn = emit_move_insn (mem, reg);
2329
2330       RTX_FRAME_RELATED_P (insn) = 1;
2331     }
2332 }
2333
2334 static void
2335 aarch64_restore_callee_saves (machine_mode mode,
2336                               HOST_WIDE_INT start_offset, unsigned start,
2337                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2338 {
2339   rtx base_rtx = stack_pointer_rtx;
2340   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2341                                                  ? gen_frame_mem : gen_rtx_MEM);
2342   unsigned regno;
2343   unsigned regno2;
2344   HOST_WIDE_INT offset;
2345
2346   for (regno = aarch64_next_callee_save (start, limit);
2347        regno <= limit;
2348        regno = aarch64_next_callee_save (regno + 1, limit))
2349     {
2350       rtx reg, mem;
2351
2352       if (skip_wb
2353           && (regno == cfun->machine->frame.wb_candidate1
2354               || regno == cfun->machine->frame.wb_candidate2))
2355         continue;
2356
2357       reg = gen_rtx_REG (mode, regno);
2358       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2359       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2360
2361       regno2 = aarch64_next_callee_save (regno + 1, limit);
2362
2363       if (regno2 <= limit
2364           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2365               == cfun->machine->frame.reg_offset[regno2]))
2366         {
2367           rtx reg2 = gen_rtx_REG (mode, regno2);
2368           rtx mem2;
2369
2370           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2371           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2372           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2373
2374           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2375           regno = regno2;
2376         }
2377       else
2378         emit_move_insn (reg, mem);
2379       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2380     }
2381 }
2382
2383 /* AArch64 stack frames generated by this compiler look like:
2384
2385         +-------------------------------+
2386         |                               |
2387         |  incoming stack arguments     |
2388         |                               |
2389         +-------------------------------+
2390         |                               | <-- incoming stack pointer (aligned)
2391         |  callee-allocated save area   |
2392         |  for register varargs         |
2393         |                               |
2394         +-------------------------------+
2395         |  local variables              | <-- frame_pointer_rtx
2396         |                               |
2397         +-------------------------------+
2398         |  padding0                     | \
2399         +-------------------------------+  |
2400         |  callee-saved registers       |  | frame.saved_regs_size
2401         +-------------------------------+  |
2402         |  LR'                          |  |
2403         +-------------------------------+  |
2404         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2405         +-------------------------------+
2406         |  dynamic allocation           |
2407         +-------------------------------+
2408         |  padding                      |
2409         +-------------------------------+
2410         |  outgoing stack arguments     | <-- arg_pointer
2411         |                               |
2412         +-------------------------------+
2413         |                               | <-- stack_pointer_rtx (aligned)
2414
2415    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2416    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2417    unchanged.  */
2418
2419 /* Generate the prologue instructions for entry into a function.
2420    Establish the stack frame by decreasing the stack pointer with a
2421    properly calculated size and, if necessary, create a frame record
2422    filled with the values of LR and previous frame pointer.  The
2423    current FP is also set up if it is in use.  */
2424
2425 void
2426 aarch64_expand_prologue (void)
2427 {
2428   /* sub sp, sp, #<frame_size>
2429      stp {fp, lr}, [sp, #<frame_size> - 16]
2430      add fp, sp, #<frame_size> - hardfp_offset
2431      stp {cs_reg}, [fp, #-16] etc.
2432
2433      sub sp, sp, <final_adjustment_if_any>
2434   */
2435   HOST_WIDE_INT frame_size, offset;
2436   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2437   HOST_WIDE_INT hard_fp_offset;
2438   rtx_insn *insn;
2439
2440   aarch64_layout_frame ();
2441
2442   offset = frame_size = cfun->machine->frame.frame_size;
2443   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2444   fp_offset = frame_size - hard_fp_offset;
2445
2446   if (flag_stack_usage_info)
2447     current_function_static_stack_size = frame_size;
2448
2449   /* Store pairs and load pairs have a range only -512 to 504.  */
2450   if (offset >= 512)
2451     {
2452       /* When the frame has a large size, an initial decrease is done on
2453          the stack pointer to jump over the callee-allocated save area for
2454          register varargs, the local variable area and/or the callee-saved
2455          register area.  This will allow the pre-index write-back
2456          store pair instructions to be used for setting up the stack frame
2457          efficiently.  */
2458       offset = hard_fp_offset;
2459       if (offset >= 512)
2460         offset = cfun->machine->frame.saved_regs_size;
2461
2462       frame_size -= (offset + crtl->outgoing_args_size);
2463       fp_offset = 0;
2464
2465       if (frame_size >= 0x1000000)
2466         {
2467           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2468           emit_move_insn (op0, GEN_INT (-frame_size));
2469           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2470
2471           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2472                         gen_rtx_SET (stack_pointer_rtx,
2473                                      plus_constant (Pmode, stack_pointer_rtx,
2474                                                     -frame_size)));
2475           RTX_FRAME_RELATED_P (insn) = 1;
2476         }
2477       else if (frame_size > 0)
2478         {
2479           int hi_ofs = frame_size & 0xfff000;
2480           int lo_ofs = frame_size & 0x000fff;
2481
2482           if (hi_ofs)
2483             {
2484               insn = emit_insn (gen_add2_insn
2485                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2486               RTX_FRAME_RELATED_P (insn) = 1;
2487             }
2488           if (lo_ofs)
2489             {
2490               insn = emit_insn (gen_add2_insn
2491                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2492               RTX_FRAME_RELATED_P (insn) = 1;
2493             }
2494         }
2495     }
2496   else
2497     frame_size = -1;
2498
2499   if (offset > 0)
2500     {
2501       bool skip_wb = false;
2502
2503       if (frame_pointer_needed)
2504         {
2505           skip_wb = true;
2506
2507           if (fp_offset)
2508             {
2509               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2510                                                GEN_INT (-offset)));
2511               RTX_FRAME_RELATED_P (insn) = 1;
2512
2513               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2514                                          R30_REGNUM, false);
2515             }
2516           else
2517             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2518
2519           /* Set up frame pointer to point to the location of the
2520              previous frame pointer on the stack.  */
2521           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2522                                            stack_pointer_rtx,
2523                                            GEN_INT (fp_offset)));
2524           RTX_FRAME_RELATED_P (insn) = 1;
2525           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2526         }
2527       else
2528         {
2529           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2530           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2531
2532           if (fp_offset
2533               || reg1 == FIRST_PSEUDO_REGISTER
2534               || (reg2 == FIRST_PSEUDO_REGISTER
2535                   && offset >= 256))
2536             {
2537               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2538                                                GEN_INT (-offset)));
2539               RTX_FRAME_RELATED_P (insn) = 1;
2540             }
2541           else
2542             {
2543               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2544
2545               skip_wb = true;
2546
2547               if (reg2 == FIRST_PSEUDO_REGISTER)
2548                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2549               else
2550                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2551             }
2552         }
2553
2554       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2555                                  skip_wb);
2556       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2557                                  skip_wb);
2558     }
2559
2560   /* when offset >= 512,
2561      sub sp, sp, #<outgoing_args_size> */
2562   if (frame_size > -1)
2563     {
2564       if (crtl->outgoing_args_size > 0)
2565         {
2566           insn = emit_insn (gen_add2_insn
2567                             (stack_pointer_rtx,
2568                              GEN_INT (- crtl->outgoing_args_size)));
2569           RTX_FRAME_RELATED_P (insn) = 1;
2570         }
2571     }
2572 }
2573
2574 /* Return TRUE if we can use a simple_return insn.
2575
2576    This function checks whether the callee saved stack is empty, which
2577    means no restore actions are need. The pro_and_epilogue will use
2578    this to check whether shrink-wrapping opt is feasible.  */
2579
2580 bool
2581 aarch64_use_return_insn_p (void)
2582 {
2583   if (!reload_completed)
2584     return false;
2585
2586   if (crtl->profile)
2587     return false;
2588
2589   aarch64_layout_frame ();
2590
2591   return cfun->machine->frame.frame_size == 0;
2592 }
2593
2594 /* Generate the epilogue instructions for returning from a function.  */
2595 void
2596 aarch64_expand_epilogue (bool for_sibcall)
2597 {
2598   HOST_WIDE_INT frame_size, offset;
2599   HOST_WIDE_INT fp_offset;
2600   HOST_WIDE_INT hard_fp_offset;
2601   rtx_insn *insn;
2602   /* We need to add memory barrier to prevent read from deallocated stack.  */
2603   bool need_barrier_p = (get_frame_size () != 0
2604                          || cfun->machine->frame.saved_varargs_size);
2605
2606   aarch64_layout_frame ();
2607
2608   offset = frame_size = cfun->machine->frame.frame_size;
2609   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2610   fp_offset = frame_size - hard_fp_offset;
2611
2612   /* Store pairs and load pairs have a range only -512 to 504.  */
2613   if (offset >= 512)
2614     {
2615       offset = hard_fp_offset;
2616       if (offset >= 512)
2617         offset = cfun->machine->frame.saved_regs_size;
2618
2619       frame_size -= (offset + crtl->outgoing_args_size);
2620       fp_offset = 0;
2621       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2622         {
2623           insn = emit_insn (gen_add2_insn
2624                             (stack_pointer_rtx,
2625                              GEN_INT (crtl->outgoing_args_size)));
2626           RTX_FRAME_RELATED_P (insn) = 1;
2627         }
2628     }
2629   else
2630     frame_size = -1;
2631
2632   /* If there were outgoing arguments or we've done dynamic stack
2633      allocation, then restore the stack pointer from the frame
2634      pointer.  This is at most one insn and more efficient than using
2635      GCC's internal mechanism.  */
2636   if (frame_pointer_needed
2637       && (crtl->outgoing_args_size || cfun->calls_alloca))
2638     {
2639       if (cfun->calls_alloca)
2640         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2641
2642       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2643                                        hard_frame_pointer_rtx,
2644                                        GEN_INT (0)));
2645       offset = offset - fp_offset;
2646     }
2647
2648   if (offset > 0)
2649     {
2650       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2651       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2652       bool skip_wb = true;
2653       rtx cfi_ops = NULL;
2654
2655       if (frame_pointer_needed)
2656         fp_offset = 0;
2657       else if (fp_offset
2658                || reg1 == FIRST_PSEUDO_REGISTER
2659                || (reg2 == FIRST_PSEUDO_REGISTER
2660                    && offset >= 256))
2661         skip_wb = false;
2662
2663       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2664                                     skip_wb, &cfi_ops);
2665       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2666                                     skip_wb, &cfi_ops);
2667
2668       if (need_barrier_p)
2669         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2670
2671       if (skip_wb)
2672         {
2673           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2674           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2675
2676           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2677           if (reg2 == FIRST_PSEUDO_REGISTER)
2678             {
2679               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2680               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2681               mem = gen_rtx_MEM (mode1, mem);
2682               insn = emit_move_insn (rreg1, mem);
2683             }
2684           else
2685             {
2686               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2687
2688               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2689               insn = emit_insn (aarch64_gen_loadwb_pair
2690                                 (mode1, stack_pointer_rtx, rreg1,
2691                                  rreg2, offset));
2692             }
2693         }
2694       else
2695         {
2696           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2697                                            GEN_INT (offset)));
2698         }
2699
2700       /* Reset the CFA to be SP + FRAME_SIZE.  */
2701       rtx new_cfa = stack_pointer_rtx;
2702       if (frame_size > 0)
2703         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2704       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2705       REG_NOTES (insn) = cfi_ops;
2706       RTX_FRAME_RELATED_P (insn) = 1;
2707     }
2708
2709   if (frame_size > 0)
2710     {
2711       if (need_barrier_p)
2712         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2713
2714       if (frame_size >= 0x1000000)
2715         {
2716           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2717           emit_move_insn (op0, GEN_INT (frame_size));
2718           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2719         }
2720       else
2721         {
2722           int hi_ofs = frame_size & 0xfff000;
2723           int lo_ofs = frame_size & 0x000fff;
2724
2725           if (hi_ofs && lo_ofs)
2726             {
2727               insn = emit_insn (gen_add2_insn
2728                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2729               RTX_FRAME_RELATED_P (insn) = 1;
2730               frame_size = lo_ofs;
2731             }
2732           insn = emit_insn (gen_add2_insn
2733                             (stack_pointer_rtx, GEN_INT (frame_size)));
2734         }
2735
2736       /* Reset the CFA to be SP + 0.  */
2737       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2738       RTX_FRAME_RELATED_P (insn) = 1;
2739     }
2740
2741   /* Stack adjustment for exception handler.  */
2742   if (crtl->calls_eh_return)
2743     {
2744       /* We need to unwind the stack by the offset computed by
2745          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2746          to be SP; letting the CFA move during this adjustment
2747          is just as correct as retaining the CFA from the body
2748          of the function.  Therefore, do nothing special.  */
2749       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2750     }
2751
2752   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2753   if (!for_sibcall)
2754     emit_jump_insn (ret_rtx);
2755 }
2756
2757 /* Return the place to copy the exception unwinding return address to.
2758    This will probably be a stack slot, but could (in theory be the
2759    return register).  */
2760 rtx
2761 aarch64_final_eh_return_addr (void)
2762 {
2763   HOST_WIDE_INT fp_offset;
2764
2765   aarch64_layout_frame ();
2766
2767   fp_offset = cfun->machine->frame.frame_size
2768               - cfun->machine->frame.hard_fp_offset;
2769
2770   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2771     return gen_rtx_REG (DImode, LR_REGNUM);
2772
2773   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2774      result in a store to save LR introduced by builtin_eh_return () being
2775      incorrectly deleted because the alias is not detected.
2776      So in the calculation of the address to copy the exception unwinding
2777      return address to, we note 2 cases.
2778      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2779      we return a SP-relative location since all the addresses are SP-relative
2780      in this case.  This prevents the store from being optimized away.
2781      If the fp_offset is not 0, then the addresses will be FP-relative and
2782      therefore we return a FP-relative location.  */
2783
2784   if (frame_pointer_needed)
2785     {
2786       if (fp_offset)
2787         return gen_frame_mem (DImode,
2788                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2789       else
2790         return gen_frame_mem (DImode,
2791                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2792     }
2793
2794   /* If FP is not needed, we calculate the location of LR, which would be
2795      at the top of the saved registers block.  */
2796
2797   return gen_frame_mem (DImode,
2798                         plus_constant (Pmode,
2799                                        stack_pointer_rtx,
2800                                        fp_offset
2801                                        + cfun->machine->frame.saved_regs_size
2802                                        - 2 * UNITS_PER_WORD));
2803 }
2804
2805 /* Possibly output code to build up a constant in a register.  For
2806    the benefit of the costs infrastructure, returns the number of
2807    instructions which would be emitted.  GENERATE inhibits or
2808    enables code generation.  */
2809
2810 static int
2811 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2812 {
2813   int insns = 0;
2814
2815   if (aarch64_bitmask_imm (val, DImode))
2816     {
2817       if (generate)
2818         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2819       insns = 1;
2820     }
2821   else
2822     {
2823       int i;
2824       int ncount = 0;
2825       int zcount = 0;
2826       HOST_WIDE_INT valp = val >> 16;
2827       HOST_WIDE_INT valm;
2828       HOST_WIDE_INT tval;
2829
2830       for (i = 16; i < 64; i += 16)
2831         {
2832           valm = (valp & 0xffff);
2833
2834           if (valm != 0)
2835             ++ zcount;
2836
2837           if (valm != 0xffff)
2838             ++ ncount;
2839
2840           valp >>= 16;
2841         }
2842
2843       /* zcount contains the number of additional MOVK instructions
2844          required if the constant is built up with an initial MOVZ instruction,
2845          while ncount is the number of MOVK instructions required if starting
2846          with a MOVN instruction.  Choose the sequence that yields the fewest
2847          number of instructions, preferring MOVZ instructions when they are both
2848          the same.  */
2849       if (ncount < zcount)
2850         {
2851           if (generate)
2852             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2853                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2854           tval = 0xffff;
2855           insns++;
2856         }
2857       else
2858         {
2859           if (generate)
2860             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2861                             GEN_INT (val & 0xffff));
2862           tval = 0;
2863           insns++;
2864         }
2865
2866       val >>= 16;
2867
2868       for (i = 16; i < 64; i += 16)
2869         {
2870           if ((val & 0xffff) != tval)
2871             {
2872               if (generate)
2873                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2874                                            GEN_INT (i),
2875                                            GEN_INT (val & 0xffff)));
2876               insns++;
2877             }
2878           val >>= 16;
2879         }
2880     }
2881   return insns;
2882 }
2883
2884 static void
2885 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2886 {
2887   HOST_WIDE_INT mdelta = delta;
2888   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2889   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2890
2891   if (mdelta < 0)
2892     mdelta = -mdelta;
2893
2894   if (mdelta >= 4096 * 4096)
2895     {
2896       (void) aarch64_build_constant (scratchreg, delta, true);
2897       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2898     }
2899   else if (mdelta > 0)
2900     {
2901       if (mdelta >= 4096)
2902         {
2903           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2904           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2905           if (delta < 0)
2906             emit_insn (gen_rtx_SET (this_rtx,
2907                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2908           else
2909             emit_insn (gen_rtx_SET (this_rtx,
2910                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2911         }
2912       if (mdelta % 4096 != 0)
2913         {
2914           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2915           emit_insn (gen_rtx_SET (this_rtx,
2916                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2917         }
2918     }
2919 }
2920
2921 /* Output code to add DELTA to the first argument, and then jump
2922    to FUNCTION.  Used for C++ multiple inheritance.  */
2923 static void
2924 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2925                          HOST_WIDE_INT delta,
2926                          HOST_WIDE_INT vcall_offset,
2927                          tree function)
2928 {
2929   /* The this pointer is always in x0.  Note that this differs from
2930      Arm where the this pointer maybe bumped to r1 if r0 is required
2931      to return a pointer to an aggregate.  On AArch64 a result value
2932      pointer will be in x8.  */
2933   int this_regno = R0_REGNUM;
2934   rtx this_rtx, temp0, temp1, addr, funexp;
2935   rtx_insn *insn;
2936
2937   reload_completed = 1;
2938   emit_note (NOTE_INSN_PROLOGUE_END);
2939
2940   if (vcall_offset == 0)
2941     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2942   else
2943     {
2944       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2945
2946       this_rtx = gen_rtx_REG (Pmode, this_regno);
2947       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2948       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2949
2950       addr = this_rtx;
2951       if (delta != 0)
2952         {
2953           if (delta >= -256 && delta < 256)
2954             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2955                                        plus_constant (Pmode, this_rtx, delta));
2956           else
2957             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2958         }
2959
2960       if (Pmode == ptr_mode)
2961         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2962       else
2963         aarch64_emit_move (temp0,
2964                            gen_rtx_ZERO_EXTEND (Pmode,
2965                                                 gen_rtx_MEM (ptr_mode, addr)));
2966
2967       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2968           addr = plus_constant (Pmode, temp0, vcall_offset);
2969       else
2970         {
2971           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2972           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2973         }
2974
2975       if (Pmode == ptr_mode)
2976         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2977       else
2978         aarch64_emit_move (temp1,
2979                            gen_rtx_SIGN_EXTEND (Pmode,
2980                                                 gen_rtx_MEM (ptr_mode, addr)));
2981
2982       emit_insn (gen_add2_insn (this_rtx, temp1));
2983     }
2984
2985   /* Generate a tail call to the target function.  */
2986   if (!TREE_USED (function))
2987     {
2988       assemble_external (function);
2989       TREE_USED (function) = 1;
2990     }
2991   funexp = XEXP (DECL_RTL (function), 0);
2992   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2993   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2994   SIBLING_CALL_P (insn) = 1;
2995
2996   insn = get_insns ();
2997   shorten_branches (insn);
2998   final_start_function (insn, file, 1);
2999   final (insn, file, 1);
3000   final_end_function ();
3001
3002   /* Stop pretending to be a post-reload pass.  */
3003   reload_completed = 0;
3004 }
3005
3006 static bool
3007 aarch64_tls_referenced_p (rtx x)
3008 {
3009   if (!TARGET_HAVE_TLS)
3010     return false;
3011   subrtx_iterator::array_type array;
3012   FOR_EACH_SUBRTX (iter, array, x, ALL)
3013     {
3014       const_rtx x = *iter;
3015       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3016         return true;
3017       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3018          TLS offsets, not real symbol references.  */
3019       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3020         iter.skip_subrtxes ();
3021     }
3022   return false;
3023 }
3024
3025
3026 static int
3027 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3028 {
3029   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3030   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3031
3032   if (*imm1 < *imm2)
3033     return -1;
3034   if (*imm1 > *imm2)
3035     return +1;
3036   return 0;
3037 }
3038
3039
3040 static void
3041 aarch64_build_bitmask_table (void)
3042 {
3043   unsigned HOST_WIDE_INT mask, imm;
3044   unsigned int log_e, e, s, r;
3045   unsigned int nimms = 0;
3046
3047   for (log_e = 1; log_e <= 6; log_e++)
3048     {
3049       e = 1 << log_e;
3050       if (e == 64)
3051         mask = ~(HOST_WIDE_INT) 0;
3052       else
3053         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3054       for (s = 1; s < e; s++)
3055         {
3056           for (r = 0; r < e; r++)
3057             {
3058               /* set s consecutive bits to 1 (s < 64) */
3059               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3060               /* rotate right by r */
3061               if (r != 0)
3062                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3063               /* replicate the constant depending on SIMD size */
3064               switch (log_e) {
3065               case 1: imm |= (imm <<  2);
3066               case 2: imm |= (imm <<  4);
3067               case 3: imm |= (imm <<  8);
3068               case 4: imm |= (imm << 16);
3069               case 5: imm |= (imm << 32);
3070               case 6:
3071                 break;
3072               default:
3073                 gcc_unreachable ();
3074               }
3075               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3076               aarch64_bitmasks[nimms++] = imm;
3077             }
3078         }
3079     }
3080
3081   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3082   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3083          aarch64_bitmasks_cmp);
3084 }
3085
3086
3087 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3088    a left shift of 0 or 12 bits.  */
3089 bool
3090 aarch64_uimm12_shift (HOST_WIDE_INT val)
3091 {
3092   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3093           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3094           );
3095 }
3096
3097
3098 /* Return true if val is an immediate that can be loaded into a
3099    register by a MOVZ instruction.  */
3100 static bool
3101 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3102 {
3103   if (GET_MODE_SIZE (mode) > 4)
3104     {
3105       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3106           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3107         return 1;
3108     }
3109   else
3110     {
3111       /* Ignore sign extension.  */
3112       val &= (HOST_WIDE_INT) 0xffffffff;
3113     }
3114   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3115           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3116 }
3117
3118
3119 /* Return true if val is a valid bitmask immediate.  */
3120 bool
3121 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3122 {
3123   if (GET_MODE_SIZE (mode) < 8)
3124     {
3125       /* Replicate bit pattern.  */
3126       val &= (HOST_WIDE_INT) 0xffffffff;
3127       val |= val << 32;
3128     }
3129   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3130                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3131 }
3132
3133
3134 /* Return true if val is an immediate that can be loaded into a
3135    register in a single instruction.  */
3136 bool
3137 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3138 {
3139   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3140     return 1;
3141   return aarch64_bitmask_imm (val, mode);
3142 }
3143
3144 static bool
3145 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3146 {
3147   rtx base, offset;
3148
3149   if (GET_CODE (x) == HIGH)
3150     return true;
3151
3152   split_const (x, &base, &offset);
3153   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3154     {
3155       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3156           != SYMBOL_FORCE_TO_MEM)
3157         return true;
3158       else
3159         /* Avoid generating a 64-bit relocation in ILP32; leave
3160            to aarch64_expand_mov_immediate to handle it properly.  */
3161         return mode != ptr_mode;
3162     }
3163
3164   return aarch64_tls_referenced_p (x);
3165 }
3166
3167 /* Return true if register REGNO is a valid index register.
3168    STRICT_P is true if REG_OK_STRICT is in effect.  */
3169
3170 bool
3171 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3172 {
3173   if (!HARD_REGISTER_NUM_P (regno))
3174     {
3175       if (!strict_p)
3176         return true;
3177
3178       if (!reg_renumber)
3179         return false;
3180
3181       regno = reg_renumber[regno];
3182     }
3183   return GP_REGNUM_P (regno);
3184 }
3185
3186 /* Return true if register REGNO is a valid base register for mode MODE.
3187    STRICT_P is true if REG_OK_STRICT is in effect.  */
3188
3189 bool
3190 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3191 {
3192   if (!HARD_REGISTER_NUM_P (regno))
3193     {
3194       if (!strict_p)
3195         return true;
3196
3197       if (!reg_renumber)
3198         return false;
3199
3200       regno = reg_renumber[regno];
3201     }
3202
3203   /* The fake registers will be eliminated to either the stack or
3204      hard frame pointer, both of which are usually valid base registers.
3205      Reload deals with the cases where the eliminated form isn't valid.  */
3206   return (GP_REGNUM_P (regno)
3207           || regno == SP_REGNUM
3208           || regno == FRAME_POINTER_REGNUM
3209           || regno == ARG_POINTER_REGNUM);
3210 }
3211
3212 /* Return true if X is a valid base register for mode MODE.
3213    STRICT_P is true if REG_OK_STRICT is in effect.  */
3214
3215 static bool
3216 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3217 {
3218   if (!strict_p && GET_CODE (x) == SUBREG)
3219     x = SUBREG_REG (x);
3220
3221   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3222 }
3223
3224 /* Return true if address offset is a valid index.  If it is, fill in INFO
3225    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3226
3227 static bool
3228 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3229                         machine_mode mode, bool strict_p)
3230 {
3231   enum aarch64_address_type type;
3232   rtx index;
3233   int shift;
3234
3235   /* (reg:P) */
3236   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3237       && GET_MODE (x) == Pmode)
3238     {
3239       type = ADDRESS_REG_REG;
3240       index = x;
3241       shift = 0;
3242     }
3243   /* (sign_extend:DI (reg:SI)) */
3244   else if ((GET_CODE (x) == SIGN_EXTEND
3245             || GET_CODE (x) == ZERO_EXTEND)
3246            && GET_MODE (x) == DImode
3247            && GET_MODE (XEXP (x, 0)) == SImode)
3248     {
3249       type = (GET_CODE (x) == SIGN_EXTEND)
3250         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3251       index = XEXP (x, 0);
3252       shift = 0;
3253     }
3254   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3255   else if (GET_CODE (x) == MULT
3256            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3257                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3258            && GET_MODE (XEXP (x, 0)) == DImode
3259            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3260            && CONST_INT_P (XEXP (x, 1)))
3261     {
3262       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3263         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3264       index = XEXP (XEXP (x, 0), 0);
3265       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3266     }
3267   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3268   else if (GET_CODE (x) == ASHIFT
3269            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3270                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3271            && GET_MODE (XEXP (x, 0)) == DImode
3272            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3273            && CONST_INT_P (XEXP (x, 1)))
3274     {
3275       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3276         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3277       index = XEXP (XEXP (x, 0), 0);
3278       shift = INTVAL (XEXP (x, 1));
3279     }
3280   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3281   else if ((GET_CODE (x) == SIGN_EXTRACT
3282             || GET_CODE (x) == ZERO_EXTRACT)
3283            && GET_MODE (x) == DImode
3284            && GET_CODE (XEXP (x, 0)) == MULT
3285            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3286            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3287     {
3288       type = (GET_CODE (x) == SIGN_EXTRACT)
3289         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3290       index = XEXP (XEXP (x, 0), 0);
3291       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3292       if (INTVAL (XEXP (x, 1)) != 32 + shift
3293           || INTVAL (XEXP (x, 2)) != 0)
3294         shift = -1;
3295     }
3296   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3297      (const_int 0xffffffff<<shift)) */
3298   else if (GET_CODE (x) == AND
3299            && GET_MODE (x) == DImode
3300            && GET_CODE (XEXP (x, 0)) == MULT
3301            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3302            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3303            && CONST_INT_P (XEXP (x, 1)))
3304     {
3305       type = ADDRESS_REG_UXTW;
3306       index = XEXP (XEXP (x, 0), 0);
3307       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3308       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3309         shift = -1;
3310     }
3311   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3312   else if ((GET_CODE (x) == SIGN_EXTRACT
3313             || GET_CODE (x) == ZERO_EXTRACT)
3314            && GET_MODE (x) == DImode
3315            && GET_CODE (XEXP (x, 0)) == ASHIFT
3316            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3317            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3318     {
3319       type = (GET_CODE (x) == SIGN_EXTRACT)
3320         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3321       index = XEXP (XEXP (x, 0), 0);
3322       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3323       if (INTVAL (XEXP (x, 1)) != 32 + shift
3324           || INTVAL (XEXP (x, 2)) != 0)
3325         shift = -1;
3326     }
3327   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3328      (const_int 0xffffffff<<shift)) */
3329   else if (GET_CODE (x) == AND
3330            && GET_MODE (x) == DImode
3331            && GET_CODE (XEXP (x, 0)) == ASHIFT
3332            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3333            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3334            && CONST_INT_P (XEXP (x, 1)))
3335     {
3336       type = ADDRESS_REG_UXTW;
3337       index = XEXP (XEXP (x, 0), 0);
3338       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3339       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3340         shift = -1;
3341     }
3342   /* (mult:P (reg:P) (const_int scale)) */
3343   else if (GET_CODE (x) == MULT
3344            && GET_MODE (x) == Pmode
3345            && GET_MODE (XEXP (x, 0)) == Pmode
3346            && CONST_INT_P (XEXP (x, 1)))
3347     {
3348       type = ADDRESS_REG_REG;
3349       index = XEXP (x, 0);
3350       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3351     }
3352   /* (ashift:P (reg:P) (const_int shift)) */
3353   else if (GET_CODE (x) == ASHIFT
3354            && GET_MODE (x) == Pmode
3355            && GET_MODE (XEXP (x, 0)) == Pmode
3356            && CONST_INT_P (XEXP (x, 1)))
3357     {
3358       type = ADDRESS_REG_REG;
3359       index = XEXP (x, 0);
3360       shift = INTVAL (XEXP (x, 1));
3361     }
3362   else
3363     return false;
3364
3365   if (GET_CODE (index) == SUBREG)
3366     index = SUBREG_REG (index);
3367
3368   if ((shift == 0 ||
3369        (shift > 0 && shift <= 3
3370         && (1 << shift) == GET_MODE_SIZE (mode)))
3371       && REG_P (index)
3372       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3373     {
3374       info->type = type;
3375       info->offset = index;
3376       info->shift = shift;
3377       return true;
3378     }
3379
3380   return false;
3381 }
3382
3383 bool
3384 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3385 {
3386   return (offset >= -64 * GET_MODE_SIZE (mode)
3387           && offset < 64 * GET_MODE_SIZE (mode)
3388           && offset % GET_MODE_SIZE (mode) == 0);
3389 }
3390
3391 static inline bool
3392 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3393                                HOST_WIDE_INT offset)
3394 {
3395   return offset >= -256 && offset < 256;
3396 }
3397
3398 static inline bool
3399 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3400 {
3401   return (offset >= 0
3402           && offset < 4096 * GET_MODE_SIZE (mode)
3403           && offset % GET_MODE_SIZE (mode) == 0);
3404 }
3405
3406 /* Return true if X is a valid address for machine mode MODE.  If it is,
3407    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3408    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3409
3410 static bool
3411 aarch64_classify_address (struct aarch64_address_info *info,
3412                           rtx x, machine_mode mode,
3413                           RTX_CODE outer_code, bool strict_p)
3414 {
3415   enum rtx_code code = GET_CODE (x);
3416   rtx op0, op1;
3417
3418   /* On BE, we use load/store pair for all large int mode load/stores.  */
3419   bool load_store_pair_p = (outer_code == PARALLEL
3420                             || (BYTES_BIG_ENDIAN
3421                                 && aarch64_vect_struct_mode_p (mode)));
3422
3423   bool allow_reg_index_p =
3424     !load_store_pair_p
3425     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3426     && !aarch64_vect_struct_mode_p (mode);
3427
3428   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3429      REG addressing.  */
3430   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3431       && (code != POST_INC && code != REG))
3432     return false;
3433
3434   switch (code)
3435     {
3436     case REG:
3437     case SUBREG:
3438       info->type = ADDRESS_REG_IMM;
3439       info->base = x;
3440       info->offset = const0_rtx;
3441       return aarch64_base_register_rtx_p (x, strict_p);
3442
3443     case PLUS:
3444       op0 = XEXP (x, 0);
3445       op1 = XEXP (x, 1);
3446
3447       if (! strict_p
3448           && REG_P (op0)
3449           && (op0 == virtual_stack_vars_rtx
3450               || op0 == frame_pointer_rtx
3451               || op0 == arg_pointer_rtx)
3452           && CONST_INT_P (op1))
3453         {
3454           info->type = ADDRESS_REG_IMM;
3455           info->base = op0;
3456           info->offset = op1;
3457
3458           return true;
3459         }
3460
3461       if (GET_MODE_SIZE (mode) != 0
3462           && CONST_INT_P (op1)
3463           && aarch64_base_register_rtx_p (op0, strict_p))
3464         {
3465           HOST_WIDE_INT offset = INTVAL (op1);
3466
3467           info->type = ADDRESS_REG_IMM;
3468           info->base = op0;
3469           info->offset = op1;
3470
3471           /* TImode and TFmode values are allowed in both pairs of X
3472              registers and individual Q registers.  The available
3473              address modes are:
3474              X,X: 7-bit signed scaled offset
3475              Q:   9-bit signed offset
3476              We conservatively require an offset representable in either mode.
3477            */
3478           if (mode == TImode || mode == TFmode)
3479             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3480                     && offset_9bit_signed_unscaled_p (mode, offset));
3481
3482           /* A 7bit offset check because OImode will emit a ldp/stp
3483              instruction (only big endian will get here).
3484              For ldp/stp instructions, the offset is scaled for the size of a
3485              single element of the pair.  */
3486           if (mode == OImode)
3487             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3488
3489           /* Three 9/12 bit offsets checks because CImode will emit three
3490              ldr/str instructions (only big endian will get here).  */
3491           if (mode == CImode)
3492             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3493                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3494                         || offset_12bit_unsigned_scaled_p (V16QImode,
3495                                                            offset + 32)));
3496
3497           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3498              instructions (only big endian will get here).  */
3499           if (mode == XImode)
3500             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3501                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3502                                                             offset + 32));
3503
3504           if (load_store_pair_p)
3505             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3506                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3507           else
3508             return (offset_9bit_signed_unscaled_p (mode, offset)
3509                     || offset_12bit_unsigned_scaled_p (mode, offset));
3510         }
3511
3512       if (allow_reg_index_p)
3513         {
3514           /* Look for base + (scaled/extended) index register.  */
3515           if (aarch64_base_register_rtx_p (op0, strict_p)
3516               && aarch64_classify_index (info, op1, mode, strict_p))
3517             {
3518               info->base = op0;
3519               return true;
3520             }
3521           if (aarch64_base_register_rtx_p (op1, strict_p)
3522               && aarch64_classify_index (info, op0, mode, strict_p))
3523             {
3524               info->base = op1;
3525               return true;
3526             }
3527         }
3528
3529       return false;
3530
3531     case POST_INC:
3532     case POST_DEC:
3533     case PRE_INC:
3534     case PRE_DEC:
3535       info->type = ADDRESS_REG_WB;
3536       info->base = XEXP (x, 0);
3537       info->offset = NULL_RTX;
3538       return aarch64_base_register_rtx_p (info->base, strict_p);
3539
3540     case POST_MODIFY:
3541     case PRE_MODIFY:
3542       info->type = ADDRESS_REG_WB;
3543       info->base = XEXP (x, 0);
3544       if (GET_CODE (XEXP (x, 1)) == PLUS
3545           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3546           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3547           && aarch64_base_register_rtx_p (info->base, strict_p))
3548         {
3549           HOST_WIDE_INT offset;
3550           info->offset = XEXP (XEXP (x, 1), 1);
3551           offset = INTVAL (info->offset);
3552
3553           /* TImode and TFmode values are allowed in both pairs of X
3554              registers and individual Q registers.  The available
3555              address modes are:
3556              X,X: 7-bit signed scaled offset
3557              Q:   9-bit signed offset
3558              We conservatively require an offset representable in either mode.
3559            */
3560           if (mode == TImode || mode == TFmode)
3561             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3562                     && offset_9bit_signed_unscaled_p (mode, offset));
3563
3564           if (load_store_pair_p)
3565             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3566                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3567           else
3568             return offset_9bit_signed_unscaled_p (mode, offset);
3569         }
3570       return false;
3571
3572     case CONST:
3573     case SYMBOL_REF:
3574     case LABEL_REF:
3575       /* load literal: pc-relative constant pool entry.  Only supported
3576          for SI mode or larger.  */
3577       info->type = ADDRESS_SYMBOLIC;
3578
3579       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3580         {
3581           rtx sym, addend;
3582
3583           split_const (x, &sym, &addend);
3584           return (GET_CODE (sym) == LABEL_REF
3585                   || (GET_CODE (sym) == SYMBOL_REF
3586                       && CONSTANT_POOL_ADDRESS_P (sym)));
3587         }
3588       return false;
3589
3590     case LO_SUM:
3591       info->type = ADDRESS_LO_SUM;
3592       info->base = XEXP (x, 0);
3593       info->offset = XEXP (x, 1);
3594       if (allow_reg_index_p
3595           && aarch64_base_register_rtx_p (info->base, strict_p))
3596         {
3597           rtx sym, offs;
3598           split_const (info->offset, &sym, &offs);
3599           if (GET_CODE (sym) == SYMBOL_REF
3600               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3601                   == SYMBOL_SMALL_ABSOLUTE))
3602             {
3603               /* The symbol and offset must be aligned to the access size.  */
3604               unsigned int align;
3605               unsigned int ref_size;
3606
3607               if (CONSTANT_POOL_ADDRESS_P (sym))
3608                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3609               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3610                 {
3611                   tree exp = SYMBOL_REF_DECL (sym);
3612                   align = TYPE_ALIGN (TREE_TYPE (exp));
3613                   align = CONSTANT_ALIGNMENT (exp, align);
3614                 }
3615               else if (SYMBOL_REF_DECL (sym))
3616                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3617               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3618                        && SYMBOL_REF_BLOCK (sym) != NULL)
3619                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3620               else
3621                 align = BITS_PER_UNIT;
3622
3623               ref_size = GET_MODE_SIZE (mode);
3624               if (ref_size == 0)
3625                 ref_size = GET_MODE_SIZE (DImode);
3626
3627               return ((INTVAL (offs) & (ref_size - 1)) == 0
3628                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3629             }
3630         }
3631       return false;
3632
3633     default:
3634       return false;
3635     }
3636 }
3637
3638 bool
3639 aarch64_symbolic_address_p (rtx x)
3640 {
3641   rtx offset;
3642
3643   split_const (x, &x, &offset);
3644   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3645 }
3646
3647 /* Classify the base of symbolic expression X, given that X appears in
3648    context CONTEXT.  */
3649
3650 enum aarch64_symbol_type
3651 aarch64_classify_symbolic_expression (rtx x,
3652                                       enum aarch64_symbol_context context)
3653 {
3654   rtx offset;
3655
3656   split_const (x, &x, &offset);
3657   return aarch64_classify_symbol (x, offset, context);
3658 }
3659
3660
3661 /* Return TRUE if X is a legitimate address for accessing memory in
3662    mode MODE.  */
3663 static bool
3664 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3665 {
3666   struct aarch64_address_info addr;
3667
3668   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3669 }
3670
3671 /* Return TRUE if X is a legitimate address for accessing memory in
3672    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3673    pair operation.  */
3674 bool
3675 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3676                               RTX_CODE outer_code, bool strict_p)
3677 {
3678   struct aarch64_address_info addr;
3679
3680   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3681 }
3682
3683 /* Return TRUE if rtx X is immediate constant 0.0 */
3684 bool
3685 aarch64_float_const_zero_rtx_p (rtx x)
3686 {
3687   REAL_VALUE_TYPE r;
3688
3689   if (GET_MODE (x) == VOIDmode)
3690     return false;
3691
3692   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3693   if (REAL_VALUE_MINUS_ZERO (r))
3694     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3695   return REAL_VALUES_EQUAL (r, dconst0);
3696 }
3697
3698 /* Return the fixed registers used for condition codes.  */
3699
3700 static bool
3701 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3702 {
3703   *p1 = CC_REGNUM;
3704   *p2 = INVALID_REGNUM;
3705   return true;
3706 }
3707
3708 /* Emit call insn with PAT and do aarch64-specific handling.  */
3709
3710 void
3711 aarch64_emit_call_insn (rtx pat)
3712 {
3713   rtx insn = emit_call_insn (pat);
3714
3715   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3716   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3717   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3718 }
3719
3720 machine_mode
3721 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3722 {
3723   /* All floating point compares return CCFP if it is an equality
3724      comparison, and CCFPE otherwise.  */
3725   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3726     {
3727       switch (code)
3728         {
3729         case EQ:
3730         case NE:
3731         case UNORDERED:
3732         case ORDERED:
3733         case UNLT:
3734         case UNLE:
3735         case UNGT:
3736         case UNGE:
3737         case UNEQ:
3738         case LTGT:
3739           return CCFPmode;
3740
3741         case LT:
3742         case LE:
3743         case GT:
3744         case GE:
3745           return CCFPEmode;
3746
3747         default:
3748           gcc_unreachable ();
3749         }
3750     }
3751
3752   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3753       && y == const0_rtx
3754       && (code == EQ || code == NE || code == LT || code == GE)
3755       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3756           || GET_CODE (x) == NEG))
3757     return CC_NZmode;
3758
3759   /* A compare with a shifted operand.  Because of canonicalization,
3760      the comparison will have to be swapped when we emit the assembly
3761      code.  */
3762   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3763       && (REG_P (y) || GET_CODE (y) == SUBREG)
3764       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3765           || GET_CODE (x) == LSHIFTRT
3766           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3767     return CC_SWPmode;
3768
3769   /* Similarly for a negated operand, but we can only do this for
3770      equalities.  */
3771   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3772       && (REG_P (y) || GET_CODE (y) == SUBREG)
3773       && (code == EQ || code == NE)
3774       && GET_CODE (x) == NEG)
3775     return CC_Zmode;
3776
3777   /* A compare of a mode narrower than SI mode against zero can be done
3778      by extending the value in the comparison.  */
3779   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3780       && y == const0_rtx)
3781     /* Only use sign-extension if we really need it.  */
3782     return ((code == GT || code == GE || code == LE || code == LT)
3783             ? CC_SESWPmode : CC_ZESWPmode);
3784
3785   /* For everything else, return CCmode.  */
3786   return CCmode;
3787 }
3788
3789 static int
3790 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3791
3792 int
3793 aarch64_get_condition_code (rtx x)
3794 {
3795   machine_mode mode = GET_MODE (XEXP (x, 0));
3796   enum rtx_code comp_code = GET_CODE (x);
3797
3798   if (GET_MODE_CLASS (mode) != MODE_CC)
3799     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3800   return aarch64_get_condition_code_1 (mode, comp_code);
3801 }
3802
3803 static int
3804 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3805 {
3806   int ne = -1, eq = -1;
3807   switch (mode)
3808     {
3809     case CCFPmode:
3810     case CCFPEmode:
3811       switch (comp_code)
3812         {
3813         case GE: return AARCH64_GE;
3814         case GT: return AARCH64_GT;
3815         case LE: return AARCH64_LS;
3816         case LT: return AARCH64_MI;
3817         case NE: return AARCH64_NE;
3818         case EQ: return AARCH64_EQ;
3819         case ORDERED: return AARCH64_VC;
3820         case UNORDERED: return AARCH64_VS;
3821         case UNLT: return AARCH64_LT;
3822         case UNLE: return AARCH64_LE;
3823         case UNGT: return AARCH64_HI;
3824         case UNGE: return AARCH64_PL;
3825         default: return -1;
3826         }
3827       break;
3828
3829     case CC_DNEmode:
3830       ne = AARCH64_NE;
3831       eq = AARCH64_EQ;
3832       break;
3833
3834     case CC_DEQmode:
3835       ne = AARCH64_EQ;
3836       eq = AARCH64_NE;
3837       break;
3838
3839     case CC_DGEmode:
3840       ne = AARCH64_GE;
3841       eq = AARCH64_LT;
3842       break;
3843
3844     case CC_DLTmode:
3845       ne = AARCH64_LT;
3846       eq = AARCH64_GE;
3847       break;
3848
3849     case CC_DGTmode:
3850       ne = AARCH64_GT;
3851       eq = AARCH64_LE;
3852       break;
3853
3854     case CC_DLEmode:
3855       ne = AARCH64_LE;
3856       eq = AARCH64_GT;
3857       break;
3858
3859     case CC_DGEUmode:
3860       ne = AARCH64_CS;
3861       eq = AARCH64_CC;
3862       break;
3863
3864     case CC_DLTUmode:
3865       ne = AARCH64_CC;
3866       eq = AARCH64_CS;
3867       break;
3868
3869     case CC_DGTUmode:
3870       ne = AARCH64_HI;
3871       eq = AARCH64_LS;
3872       break;
3873
3874     case CC_DLEUmode:
3875       ne = AARCH64_LS;
3876       eq = AARCH64_HI;
3877       break;
3878
3879     case CCmode:
3880       switch (comp_code)
3881         {
3882         case NE: return AARCH64_NE;
3883         case EQ: return AARCH64_EQ;
3884         case GE: return AARCH64_GE;
3885         case GT: return AARCH64_GT;
3886         case LE: return AARCH64_LE;
3887         case LT: return AARCH64_LT;
3888         case GEU: return AARCH64_CS;
3889         case GTU: return AARCH64_HI;
3890         case LEU: return AARCH64_LS;
3891         case LTU: return AARCH64_CC;
3892         default: return -1;
3893         }
3894       break;
3895
3896     case CC_SWPmode:
3897     case CC_ZESWPmode:
3898     case CC_SESWPmode:
3899       switch (comp_code)
3900         {
3901         case NE: return AARCH64_NE;
3902         case EQ: return AARCH64_EQ;
3903         case GE: return AARCH64_LE;
3904         case GT: return AARCH64_LT;
3905         case LE: return AARCH64_GE;
3906         case LT: return AARCH64_GT;
3907         case GEU: return AARCH64_LS;
3908         case GTU: return AARCH64_CC;
3909         case LEU: return AARCH64_CS;
3910         case LTU: return AARCH64_HI;
3911         default: return -1;
3912         }
3913       break;
3914
3915     case CC_NZmode:
3916       switch (comp_code)
3917         {
3918         case NE: return AARCH64_NE;
3919         case EQ: return AARCH64_EQ;
3920         case GE: return AARCH64_PL;
3921         case LT: return AARCH64_MI;
3922         default: return -1;
3923         }
3924       break;
3925
3926     case CC_Zmode:
3927       switch (comp_code)
3928         {
3929         case NE: return AARCH64_NE;
3930         case EQ: return AARCH64_EQ;
3931         default: return -1;
3932         }
3933       break;
3934
3935     default:
3936       return -1;
3937       break;
3938     }
3939
3940   if (comp_code == NE)
3941     return ne;
3942
3943   if (comp_code == EQ)
3944     return eq;
3945
3946   return -1;
3947 }
3948
3949 bool
3950 aarch64_const_vec_all_same_in_range_p (rtx x,
3951                                   HOST_WIDE_INT minval,
3952                                   HOST_WIDE_INT maxval)
3953 {
3954   HOST_WIDE_INT firstval;
3955   int count, i;
3956
3957   if (GET_CODE (x) != CONST_VECTOR
3958       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3959     return false;
3960
3961   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3962   if (firstval < minval || firstval > maxval)
3963     return false;
3964
3965   count = CONST_VECTOR_NUNITS (x);
3966   for (i = 1; i < count; i++)
3967     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3968       return false;
3969
3970   return true;
3971 }
3972
3973 bool
3974 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3975 {
3976   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3977 }
3978
3979 static unsigned
3980 bit_count (unsigned HOST_WIDE_INT value)
3981 {
3982   unsigned count = 0;
3983
3984   while (value)
3985     {
3986       count++;
3987       value &= value - 1;
3988     }
3989
3990   return count;
3991 }
3992
3993 /* N Z C V.  */
3994 #define AARCH64_CC_V 1
3995 #define AARCH64_CC_C (1 << 1)
3996 #define AARCH64_CC_Z (1 << 2)
3997 #define AARCH64_CC_N (1 << 3)
3998
3999 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4000    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4001 static const int aarch64_nzcv_codes[][2] =
4002 {
4003   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4004   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4005   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4006   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4007   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4008   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4009   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4010   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4011   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4012   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4013   {0, AARCH64_CC_V}, /* GE, N == V.  */
4014   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4015   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4016   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4017   {0, 0}, /* AL, Any.  */
4018   {0, 0}, /* NV, Any.  */
4019 };
4020
4021 int
4022 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4023 {
4024   switch (mode)
4025     {
4026     case CC_DNEmode:
4027       return NE;
4028
4029     case CC_DEQmode:
4030       return EQ;
4031
4032     case CC_DLEmode:
4033       return LE;
4034
4035     case CC_DGTmode:
4036       return GT;
4037
4038     case CC_DLTmode:
4039       return LT;
4040
4041     case CC_DGEmode:
4042       return GE;
4043
4044     case CC_DLEUmode:
4045       return LEU;
4046
4047     case CC_DGTUmode:
4048       return GTU;
4049
4050     case CC_DLTUmode:
4051       return LTU;
4052
4053     case CC_DGEUmode:
4054       return GEU;
4055
4056     default:
4057       gcc_unreachable ();
4058     }
4059 }
4060
4061
4062 void
4063 aarch64_print_operand (FILE *f, rtx x, char code)
4064 {
4065   switch (code)
4066     {
4067     /* An integer or symbol address without a preceding # sign.  */
4068     case 'c':
4069       switch (GET_CODE (x))
4070         {
4071         case CONST_INT:
4072           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4073           break;
4074
4075         case SYMBOL_REF:
4076           output_addr_const (f, x);
4077           break;
4078
4079         case CONST:
4080           if (GET_CODE (XEXP (x, 0)) == PLUS
4081               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4082             {
4083               output_addr_const (f, x);
4084               break;
4085             }
4086           /* Fall through.  */
4087
4088         default:
4089           output_operand_lossage ("Unsupported operand for code '%c'", code);
4090         }
4091       break;
4092
4093     case 'e':
4094       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4095       {
4096         int n;
4097
4098         if (!CONST_INT_P (x)
4099             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4100           {
4101             output_operand_lossage ("invalid operand for '%%%c'", code);
4102             return;
4103           }
4104
4105         switch (n)
4106           {
4107           case 3:
4108             fputc ('b', f);
4109             break;
4110           case 4:
4111             fputc ('h', f);
4112             break;
4113           case 5:
4114             fputc ('w', f);
4115             break;
4116           default:
4117             output_operand_lossage ("invalid operand for '%%%c'", code);
4118             return;
4119           }
4120       }
4121       break;
4122
4123     case 'p':
4124       {
4125         int n;
4126
4127         /* Print N such that 2^N == X.  */
4128         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4129           {
4130             output_operand_lossage ("invalid operand for '%%%c'", code);
4131             return;
4132           }
4133
4134         asm_fprintf (f, "%d", n);
4135       }
4136       break;
4137
4138     case 'P':
4139       /* Print the number of non-zero bits in X (a const_int).  */
4140       if (!CONST_INT_P (x))
4141         {
4142           output_operand_lossage ("invalid operand for '%%%c'", code);
4143           return;
4144         }
4145
4146       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4147       break;
4148
4149     case 'H':
4150       /* Print the higher numbered register of a pair (TImode) of regs.  */
4151       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4152         {
4153           output_operand_lossage ("invalid operand for '%%%c'", code);
4154           return;
4155         }
4156
4157       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4158       break;
4159
4160     case 'm':
4161       {
4162         int cond_code;
4163         /* Print a condition (eq, ne, etc).  */
4164
4165         /* CONST_TRUE_RTX means always -- that's the default.  */
4166         if (x == const_true_rtx)
4167           return;
4168
4169         if (!COMPARISON_P (x))
4170           {
4171             output_operand_lossage ("invalid operand for '%%%c'", code);
4172             return;
4173           }
4174
4175         cond_code = aarch64_get_condition_code (x);
4176         gcc_assert (cond_code >= 0);
4177         fputs (aarch64_condition_codes[cond_code], f);
4178       }
4179       break;
4180
4181     case 'M':
4182       {
4183         int cond_code;
4184         /* Print the inverse of a condition (eq <-> ne, etc).  */
4185
4186         /* CONST_TRUE_RTX means never -- that's the default.  */
4187         if (x == const_true_rtx)
4188           {
4189             fputs ("nv", f);
4190             return;
4191           }
4192
4193         if (!COMPARISON_P (x))
4194           {
4195             output_operand_lossage ("invalid operand for '%%%c'", code);
4196             return;
4197           }
4198         cond_code = aarch64_get_condition_code (x);
4199         gcc_assert (cond_code >= 0);
4200         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4201                                        (cond_code)], f);
4202       }
4203       break;
4204
4205     case 'b':
4206     case 'h':
4207     case 's':
4208     case 'd':
4209     case 'q':
4210       /* Print a scalar FP/SIMD register name.  */
4211       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4212         {
4213           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4214           return;
4215         }
4216       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4217       break;
4218
4219     case 'S':
4220     case 'T':
4221     case 'U':
4222     case 'V':
4223       /* Print the first FP/SIMD register name in a list.  */
4224       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4225         {
4226           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4227           return;
4228         }
4229       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4230       break;
4231
4232     case 'R':
4233       /* Print a scalar FP/SIMD register name + 1.  */
4234       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4235         {
4236           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4237           return;
4238         }
4239       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4240       break;
4241
4242     case 'X':
4243       /* Print bottom 16 bits of integer constant in hex.  */
4244       if (!CONST_INT_P (x))
4245         {
4246           output_operand_lossage ("invalid operand for '%%%c'", code);
4247           return;
4248         }
4249       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4250       break;
4251
4252     case 'w':
4253     case 'x':
4254       /* Print a general register name or the zero register (32-bit or
4255          64-bit).  */
4256       if (x == const0_rtx
4257           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4258         {
4259           asm_fprintf (f, "%czr", code);
4260           break;
4261         }
4262
4263       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4264         {
4265           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4266           break;
4267         }
4268
4269       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4270         {
4271           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4272           break;
4273         }
4274
4275       /* Fall through */
4276
4277     case 0:
4278       /* Print a normal operand, if it's a general register, then we
4279          assume DImode.  */
4280       if (x == NULL)
4281         {
4282           output_operand_lossage ("missing operand");
4283           return;
4284         }
4285
4286       switch (GET_CODE (x))
4287         {
4288         case REG:
4289           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4290           break;
4291
4292         case MEM:
4293           aarch64_memory_reference_mode = GET_MODE (x);
4294           output_address (XEXP (x, 0));
4295           break;
4296
4297         case LABEL_REF:
4298         case SYMBOL_REF:
4299           output_addr_const (asm_out_file, x);
4300           break;
4301
4302         case CONST_INT:
4303           asm_fprintf (f, "%wd", INTVAL (x));
4304           break;
4305
4306         case CONST_VECTOR:
4307           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4308             {
4309               gcc_assert (
4310                   aarch64_const_vec_all_same_in_range_p (x,
4311                                                          HOST_WIDE_INT_MIN,
4312                                                          HOST_WIDE_INT_MAX));
4313               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4314             }
4315           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4316             {
4317               fputc ('0', f);
4318             }
4319           else
4320             gcc_unreachable ();
4321           break;
4322
4323         case CONST_DOUBLE:
4324           /* CONST_DOUBLE can represent a double-width integer.
4325              In this case, the mode of x is VOIDmode.  */
4326           if (GET_MODE (x) == VOIDmode)
4327             ; /* Do Nothing.  */
4328           else if (aarch64_float_const_zero_rtx_p (x))
4329             {
4330               fputc ('0', f);
4331               break;
4332             }
4333           else if (aarch64_float_const_representable_p (x))
4334             {
4335 #define buf_size 20
4336               char float_buf[buf_size] = {'\0'};
4337               REAL_VALUE_TYPE r;
4338               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4339               real_to_decimal_for_mode (float_buf, &r,
4340                                         buf_size, buf_size,
4341                                         1, GET_MODE (x));
4342               asm_fprintf (asm_out_file, "%s", float_buf);
4343               break;
4344 #undef buf_size
4345             }
4346           output_operand_lossage ("invalid constant");
4347           return;
4348         default:
4349           output_operand_lossage ("invalid operand");
4350           return;
4351         }
4352       break;
4353
4354     case 'A':
4355       if (GET_CODE (x) == HIGH)
4356         x = XEXP (x, 0);
4357
4358       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4359         {
4360         case SYMBOL_SMALL_GOT:
4361           asm_fprintf (asm_out_file, ":got:");
4362           break;
4363
4364         case SYMBOL_SMALL_TLSGD:
4365           asm_fprintf (asm_out_file, ":tlsgd:");
4366           break;
4367
4368         case SYMBOL_SMALL_TLSDESC:
4369           asm_fprintf (asm_out_file, ":tlsdesc:");
4370           break;
4371
4372         case SYMBOL_SMALL_GOTTPREL:
4373           asm_fprintf (asm_out_file, ":gottprel:");
4374           break;
4375
4376         case SYMBOL_SMALL_TPREL:
4377           asm_fprintf (asm_out_file, ":tprel:");
4378           break;
4379
4380         case SYMBOL_TINY_GOT:
4381           gcc_unreachable ();
4382           break;
4383
4384         default:
4385           break;
4386         }
4387       output_addr_const (asm_out_file, x);
4388       break;
4389
4390     case 'L':
4391       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4392         {
4393         case SYMBOL_SMALL_GOT:
4394           asm_fprintf (asm_out_file, ":lo12:");
4395           break;
4396
4397         case SYMBOL_SMALL_TLSGD:
4398           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4399           break;
4400
4401         case SYMBOL_SMALL_TLSDESC:
4402           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4403           break;
4404
4405         case SYMBOL_SMALL_GOTTPREL:
4406           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4407           break;
4408
4409         case SYMBOL_SMALL_TPREL:
4410           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4411           break;
4412
4413         case SYMBOL_TINY_GOT:
4414           asm_fprintf (asm_out_file, ":got:");
4415           break;
4416
4417         default:
4418           break;
4419         }
4420       output_addr_const (asm_out_file, x);
4421       break;
4422
4423     case 'G':
4424
4425       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4426         {
4427         case SYMBOL_SMALL_TPREL:
4428           asm_fprintf (asm_out_file, ":tprel_hi12:");
4429           break;
4430         default:
4431           break;
4432         }
4433       output_addr_const (asm_out_file, x);
4434       break;
4435
4436     case 'K':
4437       {
4438         int cond_code;
4439         /* Print nzcv.  */
4440
4441         if (!COMPARISON_P (x))
4442           {
4443             output_operand_lossage ("invalid operand for '%%%c'", code);
4444             return;
4445           }
4446
4447         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4448         gcc_assert (cond_code >= 0);
4449         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4450       }
4451       break;
4452
4453     case 'k':
4454       {
4455         int cond_code;
4456         /* Print nzcv.  */
4457
4458         if (!COMPARISON_P (x))
4459           {
4460             output_operand_lossage ("invalid operand for '%%%c'", code);
4461             return;
4462           }
4463
4464         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4465         gcc_assert (cond_code >= 0);
4466         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4467       }
4468       break;
4469
4470     default:
4471       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4472       return;
4473     }
4474 }
4475
4476 void
4477 aarch64_print_operand_address (FILE *f, rtx x)
4478 {
4479   struct aarch64_address_info addr;
4480
4481   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4482                              MEM, true))
4483     switch (addr.type)
4484       {
4485       case ADDRESS_REG_IMM:
4486         if (addr.offset == const0_rtx)
4487           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4488         else
4489           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4490                        INTVAL (addr.offset));
4491         return;
4492
4493       case ADDRESS_REG_REG:
4494         if (addr.shift == 0)
4495           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4496                        reg_names [REGNO (addr.offset)]);
4497         else
4498           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4499                        reg_names [REGNO (addr.offset)], addr.shift);
4500         return;
4501
4502       case ADDRESS_REG_UXTW:
4503         if (addr.shift == 0)
4504           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4505                        REGNO (addr.offset) - R0_REGNUM);
4506         else
4507           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4508                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4509         return;
4510
4511       case ADDRESS_REG_SXTW:
4512         if (addr.shift == 0)
4513           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4514                        REGNO (addr.offset) - R0_REGNUM);
4515         else
4516           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4517                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4518         return;
4519
4520       case ADDRESS_REG_WB:
4521         switch (GET_CODE (x))
4522           {
4523           case PRE_INC:
4524             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4525                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4526             return;
4527           case POST_INC:
4528             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4529                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4530             return;
4531           case PRE_DEC:
4532             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4533                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4534             return;
4535           case POST_DEC:
4536             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4537                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4538             return;
4539           case PRE_MODIFY:
4540             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4541                          INTVAL (addr.offset));
4542             return;
4543           case POST_MODIFY:
4544             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4545                          INTVAL (addr.offset));
4546             return;
4547           default:
4548             break;
4549           }
4550         break;
4551
4552       case ADDRESS_LO_SUM:
4553         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4554         output_addr_const (f, addr.offset);
4555         asm_fprintf (f, "]");
4556         return;
4557
4558       case ADDRESS_SYMBOLIC:
4559         break;
4560       }
4561
4562   output_addr_const (f, x);
4563 }
4564
4565 bool
4566 aarch64_label_mentioned_p (rtx x)
4567 {
4568   const char *fmt;
4569   int i;
4570
4571   if (GET_CODE (x) == LABEL_REF)
4572     return true;
4573
4574   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4575      referencing instruction, but they are constant offsets, not
4576      symbols.  */
4577   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4578     return false;
4579
4580   fmt = GET_RTX_FORMAT (GET_CODE (x));
4581   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4582     {
4583       if (fmt[i] == 'E')
4584         {
4585           int j;
4586
4587           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4588             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4589               return 1;
4590         }
4591       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4592         return 1;
4593     }
4594
4595   return 0;
4596 }
4597
4598 /* Implement REGNO_REG_CLASS.  */
4599
4600 enum reg_class
4601 aarch64_regno_regclass (unsigned regno)
4602 {
4603   if (GP_REGNUM_P (regno))
4604     return GENERAL_REGS;
4605
4606   if (regno == SP_REGNUM)
4607     return STACK_REG;
4608
4609   if (regno == FRAME_POINTER_REGNUM
4610       || regno == ARG_POINTER_REGNUM)
4611     return POINTER_REGS;
4612
4613   if (FP_REGNUM_P (regno))
4614     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4615
4616   return NO_REGS;
4617 }
4618
4619 static rtx
4620 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4621 {
4622   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4623      where mask is selected by alignment and size of the offset.
4624      We try to pick as large a range for the offset as possible to
4625      maximize the chance of a CSE.  However, for aligned addresses
4626      we limit the range to 4k so that structures with different sized
4627      elements are likely to use the same base.  */
4628
4629   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4630     {
4631       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4632       HOST_WIDE_INT base_offset;
4633
4634       /* Does it look like we'll need a load/store-pair operation?  */
4635       if (GET_MODE_SIZE (mode) > 16
4636           || mode == TImode)
4637         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4638                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4639       /* For offsets aren't a multiple of the access size, the limit is
4640          -256...255.  */
4641       else if (offset & (GET_MODE_SIZE (mode) - 1))
4642         base_offset = (offset + 0x100) & ~0x1ff;
4643       else
4644         base_offset = offset & ~0xfff;
4645
4646       if (base_offset == 0)
4647         return x;
4648
4649       offset -= base_offset;
4650       rtx base_reg = gen_reg_rtx (Pmode);
4651       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4652                            NULL_RTX);
4653       emit_move_insn (base_reg, val);
4654       x = plus_constant (Pmode, base_reg, offset);
4655     }
4656
4657   return x;
4658 }
4659
4660 /* Try a machine-dependent way of reloading an illegitimate address
4661    operand.  If we find one, push the reload and return the new rtx.  */
4662
4663 rtx
4664 aarch64_legitimize_reload_address (rtx *x_p,
4665                                    machine_mode mode,
4666                                    int opnum, int type,
4667                                    int ind_levels ATTRIBUTE_UNUSED)
4668 {
4669   rtx x = *x_p;
4670
4671   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4672   if (aarch64_vect_struct_mode_p (mode)
4673       && GET_CODE (x) == PLUS
4674       && REG_P (XEXP (x, 0))
4675       && CONST_INT_P (XEXP (x, 1)))
4676     {
4677       rtx orig_rtx = x;
4678       x = copy_rtx (x);
4679       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4680                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4681                    opnum, (enum reload_type) type);
4682       return x;
4683     }
4684
4685   /* We must recognize output that we have already generated ourselves.  */
4686   if (GET_CODE (x) == PLUS
4687       && GET_CODE (XEXP (x, 0)) == PLUS
4688       && REG_P (XEXP (XEXP (x, 0), 0))
4689       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4690       && CONST_INT_P (XEXP (x, 1)))
4691     {
4692       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4693                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4694                    opnum, (enum reload_type) type);
4695       return x;
4696     }
4697
4698   /* We wish to handle large displacements off a base register by splitting
4699      the addend across an add and the mem insn.  This can cut the number of
4700      extra insns needed from 3 to 1.  It is only useful for load/store of a
4701      single register with 12 bit offset field.  */
4702   if (GET_CODE (x) == PLUS
4703       && REG_P (XEXP (x, 0))
4704       && CONST_INT_P (XEXP (x, 1))
4705       && HARD_REGISTER_P (XEXP (x, 0))
4706       && mode != TImode
4707       && mode != TFmode
4708       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4709     {
4710       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4711       HOST_WIDE_INT low = val & 0xfff;
4712       HOST_WIDE_INT high = val - low;
4713       HOST_WIDE_INT offs;
4714       rtx cst;
4715       machine_mode xmode = GET_MODE (x);
4716
4717       /* In ILP32, xmode can be either DImode or SImode.  */
4718       gcc_assert (xmode == DImode || xmode == SImode);
4719
4720       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4721          BLKmode alignment.  */
4722       if (GET_MODE_SIZE (mode) == 0)
4723         return NULL_RTX;
4724
4725       offs = low % GET_MODE_SIZE (mode);
4726
4727       /* Align misaligned offset by adjusting high part to compensate.  */
4728       if (offs != 0)
4729         {
4730           if (aarch64_uimm12_shift (high + offs))
4731             {
4732               /* Align down.  */
4733               low = low - offs;
4734               high = high + offs;
4735             }
4736           else
4737             {
4738               /* Align up.  */
4739               offs = GET_MODE_SIZE (mode) - offs;
4740               low = low + offs;
4741               high = high + (low & 0x1000) - offs;
4742               low &= 0xfff;
4743             }
4744         }
4745
4746       /* Check for overflow.  */
4747       if (high + low != val)
4748         return NULL_RTX;
4749
4750       cst = GEN_INT (high);
4751       if (!aarch64_uimm12_shift (high))
4752         cst = force_const_mem (xmode, cst);
4753
4754       /* Reload high part into base reg, leaving the low part
4755          in the mem instruction.
4756          Note that replacing this gen_rtx_PLUS with plus_constant is
4757          wrong in this case because we rely on the
4758          (plus (plus reg c1) c2) structure being preserved so that
4759          XEXP (*p, 0) in push_reload below uses the correct term.  */
4760       x = gen_rtx_PLUS (xmode,
4761                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4762                         GEN_INT (low));
4763
4764       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4765                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4766                    opnum, (enum reload_type) type);
4767       return x;
4768     }
4769
4770   return NULL_RTX;
4771 }
4772
4773
4774 static reg_class_t
4775 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4776                           reg_class_t rclass,
4777                           machine_mode mode,
4778                           secondary_reload_info *sri)
4779 {
4780   /* Without the TARGET_SIMD instructions we cannot move a Q register
4781      to a Q register directly.  We need a scratch.  */
4782   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4783       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4784       && reg_class_subset_p (rclass, FP_REGS))
4785     {
4786       if (mode == TFmode)
4787         sri->icode = CODE_FOR_aarch64_reload_movtf;
4788       else if (mode == TImode)
4789         sri->icode = CODE_FOR_aarch64_reload_movti;
4790       return NO_REGS;
4791     }
4792
4793   /* A TFmode or TImode memory access should be handled via an FP_REGS
4794      because AArch64 has richer addressing modes for LDR/STR instructions
4795      than LDP/STP instructions.  */
4796   if (TARGET_FLOAT && rclass == GENERAL_REGS
4797       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4798     return FP_REGS;
4799
4800   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4801       return GENERAL_REGS;
4802
4803   return NO_REGS;
4804 }
4805
4806 static bool
4807 aarch64_can_eliminate (const int from, const int to)
4808 {
4809   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4810      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4811
4812   if (frame_pointer_needed)
4813     {
4814       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4815         return true;
4816       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4817         return false;
4818       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4819           && !cfun->calls_alloca)
4820         return true;
4821       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4822         return true;
4823
4824       return false;
4825     }
4826   else
4827     {
4828       /* If we decided that we didn't need a leaf frame pointer but then used
4829          LR in the function, then we'll want a frame pointer after all, so
4830          prevent this elimination to ensure a frame pointer is used.  */
4831       if (to == STACK_POINTER_REGNUM
4832           && flag_omit_leaf_frame_pointer
4833           && df_regs_ever_live_p (LR_REGNUM))
4834         return false;
4835     }
4836
4837   return true;
4838 }
4839
4840 HOST_WIDE_INT
4841 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4842 {
4843   aarch64_layout_frame ();
4844
4845   if (to == HARD_FRAME_POINTER_REGNUM)
4846     {
4847       if (from == ARG_POINTER_REGNUM)
4848         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4849
4850       if (from == FRAME_POINTER_REGNUM)
4851         return (cfun->machine->frame.hard_fp_offset
4852                 - cfun->machine->frame.saved_varargs_size);
4853     }
4854
4855   if (to == STACK_POINTER_REGNUM)
4856     {
4857       if (from == FRAME_POINTER_REGNUM)
4858           return (cfun->machine->frame.frame_size
4859                   - cfun->machine->frame.saved_varargs_size);
4860     }
4861
4862   return cfun->machine->frame.frame_size;
4863 }
4864
4865 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4866    previous frame.  */
4867
4868 rtx
4869 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4870 {
4871   if (count != 0)
4872     return const0_rtx;
4873   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4874 }
4875
4876
4877 static void
4878 aarch64_asm_trampoline_template (FILE *f)
4879 {
4880   if (TARGET_ILP32)
4881     {
4882       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4883       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4884     }
4885   else
4886     {
4887       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4888       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4889     }
4890   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4891   assemble_aligned_integer (4, const0_rtx);
4892   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4893   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4894 }
4895
4896 static void
4897 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4898 {
4899   rtx fnaddr, mem, a_tramp;
4900   const int tramp_code_sz = 16;
4901
4902   /* Don't need to copy the trailing D-words, we fill those in below.  */
4903   emit_block_move (m_tramp, assemble_trampoline_template (),
4904                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4905   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4906   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4907   if (GET_MODE (fnaddr) != ptr_mode)
4908     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4909   emit_move_insn (mem, fnaddr);
4910
4911   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4912   emit_move_insn (mem, chain_value);
4913
4914   /* XXX We should really define a "clear_cache" pattern and use
4915      gen_clear_cache().  */
4916   a_tramp = XEXP (m_tramp, 0);
4917   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4918                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4919                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4920                      ptr_mode);
4921 }
4922
4923 static unsigned char
4924 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4925 {
4926   switch (regclass)
4927     {
4928     case CALLER_SAVE_REGS:
4929     case POINTER_REGS:
4930     case GENERAL_REGS:
4931     case ALL_REGS:
4932     case FP_REGS:
4933     case FP_LO_REGS:
4934       return
4935         aarch64_vector_mode_p (mode)
4936           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
4937           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4938     case STACK_REG:
4939       return 1;
4940
4941     case NO_REGS:
4942       return 0;
4943
4944     default:
4945       break;
4946     }
4947   gcc_unreachable ();
4948 }
4949
4950 static reg_class_t
4951 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4952 {
4953   if (regclass == POINTER_REGS)
4954     return GENERAL_REGS;
4955
4956   if (regclass == STACK_REG)
4957     {
4958       if (REG_P(x)
4959           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4960           return regclass;
4961
4962       return NO_REGS;
4963     }
4964
4965   /* If it's an integer immediate that MOVI can't handle, then
4966      FP_REGS is not an option, so we return NO_REGS instead.  */
4967   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4968       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4969     return NO_REGS;
4970
4971   /* Register eliminiation can result in a request for
4972      SP+constant->FP_REGS.  We cannot support such operations which
4973      use SP as source and an FP_REG as destination, so reject out
4974      right now.  */
4975   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4976     {
4977       rtx lhs = XEXP (x, 0);
4978
4979       /* Look through a possible SUBREG introduced by ILP32.  */
4980       if (GET_CODE (lhs) == SUBREG)
4981         lhs = SUBREG_REG (lhs);
4982
4983       gcc_assert (REG_P (lhs));
4984       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4985                                       POINTER_REGS));
4986       return NO_REGS;
4987     }
4988
4989   return regclass;
4990 }
4991
4992 void
4993 aarch64_asm_output_labelref (FILE* f, const char *name)
4994 {
4995   asm_fprintf (f, "%U%s", name);
4996 }
4997
4998 static void
4999 aarch64_elf_asm_constructor (rtx symbol, int priority)
5000 {
5001   if (priority == DEFAULT_INIT_PRIORITY)
5002     default_ctor_section_asm_out_constructor (symbol, priority);
5003   else
5004     {
5005       section *s;
5006       char buf[18];
5007       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5008       s = get_section (buf, SECTION_WRITE, NULL);
5009       switch_to_section (s);
5010       assemble_align (POINTER_SIZE);
5011       assemble_aligned_integer (POINTER_BYTES, symbol);
5012     }
5013 }
5014
5015 static void
5016 aarch64_elf_asm_destructor (rtx symbol, int priority)
5017 {
5018   if (priority == DEFAULT_INIT_PRIORITY)
5019     default_dtor_section_asm_out_destructor (symbol, priority);
5020   else
5021     {
5022       section *s;
5023       char buf[18];
5024       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5025       s = get_section (buf, SECTION_WRITE, NULL);
5026       switch_to_section (s);
5027       assemble_align (POINTER_SIZE);
5028       assemble_aligned_integer (POINTER_BYTES, symbol);
5029     }
5030 }
5031
5032 const char*
5033 aarch64_output_casesi (rtx *operands)
5034 {
5035   char buf[100];
5036   char label[100];
5037   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5038   int index;
5039   static const char *const patterns[4][2] =
5040   {
5041     {
5042       "ldrb\t%w3, [%0,%w1,uxtw]",
5043       "add\t%3, %4, %w3, sxtb #2"
5044     },
5045     {
5046       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5047       "add\t%3, %4, %w3, sxth #2"
5048     },
5049     {
5050       "ldr\t%w3, [%0,%w1,uxtw #2]",
5051       "add\t%3, %4, %w3, sxtw #2"
5052     },
5053     /* We assume that DImode is only generated when not optimizing and
5054        that we don't really need 64-bit address offsets.  That would
5055        imply an object file with 8GB of code in a single function!  */
5056     {
5057       "ldr\t%w3, [%0,%w1,uxtw #2]",
5058       "add\t%3, %4, %w3, sxtw #2"
5059     }
5060   };
5061
5062   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5063
5064   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5065
5066   gcc_assert (index >= 0 && index <= 3);
5067
5068   /* Need to implement table size reduction, by chaning the code below.  */
5069   output_asm_insn (patterns[index][0], operands);
5070   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5071   snprintf (buf, sizeof (buf),
5072             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5073   output_asm_insn (buf, operands);
5074   output_asm_insn (patterns[index][1], operands);
5075   output_asm_insn ("br\t%3", operands);
5076   assemble_label (asm_out_file, label);
5077   return "";
5078 }
5079
5080
5081 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5082    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5083    operator.  */
5084
5085 int
5086 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5087 {
5088   if (shift >= 0 && shift <= 3)
5089     {
5090       int size;
5091       for (size = 8; size <= 32; size *= 2)
5092         {
5093           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5094           if (mask == bits << shift)
5095             return size;
5096         }
5097     }
5098   return 0;
5099 }
5100
5101 static bool
5102 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5103                                    const_rtx x ATTRIBUTE_UNUSED)
5104 {
5105   /* We can't use blocks for constants when we're using a per-function
5106      constant pool.  */
5107   return false;
5108 }
5109
5110 static section *
5111 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5112                             rtx x ATTRIBUTE_UNUSED,
5113                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5114 {
5115   /* Force all constant pool entries into the current function section.  */
5116   return function_section (current_function_decl);
5117 }
5118
5119
5120 /* Costs.  */
5121
5122 /* Helper function for rtx cost calculation.  Strip a shift expression
5123    from X.  Returns the inner operand if successful, or the original
5124    expression on failure.  */
5125 static rtx
5126 aarch64_strip_shift (rtx x)
5127 {
5128   rtx op = x;
5129
5130   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5131      we can convert both to ROR during final output.  */
5132   if ((GET_CODE (op) == ASHIFT
5133        || GET_CODE (op) == ASHIFTRT
5134        || GET_CODE (op) == LSHIFTRT
5135        || GET_CODE (op) == ROTATERT
5136        || GET_CODE (op) == ROTATE)
5137       && CONST_INT_P (XEXP (op, 1)))
5138     return XEXP (op, 0);
5139
5140   if (GET_CODE (op) == MULT
5141       && CONST_INT_P (XEXP (op, 1))
5142       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5143     return XEXP (op, 0);
5144
5145   return x;
5146 }
5147
5148 /* Helper function for rtx cost calculation.  Strip an extend
5149    expression from X.  Returns the inner operand if successful, or the
5150    original expression on failure.  We deal with a number of possible
5151    canonicalization variations here.  */
5152 static rtx
5153 aarch64_strip_extend (rtx x)
5154 {
5155   rtx op = x;
5156
5157   /* Zero and sign extraction of a widened value.  */
5158   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5159       && XEXP (op, 2) == const0_rtx
5160       && GET_CODE (XEXP (op, 0)) == MULT
5161       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5162                                          XEXP (op, 1)))
5163     return XEXP (XEXP (op, 0), 0);
5164
5165   /* It can also be represented (for zero-extend) as an AND with an
5166      immediate.  */
5167   if (GET_CODE (op) == AND
5168       && GET_CODE (XEXP (op, 0)) == MULT
5169       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5170       && CONST_INT_P (XEXP (op, 1))
5171       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5172                            INTVAL (XEXP (op, 1))) != 0)
5173     return XEXP (XEXP (op, 0), 0);
5174
5175   /* Now handle extended register, as this may also have an optional
5176      left shift by 1..4.  */
5177   if (GET_CODE (op) == ASHIFT
5178       && CONST_INT_P (XEXP (op, 1))
5179       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5180     op = XEXP (op, 0);
5181
5182   if (GET_CODE (op) == ZERO_EXTEND
5183       || GET_CODE (op) == SIGN_EXTEND)
5184     op = XEXP (op, 0);
5185
5186   if (op != x)
5187     return op;
5188
5189   return x;
5190 }
5191
5192 /* Return true iff CODE is a shift supported in combination
5193    with arithmetic instructions.  */
5194
5195 static bool
5196 aarch64_shift_p (enum rtx_code code)
5197 {
5198   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5199 }
5200
5201 /* Helper function for rtx cost calculation.  Calculate the cost of
5202    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5203    Return the calculated cost of the expression, recursing manually in to
5204    operands where needed.  */
5205
5206 static int
5207 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5208 {
5209   rtx op0, op1;
5210   const struct cpu_cost_table *extra_cost
5211     = aarch64_tune_params->insn_extra_cost;
5212   int cost = 0;
5213   bool compound_p = (outer == PLUS || outer == MINUS);
5214   machine_mode mode = GET_MODE (x);
5215
5216   gcc_checking_assert (code == MULT);
5217
5218   op0 = XEXP (x, 0);
5219   op1 = XEXP (x, 1);
5220
5221   if (VECTOR_MODE_P (mode))
5222     mode = GET_MODE_INNER (mode);
5223
5224   /* Integer multiply/fma.  */
5225   if (GET_MODE_CLASS (mode) == MODE_INT)
5226     {
5227       /* The multiply will be canonicalized as a shift, cost it as such.  */
5228       if (aarch64_shift_p (GET_CODE (x))
5229           || (CONST_INT_P (op1)
5230               && exact_log2 (INTVAL (op1)) > 0))
5231         {
5232           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5233                            || GET_CODE (op0) == SIGN_EXTEND;
5234           if (speed)
5235             {
5236               if (compound_p)
5237                 {
5238                   if (REG_P (op1))
5239                     /* ARITH + shift-by-register.  */
5240                     cost += extra_cost->alu.arith_shift_reg;
5241                   else if (is_extend)
5242                     /* ARITH + extended register.  We don't have a cost field
5243                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5244                     cost += extra_cost->alu.extend_arith;
5245                   else
5246                     /* ARITH + shift-by-immediate.  */
5247                     cost += extra_cost->alu.arith_shift;
5248                 }
5249               else
5250                 /* LSL (immediate).  */
5251                 cost += extra_cost->alu.shift;
5252
5253             }
5254           /* Strip extends as we will have costed them in the case above.  */
5255           if (is_extend)
5256             op0 = aarch64_strip_extend (op0);
5257
5258           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5259
5260           return cost;
5261         }
5262
5263       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5264          compound and let the below cases handle it.  After all, MNEG is a
5265          special-case alias of MSUB.  */
5266       if (GET_CODE (op0) == NEG)
5267         {
5268           op0 = XEXP (op0, 0);
5269           compound_p = true;
5270         }
5271
5272       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5273       if ((GET_CODE (op0) == ZERO_EXTEND
5274            && GET_CODE (op1) == ZERO_EXTEND)
5275           || (GET_CODE (op0) == SIGN_EXTEND
5276               && GET_CODE (op1) == SIGN_EXTEND))
5277         {
5278           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5279                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5280
5281           if (speed)
5282             {
5283               if (compound_p)
5284                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5285                 cost += extra_cost->mult[0].extend_add;
5286               else
5287                 /* MUL/SMULL/UMULL.  */
5288                 cost += extra_cost->mult[0].extend;
5289             }
5290
5291           return cost;
5292         }
5293
5294       /* This is either an integer multiply or a MADD.  In both cases
5295          we want to recurse and cost the operands.  */
5296       cost += rtx_cost (op0, MULT, 0, speed)
5297               + rtx_cost (op1, MULT, 1, speed);
5298
5299       if (speed)
5300         {
5301           if (compound_p)
5302             /* MADD/MSUB.  */
5303             cost += extra_cost->mult[mode == DImode].add;
5304           else
5305             /* MUL.  */
5306             cost += extra_cost->mult[mode == DImode].simple;
5307         }
5308
5309       return cost;
5310     }
5311   else
5312     {
5313       if (speed)
5314         {
5315           /* Floating-point FMA/FMUL can also support negations of the
5316              operands.  */
5317           if (GET_CODE (op0) == NEG)
5318             op0 = XEXP (op0, 0);
5319           if (GET_CODE (op1) == NEG)
5320             op1 = XEXP (op1, 0);
5321
5322           if (compound_p)
5323             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5324             cost += extra_cost->fp[mode == DFmode].fma;
5325           else
5326             /* FMUL/FNMUL.  */
5327             cost += extra_cost->fp[mode == DFmode].mult;
5328         }
5329
5330       cost += rtx_cost (op0, MULT, 0, speed)
5331               + rtx_cost (op1, MULT, 1, speed);
5332       return cost;
5333     }
5334 }
5335
5336 static int
5337 aarch64_address_cost (rtx x,
5338                       machine_mode mode,
5339                       addr_space_t as ATTRIBUTE_UNUSED,
5340                       bool speed)
5341 {
5342   enum rtx_code c = GET_CODE (x);
5343   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5344   struct aarch64_address_info info;
5345   int cost = 0;
5346   info.shift = 0;
5347
5348   if (!aarch64_classify_address (&info, x, mode, c, false))
5349     {
5350       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5351         {
5352           /* This is a CONST or SYMBOL ref which will be split
5353              in a different way depending on the code model in use.
5354              Cost it through the generic infrastructure.  */
5355           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5356           /* Divide through by the cost of one instruction to
5357              bring it to the same units as the address costs.  */
5358           cost_symbol_ref /= COSTS_N_INSNS (1);
5359           /* The cost is then the cost of preparing the address,
5360              followed by an immediate (possibly 0) offset.  */
5361           return cost_symbol_ref + addr_cost->imm_offset;
5362         }
5363       else
5364         {
5365           /* This is most likely a jump table from a case
5366              statement.  */
5367           return addr_cost->register_offset;
5368         }
5369     }
5370
5371   switch (info.type)
5372     {
5373       case ADDRESS_LO_SUM:
5374       case ADDRESS_SYMBOLIC:
5375       case ADDRESS_REG_IMM:
5376         cost += addr_cost->imm_offset;
5377         break;
5378
5379       case ADDRESS_REG_WB:
5380         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5381           cost += addr_cost->pre_modify;
5382         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5383           cost += addr_cost->post_modify;
5384         else
5385           gcc_unreachable ();
5386
5387         break;
5388
5389       case ADDRESS_REG_REG:
5390         cost += addr_cost->register_offset;
5391         break;
5392
5393       case ADDRESS_REG_UXTW:
5394       case ADDRESS_REG_SXTW:
5395         cost += addr_cost->register_extend;
5396         break;
5397
5398       default:
5399         gcc_unreachable ();
5400     }
5401
5402
5403   if (info.shift > 0)
5404     {
5405       /* For the sake of calculating the cost of the shifted register
5406          component, we can treat same sized modes in the same way.  */
5407       switch (GET_MODE_BITSIZE (mode))
5408         {
5409           case 16:
5410             cost += addr_cost->addr_scale_costs.hi;
5411             break;
5412
5413           case 32:
5414             cost += addr_cost->addr_scale_costs.si;
5415             break;
5416
5417           case 64:
5418             cost += addr_cost->addr_scale_costs.di;
5419             break;
5420
5421           /* We can't tell, or this is a 128-bit vector.  */
5422           default:
5423             cost += addr_cost->addr_scale_costs.ti;
5424             break;
5425         }
5426     }
5427
5428   return cost;
5429 }
5430
5431 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5432    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5433    to be taken.  */
5434
5435 int
5436 aarch64_branch_cost (bool speed_p, bool predictable_p)
5437 {
5438   /* When optimizing for speed, use the cost of unpredictable branches.  */
5439   const struct cpu_branch_cost *branch_costs =
5440     aarch64_tune_params->branch_costs;
5441
5442   if (!speed_p || predictable_p)
5443     return branch_costs->predictable;
5444   else
5445     return branch_costs->unpredictable;
5446 }
5447
5448 /* Return true if the RTX X in mode MODE is a zero or sign extract
5449    usable in an ADD or SUB (extended register) instruction.  */
5450 static bool
5451 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5452 {
5453   /* Catch add with a sign extract.
5454      This is add_<optab><mode>_multp2.  */
5455   if (GET_CODE (x) == SIGN_EXTRACT
5456       || GET_CODE (x) == ZERO_EXTRACT)
5457     {
5458       rtx op0 = XEXP (x, 0);
5459       rtx op1 = XEXP (x, 1);
5460       rtx op2 = XEXP (x, 2);
5461
5462       if (GET_CODE (op0) == MULT
5463           && CONST_INT_P (op1)
5464           && op2 == const0_rtx
5465           && CONST_INT_P (XEXP (op0, 1))
5466           && aarch64_is_extend_from_extract (mode,
5467                                              XEXP (op0, 1),
5468                                              op1))
5469         {
5470           return true;
5471         }
5472     }
5473
5474   return false;
5475 }
5476
5477 static bool
5478 aarch64_frint_unspec_p (unsigned int u)
5479 {
5480   switch (u)
5481     {
5482       case UNSPEC_FRINTZ:
5483       case UNSPEC_FRINTP:
5484       case UNSPEC_FRINTM:
5485       case UNSPEC_FRINTA:
5486       case UNSPEC_FRINTN:
5487       case UNSPEC_FRINTX:
5488       case UNSPEC_FRINTI:
5489         return true;
5490
5491       default:
5492         return false;
5493     }
5494 }
5495
5496 /* Return true iff X is an rtx that will match an extr instruction
5497    i.e. as described in the *extr<mode>5_insn family of patterns.
5498    OP0 and OP1 will be set to the operands of the shifts involved
5499    on success and will be NULL_RTX otherwise.  */
5500
5501 static bool
5502 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5503 {
5504   rtx op0, op1;
5505   machine_mode mode = GET_MODE (x);
5506
5507   *res_op0 = NULL_RTX;
5508   *res_op1 = NULL_RTX;
5509
5510   if (GET_CODE (x) != IOR)
5511     return false;
5512
5513   op0 = XEXP (x, 0);
5514   op1 = XEXP (x, 1);
5515
5516   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5517       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5518     {
5519      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5520       if (GET_CODE (op1) == ASHIFT)
5521         std::swap (op0, op1);
5522
5523       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5524         return false;
5525
5526       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5527       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5528
5529       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5530           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5531         {
5532           *res_op0 = XEXP (op0, 0);
5533           *res_op1 = XEXP (op1, 0);
5534           return true;
5535         }
5536     }
5537
5538   return false;
5539 }
5540
5541 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5542    storing it in *COST.  Result is true if the total cost of the operation
5543    has now been calculated.  */
5544 static bool
5545 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5546 {
5547   rtx inner;
5548   rtx comparator;
5549   enum rtx_code cmpcode;
5550
5551   if (COMPARISON_P (op0))
5552     {
5553       inner = XEXP (op0, 0);
5554       comparator = XEXP (op0, 1);
5555       cmpcode = GET_CODE (op0);
5556     }
5557   else
5558     {
5559       inner = op0;
5560       comparator = const0_rtx;
5561       cmpcode = NE;
5562     }
5563
5564   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5565     {
5566       /* Conditional branch.  */
5567       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5568         return true;
5569       else
5570         {
5571           if (cmpcode == NE || cmpcode == EQ)
5572             {
5573               if (comparator == const0_rtx)
5574                 {
5575                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5576                   if (GET_CODE (inner) == ZERO_EXTRACT)
5577                     /* TBZ/TBNZ.  */
5578                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5579                                        0, speed);
5580                 else
5581                   /* CBZ/CBNZ.  */
5582                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5583
5584                 return true;
5585               }
5586             }
5587           else if (cmpcode == LT || cmpcode == GE)
5588             {
5589               /* TBZ/TBNZ.  */
5590               if (comparator == const0_rtx)
5591                 return true;
5592             }
5593         }
5594     }
5595   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5596     {
5597       /* It's a conditional operation based on the status flags,
5598          so it must be some flavor of CSEL.  */
5599
5600       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5601       if (GET_CODE (op1) == NEG
5602           || GET_CODE (op1) == NOT
5603           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5604         op1 = XEXP (op1, 0);
5605
5606       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5607       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5608       return true;
5609     }
5610
5611   /* We don't know what this is, cost all operands.  */
5612   return false;
5613 }
5614
5615 /* Calculate the cost of calculating X, storing it in *COST.  Result
5616    is true if the total cost of the operation has now been calculated.  */
5617 static bool
5618 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5619                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5620 {
5621   rtx op0, op1, op2;
5622   const struct cpu_cost_table *extra_cost
5623     = aarch64_tune_params->insn_extra_cost;
5624   machine_mode mode = GET_MODE (x);
5625
5626   /* By default, assume that everything has equivalent cost to the
5627      cheapest instruction.  Any additional costs are applied as a delta
5628      above this default.  */
5629   *cost = COSTS_N_INSNS (1);
5630
5631   switch (code)
5632     {
5633     case SET:
5634       /* The cost depends entirely on the operands to SET.  */
5635       *cost = 0;
5636       op0 = SET_DEST (x);
5637       op1 = SET_SRC (x);
5638
5639       switch (GET_CODE (op0))
5640         {
5641         case MEM:
5642           if (speed)
5643             {
5644               rtx address = XEXP (op0, 0);
5645               if (VECTOR_MODE_P (mode))
5646                 *cost += extra_cost->ldst.storev;
5647               else if (GET_MODE_CLASS (mode) == MODE_INT)
5648                 *cost += extra_cost->ldst.store;
5649               else if (mode == SFmode)
5650                 *cost += extra_cost->ldst.storef;
5651               else if (mode == DFmode)
5652                 *cost += extra_cost->ldst.stored;
5653
5654               *cost +=
5655                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5656                                                      0, speed));
5657             }
5658
5659           *cost += rtx_cost (op1, SET, 1, speed);
5660           return true;
5661
5662         case SUBREG:
5663           if (! REG_P (SUBREG_REG (op0)))
5664             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5665
5666           /* Fall through.  */
5667         case REG:
5668           /* The cost is one per vector-register copied.  */
5669           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5670             {
5671               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5672                               / GET_MODE_SIZE (V4SImode);
5673               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5674             }
5675           /* const0_rtx is in general free, but we will use an
5676              instruction to set a register to 0.  */
5677           else if (REG_P (op1) || op1 == const0_rtx)
5678             {
5679               /* The cost is 1 per register copied.  */
5680               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5681                               / UNITS_PER_WORD;
5682               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5683             }
5684           else
5685             /* Cost is just the cost of the RHS of the set.  */
5686             *cost += rtx_cost (op1, SET, 1, speed);
5687           return true;
5688
5689         case ZERO_EXTRACT:
5690         case SIGN_EXTRACT:
5691           /* Bit-field insertion.  Strip any redundant widening of
5692              the RHS to meet the width of the target.  */
5693           if (GET_CODE (op1) == SUBREG)
5694             op1 = SUBREG_REG (op1);
5695           if ((GET_CODE (op1) == ZERO_EXTEND
5696                || GET_CODE (op1) == SIGN_EXTEND)
5697               && CONST_INT_P (XEXP (op0, 1))
5698               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5699                   >= INTVAL (XEXP (op0, 1))))
5700             op1 = XEXP (op1, 0);
5701
5702           if (CONST_INT_P (op1))
5703             {
5704               /* MOV immediate is assumed to always be cheap.  */
5705               *cost = COSTS_N_INSNS (1);
5706             }
5707           else
5708             {
5709               /* BFM.  */
5710               if (speed)
5711                 *cost += extra_cost->alu.bfi;
5712               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5713             }
5714
5715           return true;
5716
5717         default:
5718           /* We can't make sense of this, assume default cost.  */
5719           *cost = COSTS_N_INSNS (1);
5720           return false;
5721         }
5722       return false;
5723
5724     case CONST_INT:
5725       /* If an instruction can incorporate a constant within the
5726          instruction, the instruction's expression avoids calling
5727          rtx_cost() on the constant.  If rtx_cost() is called on a
5728          constant, then it is usually because the constant must be
5729          moved into a register by one or more instructions.
5730
5731          The exception is constant 0, which can be expressed
5732          as XZR/WZR and is therefore free.  The exception to this is
5733          if we have (set (reg) (const0_rtx)) in which case we must cost
5734          the move.  However, we can catch that when we cost the SET, so
5735          we don't need to consider that here.  */
5736       if (x == const0_rtx)
5737         *cost = 0;
5738       else
5739         {
5740           /* To an approximation, building any other constant is
5741              proportionally expensive to the number of instructions
5742              required to build that constant.  This is true whether we
5743              are compiling for SPEED or otherwise.  */
5744           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5745                                  (NULL_RTX, x, false, mode));
5746         }
5747       return true;
5748
5749     case CONST_DOUBLE:
5750       if (speed)
5751         {
5752           /* mov[df,sf]_aarch64.  */
5753           if (aarch64_float_const_representable_p (x))
5754             /* FMOV (scalar immediate).  */
5755             *cost += extra_cost->fp[mode == DFmode].fpconst;
5756           else if (!aarch64_float_const_zero_rtx_p (x))
5757             {
5758               /* This will be a load from memory.  */
5759               if (mode == DFmode)
5760                 *cost += extra_cost->ldst.loadd;
5761               else
5762                 *cost += extra_cost->ldst.loadf;
5763             }
5764           else
5765             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5766                or MOV v0.s[0], wzr - neither of which are modeled by the
5767                cost tables.  Just use the default cost.  */
5768             {
5769             }
5770         }
5771
5772       return true;
5773
5774     case MEM:
5775       if (speed)
5776         {
5777           /* For loads we want the base cost of a load, plus an
5778              approximation for the additional cost of the addressing
5779              mode.  */
5780           rtx address = XEXP (x, 0);
5781           if (VECTOR_MODE_P (mode))
5782             *cost += extra_cost->ldst.loadv;
5783           else if (GET_MODE_CLASS (mode) == MODE_INT)
5784             *cost += extra_cost->ldst.load;
5785           else if (mode == SFmode)
5786             *cost += extra_cost->ldst.loadf;
5787           else if (mode == DFmode)
5788             *cost += extra_cost->ldst.loadd;
5789
5790           *cost +=
5791                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5792                                                      0, speed));
5793         }
5794
5795       return true;
5796
5797     case NEG:
5798       op0 = XEXP (x, 0);
5799
5800       if (VECTOR_MODE_P (mode))
5801         {
5802           if (speed)
5803             {
5804               /* FNEG.  */
5805               *cost += extra_cost->vect.alu;
5806             }
5807           return false;
5808         }
5809
5810       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5811        {
5812           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5813               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5814             {
5815               /* CSETM.  */
5816               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5817               return true;
5818             }
5819
5820           /* Cost this as SUB wzr, X.  */
5821           op0 = CONST0_RTX (GET_MODE (x));
5822           op1 = XEXP (x, 0);
5823           goto cost_minus;
5824         }
5825
5826       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5827         {
5828           /* Support (neg(fma...)) as a single instruction only if
5829              sign of zeros is unimportant.  This matches the decision
5830              making in aarch64.md.  */
5831           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5832             {
5833               /* FNMADD.  */
5834               *cost = rtx_cost (op0, NEG, 0, speed);
5835               return true;
5836             }
5837           if (speed)
5838             /* FNEG.  */
5839             *cost += extra_cost->fp[mode == DFmode].neg;
5840           return false;
5841         }
5842
5843       return false;
5844
5845     case CLRSB:
5846     case CLZ:
5847       if (speed)
5848         {
5849           if (VECTOR_MODE_P (mode))
5850             *cost += extra_cost->vect.alu;
5851           else
5852             *cost += extra_cost->alu.clz;
5853         }
5854
5855       return false;
5856
5857     case COMPARE:
5858       op0 = XEXP (x, 0);
5859       op1 = XEXP (x, 1);
5860
5861       if (op1 == const0_rtx
5862           && GET_CODE (op0) == AND)
5863         {
5864           x = op0;
5865           goto cost_logic;
5866         }
5867
5868       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5869         {
5870           /* TODO: A write to the CC flags possibly costs extra, this
5871              needs encoding in the cost tables.  */
5872
5873           /* CC_ZESWPmode supports zero extend for free.  */
5874           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5875             op0 = XEXP (op0, 0);
5876
5877           /* ANDS.  */
5878           if (GET_CODE (op0) == AND)
5879             {
5880               x = op0;
5881               goto cost_logic;
5882             }
5883
5884           if (GET_CODE (op0) == PLUS)
5885             {
5886               /* ADDS (and CMN alias).  */
5887               x = op0;
5888               goto cost_plus;
5889             }
5890
5891           if (GET_CODE (op0) == MINUS)
5892             {
5893               /* SUBS.  */
5894               x = op0;
5895               goto cost_minus;
5896             }
5897
5898           if (GET_CODE (op1) == NEG)
5899             {
5900               /* CMN.  */
5901               if (speed)
5902                 *cost += extra_cost->alu.arith;
5903
5904               *cost += rtx_cost (op0, COMPARE, 0, speed);
5905               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5906               return true;
5907             }
5908
5909           /* CMP.
5910
5911              Compare can freely swap the order of operands, and
5912              canonicalization puts the more complex operation first.
5913              But the integer MINUS logic expects the shift/extend
5914              operation in op1.  */
5915           if (! (REG_P (op0)
5916                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5917           {
5918             op0 = XEXP (x, 1);
5919             op1 = XEXP (x, 0);
5920           }
5921           goto cost_minus;
5922         }
5923
5924       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5925         {
5926           /* FCMP.  */
5927           if (speed)
5928             *cost += extra_cost->fp[mode == DFmode].compare;
5929
5930           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5931             {
5932               *cost += rtx_cost (op0, COMPARE, 0, speed);
5933               /* FCMP supports constant 0.0 for no extra cost. */
5934               return true;
5935             }
5936           return false;
5937         }
5938
5939       if (VECTOR_MODE_P (mode))
5940         {
5941           /* Vector compare.  */
5942           if (speed)
5943             *cost += extra_cost->vect.alu;
5944
5945           if (aarch64_float_const_zero_rtx_p (op1))
5946             {
5947               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
5948                  cost.  */
5949               return true;
5950             }
5951           return false;
5952         }
5953       return false;
5954
5955     case MINUS:
5956       {
5957         op0 = XEXP (x, 0);
5958         op1 = XEXP (x, 1);
5959
5960 cost_minus:
5961         *cost += rtx_cost (op0, MINUS, 0, speed);
5962
5963         /* Detect valid immediates.  */
5964         if ((GET_MODE_CLASS (mode) == MODE_INT
5965              || (GET_MODE_CLASS (mode) == MODE_CC
5966                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5967             && CONST_INT_P (op1)
5968             && aarch64_uimm12_shift (INTVAL (op1)))
5969           {
5970             if (speed)
5971               /* SUB(S) (immediate).  */
5972               *cost += extra_cost->alu.arith;
5973             return true;
5974           }
5975
5976         /* Look for SUB (extended register).  */
5977         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5978           {
5979             if (speed)
5980               *cost += extra_cost->alu.extend_arith;
5981
5982             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5983                                (enum rtx_code) GET_CODE (op1),
5984                                0, speed);
5985             return true;
5986           }
5987
5988         rtx new_op1 = aarch64_strip_extend (op1);
5989
5990         /* Cost this as an FMA-alike operation.  */
5991         if ((GET_CODE (new_op1) == MULT
5992              || aarch64_shift_p (GET_CODE (new_op1)))
5993             && code != COMPARE)
5994           {
5995             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5996                                             (enum rtx_code) code,
5997                                             speed);
5998             return true;
5999           }
6000
6001         *cost += rtx_cost (new_op1, MINUS, 1, speed);
6002
6003         if (speed)
6004           {
6005             if (VECTOR_MODE_P (mode))
6006               {
6007                 /* Vector SUB.  */
6008                 *cost += extra_cost->vect.alu;
6009               }
6010             else if (GET_MODE_CLASS (mode) == MODE_INT)
6011               {
6012                 /* SUB(S).  */
6013                 *cost += extra_cost->alu.arith;
6014               }
6015             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6016               {
6017                 /* FSUB.  */
6018                 *cost += extra_cost->fp[mode == DFmode].addsub;
6019               }
6020           }
6021         return true;
6022       }
6023
6024     case PLUS:
6025       {
6026         rtx new_op0;
6027
6028         op0 = XEXP (x, 0);
6029         op1 = XEXP (x, 1);
6030
6031 cost_plus:
6032         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6033             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6034           {
6035             /* CSINC.  */
6036             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6037             *cost += rtx_cost (op1, PLUS, 1, speed);
6038             return true;
6039           }
6040
6041         if (GET_MODE_CLASS (mode) == MODE_INT
6042             && CONST_INT_P (op1)
6043             && aarch64_uimm12_shift (INTVAL (op1)))
6044           {
6045             *cost += rtx_cost (op0, PLUS, 0, speed);
6046
6047             if (speed)
6048               /* ADD (immediate).  */
6049               *cost += extra_cost->alu.arith;
6050             return true;
6051           }
6052
6053         *cost += rtx_cost (op1, PLUS, 1, speed);
6054
6055         /* Look for ADD (extended register).  */
6056         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6057           {
6058             if (speed)
6059               *cost += extra_cost->alu.extend_arith;
6060
6061             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6062                                (enum rtx_code) GET_CODE (op0),
6063                                0, speed);
6064             return true;
6065           }
6066
6067         /* Strip any extend, leave shifts behind as we will
6068            cost them through mult_cost.  */
6069         new_op0 = aarch64_strip_extend (op0);
6070
6071         if (GET_CODE (new_op0) == MULT
6072             || aarch64_shift_p (GET_CODE (new_op0)))
6073           {
6074             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6075                                             speed);
6076             return true;
6077           }
6078
6079         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6080
6081         if (speed)
6082           {
6083             if (VECTOR_MODE_P (mode))
6084               {
6085                 /* Vector ADD.  */
6086                 *cost += extra_cost->vect.alu;
6087               }
6088             else if (GET_MODE_CLASS (mode) == MODE_INT)
6089               {
6090                 /* ADD.  */
6091                 *cost += extra_cost->alu.arith;
6092               }
6093             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6094               {
6095                 /* FADD.  */
6096                 *cost += extra_cost->fp[mode == DFmode].addsub;
6097               }
6098           }
6099         return true;
6100       }
6101
6102     case BSWAP:
6103       *cost = COSTS_N_INSNS (1);
6104
6105       if (speed)
6106         {
6107           if (VECTOR_MODE_P (mode))
6108             *cost += extra_cost->vect.alu;
6109           else
6110             *cost += extra_cost->alu.rev;
6111         }
6112       return false;
6113
6114     case IOR:
6115       if (aarch_rev16_p (x))
6116         {
6117           *cost = COSTS_N_INSNS (1);
6118
6119           if (speed)
6120             {
6121               if (VECTOR_MODE_P (mode))
6122                 *cost += extra_cost->vect.alu;
6123               else
6124                 *cost += extra_cost->alu.rev;
6125             }
6126           return true;
6127         }
6128
6129       if (aarch64_extr_rtx_p (x, &op0, &op1))
6130         {
6131           *cost += rtx_cost (op0, IOR, 0, speed)
6132                    + rtx_cost (op1, IOR, 1, speed);
6133           if (speed)
6134             *cost += extra_cost->alu.shift;
6135
6136           return true;
6137         }
6138     /* Fall through.  */
6139     case XOR:
6140     case AND:
6141     cost_logic:
6142       op0 = XEXP (x, 0);
6143       op1 = XEXP (x, 1);
6144
6145       if (VECTOR_MODE_P (mode))
6146         {
6147           if (speed)
6148             *cost += extra_cost->vect.alu;
6149           return true;
6150         }
6151
6152       if (code == AND
6153           && GET_CODE (op0) == MULT
6154           && CONST_INT_P (XEXP (op0, 1))
6155           && CONST_INT_P (op1)
6156           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6157                                INTVAL (op1)) != 0)
6158         {
6159           /* This is a UBFM/SBFM.  */
6160           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6161           if (speed)
6162             *cost += extra_cost->alu.bfx;
6163           return true;
6164         }
6165
6166       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6167         {
6168           /* We possibly get the immediate for free, this is not
6169              modelled.  */
6170           if (CONST_INT_P (op1)
6171               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6172             {
6173               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6174
6175               if (speed)
6176                 *cost += extra_cost->alu.logical;
6177
6178               return true;
6179             }
6180           else
6181             {
6182               rtx new_op0 = op0;
6183
6184               /* Handle ORN, EON, or BIC.  */
6185               if (GET_CODE (op0) == NOT)
6186                 op0 = XEXP (op0, 0);
6187
6188               new_op0 = aarch64_strip_shift (op0);
6189
6190               /* If we had a shift on op0 then this is a logical-shift-
6191                  by-register/immediate operation.  Otherwise, this is just
6192                  a logical operation.  */
6193               if (speed)
6194                 {
6195                   if (new_op0 != op0)
6196                     {
6197                       /* Shift by immediate.  */
6198                       if (CONST_INT_P (XEXP (op0, 1)))
6199                         *cost += extra_cost->alu.log_shift;
6200                       else
6201                         *cost += extra_cost->alu.log_shift_reg;
6202                     }
6203                   else
6204                     *cost += extra_cost->alu.logical;
6205                 }
6206
6207               /* In both cases we want to cost both operands.  */
6208               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6209                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6210
6211               return true;
6212             }
6213         }
6214       return false;
6215
6216     case NOT:
6217       x = XEXP (x, 0);
6218       op0 = aarch64_strip_shift (x);
6219
6220       if (VECTOR_MODE_P (mode))
6221         {
6222           /* Vector NOT.  */
6223           *cost += extra_cost->vect.alu;
6224           return false;
6225         }
6226
6227       /* MVN-shifted-reg.  */
6228       if (op0 != x)
6229         {
6230           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6231
6232           if (speed)
6233             *cost += extra_cost->alu.log_shift;
6234
6235           return true;
6236         }
6237       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6238          Handle the second form here taking care that 'a' in the above can
6239          be a shift.  */
6240       else if (GET_CODE (op0) == XOR)
6241         {
6242           rtx newop0 = XEXP (op0, 0);
6243           rtx newop1 = XEXP (op0, 1);
6244           rtx op0_stripped = aarch64_strip_shift (newop0);
6245
6246           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6247                    + rtx_cost (op0_stripped, XOR, 0, speed);
6248
6249           if (speed)
6250             {
6251               if (op0_stripped != newop0)
6252                 *cost += extra_cost->alu.log_shift;
6253               else
6254                 *cost += extra_cost->alu.logical;
6255             }
6256
6257           return true;
6258         }
6259       /* MVN.  */
6260       if (speed)
6261         *cost += extra_cost->alu.logical;
6262
6263       return false;
6264
6265     case ZERO_EXTEND:
6266
6267       op0 = XEXP (x, 0);
6268       /* If a value is written in SI mode, then zero extended to DI
6269          mode, the operation will in general be free as a write to
6270          a 'w' register implicitly zeroes the upper bits of an 'x'
6271          register.  However, if this is
6272
6273            (set (reg) (zero_extend (reg)))
6274
6275          we must cost the explicit register move.  */
6276       if (mode == DImode
6277           && GET_MODE (op0) == SImode
6278           && outer == SET)
6279         {
6280           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6281
6282           if (!op_cost && speed)
6283             /* MOV.  */
6284             *cost += extra_cost->alu.extend;
6285           else
6286             /* Free, the cost is that of the SI mode operation.  */
6287             *cost = op_cost;
6288
6289           return true;
6290         }
6291       else if (MEM_P (XEXP (x, 0)))
6292         {
6293           /* All loads can zero extend to any size for free.  */
6294           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6295           return true;
6296         }
6297
6298       if (speed)
6299         {
6300           if (VECTOR_MODE_P (mode))
6301             {
6302               /* UMOV.  */
6303               *cost += extra_cost->vect.alu;
6304             }
6305           else
6306             {
6307               /* UXTB/UXTH.  */
6308               *cost += extra_cost->alu.extend;
6309             }
6310         }
6311       return false;
6312
6313     case SIGN_EXTEND:
6314       if (MEM_P (XEXP (x, 0)))
6315         {
6316           /* LDRSH.  */
6317           if (speed)
6318             {
6319               rtx address = XEXP (XEXP (x, 0), 0);
6320               *cost += extra_cost->ldst.load_sign_extend;
6321
6322               *cost +=
6323                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6324                                                      0, speed));
6325             }
6326           return true;
6327         }
6328
6329       if (speed)
6330         {
6331           if (VECTOR_MODE_P (mode))
6332             *cost += extra_cost->vect.alu;
6333           else
6334             *cost += extra_cost->alu.extend;
6335         }
6336       return false;
6337
6338     case ASHIFT:
6339       op0 = XEXP (x, 0);
6340       op1 = XEXP (x, 1);
6341
6342       if (CONST_INT_P (op1))
6343         {
6344           if (speed)
6345             {
6346               if (VECTOR_MODE_P (mode))
6347                 {
6348                   /* Vector shift (immediate).  */
6349                   *cost += extra_cost->vect.alu;
6350                 }
6351               else
6352                 {
6353                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6354                      aliases.  */
6355                   *cost += extra_cost->alu.shift;
6356                 }
6357             }
6358
6359           /* We can incorporate zero/sign extend for free.  */
6360           if (GET_CODE (op0) == ZERO_EXTEND
6361               || GET_CODE (op0) == SIGN_EXTEND)
6362             op0 = XEXP (op0, 0);
6363
6364           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6365           return true;
6366         }
6367       else
6368         {
6369           if (speed)
6370             {
6371               if (VECTOR_MODE_P (mode))
6372                 {
6373                   /* Vector shift (register).  */
6374                   *cost += extra_cost->vect.alu;
6375                 }
6376               else
6377                 {
6378                   /* LSLV.  */
6379                   *cost += extra_cost->alu.shift_reg;
6380                 }
6381             }
6382           return false;  /* All arguments need to be in registers.  */
6383         }
6384
6385     case ROTATE:
6386     case ROTATERT:
6387     case LSHIFTRT:
6388     case ASHIFTRT:
6389       op0 = XEXP (x, 0);
6390       op1 = XEXP (x, 1);
6391
6392       if (CONST_INT_P (op1))
6393         {
6394           /* ASR (immediate) and friends.  */
6395           if (speed)
6396             {
6397               if (VECTOR_MODE_P (mode))
6398                 *cost += extra_cost->vect.alu;
6399               else
6400                 *cost += extra_cost->alu.shift;
6401             }
6402
6403           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6404           return true;
6405         }
6406       else
6407         {
6408
6409           /* ASR (register) and friends.  */
6410           if (speed)
6411             {
6412               if (VECTOR_MODE_P (mode))
6413                 *cost += extra_cost->vect.alu;
6414               else
6415                 *cost += extra_cost->alu.shift_reg;
6416             }
6417           return false;  /* All arguments need to be in registers.  */
6418         }
6419
6420     case SYMBOL_REF:
6421
6422       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6423         {
6424           /* LDR.  */
6425           if (speed)
6426             *cost += extra_cost->ldst.load;
6427         }
6428       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6429                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6430         {
6431           /* ADRP, followed by ADD.  */
6432           *cost += COSTS_N_INSNS (1);
6433           if (speed)
6434             *cost += 2 * extra_cost->alu.arith;
6435         }
6436       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6437                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6438         {
6439           /* ADR.  */
6440           if (speed)
6441             *cost += extra_cost->alu.arith;
6442         }
6443
6444       if (flag_pic)
6445         {
6446           /* One extra load instruction, after accessing the GOT.  */
6447           *cost += COSTS_N_INSNS (1);
6448           if (speed)
6449             *cost += extra_cost->ldst.load;
6450         }
6451       return true;
6452
6453     case HIGH:
6454     case LO_SUM:
6455       /* ADRP/ADD (immediate).  */
6456       if (speed)
6457         *cost += extra_cost->alu.arith;
6458       return true;
6459
6460     case ZERO_EXTRACT:
6461     case SIGN_EXTRACT:
6462       /* UBFX/SBFX.  */
6463       if (speed)
6464         {
6465           if (VECTOR_MODE_P (mode))
6466             *cost += extra_cost->vect.alu;
6467           else
6468             *cost += extra_cost->alu.bfx;
6469         }
6470
6471       /* We can trust that the immediates used will be correct (there
6472          are no by-register forms), so we need only cost op0.  */
6473       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6474       return true;
6475
6476     case MULT:
6477       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6478       /* aarch64_rtx_mult_cost always handles recursion to its
6479          operands.  */
6480       return true;
6481
6482     case MOD:
6483     case UMOD:
6484       if (speed)
6485         {
6486           if (VECTOR_MODE_P (mode))
6487             *cost += extra_cost->vect.alu;
6488           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6489             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6490                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6491           else if (GET_MODE (x) == DFmode)
6492             *cost += (extra_cost->fp[1].mult
6493                       + extra_cost->fp[1].div);
6494           else if (GET_MODE (x) == SFmode)
6495             *cost += (extra_cost->fp[0].mult
6496                       + extra_cost->fp[0].div);
6497         }
6498       return false;  /* All arguments need to be in registers.  */
6499
6500     case DIV:
6501     case UDIV:
6502     case SQRT:
6503       if (speed)
6504         {
6505           if (VECTOR_MODE_P (mode))
6506             *cost += extra_cost->vect.alu;
6507           else if (GET_MODE_CLASS (mode) == MODE_INT)
6508             /* There is no integer SQRT, so only DIV and UDIV can get
6509                here.  */
6510             *cost += extra_cost->mult[mode == DImode].idiv;
6511           else
6512             *cost += extra_cost->fp[mode == DFmode].div;
6513         }
6514       return false;  /* All arguments need to be in registers.  */
6515
6516     case IF_THEN_ELSE:
6517       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6518                                          XEXP (x, 2), cost, speed);
6519
6520     case EQ:
6521     case NE:
6522     case GT:
6523     case GTU:
6524     case LT:
6525     case LTU:
6526     case GE:
6527     case GEU:
6528     case LE:
6529     case LEU:
6530
6531       return false; /* All arguments must be in registers.  */
6532
6533     case FMA:
6534       op0 = XEXP (x, 0);
6535       op1 = XEXP (x, 1);
6536       op2 = XEXP (x, 2);
6537
6538       if (speed)
6539         {
6540           if (VECTOR_MODE_P (mode))
6541             *cost += extra_cost->vect.alu;
6542           else
6543             *cost += extra_cost->fp[mode == DFmode].fma;
6544         }
6545
6546       /* FMSUB, FNMADD, and FNMSUB are free.  */
6547       if (GET_CODE (op0) == NEG)
6548         op0 = XEXP (op0, 0);
6549
6550       if (GET_CODE (op2) == NEG)
6551         op2 = XEXP (op2, 0);
6552
6553       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6554          and the by-element operand as operand 0.  */
6555       if (GET_CODE (op1) == NEG)
6556         op1 = XEXP (op1, 0);
6557
6558       /* Catch vector-by-element operations.  The by-element operand can
6559          either be (vec_duplicate (vec_select (x))) or just
6560          (vec_select (x)), depending on whether we are multiplying by
6561          a vector or a scalar.
6562
6563          Canonicalization is not very good in these cases, FMA4 will put the
6564          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6565       if (GET_CODE (op0) == VEC_DUPLICATE)
6566         op0 = XEXP (op0, 0);
6567       else if (GET_CODE (op1) == VEC_DUPLICATE)
6568         op1 = XEXP (op1, 0);
6569
6570       if (GET_CODE (op0) == VEC_SELECT)
6571         op0 = XEXP (op0, 0);
6572       else if (GET_CODE (op1) == VEC_SELECT)
6573         op1 = XEXP (op1, 0);
6574
6575       /* If the remaining parameters are not registers,
6576          get the cost to put them into registers.  */
6577       *cost += rtx_cost (op0, FMA, 0, speed);
6578       *cost += rtx_cost (op1, FMA, 1, speed);
6579       *cost += rtx_cost (op2, FMA, 2, speed);
6580       return true;
6581
6582     case FLOAT:
6583     case UNSIGNED_FLOAT:
6584       if (speed)
6585         *cost += extra_cost->fp[mode == DFmode].fromint;
6586       return false;
6587
6588     case FLOAT_EXTEND:
6589       if (speed)
6590         {
6591           if (VECTOR_MODE_P (mode))
6592             {
6593               /*Vector truncate.  */
6594               *cost += extra_cost->vect.alu;
6595             }
6596           else
6597             *cost += extra_cost->fp[mode == DFmode].widen;
6598         }
6599       return false;
6600
6601     case FLOAT_TRUNCATE:
6602       if (speed)
6603         {
6604           if (VECTOR_MODE_P (mode))
6605             {
6606               /*Vector conversion.  */
6607               *cost += extra_cost->vect.alu;
6608             }
6609           else
6610             *cost += extra_cost->fp[mode == DFmode].narrow;
6611         }
6612       return false;
6613
6614     case FIX:
6615     case UNSIGNED_FIX:
6616       x = XEXP (x, 0);
6617       /* Strip the rounding part.  They will all be implemented
6618          by the fcvt* family of instructions anyway.  */
6619       if (GET_CODE (x) == UNSPEC)
6620         {
6621           unsigned int uns_code = XINT (x, 1);
6622
6623           if (uns_code == UNSPEC_FRINTA
6624               || uns_code == UNSPEC_FRINTM
6625               || uns_code == UNSPEC_FRINTN
6626               || uns_code == UNSPEC_FRINTP
6627               || uns_code == UNSPEC_FRINTZ)
6628             x = XVECEXP (x, 0, 0);
6629         }
6630
6631       if (speed)
6632         {
6633           if (VECTOR_MODE_P (mode))
6634             *cost += extra_cost->vect.alu;
6635           else
6636             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6637         }
6638       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6639       return true;
6640
6641     case ABS:
6642       if (VECTOR_MODE_P (mode))
6643         {
6644           /* ABS (vector).  */
6645           if (speed)
6646             *cost += extra_cost->vect.alu;
6647         }
6648       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6649         {
6650           op0 = XEXP (x, 0);
6651
6652           /* FABD, which is analogous to FADD.  */
6653           if (GET_CODE (op0) == MINUS)
6654             {
6655               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6656                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6657               if (speed)
6658                 *cost += extra_cost->fp[mode == DFmode].addsub;
6659
6660               return true;
6661             }
6662           /* Simple FABS is analogous to FNEG.  */
6663           if (speed)
6664             *cost += extra_cost->fp[mode == DFmode].neg;
6665         }
6666       else
6667         {
6668           /* Integer ABS will either be split to
6669              two arithmetic instructions, or will be an ABS
6670              (scalar), which we don't model.  */
6671           *cost = COSTS_N_INSNS (2);
6672           if (speed)
6673             *cost += 2 * extra_cost->alu.arith;
6674         }
6675       return false;
6676
6677     case SMAX:
6678     case SMIN:
6679       if (speed)
6680         {
6681           if (VECTOR_MODE_P (mode))
6682             *cost += extra_cost->vect.alu;
6683           else
6684             {
6685               /* FMAXNM/FMINNM/FMAX/FMIN.
6686                  TODO: This may not be accurate for all implementations, but
6687                  we do not model this in the cost tables.  */
6688               *cost += extra_cost->fp[mode == DFmode].addsub;
6689             }
6690         }
6691       return false;
6692
6693     case UNSPEC:
6694       /* The floating point round to integer frint* instructions.  */
6695       if (aarch64_frint_unspec_p (XINT (x, 1)))
6696         {
6697           if (speed)
6698             *cost += extra_cost->fp[mode == DFmode].roundint;
6699
6700           return false;
6701         }
6702
6703       if (XINT (x, 1) == UNSPEC_RBIT)
6704         {
6705           if (speed)
6706             *cost += extra_cost->alu.rev;
6707
6708           return false;
6709         }
6710       break;
6711
6712     case TRUNCATE:
6713
6714       /* Decompose <su>muldi3_highpart.  */
6715       if (/* (truncate:DI  */
6716           mode == DImode
6717           /*   (lshiftrt:TI  */
6718           && GET_MODE (XEXP (x, 0)) == TImode
6719           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6720           /*      (mult:TI  */
6721           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6722           /*        (ANY_EXTEND:TI (reg:DI))
6723                     (ANY_EXTEND:TI (reg:DI)))  */
6724           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6725                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6726               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6727                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6728           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6729           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6730           /*     (const_int 64)  */
6731           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6732           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6733         {
6734           /* UMULH/SMULH.  */
6735           if (speed)
6736             *cost += extra_cost->mult[mode == DImode].extend;
6737           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6738                              MULT, 0, speed);
6739           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6740                              MULT, 1, speed);
6741           return true;
6742         }
6743
6744       /* Fall through.  */
6745     default:
6746       break;
6747     }
6748
6749   if (dump_file && (dump_flags & TDF_DETAILS))
6750     fprintf (dump_file,
6751       "\nFailed to cost RTX.  Assuming default cost.\n");
6752
6753   return true;
6754 }
6755
6756 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6757    calculated for X.  This cost is stored in *COST.  Returns true
6758    if the total cost of X was calculated.  */
6759 static bool
6760 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6761                    int param, int *cost, bool speed)
6762 {
6763   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6764
6765   if (dump_file && (dump_flags & TDF_DETAILS))
6766     {
6767       print_rtl_single (dump_file, x);
6768       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6769                speed ? "Hot" : "Cold",
6770                *cost, result ? "final" : "partial");
6771     }
6772
6773   return result;
6774 }
6775
6776 static int
6777 aarch64_register_move_cost (machine_mode mode,
6778                             reg_class_t from_i, reg_class_t to_i)
6779 {
6780   enum reg_class from = (enum reg_class) from_i;
6781   enum reg_class to = (enum reg_class) to_i;
6782   const struct cpu_regmove_cost *regmove_cost
6783     = aarch64_tune_params->regmove_cost;
6784
6785   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6786   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6787     to = GENERAL_REGS;
6788
6789   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6790     from = GENERAL_REGS;
6791
6792   /* Moving between GPR and stack cost is the same as GP2GP.  */
6793   if ((from == GENERAL_REGS && to == STACK_REG)
6794       || (to == GENERAL_REGS && from == STACK_REG))
6795     return regmove_cost->GP2GP;
6796
6797   /* To/From the stack register, we move via the gprs.  */
6798   if (to == STACK_REG || from == STACK_REG)
6799     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6800             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6801
6802   if (GET_MODE_SIZE (mode) == 16)
6803     {
6804       /* 128-bit operations on general registers require 2 instructions.  */
6805       if (from == GENERAL_REGS && to == GENERAL_REGS)
6806         return regmove_cost->GP2GP * 2;
6807       else if (from == GENERAL_REGS)
6808         return regmove_cost->GP2FP * 2;
6809       else if (to == GENERAL_REGS)
6810         return regmove_cost->FP2GP * 2;
6811
6812       /* When AdvSIMD instructions are disabled it is not possible to move
6813          a 128-bit value directly between Q registers.  This is handled in
6814          secondary reload.  A general register is used as a scratch to move
6815          the upper DI value and the lower DI value is moved directly,
6816          hence the cost is the sum of three moves. */
6817       if (! TARGET_SIMD)
6818         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6819
6820       return regmove_cost->FP2FP;
6821     }
6822
6823   if (from == GENERAL_REGS && to == GENERAL_REGS)
6824     return regmove_cost->GP2GP;
6825   else if (from == GENERAL_REGS)
6826     return regmove_cost->GP2FP;
6827   else if (to == GENERAL_REGS)
6828     return regmove_cost->FP2GP;
6829
6830   return regmove_cost->FP2FP;
6831 }
6832
6833 static int
6834 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6835                           reg_class_t rclass ATTRIBUTE_UNUSED,
6836                           bool in ATTRIBUTE_UNUSED)
6837 {
6838   return aarch64_tune_params->memmov_cost;
6839 }
6840
6841 /* Return the number of instructions that can be issued per cycle.  */
6842 static int
6843 aarch64_sched_issue_rate (void)
6844 {
6845   return aarch64_tune_params->issue_rate;
6846 }
6847
6848 static int
6849 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6850 {
6851   int issue_rate = aarch64_sched_issue_rate ();
6852
6853   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6854 }
6855
6856 /* Vectorizer cost model target hooks.  */
6857
6858 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6859 static int
6860 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6861                                     tree vectype,
6862                                     int misalign ATTRIBUTE_UNUSED)
6863 {
6864   unsigned elements;
6865
6866   switch (type_of_cost)
6867     {
6868       case scalar_stmt:
6869         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6870
6871       case scalar_load:
6872         return aarch64_tune_params->vec_costs->scalar_load_cost;
6873
6874       case scalar_store:
6875         return aarch64_tune_params->vec_costs->scalar_store_cost;
6876
6877       case vector_stmt:
6878         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6879
6880       case vector_load:
6881         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6882
6883       case vector_store:
6884         return aarch64_tune_params->vec_costs->vec_store_cost;
6885
6886       case vec_to_scalar:
6887         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6888
6889       case scalar_to_vec:
6890         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6891
6892       case unaligned_load:
6893         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6894
6895       case unaligned_store:
6896         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6897
6898       case cond_branch_taken:
6899         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6900
6901       case cond_branch_not_taken:
6902         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6903
6904       case vec_perm:
6905       case vec_promote_demote:
6906         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6907
6908       case vec_construct:
6909         elements = TYPE_VECTOR_SUBPARTS (vectype);
6910         return elements / 2 + 1;
6911
6912       default:
6913         gcc_unreachable ();
6914     }
6915 }
6916
6917 /* Implement targetm.vectorize.add_stmt_cost.  */
6918 static unsigned
6919 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6920                        struct _stmt_vec_info *stmt_info, int misalign,
6921                        enum vect_cost_model_location where)
6922 {
6923   unsigned *cost = (unsigned *) data;
6924   unsigned retval = 0;
6925
6926   if (flag_vect_cost_model)
6927     {
6928       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6929       int stmt_cost =
6930             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6931
6932       /* Statements in an inner loop relative to the loop being
6933          vectorized are weighted more heavily.  The value here is
6934          a function (linear for now) of the loop nest level.  */
6935       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6936         {
6937           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6938           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6939           unsigned nest_level = loop_depth (loop);
6940
6941           count *= nest_level;
6942         }
6943
6944       retval = (unsigned) (count * stmt_cost);
6945       cost[where] += retval;
6946     }
6947
6948   return retval;
6949 }
6950
6951 static void initialize_aarch64_code_model (void);
6952
6953 /* Parse the architecture extension string.  */
6954
6955 static void
6956 aarch64_parse_extension (char *str)
6957 {
6958   /* The extension string is parsed left to right.  */
6959   const struct aarch64_option_extension *opt = NULL;
6960
6961   /* Flag to say whether we are adding or removing an extension.  */
6962   int adding_ext = -1;
6963
6964   while (str != NULL && *str != 0)
6965     {
6966       char *ext;
6967       size_t len;
6968
6969       str++;
6970       ext = strchr (str, '+');
6971
6972       if (ext != NULL)
6973         len = ext - str;
6974       else
6975         len = strlen (str);
6976
6977       if (len >= 2 && strncmp (str, "no", 2) == 0)
6978         {
6979           adding_ext = 0;
6980           len -= 2;
6981           str += 2;
6982         }
6983       else if (len > 0)
6984         adding_ext = 1;
6985
6986       if (len == 0)
6987         {
6988           error ("missing feature modifier after %qs", adding_ext ? "+"
6989                                                                   : "+no");
6990           return;
6991         }
6992
6993       /* Scan over the extensions table trying to find an exact match.  */
6994       for (opt = all_extensions; opt->name != NULL; opt++)
6995         {
6996           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6997             {
6998               /* Add or remove the extension.  */
6999               if (adding_ext)
7000                 aarch64_isa_flags |= opt->flags_on;
7001               else
7002                 aarch64_isa_flags &= ~(opt->flags_off);
7003               break;
7004             }
7005         }
7006
7007       if (opt->name == NULL)
7008         {
7009           /* Extension not found in list.  */
7010           error ("unknown feature modifier %qs", str);
7011           return;
7012         }
7013
7014       str = ext;
7015     };
7016
7017   return;
7018 }
7019
7020 /* Parse the ARCH string.  */
7021
7022 static void
7023 aarch64_parse_arch (void)
7024 {
7025   char *ext;
7026   const struct processor *arch;
7027   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7028   size_t len;
7029
7030   strcpy (str, aarch64_arch_string);
7031
7032   ext = strchr (str, '+');
7033
7034   if (ext != NULL)
7035     len = ext - str;
7036   else
7037     len = strlen (str);
7038
7039   if (len == 0)
7040     {
7041       error ("missing arch name in -march=%qs", str);
7042       return;
7043     }
7044
7045   /* Loop through the list of supported ARCHs to find a match.  */
7046   for (arch = all_architectures; arch->name != NULL; arch++)
7047     {
7048       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7049         {
7050           selected_arch = arch;
7051           aarch64_isa_flags = selected_arch->flags;
7052
7053           if (!selected_cpu)
7054             selected_cpu = &all_cores[selected_arch->core];
7055
7056           if (ext != NULL)
7057             {
7058               /* ARCH string contains at least one extension.  */
7059               aarch64_parse_extension (ext);
7060             }
7061
7062           if (strcmp (selected_arch->arch, selected_cpu->arch))
7063             {
7064               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7065                        selected_cpu->name, selected_arch->name);
7066             }
7067
7068           return;
7069         }
7070     }
7071
7072   /* ARCH name not found in list.  */
7073   error ("unknown value %qs for -march", str);
7074   return;
7075 }
7076
7077 /* Parse the CPU string.  */
7078
7079 static void
7080 aarch64_parse_cpu (void)
7081 {
7082   char *ext;
7083   const struct processor *cpu;
7084   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7085   size_t len;
7086
7087   strcpy (str, aarch64_cpu_string);
7088
7089   ext = strchr (str, '+');
7090
7091   if (ext != NULL)
7092     len = ext - str;
7093   else
7094     len = strlen (str);
7095
7096   if (len == 0)
7097     {
7098       error ("missing cpu name in -mcpu=%qs", str);
7099       return;
7100     }
7101
7102   /* Loop through the list of supported CPUs to find a match.  */
7103   for (cpu = all_cores; cpu->name != NULL; cpu++)
7104     {
7105       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7106         {
7107           selected_cpu = cpu;
7108           aarch64_isa_flags = selected_cpu->flags;
7109
7110           if (ext != NULL)
7111             {
7112               /* CPU string contains at least one extension.  */
7113               aarch64_parse_extension (ext);
7114             }
7115
7116           return;
7117         }
7118     }
7119
7120   /* CPU name not found in list.  */
7121   error ("unknown value %qs for -mcpu", str);
7122   return;
7123 }
7124
7125 /* Parse the TUNE string.  */
7126
7127 static void
7128 aarch64_parse_tune (void)
7129 {
7130   const struct processor *cpu;
7131   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7132   strcpy (str, aarch64_tune_string);
7133
7134   /* Loop through the list of supported CPUs to find a match.  */
7135   for (cpu = all_cores; cpu->name != NULL; cpu++)
7136     {
7137       if (strcmp (cpu->name, str) == 0)
7138         {
7139           selected_tune = cpu;
7140           return;
7141         }
7142     }
7143
7144   /* CPU name not found in list.  */
7145   error ("unknown value %qs for -mtune", str);
7146   return;
7147 }
7148
7149
7150 /* Implement TARGET_OPTION_OVERRIDE.  */
7151
7152 static void
7153 aarch64_override_options (void)
7154 {
7155   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7156      If either of -march or -mtune is given, they override their
7157      respective component of -mcpu.
7158
7159      So, first parse AARCH64_CPU_STRING, then the others, be careful
7160      with -march as, if -mcpu is not present on the command line, march
7161      must set a sensible default CPU.  */
7162   if (aarch64_cpu_string)
7163     {
7164       aarch64_parse_cpu ();
7165     }
7166
7167   if (aarch64_arch_string)
7168     {
7169       aarch64_parse_arch ();
7170     }
7171
7172   if (aarch64_tune_string)
7173     {
7174       aarch64_parse_tune ();
7175     }
7176
7177 #ifndef HAVE_AS_MABI_OPTION
7178   /* The compiler may have been configured with 2.23.* binutils, which does
7179      not have support for ILP32.  */
7180   if (TARGET_ILP32)
7181     error ("Assembler does not support -mabi=ilp32");
7182 #endif
7183
7184   initialize_aarch64_code_model ();
7185
7186   aarch64_build_bitmask_table ();
7187
7188   /* This target defaults to strict volatile bitfields.  */
7189   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7190     flag_strict_volatile_bitfields = 1;
7191
7192   /* If the user did not specify a processor, choose the default
7193      one for them.  This will be the CPU set during configuration using
7194      --with-cpu, otherwise it is "generic".  */
7195   if (!selected_cpu)
7196     {
7197       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7198       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7199     }
7200
7201   gcc_assert (selected_cpu);
7202
7203   if (!selected_tune)
7204     selected_tune = selected_cpu;
7205
7206   aarch64_tune_flags = selected_tune->flags;
7207   aarch64_tune = selected_tune->core;
7208   aarch64_tune_params = selected_tune->tune;
7209   aarch64_architecture_version = selected_cpu->architecture_version;
7210
7211   if (aarch64_fix_a53_err835769 == 2)
7212     {
7213 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7214       aarch64_fix_a53_err835769 = 1;
7215 #else
7216       aarch64_fix_a53_err835769 = 0;
7217 #endif
7218     }
7219
7220   aarch64_register_fma_steering ();
7221
7222   aarch64_override_options_after_change ();
7223 }
7224
7225 /* Implement targetm.override_options_after_change.  */
7226
7227 static void
7228 aarch64_override_options_after_change (void)
7229 {
7230   if (flag_omit_frame_pointer)
7231     flag_omit_leaf_frame_pointer = false;
7232   else if (flag_omit_leaf_frame_pointer)
7233     flag_omit_frame_pointer = true;
7234
7235   /* If not optimizing for size, set the default
7236      alignment to what the target wants */
7237   if (!optimize_size)
7238     {
7239       if (align_loops <= 0)
7240         align_loops = aarch64_tune_params->loop_align;
7241       if (align_jumps <= 0)
7242         align_jumps = aarch64_tune_params->jump_align;
7243       if (align_functions <= 0)
7244         align_functions = aarch64_tune_params->function_align;
7245     }
7246 }
7247
7248 static struct machine_function *
7249 aarch64_init_machine_status (void)
7250 {
7251   struct machine_function *machine;
7252   machine = ggc_cleared_alloc<machine_function> ();
7253   return machine;
7254 }
7255
7256 void
7257 aarch64_init_expanders (void)
7258 {
7259   init_machine_status = aarch64_init_machine_status;
7260 }
7261
7262 /* A checking mechanism for the implementation of the various code models.  */
7263 static void
7264 initialize_aarch64_code_model (void)
7265 {
7266    if (flag_pic)
7267      {
7268        switch (aarch64_cmodel_var)
7269          {
7270          case AARCH64_CMODEL_TINY:
7271            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7272            break;
7273          case AARCH64_CMODEL_SMALL:
7274            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7275            break;
7276          case AARCH64_CMODEL_LARGE:
7277            sorry ("code model %qs with -f%s", "large",
7278                   flag_pic > 1 ? "PIC" : "pic");
7279          default:
7280            gcc_unreachable ();
7281          }
7282      }
7283    else
7284      aarch64_cmodel = aarch64_cmodel_var;
7285 }
7286
7287 /* Return true if SYMBOL_REF X binds locally.  */
7288
7289 static bool
7290 aarch64_symbol_binds_local_p (const_rtx x)
7291 {
7292   return (SYMBOL_REF_DECL (x)
7293           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7294           : SYMBOL_REF_LOCAL_P (x));
7295 }
7296
7297 /* Return true if SYMBOL_REF X is thread local */
7298 static bool
7299 aarch64_tls_symbol_p (rtx x)
7300 {
7301   if (! TARGET_HAVE_TLS)
7302     return false;
7303
7304   if (GET_CODE (x) != SYMBOL_REF)
7305     return false;
7306
7307   return SYMBOL_REF_TLS_MODEL (x) != 0;
7308 }
7309
7310 /* Classify a TLS symbol into one of the TLS kinds.  */
7311 enum aarch64_symbol_type
7312 aarch64_classify_tls_symbol (rtx x)
7313 {
7314   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7315
7316   switch (tls_kind)
7317     {
7318     case TLS_MODEL_GLOBAL_DYNAMIC:
7319     case TLS_MODEL_LOCAL_DYNAMIC:
7320       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7321
7322     case TLS_MODEL_INITIAL_EXEC:
7323       return SYMBOL_SMALL_GOTTPREL;
7324
7325     case TLS_MODEL_LOCAL_EXEC:
7326       return SYMBOL_SMALL_TPREL;
7327
7328     case TLS_MODEL_EMULATED:
7329     case TLS_MODEL_NONE:
7330       return SYMBOL_FORCE_TO_MEM;
7331
7332     default:
7333       gcc_unreachable ();
7334     }
7335 }
7336
7337 /* Return the method that should be used to access SYMBOL_REF or
7338    LABEL_REF X in context CONTEXT.  */
7339
7340 enum aarch64_symbol_type
7341 aarch64_classify_symbol (rtx x, rtx offset,
7342                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7343 {
7344   if (GET_CODE (x) == LABEL_REF)
7345     {
7346       switch (aarch64_cmodel)
7347         {
7348         case AARCH64_CMODEL_LARGE:
7349           return SYMBOL_FORCE_TO_MEM;
7350
7351         case AARCH64_CMODEL_TINY_PIC:
7352         case AARCH64_CMODEL_TINY:
7353           return SYMBOL_TINY_ABSOLUTE;
7354
7355         case AARCH64_CMODEL_SMALL_PIC:
7356         case AARCH64_CMODEL_SMALL:
7357           return SYMBOL_SMALL_ABSOLUTE;
7358
7359         default:
7360           gcc_unreachable ();
7361         }
7362     }
7363
7364   if (GET_CODE (x) == SYMBOL_REF)
7365     {
7366       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7367           return SYMBOL_FORCE_TO_MEM;
7368
7369       if (aarch64_tls_symbol_p (x))
7370         return aarch64_classify_tls_symbol (x);
7371
7372       switch (aarch64_cmodel)
7373         {
7374         case AARCH64_CMODEL_TINY:
7375           /* When we retreive symbol + offset address, we have to make sure
7376              the offset does not cause overflow of the final address.  But
7377              we have no way of knowing the address of symbol at compile time
7378              so we can't accurately say if the distance between the PC and
7379              symbol + offset is outside the addressible range of +/-1M in the
7380              TINY code model.  So we rely on images not being greater than
7381              1M and cap the offset at 1M and anything beyond 1M will have to
7382              be loaded using an alternative mechanism.  */
7383           if (SYMBOL_REF_WEAK (x)
7384               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7385             return SYMBOL_FORCE_TO_MEM;
7386           return SYMBOL_TINY_ABSOLUTE;
7387
7388         case AARCH64_CMODEL_SMALL:
7389           /* Same reasoning as the tiny code model, but the offset cap here is
7390              4G.  */
7391           if (SYMBOL_REF_WEAK (x)
7392               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7393                             HOST_WIDE_INT_C (4294967264)))
7394             return SYMBOL_FORCE_TO_MEM;
7395           return SYMBOL_SMALL_ABSOLUTE;
7396
7397         case AARCH64_CMODEL_TINY_PIC:
7398           if (!aarch64_symbol_binds_local_p (x))
7399             return SYMBOL_TINY_GOT;
7400           return SYMBOL_TINY_ABSOLUTE;
7401
7402         case AARCH64_CMODEL_SMALL_PIC:
7403           if (!aarch64_symbol_binds_local_p (x))
7404             return SYMBOL_SMALL_GOT;
7405           return SYMBOL_SMALL_ABSOLUTE;
7406
7407         default:
7408           gcc_unreachable ();
7409         }
7410     }
7411
7412   /* By default push everything into the constant pool.  */
7413   return SYMBOL_FORCE_TO_MEM;
7414 }
7415
7416 bool
7417 aarch64_constant_address_p (rtx x)
7418 {
7419   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7420 }
7421
7422 bool
7423 aarch64_legitimate_pic_operand_p (rtx x)
7424 {
7425   if (GET_CODE (x) == SYMBOL_REF
7426       || (GET_CODE (x) == CONST
7427           && GET_CODE (XEXP (x, 0)) == PLUS
7428           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7429      return false;
7430
7431   return true;
7432 }
7433
7434 /* Return true if X holds either a quarter-precision or
7435      floating-point +0.0 constant.  */
7436 static bool
7437 aarch64_valid_floating_const (machine_mode mode, rtx x)
7438 {
7439   if (!CONST_DOUBLE_P (x))
7440     return false;
7441
7442   if (aarch64_float_const_zero_rtx_p (x))
7443     return true;
7444
7445   /* We only handle moving 0.0 to a TFmode register.  */
7446   if (!(mode == SFmode || mode == DFmode))
7447     return false;
7448
7449   return aarch64_float_const_representable_p (x);
7450 }
7451
7452 static bool
7453 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7454 {
7455   /* Do not allow vector struct mode constants.  We could support
7456      0 and -1 easily, but they need support in aarch64-simd.md.  */
7457   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7458     return false;
7459
7460   /* This could probably go away because
7461      we now decompose CONST_INTs according to expand_mov_immediate.  */
7462   if ((GET_CODE (x) == CONST_VECTOR
7463        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7464       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7465         return !targetm.cannot_force_const_mem (mode, x);
7466
7467   if (GET_CODE (x) == HIGH
7468       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7469     return true;
7470
7471   return aarch64_constant_address_p (x);
7472 }
7473
7474 rtx
7475 aarch64_load_tp (rtx target)
7476 {
7477   if (!target
7478       || GET_MODE (target) != Pmode
7479       || !register_operand (target, Pmode))
7480     target = gen_reg_rtx (Pmode);
7481
7482   /* Can return in any reg.  */
7483   emit_insn (gen_aarch64_load_tp_hard (target));
7484   return target;
7485 }
7486
7487 /* On AAPCS systems, this is the "struct __va_list".  */
7488 static GTY(()) tree va_list_type;
7489
7490 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7491    Return the type to use as __builtin_va_list.
7492
7493    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7494
7495    struct __va_list
7496    {
7497      void *__stack;
7498      void *__gr_top;
7499      void *__vr_top;
7500      int   __gr_offs;
7501      int   __vr_offs;
7502    };  */
7503
7504 static tree
7505 aarch64_build_builtin_va_list (void)
7506 {
7507   tree va_list_name;
7508   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7509
7510   /* Create the type.  */
7511   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7512   /* Give it the required name.  */
7513   va_list_name = build_decl (BUILTINS_LOCATION,
7514                              TYPE_DECL,
7515                              get_identifier ("__va_list"),
7516                              va_list_type);
7517   DECL_ARTIFICIAL (va_list_name) = 1;
7518   TYPE_NAME (va_list_type) = va_list_name;
7519   TYPE_STUB_DECL (va_list_type) = va_list_name;
7520
7521   /* Create the fields.  */
7522   f_stack = build_decl (BUILTINS_LOCATION,
7523                         FIELD_DECL, get_identifier ("__stack"),
7524                         ptr_type_node);
7525   f_grtop = build_decl (BUILTINS_LOCATION,
7526                         FIELD_DECL, get_identifier ("__gr_top"),
7527                         ptr_type_node);
7528   f_vrtop = build_decl (BUILTINS_LOCATION,
7529                         FIELD_DECL, get_identifier ("__vr_top"),
7530                         ptr_type_node);
7531   f_groff = build_decl (BUILTINS_LOCATION,
7532                         FIELD_DECL, get_identifier ("__gr_offs"),
7533                         integer_type_node);
7534   f_vroff = build_decl (BUILTINS_LOCATION,
7535                         FIELD_DECL, get_identifier ("__vr_offs"),
7536                         integer_type_node);
7537
7538   DECL_ARTIFICIAL (f_stack) = 1;
7539   DECL_ARTIFICIAL (f_grtop) = 1;
7540   DECL_ARTIFICIAL (f_vrtop) = 1;
7541   DECL_ARTIFICIAL (f_groff) = 1;
7542   DECL_ARTIFICIAL (f_vroff) = 1;
7543
7544   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7545   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7546   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7547   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7548   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7549
7550   TYPE_FIELDS (va_list_type) = f_stack;
7551   DECL_CHAIN (f_stack) = f_grtop;
7552   DECL_CHAIN (f_grtop) = f_vrtop;
7553   DECL_CHAIN (f_vrtop) = f_groff;
7554   DECL_CHAIN (f_groff) = f_vroff;
7555
7556   /* Compute its layout.  */
7557   layout_type (va_list_type);
7558
7559   return va_list_type;
7560 }
7561
7562 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7563 static void
7564 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7565 {
7566   const CUMULATIVE_ARGS *cum;
7567   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7568   tree stack, grtop, vrtop, groff, vroff;
7569   tree t;
7570   int gr_save_area_size;
7571   int vr_save_area_size;
7572   int vr_offset;
7573
7574   cum = &crtl->args.info;
7575   gr_save_area_size
7576     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7577   vr_save_area_size
7578     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7579
7580   if (!TARGET_FLOAT)
7581     {
7582       gcc_assert (cum->aapcs_nvrn == 0);
7583       vr_save_area_size = 0;
7584     }
7585
7586   f_stack = TYPE_FIELDS (va_list_type_node);
7587   f_grtop = DECL_CHAIN (f_stack);
7588   f_vrtop = DECL_CHAIN (f_grtop);
7589   f_groff = DECL_CHAIN (f_vrtop);
7590   f_vroff = DECL_CHAIN (f_groff);
7591
7592   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7593                   NULL_TREE);
7594   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7595                   NULL_TREE);
7596   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7597                   NULL_TREE);
7598   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7599                   NULL_TREE);
7600   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7601                   NULL_TREE);
7602
7603   /* Emit code to initialize STACK, which points to the next varargs stack
7604      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7605      by named arguments.  STACK is 8-byte aligned.  */
7606   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7607   if (cum->aapcs_stack_size > 0)
7608     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7609   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7610   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7611
7612   /* Emit code to initialize GRTOP, the top of the GR save area.
7613      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7614   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7615   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7616   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7617
7618   /* Emit code to initialize VRTOP, the top of the VR save area.
7619      This address is gr_save_area_bytes below GRTOP, rounded
7620      down to the next 16-byte boundary.  */
7621   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7622   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7623                              STACK_BOUNDARY / BITS_PER_UNIT);
7624
7625   if (vr_offset)
7626     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7627   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7628   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7629
7630   /* Emit code to initialize GROFF, the offset from GRTOP of the
7631      next GPR argument.  */
7632   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7633               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7634   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7635
7636   /* Likewise emit code to initialize VROFF, the offset from FTOP
7637      of the next VR argument.  */
7638   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7639               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7640   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7641 }
7642
7643 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7644
7645 static tree
7646 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7647                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7648 {
7649   tree addr;
7650   bool indirect_p;
7651   bool is_ha;           /* is HFA or HVA.  */
7652   bool dw_align;        /* double-word align.  */
7653   machine_mode ag_mode = VOIDmode;
7654   int nregs;
7655   machine_mode mode;
7656
7657   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7658   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7659   HOST_WIDE_INT size, rsize, adjust, align;
7660   tree t, u, cond1, cond2;
7661
7662   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7663   if (indirect_p)
7664     type = build_pointer_type (type);
7665
7666   mode = TYPE_MODE (type);
7667
7668   f_stack = TYPE_FIELDS (va_list_type_node);
7669   f_grtop = DECL_CHAIN (f_stack);
7670   f_vrtop = DECL_CHAIN (f_grtop);
7671   f_groff = DECL_CHAIN (f_vrtop);
7672   f_vroff = DECL_CHAIN (f_groff);
7673
7674   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7675                   f_stack, NULL_TREE);
7676   size = int_size_in_bytes (type);
7677   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7678
7679   dw_align = false;
7680   adjust = 0;
7681   if (aarch64_vfp_is_call_or_return_candidate (mode,
7682                                                type,
7683                                                &ag_mode,
7684                                                &nregs,
7685                                                &is_ha))
7686     {
7687       /* TYPE passed in fp/simd registers.  */
7688       if (!TARGET_FLOAT)
7689         aarch64_err_no_fpadvsimd (mode, "varargs");
7690
7691       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7692                       unshare_expr (valist), f_vrtop, NULL_TREE);
7693       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7694                       unshare_expr (valist), f_vroff, NULL_TREE);
7695
7696       rsize = nregs * UNITS_PER_VREG;
7697
7698       if (is_ha)
7699         {
7700           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7701             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7702         }
7703       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7704                && size < UNITS_PER_VREG)
7705         {
7706           adjust = UNITS_PER_VREG - size;
7707         }
7708     }
7709   else
7710     {
7711       /* TYPE passed in general registers.  */
7712       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7713                       unshare_expr (valist), f_grtop, NULL_TREE);
7714       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7715                       unshare_expr (valist), f_groff, NULL_TREE);
7716       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7717       nregs = rsize / UNITS_PER_WORD;
7718
7719       if (align > 8)
7720         dw_align = true;
7721
7722       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7723           && size < UNITS_PER_WORD)
7724         {
7725           adjust = UNITS_PER_WORD  - size;
7726         }
7727     }
7728
7729   /* Get a local temporary for the field value.  */
7730   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7731
7732   /* Emit code to branch if off >= 0.  */
7733   t = build2 (GE_EXPR, boolean_type_node, off,
7734               build_int_cst (TREE_TYPE (off), 0));
7735   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7736
7737   if (dw_align)
7738     {
7739       /* Emit: offs = (offs + 15) & -16.  */
7740       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7741                   build_int_cst (TREE_TYPE (off), 15));
7742       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7743                   build_int_cst (TREE_TYPE (off), -16));
7744       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7745     }
7746   else
7747     roundup = NULL;
7748
7749   /* Update ap.__[g|v]r_offs  */
7750   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7751               build_int_cst (TREE_TYPE (off), rsize));
7752   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7753
7754   /* String up.  */
7755   if (roundup)
7756     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7757
7758   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7759   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7760               build_int_cst (TREE_TYPE (f_off), 0));
7761   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7762
7763   /* String up: make sure the assignment happens before the use.  */
7764   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7765   COND_EXPR_ELSE (cond1) = t;
7766
7767   /* Prepare the trees handling the argument that is passed on the stack;
7768      the top level node will store in ON_STACK.  */
7769   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7770   if (align > 8)
7771     {
7772       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7773       t = fold_convert (intDI_type_node, arg);
7774       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7775                   build_int_cst (TREE_TYPE (t), 15));
7776       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7777                   build_int_cst (TREE_TYPE (t), -16));
7778       t = fold_convert (TREE_TYPE (arg), t);
7779       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7780     }
7781   else
7782     roundup = NULL;
7783   /* Advance ap.__stack  */
7784   t = fold_convert (intDI_type_node, arg);
7785   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7786               build_int_cst (TREE_TYPE (t), size + 7));
7787   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7788               build_int_cst (TREE_TYPE (t), -8));
7789   t = fold_convert (TREE_TYPE (arg), t);
7790   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7791   /* String up roundup and advance.  */
7792   if (roundup)
7793     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7794   /* String up with arg */
7795   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7796   /* Big-endianness related address adjustment.  */
7797   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7798       && size < UNITS_PER_WORD)
7799   {
7800     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7801                 size_int (UNITS_PER_WORD - size));
7802     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7803   }
7804
7805   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7806   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7807
7808   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7809   t = off;
7810   if (adjust)
7811     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7812                 build_int_cst (TREE_TYPE (off), adjust));
7813
7814   t = fold_convert (sizetype, t);
7815   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7816
7817   if (is_ha)
7818     {
7819       /* type ha; // treat as "struct {ftype field[n];}"
7820          ... [computing offs]
7821          for (i = 0; i <nregs; ++i, offs += 16)
7822            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7823          return ha;  */
7824       int i;
7825       tree tmp_ha, field_t, field_ptr_t;
7826
7827       /* Declare a local variable.  */
7828       tmp_ha = create_tmp_var_raw (type, "ha");
7829       gimple_add_tmp_var (tmp_ha);
7830
7831       /* Establish the base type.  */
7832       switch (ag_mode)
7833         {
7834         case SFmode:
7835           field_t = float_type_node;
7836           field_ptr_t = float_ptr_type_node;
7837           break;
7838         case DFmode:
7839           field_t = double_type_node;
7840           field_ptr_t = double_ptr_type_node;
7841           break;
7842         case TFmode:
7843           field_t = long_double_type_node;
7844           field_ptr_t = long_double_ptr_type_node;
7845           break;
7846 /* The half precision and quad precision are not fully supported yet.  Enable
7847    the following code after the support is complete.  Need to find the correct
7848    type node for __fp16 *.  */
7849 #if 0
7850         case HFmode:
7851           field_t = float_type_node;
7852           field_ptr_t = float_ptr_type_node;
7853           break;
7854 #endif
7855         case V2SImode:
7856         case V4SImode:
7857             {
7858               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7859               field_t = build_vector_type_for_mode (innertype, ag_mode);
7860               field_ptr_t = build_pointer_type (field_t);
7861             }
7862           break;
7863         default:
7864           gcc_assert (0);
7865         }
7866
7867       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7868       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7869       addr = t;
7870       t = fold_convert (field_ptr_t, addr);
7871       t = build2 (MODIFY_EXPR, field_t,
7872                   build1 (INDIRECT_REF, field_t, tmp_ha),
7873                   build1 (INDIRECT_REF, field_t, t));
7874
7875       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7876       for (i = 1; i < nregs; ++i)
7877         {
7878           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7879           u = fold_convert (field_ptr_t, addr);
7880           u = build2 (MODIFY_EXPR, field_t,
7881                       build2 (MEM_REF, field_t, tmp_ha,
7882                               build_int_cst (field_ptr_t,
7883                                              (i *
7884                                               int_size_in_bytes (field_t)))),
7885                       build1 (INDIRECT_REF, field_t, u));
7886           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7887         }
7888
7889       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7890       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7891     }
7892
7893   COND_EXPR_ELSE (cond2) = t;
7894   addr = fold_convert (build_pointer_type (type), cond1);
7895   addr = build_va_arg_indirect_ref (addr);
7896
7897   if (indirect_p)
7898     addr = build_va_arg_indirect_ref (addr);
7899
7900   return addr;
7901 }
7902
7903 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7904
7905 static void
7906 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7907                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7908                                 int no_rtl)
7909 {
7910   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7911   CUMULATIVE_ARGS local_cum;
7912   int gr_saved, vr_saved;
7913
7914   /* The caller has advanced CUM up to, but not beyond, the last named
7915      argument.  Advance a local copy of CUM past the last "real" named
7916      argument, to find out how many registers are left over.  */
7917   local_cum = *cum;
7918   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7919
7920   /* Found out how many registers we need to save.  */
7921   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7922   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7923
7924   if (!TARGET_FLOAT)
7925     {
7926       gcc_assert (local_cum.aapcs_nvrn == 0);
7927       vr_saved = 0;
7928     }
7929
7930   if (!no_rtl)
7931     {
7932       if (gr_saved > 0)
7933         {
7934           rtx ptr, mem;
7935
7936           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7937           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7938                                - gr_saved * UNITS_PER_WORD);
7939           mem = gen_frame_mem (BLKmode, ptr);
7940           set_mem_alias_set (mem, get_varargs_alias_set ());
7941
7942           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7943                                mem, gr_saved);
7944         }
7945       if (vr_saved > 0)
7946         {
7947           /* We can't use move_block_from_reg, because it will use
7948              the wrong mode, storing D regs only.  */
7949           machine_mode mode = TImode;
7950           int off, i;
7951
7952           /* Set OFF to the offset from virtual_incoming_args_rtx of
7953              the first vector register.  The VR save area lies below
7954              the GR one, and is aligned to 16 bytes.  */
7955           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7956                                    STACK_BOUNDARY / BITS_PER_UNIT);
7957           off -= vr_saved * UNITS_PER_VREG;
7958
7959           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7960             {
7961               rtx ptr, mem;
7962
7963               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7964               mem = gen_frame_mem (mode, ptr);
7965               set_mem_alias_set (mem, get_varargs_alias_set ());
7966               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7967               off += UNITS_PER_VREG;
7968             }
7969         }
7970     }
7971
7972   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7973      any complication of having crtl->args.pretend_args_size changed.  */
7974   cfun->machine->frame.saved_varargs_size
7975     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7976                       STACK_BOUNDARY / BITS_PER_UNIT)
7977        + vr_saved * UNITS_PER_VREG);
7978 }
7979
7980 static void
7981 aarch64_conditional_register_usage (void)
7982 {
7983   int i;
7984   if (!TARGET_FLOAT)
7985     {
7986       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7987         {
7988           fixed_regs[i] = 1;
7989           call_used_regs[i] = 1;
7990         }
7991     }
7992 }
7993
7994 /* Walk down the type tree of TYPE counting consecutive base elements.
7995    If *MODEP is VOIDmode, then set it to the first valid floating point
7996    type.  If a non-floating point type is found, or if a floating point
7997    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7998    otherwise return the count in the sub-tree.  */
7999 static int
8000 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8001 {
8002   machine_mode mode;
8003   HOST_WIDE_INT size;
8004
8005   switch (TREE_CODE (type))
8006     {
8007     case REAL_TYPE:
8008       mode = TYPE_MODE (type);
8009       if (mode != DFmode && mode != SFmode && mode != TFmode)
8010         return -1;
8011
8012       if (*modep == VOIDmode)
8013         *modep = mode;
8014
8015       if (*modep == mode)
8016         return 1;
8017
8018       break;
8019
8020     case COMPLEX_TYPE:
8021       mode = TYPE_MODE (TREE_TYPE (type));
8022       if (mode != DFmode && mode != SFmode && mode != TFmode)
8023         return -1;
8024
8025       if (*modep == VOIDmode)
8026         *modep = mode;
8027
8028       if (*modep == mode)
8029         return 2;
8030
8031       break;
8032
8033     case VECTOR_TYPE:
8034       /* Use V2SImode and V4SImode as representatives of all 64-bit
8035          and 128-bit vector types.  */
8036       size = int_size_in_bytes (type);
8037       switch (size)
8038         {
8039         case 8:
8040           mode = V2SImode;
8041           break;
8042         case 16:
8043           mode = V4SImode;
8044           break;
8045         default:
8046           return -1;
8047         }
8048
8049       if (*modep == VOIDmode)
8050         *modep = mode;
8051
8052       /* Vector modes are considered to be opaque: two vectors are
8053          equivalent for the purposes of being homogeneous aggregates
8054          if they are the same size.  */
8055       if (*modep == mode)
8056         return 1;
8057
8058       break;
8059
8060     case ARRAY_TYPE:
8061       {
8062         int count;
8063         tree index = TYPE_DOMAIN (type);
8064
8065         /* Can't handle incomplete types nor sizes that are not
8066            fixed.  */
8067         if (!COMPLETE_TYPE_P (type)
8068             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8069           return -1;
8070
8071         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8072         if (count == -1
8073             || !index
8074             || !TYPE_MAX_VALUE (index)
8075             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8076             || !TYPE_MIN_VALUE (index)
8077             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8078             || count < 0)
8079           return -1;
8080
8081         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8082                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8083
8084         /* There must be no padding.  */
8085         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8086           return -1;
8087
8088         return count;
8089       }
8090
8091     case RECORD_TYPE:
8092       {
8093         int count = 0;
8094         int sub_count;
8095         tree field;
8096
8097         /* Can't handle incomplete types nor sizes that are not
8098            fixed.  */
8099         if (!COMPLETE_TYPE_P (type)
8100             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8101           return -1;
8102
8103         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8104           {
8105             if (TREE_CODE (field) != FIELD_DECL)
8106               continue;
8107
8108             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8109             if (sub_count < 0)
8110               return -1;
8111             count += sub_count;
8112           }
8113
8114         /* There must be no padding.  */
8115         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8116           return -1;
8117
8118         return count;
8119       }
8120
8121     case UNION_TYPE:
8122     case QUAL_UNION_TYPE:
8123       {
8124         /* These aren't very interesting except in a degenerate case.  */
8125         int count = 0;
8126         int sub_count;
8127         tree field;
8128
8129         /* Can't handle incomplete types nor sizes that are not
8130            fixed.  */
8131         if (!COMPLETE_TYPE_P (type)
8132             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8133           return -1;
8134
8135         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8136           {
8137             if (TREE_CODE (field) != FIELD_DECL)
8138               continue;
8139
8140             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8141             if (sub_count < 0)
8142               return -1;
8143             count = count > sub_count ? count : sub_count;
8144           }
8145
8146         /* There must be no padding.  */
8147         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8148           return -1;
8149
8150         return count;
8151       }
8152
8153     default:
8154       break;
8155     }
8156
8157   return -1;
8158 }
8159
8160 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8161    type as described in AAPCS64 \S 4.1.2.
8162
8163    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8164
8165 static bool
8166 aarch64_short_vector_p (const_tree type,
8167                         machine_mode mode)
8168 {
8169   HOST_WIDE_INT size = -1;
8170
8171   if (type && TREE_CODE (type) == VECTOR_TYPE)
8172     size = int_size_in_bytes (type);
8173   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8174             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8175     size = GET_MODE_SIZE (mode);
8176
8177   return (size == 8 || size == 16);
8178 }
8179
8180 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8181    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8182    array types.  The C99 floating-point complex types are also considered
8183    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8184    types, which are GCC extensions and out of the scope of AAPCS64, are
8185    treated as composite types here as well.
8186
8187    Note that MODE itself is not sufficient in determining whether a type
8188    is such a composite type or not.  This is because
8189    stor-layout.c:compute_record_mode may have already changed the MODE
8190    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8191    structure with only one field may have its MODE set to the mode of the
8192    field.  Also an integer mode whose size matches the size of the
8193    RECORD_TYPE type may be used to substitute the original mode
8194    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8195    solely relied on.  */
8196
8197 static bool
8198 aarch64_composite_type_p (const_tree type,
8199                           machine_mode mode)
8200 {
8201   if (aarch64_short_vector_p (type, mode))
8202     return false;
8203
8204   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8205     return true;
8206
8207   if (mode == BLKmode
8208       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8209       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8210     return true;
8211
8212   return false;
8213 }
8214
8215 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8216    shall be passed or returned in simd/fp register(s) (providing these
8217    parameter passing registers are available).
8218
8219    Upon successful return, *COUNT returns the number of needed registers,
8220    *BASE_MODE returns the mode of the individual register and when IS_HAF
8221    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8222    floating-point aggregate or a homogeneous short-vector aggregate.  */
8223
8224 static bool
8225 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8226                                          const_tree type,
8227                                          machine_mode *base_mode,
8228                                          int *count,
8229                                          bool *is_ha)
8230 {
8231   machine_mode new_mode = VOIDmode;
8232   bool composite_p = aarch64_composite_type_p (type, mode);
8233
8234   if (is_ha != NULL) *is_ha = false;
8235
8236   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8237       || aarch64_short_vector_p (type, mode))
8238     {
8239       *count = 1;
8240       new_mode = mode;
8241     }
8242   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8243     {
8244       if (is_ha != NULL) *is_ha = true;
8245       *count = 2;
8246       new_mode = GET_MODE_INNER (mode);
8247     }
8248   else if (type && composite_p)
8249     {
8250       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8251
8252       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8253         {
8254           if (is_ha != NULL) *is_ha = true;
8255           *count = ag_count;
8256         }
8257       else
8258         return false;
8259     }
8260   else
8261     return false;
8262
8263   *base_mode = new_mode;
8264   return true;
8265 }
8266
8267 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8268
8269 static rtx
8270 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8271                           int incoming ATTRIBUTE_UNUSED)
8272 {
8273   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8274 }
8275
8276 /* Implements target hook vector_mode_supported_p.  */
8277 static bool
8278 aarch64_vector_mode_supported_p (machine_mode mode)
8279 {
8280   if (TARGET_SIMD
8281       && (mode == V4SImode  || mode == V8HImode
8282           || mode == V16QImode || mode == V2DImode
8283           || mode == V2SImode  || mode == V4HImode
8284           || mode == V8QImode || mode == V2SFmode
8285           || mode == V4SFmode || mode == V2DFmode
8286           || mode == V1DFmode))
8287     return true;
8288
8289   return false;
8290 }
8291
8292 /* Return appropriate SIMD container
8293    for MODE within a vector of WIDTH bits.  */
8294 static machine_mode
8295 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8296 {
8297   gcc_assert (width == 64 || width == 128);
8298   if (TARGET_SIMD)
8299     {
8300       if (width == 128)
8301         switch (mode)
8302           {
8303           case DFmode:
8304             return V2DFmode;
8305           case SFmode:
8306             return V4SFmode;
8307           case SImode:
8308             return V4SImode;
8309           case HImode:
8310             return V8HImode;
8311           case QImode:
8312             return V16QImode;
8313           case DImode:
8314             return V2DImode;
8315           default:
8316             break;
8317           }
8318       else
8319         switch (mode)
8320           {
8321           case SFmode:
8322             return V2SFmode;
8323           case SImode:
8324             return V2SImode;
8325           case HImode:
8326             return V4HImode;
8327           case QImode:
8328             return V8QImode;
8329           default:
8330             break;
8331           }
8332     }
8333   return word_mode;
8334 }
8335
8336 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8337 static machine_mode
8338 aarch64_preferred_simd_mode (machine_mode mode)
8339 {
8340   return aarch64_simd_container_mode (mode, 128);
8341 }
8342
8343 /* Return the bitmask of possible vector sizes for the vectorizer
8344    to iterate over.  */
8345 static unsigned int
8346 aarch64_autovectorize_vector_sizes (void)
8347 {
8348   return (16 | 8);
8349 }
8350
8351 /* Implement TARGET_MANGLE_TYPE.  */
8352
8353 static const char *
8354 aarch64_mangle_type (const_tree type)
8355 {
8356   /* The AArch64 ABI documents say that "__va_list" has to be
8357      managled as if it is in the "std" namespace.  */
8358   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8359     return "St9__va_list";
8360
8361   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8362      builtin types.  */
8363   if (TYPE_NAME (type) != NULL)
8364     return aarch64_mangle_builtin_type (type);
8365
8366   /* Use the default mangling.  */
8367   return NULL;
8368 }
8369
8370
8371 /* Return true if the rtx_insn contains a MEM RTX somewhere
8372    in it.  */
8373
8374 static bool
8375 has_memory_op (rtx_insn *mem_insn)
8376 {
8377   subrtx_iterator::array_type array;
8378   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8379     if (MEM_P (*iter))
8380       return true;
8381
8382   return false;
8383 }
8384
8385 /* Find the first rtx_insn before insn that will generate an assembly
8386    instruction.  */
8387
8388 static rtx_insn *
8389 aarch64_prev_real_insn (rtx_insn *insn)
8390 {
8391   if (!insn)
8392     return NULL;
8393
8394   do
8395     {
8396       insn = prev_real_insn (insn);
8397     }
8398   while (insn && recog_memoized (insn) < 0);
8399
8400   return insn;
8401 }
8402
8403 static bool
8404 is_madd_op (enum attr_type t1)
8405 {
8406   unsigned int i;
8407   /* A number of these may be AArch32 only.  */
8408   enum attr_type mlatypes[] = {
8409     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8410     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8411     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8412   };
8413
8414   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8415     {
8416       if (t1 == mlatypes[i])
8417         return true;
8418     }
8419
8420   return false;
8421 }
8422
8423 /* Check if there is a register dependency between a load and the insn
8424    for which we hold recog_data.  */
8425
8426 static bool
8427 dep_between_memop_and_curr (rtx memop)
8428 {
8429   rtx load_reg;
8430   int opno;
8431
8432   gcc_assert (GET_CODE (memop) == SET);
8433
8434   if (!REG_P (SET_DEST (memop)))
8435     return false;
8436
8437   load_reg = SET_DEST (memop);
8438   for (opno = 1; opno < recog_data.n_operands; opno++)
8439     {
8440       rtx operand = recog_data.operand[opno];
8441       if (REG_P (operand)
8442           && reg_overlap_mentioned_p (load_reg, operand))
8443         return true;
8444
8445     }
8446   return false;
8447 }
8448
8449
8450 /* When working around the Cortex-A53 erratum 835769,
8451    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8452    instruction and has a preceding memory instruction such that a NOP
8453    should be inserted between them.  */
8454
8455 bool
8456 aarch64_madd_needs_nop (rtx_insn* insn)
8457 {
8458   enum attr_type attr_type;
8459   rtx_insn *prev;
8460   rtx body;
8461
8462   if (!aarch64_fix_a53_err835769)
8463     return false;
8464
8465   if (recog_memoized (insn) < 0)
8466     return false;
8467
8468   attr_type = get_attr_type (insn);
8469   if (!is_madd_op (attr_type))
8470     return false;
8471
8472   prev = aarch64_prev_real_insn (insn);
8473   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8474      Restore recog state to INSN to avoid state corruption.  */
8475   extract_constrain_insn_cached (insn);
8476
8477   if (!prev || !has_memory_op (prev))
8478     return false;
8479
8480   body = single_set (prev);
8481
8482   /* If the previous insn is a memory op and there is no dependency between
8483      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8484      have a complex memory operation, probably a load/store pair.
8485      Be conservative for now and emit a NOP.  */
8486   if (GET_MODE (recog_data.operand[0]) == DImode
8487       && (!body || !dep_between_memop_and_curr (body)))
8488     return true;
8489
8490   return false;
8491
8492 }
8493
8494
8495 /* Implement FINAL_PRESCAN_INSN.  */
8496
8497 void
8498 aarch64_final_prescan_insn (rtx_insn *insn)
8499 {
8500   if (aarch64_madd_needs_nop (insn))
8501     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8502 }
8503
8504
8505 /* Return the equivalent letter for size.  */
8506 static char
8507 sizetochar (int size)
8508 {
8509   switch (size)
8510     {
8511     case 64: return 'd';
8512     case 32: return 's';
8513     case 16: return 'h';
8514     case 8 : return 'b';
8515     default: gcc_unreachable ();
8516     }
8517 }
8518
8519 /* Return true iff x is a uniform vector of floating-point
8520    constants, and the constant can be represented in
8521    quarter-precision form.  Note, as aarch64_float_const_representable
8522    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8523 static bool
8524 aarch64_vect_float_const_representable_p (rtx x)
8525 {
8526   int i = 0;
8527   REAL_VALUE_TYPE r0, ri;
8528   rtx x0, xi;
8529
8530   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8531     return false;
8532
8533   x0 = CONST_VECTOR_ELT (x, 0);
8534   if (!CONST_DOUBLE_P (x0))
8535     return false;
8536
8537   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8538
8539   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8540     {
8541       xi = CONST_VECTOR_ELT (x, i);
8542       if (!CONST_DOUBLE_P (xi))
8543         return false;
8544
8545       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8546       if (!REAL_VALUES_EQUAL (r0, ri))
8547         return false;
8548     }
8549
8550   return aarch64_float_const_representable_p (x0);
8551 }
8552
8553 /* Return true for valid and false for invalid.  */
8554 bool
8555 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8556                               struct simd_immediate_info *info)
8557 {
8558 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8559   matches = 1;                                          \
8560   for (i = 0; i < idx; i += (STRIDE))                   \
8561     if (!(TEST))                                        \
8562       matches = 0;                                      \
8563   if (matches)                                          \
8564     {                                                   \
8565       immtype = (CLASS);                                \
8566       elsize = (ELSIZE);                                \
8567       eshift = (SHIFT);                                 \
8568       emvn = (NEG);                                     \
8569       break;                                            \
8570     }
8571
8572   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8573   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8574   unsigned char bytes[16];
8575   int immtype = -1, matches;
8576   unsigned int invmask = inverse ? 0xff : 0;
8577   int eshift, emvn;
8578
8579   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8580     {
8581       if (! (aarch64_simd_imm_zero_p (op, mode)
8582              || aarch64_vect_float_const_representable_p (op)))
8583         return false;
8584
8585       if (info)
8586         {
8587           info->value = CONST_VECTOR_ELT (op, 0);
8588           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8589           info->mvn = false;
8590           info->shift = 0;
8591         }
8592
8593       return true;
8594     }
8595
8596   /* Splat vector constant out into a byte vector.  */
8597   for (i = 0; i < n_elts; i++)
8598     {
8599       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8600          it must be laid out in the vector register in reverse order.  */
8601       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8602       unsigned HOST_WIDE_INT elpart;
8603       unsigned int part, parts;
8604
8605       if (CONST_INT_P (el))
8606         {
8607           elpart = INTVAL (el);
8608           parts = 1;
8609         }
8610       else if (GET_CODE (el) == CONST_DOUBLE)
8611         {
8612           elpart = CONST_DOUBLE_LOW (el);
8613           parts = 2;
8614         }
8615       else
8616         gcc_unreachable ();
8617
8618       for (part = 0; part < parts; part++)
8619         {
8620           unsigned int byte;
8621           for (byte = 0; byte < innersize; byte++)
8622             {
8623               bytes[idx++] = (elpart & 0xff) ^ invmask;
8624               elpart >>= BITS_PER_UNIT;
8625             }
8626           if (GET_CODE (el) == CONST_DOUBLE)
8627             elpart = CONST_DOUBLE_HIGH (el);
8628         }
8629     }
8630
8631   /* Sanity check.  */
8632   gcc_assert (idx == GET_MODE_SIZE (mode));
8633
8634   do
8635     {
8636       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8637              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8638
8639       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8640              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8641
8642       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8643              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8644
8645       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8646              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8647
8648       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8649
8650       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8651
8652       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8653              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8654
8655       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8656              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8657
8658       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8659              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8660
8661       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8662              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8663
8664       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8665
8666       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8667
8668       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8669              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8670
8671       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8672              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8673
8674       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8675              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8676
8677       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8678              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8679
8680       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8681
8682       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8683              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8684     }
8685   while (0);
8686
8687   if (immtype == -1)
8688     return false;
8689
8690   if (info)
8691     {
8692       info->element_width = elsize;
8693       info->mvn = emvn != 0;
8694       info->shift = eshift;
8695
8696       unsigned HOST_WIDE_INT imm = 0;
8697
8698       if (immtype >= 12 && immtype <= 15)
8699         info->msl = true;
8700
8701       /* Un-invert bytes of recognized vector, if necessary.  */
8702       if (invmask != 0)
8703         for (i = 0; i < idx; i++)
8704           bytes[i] ^= invmask;
8705
8706       if (immtype == 17)
8707         {
8708           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8709           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8710
8711           for (i = 0; i < 8; i++)
8712             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8713               << (i * BITS_PER_UNIT);
8714
8715
8716           info->value = GEN_INT (imm);
8717         }
8718       else
8719         {
8720           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8721             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8722
8723           /* Construct 'abcdefgh' because the assembler cannot handle
8724              generic constants.  */
8725           if (info->mvn)
8726             imm = ~imm;
8727           imm = (imm >> info->shift) & 0xff;
8728           info->value = GEN_INT (imm);
8729         }
8730     }
8731
8732   return true;
8733 #undef CHECK
8734 }
8735
8736 /* Check of immediate shift constants are within range.  */
8737 bool
8738 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8739 {
8740   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8741   if (left)
8742     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8743   else
8744     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8745 }
8746
8747 /* Return true if X is a uniform vector where all elements
8748    are either the floating-point constant 0.0 or the
8749    integer constant 0.  */
8750 bool
8751 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8752 {
8753   return x == CONST0_RTX (mode);
8754 }
8755
8756 bool
8757 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8758 {
8759   HOST_WIDE_INT imm = INTVAL (x);
8760   int i;
8761
8762   for (i = 0; i < 8; i++)
8763     {
8764       unsigned int byte = imm & 0xff;
8765       if (byte != 0xff && byte != 0)
8766        return false;
8767       imm >>= 8;
8768     }
8769
8770   return true;
8771 }
8772
8773 bool
8774 aarch64_mov_operand_p (rtx x,
8775                        enum aarch64_symbol_context context,
8776                        machine_mode mode)
8777 {
8778   if (GET_CODE (x) == HIGH
8779       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8780     return true;
8781
8782   if (CONST_INT_P (x))
8783     return true;
8784
8785   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8786     return true;
8787
8788   return aarch64_classify_symbolic_expression (x, context)
8789     == SYMBOL_TINY_ABSOLUTE;
8790 }
8791
8792 /* Return a const_int vector of VAL.  */
8793 rtx
8794 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8795 {
8796   int nunits = GET_MODE_NUNITS (mode);
8797   rtvec v = rtvec_alloc (nunits);
8798   int i;
8799
8800   for (i=0; i < nunits; i++)
8801     RTVEC_ELT (v, i) = GEN_INT (val);
8802
8803   return gen_rtx_CONST_VECTOR (mode, v);
8804 }
8805
8806 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8807
8808 bool
8809 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8810 {
8811   machine_mode vmode;
8812
8813   gcc_assert (!VECTOR_MODE_P (mode));
8814   vmode = aarch64_preferred_simd_mode (mode);
8815   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8816   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8817 }
8818
8819 /* Construct and return a PARALLEL RTX vector with elements numbering the
8820    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8821    the vector - from the perspective of the architecture.  This does not
8822    line up with GCC's perspective on lane numbers, so we end up with
8823    different masks depending on our target endian-ness.  The diagram
8824    below may help.  We must draw the distinction when building masks
8825    which select one half of the vector.  An instruction selecting
8826    architectural low-lanes for a big-endian target, must be described using
8827    a mask selecting GCC high-lanes.
8828
8829                  Big-Endian             Little-Endian
8830
8831 GCC             0   1   2   3           3   2   1   0
8832               | x | x | x | x |       | x | x | x | x |
8833 Architecture    3   2   1   0           3   2   1   0
8834
8835 Low Mask:         { 2, 3 }                { 0, 1 }
8836 High Mask:        { 0, 1 }                { 2, 3 }
8837 */
8838
8839 rtx
8840 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8841 {
8842   int nunits = GET_MODE_NUNITS (mode);
8843   rtvec v = rtvec_alloc (nunits / 2);
8844   int high_base = nunits / 2;
8845   int low_base = 0;
8846   int base;
8847   rtx t1;
8848   int i;
8849
8850   if (BYTES_BIG_ENDIAN)
8851     base = high ? low_base : high_base;
8852   else
8853     base = high ? high_base : low_base;
8854
8855   for (i = 0; i < nunits / 2; i++)
8856     RTVEC_ELT (v, i) = GEN_INT (base + i);
8857
8858   t1 = gen_rtx_PARALLEL (mode, v);
8859   return t1;
8860 }
8861
8862 /* Check OP for validity as a PARALLEL RTX vector with elements
8863    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8864    from the perspective of the architecture.  See the diagram above
8865    aarch64_simd_vect_par_cnst_half for more details.  */
8866
8867 bool
8868 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8869                                        bool high)
8870 {
8871   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8872   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8873   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8874   int i = 0;
8875
8876   if (!VECTOR_MODE_P (mode))
8877     return false;
8878
8879   if (count_op != count_ideal)
8880     return false;
8881
8882   for (i = 0; i < count_ideal; i++)
8883     {
8884       rtx elt_op = XVECEXP (op, 0, i);
8885       rtx elt_ideal = XVECEXP (ideal, 0, i);
8886
8887       if (!CONST_INT_P (elt_op)
8888           || INTVAL (elt_ideal) != INTVAL (elt_op))
8889         return false;
8890     }
8891   return true;
8892 }
8893
8894 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8895    HIGH (exclusive).  */
8896 void
8897 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8898                           const_tree exp)
8899 {
8900   HOST_WIDE_INT lane;
8901   gcc_assert (CONST_INT_P (operand));
8902   lane = INTVAL (operand);
8903
8904   if (lane < low || lane >= high)
8905   {
8906     if (exp)
8907       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
8908     else
8909       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
8910   }
8911 }
8912
8913 /* Return TRUE if OP is a valid vector addressing mode.  */
8914 bool
8915 aarch64_simd_mem_operand_p (rtx op)
8916 {
8917   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8918                         || REG_P (XEXP (op, 0)));
8919 }
8920
8921 /* Emit a register copy from operand to operand, taking care not to
8922    early-clobber source registers in the process.
8923
8924    COUNT is the number of components into which the copy needs to be
8925    decomposed.  */
8926 void
8927 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8928                                 unsigned int count)
8929 {
8930   unsigned int i;
8931   int rdest = REGNO (operands[0]);
8932   int rsrc = REGNO (operands[1]);
8933
8934   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8935       || rdest < rsrc)
8936     for (i = 0; i < count; i++)
8937       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8938                       gen_rtx_REG (mode, rsrc + i));
8939   else
8940     for (i = 0; i < count; i++)
8941       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8942                       gen_rtx_REG (mode, rsrc + count - i - 1));
8943 }
8944
8945 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8946    one of VSTRUCT modes: OI, CI or XI.  */
8947 int
8948 aarch64_simd_attr_length_move (rtx_insn *insn)
8949 {
8950   machine_mode mode;
8951
8952   extract_insn_cached (insn);
8953
8954   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8955     {
8956       mode = GET_MODE (recog_data.operand[0]);
8957       switch (mode)
8958         {
8959         case OImode:
8960           return 8;
8961         case CImode:
8962           return 12;
8963         case XImode:
8964           return 16;
8965         default:
8966           gcc_unreachable ();
8967         }
8968     }
8969   return 4;
8970 }
8971
8972 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8973    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8974 int
8975 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8976 {
8977   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8978 }
8979
8980 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8981    alignment of a vector to 128 bits.  */
8982 static HOST_WIDE_INT
8983 aarch64_simd_vector_alignment (const_tree type)
8984 {
8985   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8986   return MIN (align, 128);
8987 }
8988
8989 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8990 static bool
8991 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8992 {
8993   if (is_packed)
8994     return false;
8995
8996   /* We guarantee alignment for vectors up to 128-bits.  */
8997   if (tree_int_cst_compare (TYPE_SIZE (type),
8998                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8999     return false;
9000
9001   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
9002   return true;
9003 }
9004
9005 /* If VALS is a vector constant that can be loaded into a register
9006    using DUP, generate instructions to do so and return an RTX to
9007    assign to the register.  Otherwise return NULL_RTX.  */
9008 static rtx
9009 aarch64_simd_dup_constant (rtx vals)
9010 {
9011   machine_mode mode = GET_MODE (vals);
9012   machine_mode inner_mode = GET_MODE_INNER (mode);
9013   int n_elts = GET_MODE_NUNITS (mode);
9014   bool all_same = true;
9015   rtx x;
9016   int i;
9017
9018   if (GET_CODE (vals) != CONST_VECTOR)
9019     return NULL_RTX;
9020
9021   for (i = 1; i < n_elts; ++i)
9022     {
9023       x = CONST_VECTOR_ELT (vals, i);
9024       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9025         all_same = false;
9026     }
9027
9028   if (!all_same)
9029     return NULL_RTX;
9030
9031   /* We can load this constant by using DUP and a constant in a
9032      single ARM register.  This will be cheaper than a vector
9033      load.  */
9034   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9035   return gen_rtx_VEC_DUPLICATE (mode, x);
9036 }
9037
9038
9039 /* Generate code to load VALS, which is a PARALLEL containing only
9040    constants (for vec_init) or CONST_VECTOR, efficiently into a
9041    register.  Returns an RTX to copy into the register, or NULL_RTX
9042    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
9043 static rtx
9044 aarch64_simd_make_constant (rtx vals)
9045 {
9046   machine_mode mode = GET_MODE (vals);
9047   rtx const_dup;
9048   rtx const_vec = NULL_RTX;
9049   int n_elts = GET_MODE_NUNITS (mode);
9050   int n_const = 0;
9051   int i;
9052
9053   if (GET_CODE (vals) == CONST_VECTOR)
9054     const_vec = vals;
9055   else if (GET_CODE (vals) == PARALLEL)
9056     {
9057       /* A CONST_VECTOR must contain only CONST_INTs and
9058          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9059          Only store valid constants in a CONST_VECTOR.  */
9060       for (i = 0; i < n_elts; ++i)
9061         {
9062           rtx x = XVECEXP (vals, 0, i);
9063           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9064             n_const++;
9065         }
9066       if (n_const == n_elts)
9067         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9068     }
9069   else
9070     gcc_unreachable ();
9071
9072   if (const_vec != NULL_RTX
9073       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9074     /* Load using MOVI/MVNI.  */
9075     return const_vec;
9076   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9077     /* Loaded using DUP.  */
9078     return const_dup;
9079   else if (const_vec != NULL_RTX)
9080     /* Load from constant pool. We can not take advantage of single-cycle
9081        LD1 because we need a PC-relative addressing mode.  */
9082     return const_vec;
9083   else
9084     /* A PARALLEL containing something not valid inside CONST_VECTOR.
9085        We can not construct an initializer.  */
9086     return NULL_RTX;
9087 }
9088
9089 void
9090 aarch64_expand_vector_init (rtx target, rtx vals)
9091 {
9092   machine_mode mode = GET_MODE (target);
9093   machine_mode inner_mode = GET_MODE_INNER (mode);
9094   int n_elts = GET_MODE_NUNITS (mode);
9095   int n_var = 0;
9096   rtx any_const = NULL_RTX;
9097   bool all_same = true;
9098
9099   for (int i = 0; i < n_elts; ++i)
9100     {
9101       rtx x = XVECEXP (vals, 0, i);
9102       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9103         ++n_var;
9104       else
9105         any_const = x;
9106
9107       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9108         all_same = false;
9109     }
9110
9111   if (n_var == 0)
9112     {
9113       rtx constant = aarch64_simd_make_constant (vals);
9114       if (constant != NULL_RTX)
9115         {
9116           emit_move_insn (target, constant);
9117           return;
9118         }
9119     }
9120
9121   /* Splat a single non-constant element if we can.  */
9122   if (all_same)
9123     {
9124       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9125       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9126       return;
9127     }
9128
9129   /* Half the fields (or less) are non-constant.  Load constant then overwrite
9130      varying fields.  Hope that this is more efficient than using the stack.  */
9131   if (n_var <= n_elts/2)
9132     {
9133       rtx copy = copy_rtx (vals);
9134
9135       /* Load constant part of vector.  We really don't care what goes into the
9136          parts we will overwrite, but we're more likely to be able to load the
9137          constant efficiently if it has fewer, larger, repeating parts
9138          (see aarch64_simd_valid_immediate).  */
9139       for (int i = 0; i < n_elts; i++)
9140         {
9141           rtx x = XVECEXP (vals, 0, i);
9142           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9143             continue;
9144           rtx subst = any_const;
9145           for (int bit = n_elts / 2; bit > 0; bit /= 2)
9146             {
9147               /* Look in the copied vector, as more elements are const.  */
9148               rtx test = XVECEXP (copy, 0, i ^ bit);
9149               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9150                 {
9151                   subst = test;
9152                   break;
9153                 }
9154             }
9155           XVECEXP (copy, 0, i) = subst;
9156         }
9157       aarch64_expand_vector_init (target, copy);
9158
9159       /* Insert variables.  */
9160       enum insn_code icode = optab_handler (vec_set_optab, mode);
9161       gcc_assert (icode != CODE_FOR_nothing);
9162
9163       for (int i = 0; i < n_elts; i++)
9164         {
9165           rtx x = XVECEXP (vals, 0, i);
9166           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9167             continue;
9168           x = copy_to_mode_reg (inner_mode, x);
9169           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9170         }
9171       return;
9172     }
9173
9174   /* Construct the vector in memory one field at a time
9175      and load the whole vector.  */
9176   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9177   for (int i = 0; i < n_elts; i++)
9178     emit_move_insn (adjust_address_nv (mem, inner_mode,
9179                                     i * GET_MODE_SIZE (inner_mode)),
9180                     XVECEXP (vals, 0, i));
9181   emit_move_insn (target, mem);
9182
9183 }
9184
9185 static unsigned HOST_WIDE_INT
9186 aarch64_shift_truncation_mask (machine_mode mode)
9187 {
9188   return
9189     (aarch64_vector_mode_supported_p (mode)
9190      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9191 }
9192
9193 #ifndef TLS_SECTION_ASM_FLAG
9194 #define TLS_SECTION_ASM_FLAG 'T'
9195 #endif
9196
9197 void
9198 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9199                                tree decl ATTRIBUTE_UNUSED)
9200 {
9201   char flagchars[10], *f = flagchars;
9202
9203   /* If we have already declared this section, we can use an
9204      abbreviated form to switch back to it -- unless this section is
9205      part of a COMDAT groups, in which case GAS requires the full
9206      declaration every time.  */
9207   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9208       && (flags & SECTION_DECLARED))
9209     {
9210       fprintf (asm_out_file, "\t.section\t%s\n", name);
9211       return;
9212     }
9213
9214   if (!(flags & SECTION_DEBUG))
9215     *f++ = 'a';
9216   if (flags & SECTION_WRITE)
9217     *f++ = 'w';
9218   if (flags & SECTION_CODE)
9219     *f++ = 'x';
9220   if (flags & SECTION_SMALL)
9221     *f++ = 's';
9222   if (flags & SECTION_MERGE)
9223     *f++ = 'M';
9224   if (flags & SECTION_STRINGS)
9225     *f++ = 'S';
9226   if (flags & SECTION_TLS)
9227     *f++ = TLS_SECTION_ASM_FLAG;
9228   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9229     *f++ = 'G';
9230   *f = '\0';
9231
9232   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9233
9234   if (!(flags & SECTION_NOTYPE))
9235     {
9236       const char *type;
9237       const char *format;
9238
9239       if (flags & SECTION_BSS)
9240         type = "nobits";
9241       else
9242         type = "progbits";
9243
9244 #ifdef TYPE_OPERAND_FMT
9245       format = "," TYPE_OPERAND_FMT;
9246 #else
9247       format = ",@%s";
9248 #endif
9249
9250       fprintf (asm_out_file, format, type);
9251
9252       if (flags & SECTION_ENTSIZE)
9253         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9254       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9255         {
9256           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9257             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9258           else
9259             fprintf (asm_out_file, ",%s,comdat",
9260                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9261         }
9262     }
9263
9264   putc ('\n', asm_out_file);
9265 }
9266
9267 /* Select a format to encode pointers in exception handling data.  */
9268 int
9269 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9270 {
9271    int type;
9272    switch (aarch64_cmodel)
9273      {
9274      case AARCH64_CMODEL_TINY:
9275      case AARCH64_CMODEL_TINY_PIC:
9276      case AARCH64_CMODEL_SMALL:
9277      case AARCH64_CMODEL_SMALL_PIC:
9278        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9279           for everything.  */
9280        type = DW_EH_PE_sdata4;
9281        break;
9282      default:
9283        /* No assumptions here.  8-byte relocs required.  */
9284        type = DW_EH_PE_sdata8;
9285        break;
9286      }
9287    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9288 }
9289
9290 /* Emit load exclusive.  */
9291
9292 static void
9293 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9294                              rtx mem, rtx model_rtx)
9295 {
9296   rtx (*gen) (rtx, rtx, rtx);
9297
9298   switch (mode)
9299     {
9300     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9301     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9302     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9303     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9304     default:
9305       gcc_unreachable ();
9306     }
9307
9308   emit_insn (gen (rval, mem, model_rtx));
9309 }
9310
9311 /* Emit store exclusive.  */
9312
9313 static void
9314 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9315                               rtx rval, rtx mem, rtx model_rtx)
9316 {
9317   rtx (*gen) (rtx, rtx, rtx, rtx);
9318
9319   switch (mode)
9320     {
9321     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9322     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9323     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9324     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9325     default:
9326       gcc_unreachable ();
9327     }
9328
9329   emit_insn (gen (bval, rval, mem, model_rtx));
9330 }
9331
9332 /* Mark the previous jump instruction as unlikely.  */
9333
9334 static void
9335 aarch64_emit_unlikely_jump (rtx insn)
9336 {
9337   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9338
9339   insn = emit_jump_insn (insn);
9340   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9341 }
9342
9343 /* Expand a compare and swap pattern.  */
9344
9345 void
9346 aarch64_expand_compare_and_swap (rtx operands[])
9347 {
9348   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9349   machine_mode mode, cmp_mode;
9350   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9351
9352   bval = operands[0];
9353   rval = operands[1];
9354   mem = operands[2];
9355   oldval = operands[3];
9356   newval = operands[4];
9357   is_weak = operands[5];
9358   mod_s = operands[6];
9359   mod_f = operands[7];
9360   mode = GET_MODE (mem);
9361   cmp_mode = mode;
9362
9363   /* Normally the succ memory model must be stronger than fail, but in the
9364      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9365      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9366
9367   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9368       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9369     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9370
9371   switch (mode)
9372     {
9373     case QImode:
9374     case HImode:
9375       /* For short modes, we're going to perform the comparison in SImode,
9376          so do the zero-extension now.  */
9377       cmp_mode = SImode;
9378       rval = gen_reg_rtx (SImode);
9379       oldval = convert_modes (SImode, mode, oldval, true);
9380       /* Fall through.  */
9381
9382     case SImode:
9383     case DImode:
9384       /* Force the value into a register if needed.  */
9385       if (!aarch64_plus_operand (oldval, mode))
9386         oldval = force_reg (cmp_mode, oldval);
9387       break;
9388
9389     default:
9390       gcc_unreachable ();
9391     }
9392
9393   switch (mode)
9394     {
9395     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9396     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9397     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9398     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9399     default:
9400       gcc_unreachable ();
9401     }
9402
9403   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9404
9405   if (mode == QImode || mode == HImode)
9406     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9407
9408   x = gen_rtx_REG (CCmode, CC_REGNUM);
9409   x = gen_rtx_EQ (SImode, x, const0_rtx);
9410   emit_insn (gen_rtx_SET (bval, x));
9411 }
9412
9413 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9414    sequence implementing an atomic operation.  */
9415
9416 static void
9417 aarch64_emit_post_barrier (enum memmodel model)
9418 {
9419   const enum memmodel base_model = memmodel_base (model);
9420
9421   if (is_mm_sync (model)
9422       && (base_model == MEMMODEL_ACQUIRE
9423           || base_model == MEMMODEL_ACQ_REL
9424           || base_model == MEMMODEL_SEQ_CST))
9425     {
9426       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9427     }
9428 }
9429
9430 /* Split a compare and swap pattern.  */
9431
9432 void
9433 aarch64_split_compare_and_swap (rtx operands[])
9434 {
9435   rtx rval, mem, oldval, newval, scratch;
9436   machine_mode mode;
9437   bool is_weak;
9438   rtx_code_label *label1, *label2;
9439   rtx x, cond;
9440   enum memmodel model;
9441   rtx model_rtx;
9442
9443   rval = operands[0];
9444   mem = operands[1];
9445   oldval = operands[2];
9446   newval = operands[3];
9447   is_weak = (operands[4] != const0_rtx);
9448   model_rtx = operands[5];
9449   scratch = operands[7];
9450   mode = GET_MODE (mem);
9451   model = memmodel_from_int (INTVAL (model_rtx));
9452
9453   label1 = NULL;
9454   if (!is_weak)
9455     {
9456       label1 = gen_label_rtx ();
9457       emit_label (label1);
9458     }
9459   label2 = gen_label_rtx ();
9460
9461   /* The initial load can be relaxed for a __sync operation since a final
9462      barrier will be emitted to stop code hoisting.  */
9463   if (is_mm_sync (model))
9464     aarch64_emit_load_exclusive (mode, rval, mem,
9465                                  GEN_INT (MEMMODEL_RELAXED));
9466   else
9467     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9468
9469   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9470   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9471   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9472                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9473   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9474
9475   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9476
9477   if (!is_weak)
9478     {
9479       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9480       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9481                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9482       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9483     }
9484   else
9485     {
9486       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9487       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9488       emit_insn (gen_rtx_SET (cond, x));
9489     }
9490
9491   emit_label (label2);
9492
9493   /* Emit any final barrier needed for a __sync operation.  */
9494   if (is_mm_sync (model))
9495     aarch64_emit_post_barrier (model);
9496 }
9497
9498 /* Split an atomic operation.  */
9499
9500 void
9501 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9502                      rtx value, rtx model_rtx, rtx cond)
9503 {
9504   machine_mode mode = GET_MODE (mem);
9505   machine_mode wmode = (mode == DImode ? DImode : SImode);
9506   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9507   const bool is_sync = is_mm_sync (model);
9508   rtx_code_label *label;
9509   rtx x;
9510
9511   label = gen_label_rtx ();
9512   emit_label (label);
9513
9514   if (new_out)
9515     new_out = gen_lowpart (wmode, new_out);
9516   if (old_out)
9517     old_out = gen_lowpart (wmode, old_out);
9518   else
9519     old_out = new_out;
9520   value = simplify_gen_subreg (wmode, value, mode, 0);
9521
9522   /* The initial load can be relaxed for a __sync operation since a final
9523      barrier will be emitted to stop code hoisting.  */
9524  if (is_sync)
9525     aarch64_emit_load_exclusive (mode, old_out, mem,
9526                                  GEN_INT (MEMMODEL_RELAXED));
9527   else
9528     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9529
9530   switch (code)
9531     {
9532     case SET:
9533       new_out = value;
9534       break;
9535
9536     case NOT:
9537       x = gen_rtx_AND (wmode, old_out, value);
9538       emit_insn (gen_rtx_SET (new_out, x));
9539       x = gen_rtx_NOT (wmode, new_out);
9540       emit_insn (gen_rtx_SET (new_out, x));
9541       break;
9542
9543     case MINUS:
9544       if (CONST_INT_P (value))
9545         {
9546           value = GEN_INT (-INTVAL (value));
9547           code = PLUS;
9548         }
9549       /* Fall through.  */
9550
9551     default:
9552       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9553       emit_insn (gen_rtx_SET (new_out, x));
9554       break;
9555     }
9556
9557   aarch64_emit_store_exclusive (mode, cond, mem,
9558                                 gen_lowpart (mode, new_out), model_rtx);
9559
9560   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9561   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9562                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9563   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9564
9565   /* Emit any final barrier needed for a __sync operation.  */
9566   if (is_sync)
9567     aarch64_emit_post_barrier (model);
9568 }
9569
9570 static void
9571 aarch64_print_extension (void)
9572 {
9573   const struct aarch64_option_extension *opt = NULL;
9574
9575   for (opt = all_extensions; opt->name != NULL; opt++)
9576     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9577       asm_fprintf (asm_out_file, "+%s", opt->name);
9578
9579   asm_fprintf (asm_out_file, "\n");
9580 }
9581
9582 static void
9583 aarch64_start_file (void)
9584 {
9585   if (selected_arch)
9586     {
9587       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9588       aarch64_print_extension ();
9589     }
9590   else if (selected_cpu)
9591     {
9592       const char *truncated_name
9593             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9594       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9595       aarch64_print_extension ();
9596     }
9597   default_file_start();
9598 }
9599
9600 /* Target hook for c_mode_for_suffix.  */
9601 static machine_mode
9602 aarch64_c_mode_for_suffix (char suffix)
9603 {
9604   if (suffix == 'q')
9605     return TFmode;
9606
9607   return VOIDmode;
9608 }
9609
9610 /* We can only represent floating point constants which will fit in
9611    "quarter-precision" values.  These values are characterised by
9612    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9613    by:
9614
9615    (-1)^s * (n/16) * 2^r
9616
9617    Where:
9618      's' is the sign bit.
9619      'n' is an integer in the range 16 <= n <= 31.
9620      'r' is an integer in the range -3 <= r <= 4.  */
9621
9622 /* Return true iff X can be represented by a quarter-precision
9623    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9624 bool
9625 aarch64_float_const_representable_p (rtx x)
9626 {
9627   /* This represents our current view of how many bits
9628      make up the mantissa.  */
9629   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9630   int exponent;
9631   unsigned HOST_WIDE_INT mantissa, mask;
9632   REAL_VALUE_TYPE r, m;
9633   bool fail;
9634
9635   if (!CONST_DOUBLE_P (x))
9636     return false;
9637
9638   if (GET_MODE (x) == VOIDmode)
9639     return false;
9640
9641   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9642
9643   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9644      know if we have +zero until we analyse the mantissa, but we
9645      can reject the other invalid values.  */
9646   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9647       || REAL_VALUE_MINUS_ZERO (r))
9648     return false;
9649
9650   /* Extract exponent.  */
9651   r = real_value_abs (&r);
9652   exponent = REAL_EXP (&r);
9653
9654   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9655      highest (sign) bit, with a fixed binary point at bit point_pos.
9656      m1 holds the low part of the mantissa, m2 the high part.
9657      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9658      bits for the mantissa, this can fail (low bits will be lost).  */
9659   real_ldexp (&m, &r, point_pos - exponent);
9660   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9661
9662   /* If the low part of the mantissa has bits set we cannot represent
9663      the value.  */
9664   if (w.elt (0) != 0)
9665     return false;
9666   /* We have rejected the lower HOST_WIDE_INT, so update our
9667      understanding of how many bits lie in the mantissa and
9668      look only at the high HOST_WIDE_INT.  */
9669   mantissa = w.elt (1);
9670   point_pos -= HOST_BITS_PER_WIDE_INT;
9671
9672   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9673   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9674   if ((mantissa & mask) != 0)
9675     return false;
9676
9677   /* Having filtered unrepresentable values, we may now remove all
9678      but the highest 5 bits.  */
9679   mantissa >>= point_pos - 5;
9680
9681   /* We cannot represent the value 0.0, so reject it.  This is handled
9682      elsewhere.  */
9683   if (mantissa == 0)
9684     return false;
9685
9686   /* Then, as bit 4 is always set, we can mask it off, leaving
9687      the mantissa in the range [0, 15].  */
9688   mantissa &= ~(1 << 4);
9689   gcc_assert (mantissa <= 15);
9690
9691   /* GCC internally does not use IEEE754-like encoding (where normalized
9692      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9693      Our mantissa values are shifted 4 places to the left relative to
9694      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9695      by 5 places to correct for GCC's representation.  */
9696   exponent = 5 - exponent;
9697
9698   return (exponent >= 0 && exponent <= 7);
9699 }
9700
9701 char*
9702 aarch64_output_simd_mov_immediate (rtx const_vector,
9703                                    machine_mode mode,
9704                                    unsigned width)
9705 {
9706   bool is_valid;
9707   static char templ[40];
9708   const char *mnemonic;
9709   const char *shift_op;
9710   unsigned int lane_count = 0;
9711   char element_char;
9712
9713   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9714
9715   /* This will return true to show const_vector is legal for use as either
9716      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9717      also update INFO to show how the immediate should be generated.  */
9718   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9719   gcc_assert (is_valid);
9720
9721   element_char = sizetochar (info.element_width);
9722   lane_count = width / info.element_width;
9723
9724   mode = GET_MODE_INNER (mode);
9725   if (mode == SFmode || mode == DFmode)
9726     {
9727       gcc_assert (info.shift == 0 && ! info.mvn);
9728       if (aarch64_float_const_zero_rtx_p (info.value))
9729         info.value = GEN_INT (0);
9730       else
9731         {
9732 #define buf_size 20
9733           REAL_VALUE_TYPE r;
9734           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9735           char float_buf[buf_size] = {'\0'};
9736           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9737 #undef buf_size
9738
9739           if (lane_count == 1)
9740             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9741           else
9742             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9743                       lane_count, element_char, float_buf);
9744           return templ;
9745         }
9746     }
9747
9748   mnemonic = info.mvn ? "mvni" : "movi";
9749   shift_op = info.msl ? "msl" : "lsl";
9750
9751   if (lane_count == 1)
9752     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9753               mnemonic, UINTVAL (info.value));
9754   else if (info.shift)
9755     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9756               ", %s %d", mnemonic, lane_count, element_char,
9757               UINTVAL (info.value), shift_op, info.shift);
9758   else
9759     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9760               mnemonic, lane_count, element_char, UINTVAL (info.value));
9761   return templ;
9762 }
9763
9764 char*
9765 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9766                                           machine_mode mode)
9767 {
9768   machine_mode vmode;
9769
9770   gcc_assert (!VECTOR_MODE_P (mode));
9771   vmode = aarch64_simd_container_mode (mode, 64);
9772   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9773   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9774 }
9775
9776 /* Split operands into moves from op[1] + op[2] into op[0].  */
9777
9778 void
9779 aarch64_split_combinev16qi (rtx operands[3])
9780 {
9781   unsigned int dest = REGNO (operands[0]);
9782   unsigned int src1 = REGNO (operands[1]);
9783   unsigned int src2 = REGNO (operands[2]);
9784   machine_mode halfmode = GET_MODE (operands[1]);
9785   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9786   rtx destlo, desthi;
9787
9788   gcc_assert (halfmode == V16QImode);
9789
9790   if (src1 == dest && src2 == dest + halfregs)
9791     {
9792       /* No-op move.  Can't split to nothing; emit something.  */
9793       emit_note (NOTE_INSN_DELETED);
9794       return;
9795     }
9796
9797   /* Preserve register attributes for variable tracking.  */
9798   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9799   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9800                                GET_MODE_SIZE (halfmode));
9801
9802   /* Special case of reversed high/low parts.  */
9803   if (reg_overlap_mentioned_p (operands[2], destlo)
9804       && reg_overlap_mentioned_p (operands[1], desthi))
9805     {
9806       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9807       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9808       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9809     }
9810   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9811     {
9812       /* Try to avoid unnecessary moves if part of the result
9813          is in the right place already.  */
9814       if (src1 != dest)
9815         emit_move_insn (destlo, operands[1]);
9816       if (src2 != dest + halfregs)
9817         emit_move_insn (desthi, operands[2]);
9818     }
9819   else
9820     {
9821       if (src2 != dest + halfregs)
9822         emit_move_insn (desthi, operands[2]);
9823       if (src1 != dest)
9824         emit_move_insn (destlo, operands[1]);
9825     }
9826 }
9827
9828 /* vec_perm support.  */
9829
9830 #define MAX_VECT_LEN 16
9831
9832 struct expand_vec_perm_d
9833 {
9834   rtx target, op0, op1;
9835   unsigned char perm[MAX_VECT_LEN];
9836   machine_mode vmode;
9837   unsigned char nelt;
9838   bool one_vector_p;
9839   bool testing_p;
9840 };
9841
9842 /* Generate a variable permutation.  */
9843
9844 static void
9845 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9846 {
9847   machine_mode vmode = GET_MODE (target);
9848   bool one_vector_p = rtx_equal_p (op0, op1);
9849
9850   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9851   gcc_checking_assert (GET_MODE (op0) == vmode);
9852   gcc_checking_assert (GET_MODE (op1) == vmode);
9853   gcc_checking_assert (GET_MODE (sel) == vmode);
9854   gcc_checking_assert (TARGET_SIMD);
9855
9856   if (one_vector_p)
9857     {
9858       if (vmode == V8QImode)
9859         {
9860           /* Expand the argument to a V16QI mode by duplicating it.  */
9861           rtx pair = gen_reg_rtx (V16QImode);
9862           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9863           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9864         }
9865       else
9866         {
9867           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9868         }
9869     }
9870   else
9871     {
9872       rtx pair;
9873
9874       if (vmode == V8QImode)
9875         {
9876           pair = gen_reg_rtx (V16QImode);
9877           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9878           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9879         }
9880       else
9881         {
9882           pair = gen_reg_rtx (OImode);
9883           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9884           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9885         }
9886     }
9887 }
9888
9889 void
9890 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9891 {
9892   machine_mode vmode = GET_MODE (target);
9893   unsigned int nelt = GET_MODE_NUNITS (vmode);
9894   bool one_vector_p = rtx_equal_p (op0, op1);
9895   rtx mask;
9896
9897   /* The TBL instruction does not use a modulo index, so we must take care
9898      of that ourselves.  */
9899   mask = aarch64_simd_gen_const_vector_dup (vmode,
9900       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9901   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9902
9903   /* For big-endian, we also need to reverse the index within the vector
9904      (but not which vector).  */
9905   if (BYTES_BIG_ENDIAN)
9906     {
9907       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9908       if (!one_vector_p)
9909         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9910       sel = expand_simple_binop (vmode, XOR, sel, mask,
9911                                  NULL, 0, OPTAB_LIB_WIDEN);
9912     }
9913   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9914 }
9915
9916 /* Recognize patterns suitable for the TRN instructions.  */
9917 static bool
9918 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9919 {
9920   unsigned int i, odd, mask, nelt = d->nelt;
9921   rtx out, in0, in1, x;
9922   rtx (*gen) (rtx, rtx, rtx);
9923   machine_mode vmode = d->vmode;
9924
9925   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9926     return false;
9927
9928   /* Note that these are little-endian tests.
9929      We correct for big-endian later.  */
9930   if (d->perm[0] == 0)
9931     odd = 0;
9932   else if (d->perm[0] == 1)
9933     odd = 1;
9934   else
9935     return false;
9936   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9937
9938   for (i = 0; i < nelt; i += 2)
9939     {
9940       if (d->perm[i] != i + odd)
9941         return false;
9942       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9943         return false;
9944     }
9945
9946   /* Success!  */
9947   if (d->testing_p)
9948     return true;
9949
9950   in0 = d->op0;
9951   in1 = d->op1;
9952   if (BYTES_BIG_ENDIAN)
9953     {
9954       x = in0, in0 = in1, in1 = x;
9955       odd = !odd;
9956     }
9957   out = d->target;
9958
9959   if (odd)
9960     {
9961       switch (vmode)
9962         {
9963         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9964         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9965         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9966         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9967         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9968         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9969         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9970         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9971         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9972         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9973         default:
9974           return false;
9975         }
9976     }
9977   else
9978     {
9979       switch (vmode)
9980         {
9981         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9982         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9983         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9984         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9985         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9986         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9987         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9988         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9989         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9990         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9991         default:
9992           return false;
9993         }
9994     }
9995
9996   emit_insn (gen (out, in0, in1));
9997   return true;
9998 }
9999
10000 /* Recognize patterns suitable for the UZP instructions.  */
10001 static bool
10002 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10003 {
10004   unsigned int i, odd, mask, nelt = d->nelt;
10005   rtx out, in0, in1, x;
10006   rtx (*gen) (rtx, rtx, rtx);
10007   machine_mode vmode = d->vmode;
10008
10009   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10010     return false;
10011
10012   /* Note that these are little-endian tests.
10013      We correct for big-endian later.  */
10014   if (d->perm[0] == 0)
10015     odd = 0;
10016   else if (d->perm[0] == 1)
10017     odd = 1;
10018   else
10019     return false;
10020   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10021
10022   for (i = 0; i < nelt; i++)
10023     {
10024       unsigned elt = (i * 2 + odd) & mask;
10025       if (d->perm[i] != elt)
10026         return false;
10027     }
10028
10029   /* Success!  */
10030   if (d->testing_p)
10031     return true;
10032
10033   in0 = d->op0;
10034   in1 = d->op1;
10035   if (BYTES_BIG_ENDIAN)
10036     {
10037       x = in0, in0 = in1, in1 = x;
10038       odd = !odd;
10039     }
10040   out = d->target;
10041
10042   if (odd)
10043     {
10044       switch (vmode)
10045         {
10046         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10047         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10048         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10049         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10050         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10051         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10052         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10053         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10054         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10055         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10056         default:
10057           return false;
10058         }
10059     }
10060   else
10061     {
10062       switch (vmode)
10063         {
10064         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10065         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10066         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10067         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10068         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10069         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10070         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10071         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10072         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10073         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10074         default:
10075           return false;
10076         }
10077     }
10078
10079   emit_insn (gen (out, in0, in1));
10080   return true;
10081 }
10082
10083 /* Recognize patterns suitable for the ZIP instructions.  */
10084 static bool
10085 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10086 {
10087   unsigned int i, high, mask, nelt = d->nelt;
10088   rtx out, in0, in1, x;
10089   rtx (*gen) (rtx, rtx, rtx);
10090   machine_mode vmode = d->vmode;
10091
10092   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10093     return false;
10094
10095   /* Note that these are little-endian tests.
10096      We correct for big-endian later.  */
10097   high = nelt / 2;
10098   if (d->perm[0] == high)
10099     /* Do Nothing.  */
10100     ;
10101   else if (d->perm[0] == 0)
10102     high = 0;
10103   else
10104     return false;
10105   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10106
10107   for (i = 0; i < nelt / 2; i++)
10108     {
10109       unsigned elt = (i + high) & mask;
10110       if (d->perm[i * 2] != elt)
10111         return false;
10112       elt = (elt + nelt) & mask;
10113       if (d->perm[i * 2 + 1] != elt)
10114         return false;
10115     }
10116
10117   /* Success!  */
10118   if (d->testing_p)
10119     return true;
10120
10121   in0 = d->op0;
10122   in1 = d->op1;
10123   if (BYTES_BIG_ENDIAN)
10124     {
10125       x = in0, in0 = in1, in1 = x;
10126       high = !high;
10127     }
10128   out = d->target;
10129
10130   if (high)
10131     {
10132       switch (vmode)
10133         {
10134         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10135         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10136         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10137         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10138         case V4SImode: gen = gen_aarch64_zip2v4si; break;
10139         case V2SImode: gen = gen_aarch64_zip2v2si; break;
10140         case V2DImode: gen = gen_aarch64_zip2v2di; break;
10141         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10142         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10143         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10144         default:
10145           return false;
10146         }
10147     }
10148   else
10149     {
10150       switch (vmode)
10151         {
10152         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10153         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10154         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10155         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10156         case V4SImode: gen = gen_aarch64_zip1v4si; break;
10157         case V2SImode: gen = gen_aarch64_zip1v2si; break;
10158         case V2DImode: gen = gen_aarch64_zip1v2di; break;
10159         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10160         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10161         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10162         default:
10163           return false;
10164         }
10165     }
10166
10167   emit_insn (gen (out, in0, in1));
10168   return true;
10169 }
10170
10171 /* Recognize patterns for the EXT insn.  */
10172
10173 static bool
10174 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10175 {
10176   unsigned int i, nelt = d->nelt;
10177   rtx (*gen) (rtx, rtx, rtx, rtx);
10178   rtx offset;
10179
10180   unsigned int location = d->perm[0]; /* Always < nelt.  */
10181
10182   /* Check if the extracted indices are increasing by one.  */
10183   for (i = 1; i < nelt; i++)
10184     {
10185       unsigned int required = location + i;
10186       if (d->one_vector_p)
10187         {
10188           /* We'll pass the same vector in twice, so allow indices to wrap.  */
10189           required &= (nelt - 1);
10190         }
10191       if (d->perm[i] != required)
10192         return false;
10193     }
10194
10195   switch (d->vmode)
10196     {
10197     case V16QImode: gen = gen_aarch64_extv16qi; break;
10198     case V8QImode: gen = gen_aarch64_extv8qi; break;
10199     case V4HImode: gen = gen_aarch64_extv4hi; break;
10200     case V8HImode: gen = gen_aarch64_extv8hi; break;
10201     case V2SImode: gen = gen_aarch64_extv2si; break;
10202     case V4SImode: gen = gen_aarch64_extv4si; break;
10203     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10204     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10205     case V2DImode: gen = gen_aarch64_extv2di; break;
10206     case V2DFmode: gen = gen_aarch64_extv2df; break;
10207     default:
10208       return false;
10209     }
10210
10211   /* Success! */
10212   if (d->testing_p)
10213     return true;
10214
10215   /* The case where (location == 0) is a no-op for both big- and little-endian,
10216      and is removed by the mid-end at optimization levels -O1 and higher.  */
10217
10218   if (BYTES_BIG_ENDIAN && (location != 0))
10219     {
10220       /* After setup, we want the high elements of the first vector (stored
10221          at the LSB end of the register), and the low elements of the second
10222          vector (stored at the MSB end of the register). So swap.  */
10223       std::swap (d->op0, d->op1);
10224       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10225       location = nelt - location;
10226     }
10227
10228   offset = GEN_INT (location);
10229   emit_insn (gen (d->target, d->op0, d->op1, offset));
10230   return true;
10231 }
10232
10233 /* Recognize patterns for the REV insns.  */
10234
10235 static bool
10236 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10237 {
10238   unsigned int i, j, diff, nelt = d->nelt;
10239   rtx (*gen) (rtx, rtx);
10240
10241   if (!d->one_vector_p)
10242     return false;
10243
10244   diff = d->perm[0];
10245   switch (diff)
10246     {
10247     case 7:
10248       switch (d->vmode)
10249         {
10250         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10251         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10252         default:
10253           return false;
10254         }
10255       break;
10256     case 3:
10257       switch (d->vmode)
10258         {
10259         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10260         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10261         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10262         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10263         default:
10264           return false;
10265         }
10266       break;
10267     case 1:
10268       switch (d->vmode)
10269         {
10270         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10271         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10272         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10273         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10274         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10275         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10276         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10277         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10278         default:
10279           return false;
10280         }
10281       break;
10282     default:
10283       return false;
10284     }
10285
10286   for (i = 0; i < nelt ; i += diff + 1)
10287     for (j = 0; j <= diff; j += 1)
10288       {
10289         /* This is guaranteed to be true as the value of diff
10290            is 7, 3, 1 and we should have enough elements in the
10291            queue to generate this.  Getting a vector mask with a
10292            value of diff other than these values implies that
10293            something is wrong by the time we get here.  */
10294         gcc_assert (i + j < nelt);
10295         if (d->perm[i + j] != i + diff - j)
10296           return false;
10297       }
10298
10299   /* Success! */
10300   if (d->testing_p)
10301     return true;
10302
10303   emit_insn (gen (d->target, d->op0));
10304   return true;
10305 }
10306
10307 static bool
10308 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10309 {
10310   rtx (*gen) (rtx, rtx, rtx);
10311   rtx out = d->target;
10312   rtx in0;
10313   machine_mode vmode = d->vmode;
10314   unsigned int i, elt, nelt = d->nelt;
10315   rtx lane;
10316
10317   elt = d->perm[0];
10318   for (i = 1; i < nelt; i++)
10319     {
10320       if (elt != d->perm[i])
10321         return false;
10322     }
10323
10324   /* The generic preparation in aarch64_expand_vec_perm_const_1
10325      swaps the operand order and the permute indices if it finds
10326      d->perm[0] to be in the second operand.  Thus, we can always
10327      use d->op0 and need not do any extra arithmetic to get the
10328      correct lane number.  */
10329   in0 = d->op0;
10330   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10331
10332   switch (vmode)
10333     {
10334     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10335     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10336     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10337     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10338     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10339     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10340     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10341     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10342     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10343     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10344     default:
10345       return false;
10346     }
10347
10348   emit_insn (gen (out, in0, lane));
10349   return true;
10350 }
10351
10352 static bool
10353 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10354 {
10355   rtx rperm[MAX_VECT_LEN], sel;
10356   machine_mode vmode = d->vmode;
10357   unsigned int i, nelt = d->nelt;
10358
10359   if (d->testing_p)
10360     return true;
10361
10362   /* Generic code will try constant permutation twice.  Once with the
10363      original mode and again with the elements lowered to QImode.
10364      So wait and don't do the selector expansion ourselves.  */
10365   if (vmode != V8QImode && vmode != V16QImode)
10366     return false;
10367
10368   for (i = 0; i < nelt; ++i)
10369     {
10370       int nunits = GET_MODE_NUNITS (vmode);
10371
10372       /* If big-endian and two vectors we end up with a weird mixed-endian
10373          mode on NEON.  Reverse the index within each word but not the word
10374          itself.  */
10375       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10376                                            : d->perm[i]);
10377     }
10378   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10379   sel = force_reg (vmode, sel);
10380
10381   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10382   return true;
10383 }
10384
10385 static bool
10386 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10387 {
10388   /* The pattern matching functions above are written to look for a small
10389      number to begin the sequence (0, 1, N/2).  If we begin with an index
10390      from the second operand, we can swap the operands.  */
10391   if (d->perm[0] >= d->nelt)
10392     {
10393       unsigned i, nelt = d->nelt;
10394
10395       gcc_assert (nelt == (nelt & -nelt));
10396       for (i = 0; i < nelt; ++i)
10397         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10398
10399       std::swap (d->op0, d->op1);
10400     }
10401
10402   if (TARGET_SIMD)
10403     {
10404       if (aarch64_evpc_rev (d))
10405         return true;
10406       else if (aarch64_evpc_ext (d))
10407         return true;
10408       else if (aarch64_evpc_dup (d))
10409         return true;
10410       else if (aarch64_evpc_zip (d))
10411         return true;
10412       else if (aarch64_evpc_uzp (d))
10413         return true;
10414       else if (aarch64_evpc_trn (d))
10415         return true;
10416       return aarch64_evpc_tbl (d);
10417     }
10418   return false;
10419 }
10420
10421 /* Expand a vec_perm_const pattern.  */
10422
10423 bool
10424 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10425 {
10426   struct expand_vec_perm_d d;
10427   int i, nelt, which;
10428
10429   d.target = target;
10430   d.op0 = op0;
10431   d.op1 = op1;
10432
10433   d.vmode = GET_MODE (target);
10434   gcc_assert (VECTOR_MODE_P (d.vmode));
10435   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10436   d.testing_p = false;
10437
10438   for (i = which = 0; i < nelt; ++i)
10439     {
10440       rtx e = XVECEXP (sel, 0, i);
10441       int ei = INTVAL (e) & (2 * nelt - 1);
10442       which |= (ei < nelt ? 1 : 2);
10443       d.perm[i] = ei;
10444     }
10445
10446   switch (which)
10447     {
10448     default:
10449       gcc_unreachable ();
10450
10451     case 3:
10452       d.one_vector_p = false;
10453       if (!rtx_equal_p (op0, op1))
10454         break;
10455
10456       /* The elements of PERM do not suggest that only the first operand
10457          is used, but both operands are identical.  Allow easier matching
10458          of the permutation by folding the permutation into the single
10459          input vector.  */
10460       /* Fall Through.  */
10461     case 2:
10462       for (i = 0; i < nelt; ++i)
10463         d.perm[i] &= nelt - 1;
10464       d.op0 = op1;
10465       d.one_vector_p = true;
10466       break;
10467
10468     case 1:
10469       d.op1 = op0;
10470       d.one_vector_p = true;
10471       break;
10472     }
10473
10474   return aarch64_expand_vec_perm_const_1 (&d);
10475 }
10476
10477 static bool
10478 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10479                                      const unsigned char *sel)
10480 {
10481   struct expand_vec_perm_d d;
10482   unsigned int i, nelt, which;
10483   bool ret;
10484
10485   d.vmode = vmode;
10486   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10487   d.testing_p = true;
10488   memcpy (d.perm, sel, nelt);
10489
10490   /* Calculate whether all elements are in one vector.  */
10491   for (i = which = 0; i < nelt; ++i)
10492     {
10493       unsigned char e = d.perm[i];
10494       gcc_assert (e < 2 * nelt);
10495       which |= (e < nelt ? 1 : 2);
10496     }
10497
10498   /* If all elements are from the second vector, reindex as if from the
10499      first vector.  */
10500   if (which == 2)
10501     for (i = 0; i < nelt; ++i)
10502       d.perm[i] -= nelt;
10503
10504   /* Check whether the mask can be applied to a single vector.  */
10505   d.one_vector_p = (which != 3);
10506
10507   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10508   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10509   if (!d.one_vector_p)
10510     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10511
10512   start_sequence ();
10513   ret = aarch64_expand_vec_perm_const_1 (&d);
10514   end_sequence ();
10515
10516   return ret;
10517 }
10518
10519 rtx
10520 aarch64_reverse_mask (enum machine_mode mode)
10521 {
10522   /* We have to reverse each vector because we dont have
10523      a permuted load that can reverse-load according to ABI rules.  */
10524   rtx mask;
10525   rtvec v = rtvec_alloc (16);
10526   int i, j;
10527   int nunits = GET_MODE_NUNITS (mode);
10528   int usize = GET_MODE_UNIT_SIZE (mode);
10529
10530   gcc_assert (BYTES_BIG_ENDIAN);
10531   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10532
10533   for (i = 0; i < nunits; i++)
10534     for (j = 0; j < usize; j++)
10535       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10536   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10537   return force_reg (V16QImode, mask);
10538 }
10539
10540 /* Implement MODES_TIEABLE_P.  */
10541
10542 bool
10543 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10544 {
10545   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10546     return true;
10547
10548   /* We specifically want to allow elements of "structure" modes to
10549      be tieable to the structure.  This more general condition allows
10550      other rarer situations too.  */
10551   if (TARGET_SIMD
10552       && aarch64_vector_mode_p (mode1)
10553       && aarch64_vector_mode_p (mode2))
10554     return true;
10555
10556   return false;
10557 }
10558
10559 /* Return a new RTX holding the result of moving POINTER forward by
10560    AMOUNT bytes.  */
10561
10562 static rtx
10563 aarch64_move_pointer (rtx pointer, int amount)
10564 {
10565   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10566
10567   return adjust_automodify_address (pointer, GET_MODE (pointer),
10568                                     next, amount);
10569 }
10570
10571 /* Return a new RTX holding the result of moving POINTER forward by the
10572    size of the mode it points to.  */
10573
10574 static rtx
10575 aarch64_progress_pointer (rtx pointer)
10576 {
10577   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10578
10579   return aarch64_move_pointer (pointer, amount);
10580 }
10581
10582 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10583    MODE bytes.  */
10584
10585 static void
10586 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10587                                               machine_mode mode)
10588 {
10589   rtx reg = gen_reg_rtx (mode);
10590
10591   /* "Cast" the pointers to the correct mode.  */
10592   *src = adjust_address (*src, mode, 0);
10593   *dst = adjust_address (*dst, mode, 0);
10594   /* Emit the memcpy.  */
10595   emit_move_insn (reg, *src);
10596   emit_move_insn (*dst, reg);
10597   /* Move the pointers forward.  */
10598   *src = aarch64_progress_pointer (*src);
10599   *dst = aarch64_progress_pointer (*dst);
10600 }
10601
10602 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10603    we succeed, otherwise return false.  */
10604
10605 bool
10606 aarch64_expand_movmem (rtx *operands)
10607 {
10608   unsigned int n;
10609   rtx dst = operands[0];
10610   rtx src = operands[1];
10611   rtx base;
10612   bool speed_p = !optimize_function_for_size_p (cfun);
10613
10614   /* When optimizing for size, give a better estimate of the length of a
10615      memcpy call, but use the default otherwise.  */
10616   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10617
10618   /* We can't do anything smart if the amount to copy is not constant.  */
10619   if (!CONST_INT_P (operands[2]))
10620     return false;
10621
10622   n = UINTVAL (operands[2]);
10623
10624   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10625      need to make at most two moves.  For cases above 16 bytes it will be one
10626      move for each 16 byte chunk, then at most two additional moves.  */
10627   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10628     return false;
10629
10630   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10631   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10632
10633   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10634   src = adjust_automodify_address (src, VOIDmode, base, 0);
10635
10636   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10637      1-byte chunk.  */
10638   if (n < 4)
10639     {
10640       if (n >= 2)
10641         {
10642           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10643           n -= 2;
10644         }
10645
10646       if (n == 1)
10647         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10648
10649       return true;
10650     }
10651
10652   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10653      4-byte chunk, partially overlapping with the previously copied chunk.  */
10654   if (n < 8)
10655     {
10656       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10657       n -= 4;
10658       if (n > 0)
10659         {
10660           int move = n - 4;
10661
10662           src = aarch64_move_pointer (src, move);
10663           dst = aarch64_move_pointer (dst, move);
10664           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10665         }
10666       return true;
10667     }
10668
10669   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10670      them, then (if applicable) an 8-byte chunk.  */
10671   while (n >= 8)
10672     {
10673       if (n / 16)
10674         {
10675           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10676           n -= 16;
10677         }
10678       else
10679         {
10680           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10681           n -= 8;
10682         }
10683     }
10684
10685   /* Finish the final bytes of the copy.  We can always do this in one
10686      instruction.  We either copy the exact amount we need, or partially
10687      overlap with the previous chunk we copied and copy 8-bytes.  */
10688   if (n == 0)
10689     return true;
10690   else if (n == 1)
10691     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10692   else if (n == 2)
10693     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10694   else if (n == 4)
10695     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10696   else
10697     {
10698       if (n == 3)
10699         {
10700           src = aarch64_move_pointer (src, -1);
10701           dst = aarch64_move_pointer (dst, -1);
10702           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10703         }
10704       else
10705         {
10706           int move = n - 8;
10707
10708           src = aarch64_move_pointer (src, move);
10709           dst = aarch64_move_pointer (dst, move);
10710           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10711         }
10712     }
10713
10714   return true;
10715 }
10716
10717 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10718
10719 static unsigned HOST_WIDE_INT
10720 aarch64_asan_shadow_offset (void)
10721 {
10722   return (HOST_WIDE_INT_1 << 36);
10723 }
10724
10725 static bool
10726 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10727                                         unsigned int align,
10728                                         enum by_pieces_operation op,
10729                                         bool speed_p)
10730 {
10731   /* STORE_BY_PIECES can be used when copying a constant string, but
10732      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10733      For now we always fail this and let the move_by_pieces code copy
10734      the string from read-only memory.  */
10735   if (op == STORE_BY_PIECES)
10736     return false;
10737
10738   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10739 }
10740
10741 static enum machine_mode
10742 aarch64_code_to_ccmode (enum rtx_code code)
10743 {
10744   switch (code)
10745     {
10746     case NE:
10747       return CC_DNEmode;
10748
10749     case EQ:
10750       return CC_DEQmode;
10751
10752     case LE:
10753       return CC_DLEmode;
10754
10755     case LT:
10756       return CC_DLTmode;
10757
10758     case GE:
10759       return CC_DGEmode;
10760
10761     case GT:
10762       return CC_DGTmode;
10763
10764     case LEU:
10765       return CC_DLEUmode;
10766
10767     case LTU:
10768       return CC_DLTUmode;
10769
10770     case GEU:
10771       return CC_DGEUmode;
10772
10773     case GTU:
10774       return CC_DGTUmode;
10775
10776     default:
10777       return CCmode;
10778     }
10779 }
10780
10781 static rtx
10782 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10783                         int code, tree treeop0, tree treeop1)
10784 {
10785   enum machine_mode op_mode, cmp_mode, cc_mode;
10786   rtx op0, op1, cmp, target;
10787   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10788   enum insn_code icode;
10789   struct expand_operand ops[4];
10790
10791   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10792   if (cc_mode == CCmode)
10793     return NULL_RTX;
10794
10795   start_sequence ();
10796   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10797
10798   op_mode = GET_MODE (op0);
10799   if (op_mode == VOIDmode)
10800     op_mode = GET_MODE (op1);
10801
10802   switch (op_mode)
10803     {
10804     case QImode:
10805     case HImode:
10806     case SImode:
10807       cmp_mode = SImode;
10808       icode = CODE_FOR_cmpsi;
10809       break;
10810
10811     case DImode:
10812       cmp_mode = DImode;
10813       icode = CODE_FOR_cmpdi;
10814       break;
10815
10816     default:
10817       end_sequence ();
10818       return NULL_RTX;
10819     }
10820
10821   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10822   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10823   if (!op0 || !op1)
10824     {
10825       end_sequence ();
10826       return NULL_RTX;
10827     }
10828   *prep_seq = get_insns ();
10829   end_sequence ();
10830
10831   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10832   target = gen_rtx_REG (CCmode, CC_REGNUM);
10833
10834   create_output_operand (&ops[0], target, CCmode);
10835   create_fixed_operand (&ops[1], cmp);
10836   create_fixed_operand (&ops[2], op0);
10837   create_fixed_operand (&ops[3], op1);
10838
10839   start_sequence ();
10840   if (!maybe_expand_insn (icode, 4, ops))
10841     {
10842       end_sequence ();
10843       return NULL_RTX;
10844     }
10845   *gen_seq = get_insns ();
10846   end_sequence ();
10847
10848   return gen_rtx_REG (cc_mode, CC_REGNUM);
10849 }
10850
10851 static rtx
10852 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10853                        tree treeop0, tree treeop1, int bit_code)
10854 {
10855   rtx op0, op1, cmp0, cmp1, target;
10856   enum machine_mode op_mode, cmp_mode, cc_mode;
10857   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10858   enum insn_code icode = CODE_FOR_ccmp_andsi;
10859   struct expand_operand ops[6];
10860
10861   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10862   if (cc_mode == CCmode)
10863     return NULL_RTX;
10864
10865   push_to_sequence ((rtx_insn*) *prep_seq);
10866   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10867
10868   op_mode = GET_MODE (op0);
10869   if (op_mode == VOIDmode)
10870     op_mode = GET_MODE (op1);
10871
10872   switch (op_mode)
10873     {
10874     case QImode:
10875     case HImode:
10876     case SImode:
10877       cmp_mode = SImode;
10878       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10879                                                 : CODE_FOR_ccmp_iorsi;
10880       break;
10881
10882     case DImode:
10883       cmp_mode = DImode;
10884       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10885                                                 : CODE_FOR_ccmp_iordi;
10886       break;
10887
10888     default:
10889       end_sequence ();
10890       return NULL_RTX;
10891     }
10892
10893   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10894   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10895   if (!op0 || !op1)
10896     {
10897       end_sequence ();
10898       return NULL_RTX;
10899     }
10900   *prep_seq = get_insns ();
10901   end_sequence ();
10902
10903   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10904   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10905   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10906
10907   create_fixed_operand (&ops[0], prev);
10908   create_fixed_operand (&ops[1], target);
10909   create_fixed_operand (&ops[2], op0);
10910   create_fixed_operand (&ops[3], op1);
10911   create_fixed_operand (&ops[4], cmp0);
10912   create_fixed_operand (&ops[5], cmp1);
10913
10914   push_to_sequence ((rtx_insn*) *gen_seq);
10915   if (!maybe_expand_insn (icode, 6, ops))
10916     {
10917       end_sequence ();
10918       return NULL_RTX;
10919     }
10920
10921   *gen_seq = get_insns ();
10922   end_sequence ();
10923
10924   return target;
10925 }
10926
10927 #undef TARGET_GEN_CCMP_FIRST
10928 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10929
10930 #undef TARGET_GEN_CCMP_NEXT
10931 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10932
10933 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10934    instruction fusion of some sort.  */
10935
10936 static bool
10937 aarch64_macro_fusion_p (void)
10938 {
10939   return aarch64_tune_params->fusible_ops != AARCH64_FUSE_NOTHING;
10940 }
10941
10942
10943 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10944    should be kept together during scheduling.  */
10945
10946 static bool
10947 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10948 {
10949   rtx set_dest;
10950   rtx prev_set = single_set (prev);
10951   rtx curr_set = single_set (curr);
10952   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10953   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10954
10955   if (!aarch64_macro_fusion_p ())
10956     return false;
10957
10958   if (simple_sets_p
10959       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOV_MOVK))
10960     {
10961       /* We are trying to match:
10962          prev (mov)  == (set (reg r0) (const_int imm16))
10963          curr (movk) == (set (zero_extract (reg r0)
10964                                            (const_int 16)
10965                                            (const_int 16))
10966                              (const_int imm16_1))  */
10967
10968       set_dest = SET_DEST (curr_set);
10969
10970       if (GET_CODE (set_dest) == ZERO_EXTRACT
10971           && CONST_INT_P (SET_SRC (curr_set))
10972           && CONST_INT_P (SET_SRC (prev_set))
10973           && CONST_INT_P (XEXP (set_dest, 2))
10974           && INTVAL (XEXP (set_dest, 2)) == 16
10975           && REG_P (XEXP (set_dest, 0))
10976           && REG_P (SET_DEST (prev_set))
10977           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10978         {
10979           return true;
10980         }
10981     }
10982
10983   if (simple_sets_p
10984       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_ADD))
10985     {
10986
10987       /*  We're trying to match:
10988           prev (adrp) == (set (reg r1)
10989                               (high (symbol_ref ("SYM"))))
10990           curr (add) == (set (reg r0)
10991                              (lo_sum (reg r1)
10992                                      (symbol_ref ("SYM"))))
10993           Note that r0 need not necessarily be the same as r1, especially
10994           during pre-regalloc scheduling.  */
10995
10996       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10997           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10998         {
10999           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11000               && REG_P (XEXP (SET_SRC (curr_set), 0))
11001               && REGNO (XEXP (SET_SRC (curr_set), 0))
11002                  == REGNO (SET_DEST (prev_set))
11003               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11004                               XEXP (SET_SRC (curr_set), 1)))
11005             return true;
11006         }
11007     }
11008
11009   if (simple_sets_p
11010       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11011     {
11012
11013       /* We're trying to match:
11014          prev (movk) == (set (zero_extract (reg r0)
11015                                            (const_int 16)
11016                                            (const_int 32))
11017                              (const_int imm16_1))
11018          curr (movk) == (set (zero_extract (reg r0)
11019                                            (const_int 16)
11020                                            (const_int 48))
11021                              (const_int imm16_2))  */
11022
11023       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11024           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11025           && REG_P (XEXP (SET_DEST (prev_set), 0))
11026           && REG_P (XEXP (SET_DEST (curr_set), 0))
11027           && REGNO (XEXP (SET_DEST (prev_set), 0))
11028              == REGNO (XEXP (SET_DEST (curr_set), 0))
11029           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11030           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11031           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11032           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11033           && CONST_INT_P (SET_SRC (prev_set))
11034           && CONST_INT_P (SET_SRC (curr_set)))
11035         return true;
11036
11037     }
11038   if (simple_sets_p
11039       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_LDR))
11040     {
11041       /* We're trying to match:
11042           prev (adrp) == (set (reg r0)
11043                               (high (symbol_ref ("SYM"))))
11044           curr (ldr) == (set (reg r1)
11045                              (mem (lo_sum (reg r0)
11046                                              (symbol_ref ("SYM")))))
11047                  or
11048           curr (ldr) == (set (reg r1)
11049                              (zero_extend (mem
11050                                            (lo_sum (reg r0)
11051                                                    (symbol_ref ("SYM"))))))  */
11052       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11053           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11054         {
11055           rtx curr_src = SET_SRC (curr_set);
11056
11057           if (GET_CODE (curr_src) == ZERO_EXTEND)
11058             curr_src = XEXP (curr_src, 0);
11059
11060           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11061               && REG_P (XEXP (XEXP (curr_src, 0), 0))
11062               && REGNO (XEXP (XEXP (curr_src, 0), 0))
11063                  == REGNO (SET_DEST (prev_set))
11064               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11065                               XEXP (SET_SRC (prev_set), 0)))
11066               return true;
11067         }
11068     }
11069
11070   if ((aarch64_tune_params->fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11071       && any_condjump_p (curr))
11072     {
11073       enum attr_type prev_type = get_attr_type (prev);
11074
11075       /* FIXME: this misses some which is considered simple arthematic
11076          instructions for ThunderX.  Simple shifts are missed here.  */
11077       if (prev_type == TYPE_ALUS_SREG
11078           || prev_type == TYPE_ALUS_IMM
11079           || prev_type == TYPE_LOGICS_REG
11080           || prev_type == TYPE_LOGICS_IMM)
11081         return true;
11082     }
11083
11084   return false;
11085 }
11086
11087 /* If MEM is in the form of [base+offset], extract the two parts
11088    of address and set to BASE and OFFSET, otherwise return false
11089    after clearing BASE and OFFSET.  */
11090
11091 bool
11092 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11093 {
11094   rtx addr;
11095
11096   gcc_assert (MEM_P (mem));
11097
11098   addr = XEXP (mem, 0);
11099
11100   if (REG_P (addr))
11101     {
11102       *base = addr;
11103       *offset = const0_rtx;
11104       return true;
11105     }
11106
11107   if (GET_CODE (addr) == PLUS
11108       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11109     {
11110       *base = XEXP (addr, 0);
11111       *offset = XEXP (addr, 1);
11112       return true;
11113     }
11114
11115   *base = NULL_RTX;
11116   *offset = NULL_RTX;
11117
11118   return false;
11119 }
11120
11121 /* Types for scheduling fusion.  */
11122 enum sched_fusion_type
11123 {
11124   SCHED_FUSION_NONE = 0,
11125   SCHED_FUSION_LD_SIGN_EXTEND,
11126   SCHED_FUSION_LD_ZERO_EXTEND,
11127   SCHED_FUSION_LD,
11128   SCHED_FUSION_ST,
11129   SCHED_FUSION_NUM
11130 };
11131
11132 /* If INSN is a load or store of address in the form of [base+offset],
11133    extract the two parts and set to BASE and OFFSET.  Return scheduling
11134    fusion type this INSN is.  */
11135
11136 static enum sched_fusion_type
11137 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11138 {
11139   rtx x, dest, src;
11140   enum sched_fusion_type fusion = SCHED_FUSION_LD;
11141
11142   gcc_assert (INSN_P (insn));
11143   x = PATTERN (insn);
11144   if (GET_CODE (x) != SET)
11145     return SCHED_FUSION_NONE;
11146
11147   src = SET_SRC (x);
11148   dest = SET_DEST (x);
11149
11150   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11151       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11152     return SCHED_FUSION_NONE;
11153
11154   if (GET_CODE (src) == SIGN_EXTEND)
11155     {
11156       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11157       src = XEXP (src, 0);
11158       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11159         return SCHED_FUSION_NONE;
11160     }
11161   else if (GET_CODE (src) == ZERO_EXTEND)
11162     {
11163       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11164       src = XEXP (src, 0);
11165       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11166         return SCHED_FUSION_NONE;
11167     }
11168
11169   if (GET_CODE (src) == MEM && REG_P (dest))
11170     extract_base_offset_in_addr (src, base, offset);
11171   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11172     {
11173       fusion = SCHED_FUSION_ST;
11174       extract_base_offset_in_addr (dest, base, offset);
11175     }
11176   else
11177     return SCHED_FUSION_NONE;
11178
11179   if (*base == NULL_RTX || *offset == NULL_RTX)
11180     fusion = SCHED_FUSION_NONE;
11181
11182   return fusion;
11183 }
11184
11185 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11186
11187    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11188    and PRI are only calculated for these instructions.  For other instruction,
11189    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
11190    type instruction fusion can be added by returning different priorities.
11191
11192    It's important that irrelevant instructions get the largest FUSION_PRI.  */
11193
11194 static void
11195 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11196                                int *fusion_pri, int *pri)
11197 {
11198   int tmp, off_val;
11199   rtx base, offset;
11200   enum sched_fusion_type fusion;
11201
11202   gcc_assert (INSN_P (insn));
11203
11204   tmp = max_pri - 1;
11205   fusion = fusion_load_store (insn, &base, &offset);
11206   if (fusion == SCHED_FUSION_NONE)
11207     {
11208       *pri = tmp;
11209       *fusion_pri = tmp;
11210       return;
11211     }
11212
11213   /* Set FUSION_PRI according to fusion type and base register.  */
11214   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11215
11216   /* Calculate PRI.  */
11217   tmp /= 2;
11218
11219   /* INSN with smaller offset goes first.  */
11220   off_val = (int)(INTVAL (offset));
11221   if (off_val >= 0)
11222     tmp -= (off_val & 0xfffff);
11223   else
11224     tmp += ((- off_val) & 0xfffff);
11225
11226   *pri = tmp;
11227   return;
11228 }
11229
11230 /* Given OPERANDS of consecutive load/store, check if we can merge
11231    them into ldp/stp.  LOAD is true if they are load instructions.
11232    MODE is the mode of memory operands.  */
11233
11234 bool
11235 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11236                                 enum machine_mode mode)
11237 {
11238   HOST_WIDE_INT offval_1, offval_2, msize;
11239   enum reg_class rclass_1, rclass_2;
11240   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11241
11242   if (load)
11243     {
11244       mem_1 = operands[1];
11245       mem_2 = operands[3];
11246       reg_1 = operands[0];
11247       reg_2 = operands[2];
11248       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11249       if (REGNO (reg_1) == REGNO (reg_2))
11250         return false;
11251     }
11252   else
11253     {
11254       mem_1 = operands[0];
11255       mem_2 = operands[2];
11256       reg_1 = operands[1];
11257       reg_2 = operands[3];
11258     }
11259
11260   /* The mems cannot be volatile.  */
11261   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11262     return false;
11263
11264   /* Check if the addresses are in the form of [base+offset].  */
11265   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11266   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11267     return false;
11268   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11269   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11270     return false;
11271
11272   /* Check if the bases are same.  */
11273   if (!rtx_equal_p (base_1, base_2))
11274     return false;
11275
11276   offval_1 = INTVAL (offset_1);
11277   offval_2 = INTVAL (offset_2);
11278   msize = GET_MODE_SIZE (mode);
11279   /* Check if the offsets are consecutive.  */
11280   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11281     return false;
11282
11283   /* Check if the addresses are clobbered by load.  */
11284   if (load)
11285     {
11286       if (reg_mentioned_p (reg_1, mem_1))
11287         return false;
11288
11289       /* In increasing order, the last load can clobber the address.  */
11290       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11291       return false;
11292     }
11293
11294   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11295     rclass_1 = FP_REGS;
11296   else
11297     rclass_1 = GENERAL_REGS;
11298
11299   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11300     rclass_2 = FP_REGS;
11301   else
11302     rclass_2 = GENERAL_REGS;
11303
11304   /* Check if the registers are of same class.  */
11305   if (rclass_1 != rclass_2)
11306     return false;
11307
11308   return true;
11309 }
11310
11311 /* Given OPERANDS of consecutive load/store, check if we can merge
11312    them into ldp/stp by adjusting the offset.  LOAD is true if they
11313    are load instructions.  MODE is the mode of memory operands.
11314
11315    Given below consecutive stores:
11316
11317      str  w1, [xb, 0x100]
11318      str  w1, [xb, 0x104]
11319      str  w1, [xb, 0x108]
11320      str  w1, [xb, 0x10c]
11321
11322    Though the offsets are out of the range supported by stp, we can
11323    still pair them after adjusting the offset, like:
11324
11325      add  scratch, xb, 0x100
11326      stp  w1, w1, [scratch]
11327      stp  w1, w1, [scratch, 0x8]
11328
11329    The peephole patterns detecting this opportunity should guarantee
11330    the scratch register is avaliable.  */
11331
11332 bool
11333 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11334                                        enum machine_mode mode)
11335 {
11336   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11337   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11338   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11339   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11340
11341   if (load)
11342     {
11343       reg_1 = operands[0];
11344       mem_1 = operands[1];
11345       reg_2 = operands[2];
11346       mem_2 = operands[3];
11347       reg_3 = operands[4];
11348       mem_3 = operands[5];
11349       reg_4 = operands[6];
11350       mem_4 = operands[7];
11351       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11352                   && REG_P (reg_3) && REG_P (reg_4));
11353       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11354         return false;
11355     }
11356   else
11357     {
11358       mem_1 = operands[0];
11359       reg_1 = operands[1];
11360       mem_2 = operands[2];
11361       reg_2 = operands[3];
11362       mem_3 = operands[4];
11363       reg_3 = operands[5];
11364       mem_4 = operands[6];
11365       reg_4 = operands[7];
11366     }
11367   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11368   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11369     return false;
11370
11371   /* The mems cannot be volatile.  */
11372   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11373       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11374     return false;
11375
11376   /* Check if the addresses are in the form of [base+offset].  */
11377   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11378   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11379     return false;
11380   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11381   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11382     return false;
11383   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11384   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11385     return false;
11386   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11387   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11388     return false;
11389
11390   /* Check if the bases are same.  */
11391   if (!rtx_equal_p (base_1, base_2)
11392       || !rtx_equal_p (base_2, base_3)
11393       || !rtx_equal_p (base_3, base_4))
11394     return false;
11395
11396   offval_1 = INTVAL (offset_1);
11397   offval_2 = INTVAL (offset_2);
11398   offval_3 = INTVAL (offset_3);
11399   offval_4 = INTVAL (offset_4);
11400   msize = GET_MODE_SIZE (mode);
11401   /* Check if the offsets are consecutive.  */
11402   if ((offval_1 != (offval_2 + msize)
11403        || offval_1 != (offval_3 + msize * 2)
11404        || offval_1 != (offval_4 + msize * 3))
11405       && (offval_4 != (offval_3 + msize)
11406           || offval_4 != (offval_2 + msize * 2)
11407           || offval_4 != (offval_1 + msize * 3)))
11408     return false;
11409
11410   /* Check if the addresses are clobbered by load.  */
11411   if (load)
11412     {
11413       if (reg_mentioned_p (reg_1, mem_1)
11414           || reg_mentioned_p (reg_2, mem_2)
11415           || reg_mentioned_p (reg_3, mem_3))
11416         return false;
11417
11418       /* In increasing order, the last load can clobber the address.  */
11419       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11420         return false;
11421     }
11422
11423   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11424     rclass_1 = FP_REGS;
11425   else
11426     rclass_1 = GENERAL_REGS;
11427
11428   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11429     rclass_2 = FP_REGS;
11430   else
11431     rclass_2 = GENERAL_REGS;
11432
11433   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11434     rclass_3 = FP_REGS;
11435   else
11436     rclass_3 = GENERAL_REGS;
11437
11438   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11439     rclass_4 = FP_REGS;
11440   else
11441     rclass_4 = GENERAL_REGS;
11442
11443   /* Check if the registers are of same class.  */
11444   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11445     return false;
11446
11447   return true;
11448 }
11449
11450 /* Given OPERANDS of consecutive load/store, this function pairs them
11451    into ldp/stp after adjusting the offset.  It depends on the fact
11452    that addresses of load/store instructions are in increasing order.
11453    MODE is the mode of memory operands.  CODE is the rtl operator
11454    which should be applied to all memory operands, it's SIGN_EXTEND,
11455    ZERO_EXTEND or UNKNOWN.  */
11456
11457 bool
11458 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11459                              enum machine_mode mode, RTX_CODE code)
11460 {
11461   rtx base, offset, t1, t2;
11462   rtx mem_1, mem_2, mem_3, mem_4;
11463   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11464
11465   if (load)
11466     {
11467       mem_1 = operands[1];
11468       mem_2 = operands[3];
11469       mem_3 = operands[5];
11470       mem_4 = operands[7];
11471     }
11472   else
11473     {
11474       mem_1 = operands[0];
11475       mem_2 = operands[2];
11476       mem_3 = operands[4];
11477       mem_4 = operands[6];
11478       gcc_assert (code == UNKNOWN);
11479     }
11480
11481   extract_base_offset_in_addr (mem_1, &base, &offset);
11482   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11483
11484   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11485   msize = GET_MODE_SIZE (mode);
11486   stp_off_limit = msize * 0x40;
11487   off_val = INTVAL (offset);
11488   abs_off = (off_val < 0) ? -off_val : off_val;
11489   new_off = abs_off % stp_off_limit;
11490   adj_off = abs_off - new_off;
11491
11492   /* Further adjust to make sure all offsets are OK.  */
11493   if ((new_off + msize * 2) >= stp_off_limit)
11494     {
11495       adj_off += stp_off_limit;
11496       new_off -= stp_off_limit;
11497     }
11498
11499   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11500   if (adj_off >= 0x1000)
11501     return false;
11502
11503   if (off_val < 0)
11504     {
11505       adj_off = -adj_off;
11506       new_off = -new_off;
11507     }
11508
11509   /* Create new memory references.  */
11510   mem_1 = change_address (mem_1, VOIDmode,
11511                           plus_constant (DImode, operands[8], new_off));
11512
11513   /* Check if the adjusted address is OK for ldp/stp.  */
11514   if (!aarch64_mem_pair_operand (mem_1, mode))
11515     return false;
11516
11517   msize = GET_MODE_SIZE (mode);
11518   mem_2 = change_address (mem_2, VOIDmode,
11519                           plus_constant (DImode,
11520                                          operands[8],
11521                                          new_off + msize));
11522   mem_3 = change_address (mem_3, VOIDmode,
11523                           plus_constant (DImode,
11524                                          operands[8],
11525                                          new_off + msize * 2));
11526   mem_4 = change_address (mem_4, VOIDmode,
11527                           plus_constant (DImode,
11528                                          operands[8],
11529                                          new_off + msize * 3));
11530
11531   if (code == ZERO_EXTEND)
11532     {
11533       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11534       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11535       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11536       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11537     }
11538   else if (code == SIGN_EXTEND)
11539     {
11540       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11541       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11542       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11543       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11544     }
11545
11546   if (load)
11547     {
11548       operands[1] = mem_1;
11549       operands[3] = mem_2;
11550       operands[5] = mem_3;
11551       operands[7] = mem_4;
11552     }
11553   else
11554     {
11555       operands[0] = mem_1;
11556       operands[2] = mem_2;
11557       operands[4] = mem_3;
11558       operands[6] = mem_4;
11559     }
11560
11561   /* Emit adjusting instruction.  */
11562   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11563   /* Emit ldp/stp instructions.  */
11564   t1 = gen_rtx_SET (operands[0], operands[1]);
11565   t2 = gen_rtx_SET (operands[2], operands[3]);
11566   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11567   t1 = gen_rtx_SET (operands[4], operands[5]);
11568   t2 = gen_rtx_SET (operands[6], operands[7]);
11569   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11570   return true;
11571 }
11572
11573 #undef TARGET_ADDRESS_COST
11574 #define TARGET_ADDRESS_COST aarch64_address_cost
11575
11576 /* This hook will determines whether unnamed bitfields affect the alignment
11577    of the containing structure.  The hook returns true if the structure
11578    should inherit the alignment requirements of an unnamed bitfield's
11579    type.  */
11580 #undef TARGET_ALIGN_ANON_BITFIELD
11581 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11582
11583 #undef TARGET_ASM_ALIGNED_DI_OP
11584 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11585
11586 #undef TARGET_ASM_ALIGNED_HI_OP
11587 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11588
11589 #undef TARGET_ASM_ALIGNED_SI_OP
11590 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11591
11592 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11593 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11594   hook_bool_const_tree_hwi_hwi_const_tree_true
11595
11596 #undef TARGET_ASM_FILE_START
11597 #define TARGET_ASM_FILE_START aarch64_start_file
11598
11599 #undef TARGET_ASM_OUTPUT_MI_THUNK
11600 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11601
11602 #undef TARGET_ASM_SELECT_RTX_SECTION
11603 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11604
11605 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11606 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11607
11608 #undef TARGET_BUILD_BUILTIN_VA_LIST
11609 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11610
11611 #undef TARGET_CALLEE_COPIES
11612 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11613
11614 #undef TARGET_CAN_ELIMINATE
11615 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11616
11617 #undef TARGET_CANNOT_FORCE_CONST_MEM
11618 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11619
11620 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11621 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11622
11623 /* Only the least significant bit is used for initialization guard
11624    variables.  */
11625 #undef TARGET_CXX_GUARD_MASK_BIT
11626 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11627
11628 #undef TARGET_C_MODE_FOR_SUFFIX
11629 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11630
11631 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11632 #undef  TARGET_DEFAULT_TARGET_FLAGS
11633 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11634 #endif
11635
11636 #undef TARGET_CLASS_MAX_NREGS
11637 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11638
11639 #undef TARGET_BUILTIN_DECL
11640 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11641
11642 #undef  TARGET_EXPAND_BUILTIN
11643 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11644
11645 #undef TARGET_EXPAND_BUILTIN_VA_START
11646 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11647
11648 #undef TARGET_FOLD_BUILTIN
11649 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11650
11651 #undef TARGET_FUNCTION_ARG
11652 #define TARGET_FUNCTION_ARG aarch64_function_arg
11653
11654 #undef TARGET_FUNCTION_ARG_ADVANCE
11655 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11656
11657 #undef TARGET_FUNCTION_ARG_BOUNDARY
11658 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11659
11660 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11661 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11662
11663 #undef TARGET_FUNCTION_VALUE
11664 #define TARGET_FUNCTION_VALUE aarch64_function_value
11665
11666 #undef TARGET_FUNCTION_VALUE_REGNO_P
11667 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11668
11669 #undef TARGET_FRAME_POINTER_REQUIRED
11670 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11671
11672 #undef TARGET_GIMPLE_FOLD_BUILTIN
11673 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11674
11675 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11676 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11677
11678 #undef  TARGET_INIT_BUILTINS
11679 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11680
11681 #undef TARGET_LEGITIMATE_ADDRESS_P
11682 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11683
11684 #undef TARGET_LEGITIMATE_CONSTANT_P
11685 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11686
11687 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11688 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11689
11690 #undef TARGET_LRA_P
11691 #define TARGET_LRA_P hook_bool_void_true
11692
11693 #undef TARGET_MANGLE_TYPE
11694 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11695
11696 #undef TARGET_MEMORY_MOVE_COST
11697 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11698
11699 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11700 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11701
11702 #undef TARGET_MUST_PASS_IN_STACK
11703 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11704
11705 /* This target hook should return true if accesses to volatile bitfields
11706    should use the narrowest mode possible.  It should return false if these
11707    accesses should use the bitfield container type.  */
11708 #undef TARGET_NARROW_VOLATILE_BITFIELD
11709 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11710
11711 #undef  TARGET_OPTION_OVERRIDE
11712 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11713
11714 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11715 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11716   aarch64_override_options_after_change
11717
11718 #undef TARGET_PASS_BY_REFERENCE
11719 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11720
11721 #undef TARGET_PREFERRED_RELOAD_CLASS
11722 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11723
11724 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11725 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11726
11727 #undef TARGET_SECONDARY_RELOAD
11728 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11729
11730 #undef TARGET_SHIFT_TRUNCATION_MASK
11731 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11732
11733 #undef TARGET_SETUP_INCOMING_VARARGS
11734 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11735
11736 #undef TARGET_STRUCT_VALUE_RTX
11737 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11738
11739 #undef TARGET_REGISTER_MOVE_COST
11740 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11741
11742 #undef TARGET_RETURN_IN_MEMORY
11743 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11744
11745 #undef TARGET_RETURN_IN_MSB
11746 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11747
11748 #undef TARGET_RTX_COSTS
11749 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11750
11751 #undef TARGET_SCHED_ISSUE_RATE
11752 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11753
11754 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11755 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11756   aarch64_sched_first_cycle_multipass_dfa_lookahead
11757
11758 #undef TARGET_TRAMPOLINE_INIT
11759 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11760
11761 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11762 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11763
11764 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11765 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11766
11767 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11768 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11769
11770 #undef TARGET_VECTORIZE_ADD_STMT_COST
11771 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11772
11773 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11774 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11775   aarch64_builtin_vectorization_cost
11776
11777 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11778 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11779
11780 #undef TARGET_VECTORIZE_BUILTINS
11781 #define TARGET_VECTORIZE_BUILTINS
11782
11783 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11784 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11785   aarch64_builtin_vectorized_function
11786
11787 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11788 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11789   aarch64_autovectorize_vector_sizes
11790
11791 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11792 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11793   aarch64_atomic_assign_expand_fenv
11794
11795 /* Section anchor support.  */
11796
11797 #undef TARGET_MIN_ANCHOR_OFFSET
11798 #define TARGET_MIN_ANCHOR_OFFSET -256
11799
11800 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11801    byte offset; we can do much more for larger data types, but have no way
11802    to determine the size of the access.  We assume accesses are aligned.  */
11803 #undef TARGET_MAX_ANCHOR_OFFSET
11804 #define TARGET_MAX_ANCHOR_OFFSET 4095
11805
11806 #undef TARGET_VECTOR_ALIGNMENT
11807 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11808
11809 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11810 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11811   aarch64_simd_vector_alignment_reachable
11812
11813 /* vec_perm support.  */
11814
11815 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11816 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11817   aarch64_vectorize_vec_perm_const_ok
11818
11819
11820 #undef TARGET_FIXED_CONDITION_CODE_REGS
11821 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11822
11823 #undef TARGET_FLAGS_REGNUM
11824 #define TARGET_FLAGS_REGNUM CC_REGNUM
11825
11826 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11827 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11828
11829 #undef TARGET_ASAN_SHADOW_OFFSET
11830 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11831
11832 #undef TARGET_LEGITIMIZE_ADDRESS
11833 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11834
11835 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11836 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11837   aarch64_use_by_pieces_infrastructure_p
11838
11839 #undef TARGET_CAN_USE_DOLOOP_P
11840 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11841
11842 #undef TARGET_SCHED_MACRO_FUSION_P
11843 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11844
11845 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11846 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11847
11848 #undef TARGET_SCHED_FUSION_PRIORITY
11849 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11850
11851 struct gcc_target targetm = TARGET_INITIALIZER;
11852
11853 #include "gt-aarch64.h"